├── README.md
├── 第1章  数字人文下的汉字处理
    ├── README.md
    ├── 第一章 数字人文下的汉字处理.pdf
    └── 篆体字识别
    │   ├── ALL-Font
    │       ├── 370-B-v2-1.ttf
    │       ├── BaiZhouZhuanShuJiaoHan-1.ttf
    │       ├── ChaoShiJiXiJiaoZhuanTiFan-1.ttf
    │       ├── ChaoShiJiXiYinZhuanTiFan-1.ttf
    │       ├── FZTJLSK.TTF
    │       ├── FangYuanYinZhangZhuanTi-2.ttf
    │       ├── HOT-HTenshoStd-L-2.otf
    │       └── hktenkokk-1.ttf
    │   ├── DeleteBlankIMG.py
    │   ├── TTF.py
    │   ├── TTF2IMG.py
    │   ├── VGG16-train.py
    │   └── vgg16.py
├── 第2章  数字人文下的汉语分词
    ├── README.md
    └── 第二章 数字人文下的汉字分词.pdf
├── 第3章  数字人文下的词性自动标注
    ├── Bi-LSTM-CRF
    │   ├── LSTM_CRF
    │   │   ├── data
    │   │   │   ├── test_data
    │   │   │   ├── train_data
    │   │   │   └── word2id.pkl
    │   │   ├── lstm_crf_data_helper.py
    │   │   ├── lstm_crf_main.py
    │   │   └── lstm_crf_model.py
    │   ├── README.md
    │   └── env4BiLSTM-CRF.txt
    ├── CRF
    │   ├── conlleval.py
    │   ├── crf_learn.exe
    │   ├── crf_test.exe
    │   ├── data
    │   │   ├── test.txt
    │   │   └── train.txt
    │   ├── libcrfpp.dll
    │   ├── readme.md
    │   └── template
    ├── HMM
    │   ├── HMM.py
    │   └── data
    │   │   ├── corpus.txt
    │   │   ├── output.txt
    │   │   └── test.txt
    ├── README.md
    └── 第三章 数字人文下的词性自动标注.pdf
├── 第4章  数字人文下的实体识别
    ├── BERT-NER-pytorch_sample
    │   ├── LICENSE
    │   ├── README.md
    │   ├── bert.py
    │   ├── conlleval.py
    │   ├── printmodel.py
    │   ├── requirements.txt
    │   ├── run.pid
    │   ├── run.sh
    │   ├── run_ner.py
    │   ├── run_test.sh
    │   ├── settings.py
    │   └── train_data_cixing
    │   │   ├── test.txt
    │   │   ├── train.txt
    │   │   └── valid.txt
    ├── ChinsesNER-pytorch-master
    │   ├── .gitignore
    │   ├── README.md
    │   ├── data
    │   │   ├── dev
    │   │   ├── train
    │   │   └── trainset
    │   ├── data_manager.py
    │   ├── main.py
    │   ├── model.py
    │   ├── models
    │   │   ├── config.yml
    │   │   ├── data.pkl
    │   │   └── params.pkl
    │   └── utils.py
    ├── README.md
    ├── 数据预处理
    │   ├── data
    │   │   └── filename.txt
    │   ├── data_charseq
    │   │   └── filename.txt
    │   ├── data_seq
    │   │   └── filename.txt
    │   ├── pro_ner.py
    │   └── train_test_divide.py
    └── 第四章 数字人文下的命名实体识别.pdf
├── 第5章  数字人文下的模型预训练
    ├── README.md
    ├── pytorch_chinese_lm_pretrain
    │   ├── README.md
    │   ├── data
    │   │   ├── eval.txt
    │   │   └── train.txt
    │   ├── output
    │   │   └── README.md
    │   ├── run_bert.sh
    │   ├── run_bert_from_scratch.sh
    │   ├── run_language_model_bert.py
    │   ├── run_language_model_ernie.py
    │   ├── run_language_model_roberta.py
    │   └── run_roberta.sh
    ├── transfer.py
    └── 第五章 数字人文下的模型预训练.pdf
├── 第6章  数字人文下的知识图谱构建及应用
    ├── FLASK
    │   ├── .idea
    │   │   ├── .gitignore
    │   │   ├── .name
    │   │   ├── falsk_test.iml
    │   │   ├── inspectionProfiles
    │   │   │   └── profiles_settings.xml
    │   │   ├── misc.xml
    │   │   ├── modules.xml
    │   │   └── vcs.xml
    │   ├── FindSim.py
    │   ├── Params.py
    │   ├── app.py
    │   ├── data
    │   │   ├── char_vocabs.txt
    │   │   └── stopword.txt
    │   ├── entity_extractor.py
    │   ├── https
    │   │   ├── default
    │   │   ├── https.conf
    │   │   └── nginx.conf
    │   ├── kbqa.py
    │   ├── model
    │   │   ├── NB.m
    │   │   ├── ch_ner_model.h5
    │   │   └── tf.pkl
    │   ├── predict.py
    │   ├── search_answer.py
    │   ├── static
    │   │   ├── 0.png
    │   │   └── default.css
    │   └── templates
    │   │   ├── 0.png
    │   │   ├── result.html
    │   │   └── search.html
    └── README.md
├── 第7章  数字人文下的文本分类
    ├── README.md
    ├── 循环神经网络_文本分类.py
    ├── 第七章 数字人文下的文本分类.pdf
    └── 非遗信息 全.xlsx
├── 第8章  数字人文下的文本聚类
    ├── README.md
    ├── code
    │   ├── cluster
    │   │   ├── DBSCAN.py
    │   │   ├── H_C.py
    │   │   ├── Hierarchy_C.py
    │   │   ├── kmeans-all.py
    │   │   ├── kmeans.py
    │   │   ├── kmeans（余弦相似度）
    │   │   │   ├── __init__.py
    │   │   │   ├── basealgorithm.py
    │   │   │   ├── basefunction.py
    │   │   │   └── kmeans-cos.py
    │   │   ├── mean-shift.py
    │   │   ├── onehot.py
    │   │   └── pca.py
    │   ├── ex_entity.py
    │   ├── ex_key
    │   │   ├── basextract.py
    │   │   └── extraction-keywords.py
    │   ├── some_deal
    │   │   ├── data.py
    │   │   ├── divide.py
    │   │   ├── keywords.py
    │   │   ├── porpotion.py
    │   │   ├── shufa.py
    │   │   ├── test.py
    │   │   └── title_info.py
    │   ├── tsne_plot
    │   │   ├── 3D+tsne2维画图.py
    │   │   ├── heatmap.py
    │   │   ├── tsne.py
    │   │   └── ttt.py
    │   ├── vsm.py
    │   ├── word2vec
    │   │   ├── doc2vec.py
    │   │   └── word2vector.py
    │   └── 分词
    │   │   └── 中科院分词.py
    ├── 第八章  数字人文下的文本聚类.pdf
    └── 非遗信息 全.xlsx
└── 第9章  数字人文下的机器翻译
    ├── README.md
    ├── opennmt
        ├── README.md
        ├── data
        │   ├── pred.txt
        │   ├── src-test.txt
        │   └── tgt-test.txt
        ├── model
        │   └── 说明.txt
        ├── onmt
        │   ├── .idea
        │   │   ├── misc.xml
        │   │   ├── modules.xml
        │   │   ├── onmt.iml
        │   │   ├── vcs.xml
        │   │   └── workspace.xml
        │   ├── __init__.py
        │   ├── bin
        │   │   ├── __init__.py
        │   │   ├── preprocess.py
        │   │   ├── train.py
        │   │   └── translate.py
        │   ├── decoders
        │   │   ├── __init__.py
        │   │   ├── cnn_decoder.py
        │   │   ├── decoder.py
        │   │   ├── ensemble.py
        │   │   └── transformer.py
        │   ├── encoders
        │   │   ├── __init__.py
        │   │   ├── cnn_encoder.py
        │   │   ├── encoder.py
        │   │   ├── ggnn_encoder.py
        │   │   ├── mean_encoder.py
        │   │   ├── rnn_encoder.py
        │   │   └── transformer.py
        │   ├── inputters
        │   │   ├── MakeToken.py
        │   │   ├── __init__.py
        │   │   ├── datareader_base.py
        │   │   ├── dataset_base.py
        │   │   ├── inputter.py
        │   │   └── text_dataset.py
        │   ├── model_builder.py
        │   ├── models
        │   │   ├── __init__.py
        │   │   ├── model.py
        │   │   ├── model_saver.py
        │   │   ├── sru.py
        │   │   └── stacked_rnn.py
        │   ├── modules
        │   │   ├── __init__.py
        │   │   ├── average_attn.py
        │   │   ├── conv_multi_step_attention.py
        │   │   ├── copy_generator.py
        │   │   ├── embeddings.py
        │   │   ├── gate.py
        │   │   ├── global_attention.py
        │   │   ├── multi_headed_attn.py
        │   │   ├── position_ffn.py
        │   │   ├── source_noise.py
        │   │   ├── sparse_activations.py
        │   │   ├── sparse_losses.py
        │   │   ├── structured_attention.py
        │   │   ├── util_class.py
        │   │   └── weight_norm.py
        │   ├── opts.py
        │   ├── train_single.py
        │   ├── trainer.py
        │   ├── translate
        │   │   ├── __init__.py
        │   │   ├── beam_search.py
        │   │   ├── decode_strategy.py
        │   │   ├── greedy_search.py
        │   │   ├── penalties.py
        │   │   ├── translation.py
        │   │   └── translator.py
        │   └── utils
        │   │   ├── __init__.py
        │   │   ├── alignment.py
        │   │   ├── cnn_factory.py
        │   │   ├── distributed.py
        │   │   ├── earlystopping.py
        │   │   ├── logging.py
        │   │   ├── loss.py
        │   │   ├── misc.py
        │   │   ├── optimizers.py
        │   │   ├── parse.py
        │   │   ├── report_manager.py
        │   │   ├── rnn_factory.py
        │   │   └── statistics.py
        ├── setup.py
        └── translate.py
    └── 第九章 数字人文下的机器翻译.pdf


/README.md:
--------------------------------------------------------------------------------
1 | # Resources for Digital Humanities Tutorial
2 | 
3 | 《数字人文教程》配套资源合集
4 | 
5 | 内含：
6 | 
7 | - 代码
8 | - ppt
9 | 


--------------------------------------------------------------------------------
/第1章  数字人文下的汉字处理/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # 此处内容为本书第一章篆体字自动识别项目源代码、PPT。
 3 | 
 4 | 运行环境:
 5 | 
 6 | Python3.7
 7 | 
 8 | tensorflow=2.3
 9 | 
10 | pillow=8.2
11 | 
12 | fonttools=4.24.4
13 | 
14 | 


--------------------------------------------------------------------------------
/第1章  数字人文下的汉字处理/第一章 数字人文下的汉字处理.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hsc748NLP/code-for-digital-humanities-tutorial/6220c365ab5ff7df16f1f20a46cc928b034db36a/第1章  数字人文下的汉字处理/第一章 数字人文下的汉字处理.pdf


--------------------------------------------------------------------------------
/第1章  数字人文下的汉字处理/篆体字识别/ALL-Font/370-B-v2-1.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hsc748NLP/code-for-digital-humanities-tutorial/6220c365ab5ff7df16f1f20a46cc928b034db36a/第1章  数字人文下的汉字处理/篆体字识别/ALL-Font/370-B-v2-1.ttf


--------------------------------------------------------------------------------
/第1章  数字人文下的汉字处理/篆体字识别/ALL-Font/BaiZhouZhuanShuJiaoHan-1.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hsc748NLP/code-for-digital-humanities-tutorial/6220c365ab5ff7df16f1f20a46cc928b034db36a/第1章  数字人文下的汉字处理/篆体字识别/ALL-Font/BaiZhouZhuanShuJiaoHan-1.ttf


--------------------------------------------------------------------------------
/第1章  数字人文下的汉字处理/篆体字识别/ALL-Font/ChaoShiJiXiJiaoZhuanTiFan-1.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hsc748NLP/code-for-digital-humanities-tutorial/6220c365ab5ff7df16f1f20a46cc928b034db36a/第1章  数字人文下的汉字处理/篆体字识别/ALL-Font/ChaoShiJiXiJiaoZhuanTiFan-1.ttf


--------------------------------------------------------------------------------
/第1章  数字人文下的汉字处理/篆体字识别/ALL-Font/ChaoShiJiXiYinZhuanTiFan-1.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hsc748NLP/code-for-digital-humanities-tutorial/6220c365ab5ff7df16f1f20a46cc928b034db36a/第1章  数字人文下的汉字处理/篆体字识别/ALL-Font/ChaoShiJiXiYinZhuanTiFan-1.ttf


--------------------------------------------------------------------------------
/第1章  数字人文下的汉字处理/篆体字识别/ALL-Font/FZTJLSK.TTF:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hsc748NLP/code-for-digital-humanities-tutorial/6220c365ab5ff7df16f1f20a46cc928b034db36a/第1章  数字人文下的汉字处理/篆体字识别/ALL-Font/FZTJLSK.TTF


--------------------------------------------------------------------------------
/第1章  数字人文下的汉字处理/篆体字识别/ALL-Font/FangYuanYinZhangZhuanTi-2.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hsc748NLP/code-for-digital-humanities-tutorial/6220c365ab5ff7df16f1f20a46cc928b034db36a/第1章  数字人文下的汉字处理/篆体字识别/ALL-Font/FangYuanYinZhangZhuanTi-2.ttf


--------------------------------------------------------------------------------
/第1章  数字人文下的汉字处理/篆体字识别/ALL-Font/HOT-HTenshoStd-L-2.otf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hsc748NLP/code-for-digital-humanities-tutorial/6220c365ab5ff7df16f1f20a46cc928b034db36a/第1章  数字人文下的汉字处理/篆体字识别/ALL-Font/HOT-HTenshoStd-L-2.otf


--------------------------------------------------------------------------------
/第1章  数字人文下的汉字处理/篆体字识别/ALL-Font/hktenkokk-1.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hsc748NLP/code-for-digital-humanities-tutorial/6220c365ab5ff7df16f1f20a46cc928b034db36a/第1章  数字人文下的汉字处理/篆体字识别/ALL-Font/hktenkokk-1.ttf


--------------------------------------------------------------------------------
/第1章  数字人文下的汉字处理/篆体字识别/DeleteBlankIMG.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import numpy as np
 4 | import tensorflow as tf
 5 | from tqdm import tqdm
 6 | 
 7 | 
 8 | def clean_blank():
 9 |     # 解决全白图片的方案：
10 |     blank_img_array = [255] * np.ones((30, 30, 3))  # 创建一个空白图片矩阵
11 |     img_dir = "data"  # 设置待清除空白图片的文件夹路径
12 |     for each_cls in tqdm(os.listdir(img_dir), desc='正在清除空白图片'):
13 |         dir_path = os.path.join(img_dir, each_cls)
14 |         for each_img in os.listdir(dir_path):
15 |             image_path = os.path.join(dir_path, each_img)
16 |             # 加载图片，并将图片转成ndarray类型
17 |             img_array = tf.keras.preprocessing.image.img_to_array(tf.keras.preprocessing.image.load_img(image_path))
18 |             # 判断每张图片是否是空白图片，若是，则删除。
19 |             if (blank_img_array == img_array).all():
20 |                 os.remove(image_path)
21 |             else:
22 |                 continue
23 | 
24 | 
25 | if __name__ == '__main__':
26 |     clean_blank()
27 | 


--------------------------------------------------------------------------------
/第1章  数字人文下的汉字处理/篆体字识别/TTF.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import time
 3 | 
 4 | from PIL import Image, ImageFont, ImageDraw
 5 | from fontTools.ttLib import TTFont
 6 | 
 7 | '''
 8 | 字体转图片
 9 | '''
10 | 
11 | 
12 | def char_to_img(all_chara, img_dir, uniMap, font, img_size):
13 |     i = 0
14 |     for chara in all_chara:
15 |         # 判断是否存在该字
16 |         if ord(chara) in uniMap:
17 |             # 新建长宽为300像素，背景色为白色的画布对象
18 |             im = Image.new("RGB", (img_size, img_size), "white")
19 |             draw = ImageDraw.Draw(im)
20 |             # 从画布的坐标（0, 0）处绘制黑色汉字文本
21 |             draw.text((0, 0), chara, fill="#000", font=font)
22 |             # 获取图像中非零区域边界并裁剪
23 |             im = im.crop(im.getbbox())
24 |             # 保存汉字图像
25 |             if not os.path.exists(img_dir + "/" + chara + "/"):
26 |                 os.mkdir(img_dir + "/" + chara + "/")
27 |             save_path = img_dir + "/" + chara + "/" + str(len(os.listdir(img_dir + "/" + chara + "/"))) + ".png"
28 |             im.save(save_path)
29 | 
30 | 
31 | if __name__ == '__main__':
32 |     start = time.clock()
33 |     TTF_DIR = "ALL-Font"  # 存放.tff字体文件夹
34 |     img_dir = "data"  # 生成图片存储路径
35 |     img_size = 30  # 生成图片存储路径
36 | 
37 |     # 判断是否存在文件夹，若否，则创建
38 |     if not os.path.exists(img_dir):
39 |         os.makedirs(img_dir)
40 | 
41 |     # 选取需要保存的汉字
42 |     all_chara = [chr(i) for i in range(19968,26000)]
43 | 
44 |     # 遍历每一个.tff字体文件
45 |     for each_font in os.listdir(TTF_DIR):
46 |         ttf_path = TTF_DIR + "/" + each_font
47 |         print("********" + ttf_path + "*****************")
48 |         # 创建int型unicode编码与字符映射表
49 |         fontmap = TTFont(ttf_path)
50 |         uniMap = fontmap['cmap'].tables[0].ttFont.getBestCmap()
51 |         # 加载并创建指定大小的字体对象
52 |         font = ImageFont.truetype(ttf_path, img_size)
53 |         char_to_img(all_chara, img_dir, uniMap, font, img_size)
54 |     print('time spent: {}'.format(time.clock()-start))


--------------------------------------------------------------------------------
/第1章  数字人文下的汉字处理/篆体字识别/TTF2IMG.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import time
 3 | 
 4 | from PIL import Image, ImageFont, ImageDraw
 5 | from fontTools.ttLib import TTFont
 6 | 
 7 | '''
 8 | 字体转图片
 9 | '''
10 | 
11 | 
12 | def char_to_img(all_chara, img_dir, uniMap, font, img_size):
13 |     i = 0
14 |     for chara in all_chara:
15 |         # 判断是否存在该字
16 |         if ord(chara) in uniMap:
17 |             # 新建长宽为300像素，背景色为白色的画布对象
18 |             im = Image.new("RGB", (img_size, img_size), "white")
19 |             draw = ImageDraw.Draw(im)
20 |             # 从画布的坐标（0, 0）处绘制黑色汉字文本
21 |             draw.text((0, 0), chara, fill="#000", font=font)
22 |             # 获取图像中非零区域边界并裁剪
23 |             im = im.crop(im.getbbox())
24 |             # 保存汉字图像
25 |             if not os.path.exists(img_dir + "/" + chara + "/"):
26 |                 os.mkdir(img_dir + "/" + chara + "/")
27 |             save_path = img_dir + "/" + chara + "/" + str(len(os.listdir(img_dir + "/" + chara + "/"))) + ".png"
28 |             im.save(save_path)
29 | 
30 | 
31 | if __name__ == '__main__':
32 |     start = time.clock()
33 |     TTF_DIR = "ALL-Font"  # 存放.tff字体文件夹
34 |     img_dir = "data"  # 生成图片存储路径
35 |     img_size = 30  # 生成图片存储路径
36 | 
37 |     # 判断是否存在文件夹，若否，则创建
38 |     if not os.path.exists(img_dir):
39 |         os.makedirs(img_dir)
40 | 
41 |     # 选取需要保存的汉字
42 |     all_chara = [chr(i) for i in range(19968,26000)]
43 | 
44 |     # 遍历每一个.tff字体文件
45 |     for each_font in os.listdir(TTF_DIR):
46 |         ttf_path = TTF_DIR + "/" + each_font
47 |         print("********" + ttf_path + "*****************")
48 |         # 创建int型unicode编码与字符映射表
49 |         fontmap = TTFont(ttf_path)
50 |         uniMap = fontmap['cmap'].tables[0].ttFont.getBestCmap()
51 |         # 加载并创建指定大小的字体对象
52 |         font = ImageFont.truetype(ttf_path, img_size)
53 |         char_to_img(all_chara, img_dir, uniMap, font, img_size)
54 |     print('time spent: {}'.format(time.clock()-start))


--------------------------------------------------------------------------------
/第2章  数字人文下的汉语分词/README.md:
--------------------------------------------------------------------------------
 1 | ## 非物质文化遗产自动分词系统
 2 | 此项目文件夹提供的是“第二章：数字人文下的汉语分词”中非物质文化遗产自动分词系统的源代码，系统功能及使用方法可参见教材第二章。
 3 | ### 源码下载
 4 | 由于源文件数量较多，因此请从百度云盘下载源代码:  
 5 | >下载链接:https://pan.baidu.com/s/14RAwSzgTBDI3asVUdASLGQ  
 6 | > 提取码：rqf0
 7 | 
 8 | - 下载的源代码包含两部分： 
 9 |     - （1）系统源码  
10 |     - （2）编译后的可执行文件  
11 | ### 使用方式
12 |  - #### 运行编译后的可执行文件（推荐）：   
13 | 首先进入下载的文件夹，再进入`dist(打包完成的)`，再进入`ICHAutoWordSegGUI`,找到并运行`ICHAutoWordSegGUI.exe`文件即可。
14 |  - #### 运行系统源码（进阶）：
15 | 1. 在conda中配置PyQT运行环境：本项目依赖的环境见项目文件夹中的`environment.yml`,配置方式请参考参考在“在Pycharm配置QtDesigner和PyUIC”的有关教程。
16 | 2. 在下载的文件夹中，找到并运行`ICHAutoWordSegGUI.py`模块即可。


--------------------------------------------------------------------------------
/第2章  数字人文下的汉语分词/第二章 数字人文下的汉字分词.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hsc748NLP/code-for-digital-humanities-tutorial/6220c365ab5ff7df16f1f20a46cc928b034db36a/第2章  数字人文下的汉语分词/第二章 数字人文下的汉字分词.pdf


--------------------------------------------------------------------------------
/第3章  数字人文下的词性自动标注/Bi-LSTM-CRF/LSTM_CRF/data/word2id.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hsc748NLP/code-for-digital-humanities-tutorial/6220c365ab5ff7df16f1f20a46cc928b034db36a/第3章  数字人文下的词性自动标注/Bi-LSTM-CRF/LSTM_CRF/data/word2id.pkl


--------------------------------------------------------------------------------
/第3章  数字人文下的词性自动标注/Bi-LSTM-CRF/LSTM_CRF/lstm_crf_data_helper.py:
--------------------------------------------------------------------------------
  1 | # encoding=utf-8
  2 | import pickle as pickle
  3 | 
  4 | import numpy as np
  5 | 
  6 | 
  7 | # 用于字标注向索引的转换，因为tensorflow中的crf接收的数据是数字形式的索引
  8 | def tags2id(tags_list, tag2label):
  9 |     '''
 10 |     :param tags_list:
 11 |     :param tag2label:字标注向索引映射的字典
 12 |     :return:
 13 |     '''
 14 |     tags_label_list = []
 15 |     for tags in tags_list:
 16 |         tags_label = []
 17 |         for tag in tags:
 18 |             tags_label.append(tag2label[tag])
 19 |         tags_label_list.append(tags_label)
 20 |     print('final tags2id')
 21 |     return tags_label_list
 22 | 
 23 | 
 24 | # 用于获得数据
 25 | def get_data(file_location):
 26 |     '''
 27 |     :param file_location: 文件的存放路径，注意文件中的字和字标注是空格隔开的
 28 |     :return: 两个大的List，前面的list里面一个个小list存放的是一个个句子，后面的list里面一个个小list存放的是一个个句子对应的标签
 29 |     '''
 30 |     sentences_list = []
 31 |     tags_list = []
 32 |     with open(file_location, 'r', encoding='utf-8') as fr:
 33 |         sentence_list = []
 34 |         tag_list = []
 35 |         for line in fr.readlines():
 36 |             if line != '\n':
 37 |                 [word, tag] = line.strip().split()
 38 |                 sentence_list.append(word)
 39 |                 tag_list.append(tag)
 40 |             else:
 41 |                 sentences_list.append(sentence_list)
 42 |                 tags_list.append(tag_list)
 43 |                 sentence_list = []
 44 |                 tag_list = []
 45 |     print('final get_data')
 46 |     return sentences_list, tags_list
 47 | 
 48 | 
 49 | # 用于获得训练集中每个字对应的id，返回的是键为字值为id的一个字典
 50 | # 注意这个dict只用于当前训练集，换训练集需要自己生成
 51 | def get_word_id(file_location):
 52 |     with open(file_location, 'rb') as fr:
 53 |         word2id_dict = pickle.load(fr)
 54 |     print('final get_word_id')
 55 |     return word2id_dict
 56 | 
 57 | 
 58 | # 用于初始化字向量，这里并没有通过word2vec获得，而是通过随机正太分布获得
 59 | def random_embedding(word2id_dict, embedding_size):
 60 |     '''
 61 |     :param word2id_dict: 用于获得总的字符个数
 62 |     :param embedding: 每个字的维度
 63 |     :return: 字向量组
 64 |     '''
 65 |     embedding_mat = np.random.uniform(-0.25, 0.25, (len(word2id_dict), embedding_size))
 66 |     embedding_mat = np.array(embedding_mat).astype(np.float32)
 67 |     print('final random_embedding')
 68 |     return embedding_mat
 69 | 
 70 | 
 71 | # 获得一个句子中每个字对应的索引
 72 | def sentence2id(sentences, word2id_dict):
 73 |     '''
 74 |     :param sentences: 所有句子
 75 |     :param word2id_dict: 记录字和字对应索引的字典
 76 |     :return: 包含所有句子中每个字索引的List
 77 |     '''
 78 |     sentences_id_list = []
 79 |     for sentence in sentences:
 80 |         sentence_id_list = []
 81 |         for word in sentence:
 82 |             if str(word).isdigit():
 83 |                 word = '<NUM>'
 84 |             elif ('\u0041' <= word <= '\u005a') or ('\u0061' <= word <= '\u007a'):
 85 |                 word = '<ENG>'
 86 |             if word not in word2id_dict.keys():
 87 |                 word = '<UNK>'
 88 |             sentence_id_list.append(word2id_dict[word])
 89 |         sentences_id_list.append(sentence_id_list)
 90 |     print('final sentence2id')
 91 |     return sentences_id_list
 92 | 
 93 | 
 94 | # 对句子和标签都可进行填充,同时获得每个序列长度的列表
 95 | def padding_sentences(sentences_index, pad_mark=0):
 96 |     '''
 97 |     :param sentences_index: 每个句子各个字或者字标注对应的索引
 98 |     :param pad_mark: 用什么进行填充，默认为用零进行填充
 99 |     :return: 填充后的各个句子或标注的索引和序列长度列表
100 |     '''
101 |     sen_max_len = max(map(lambda x: len(x), sentences_index))
102 |     sen_index_list, sen_len_list = [], []
103 |     for sen_index in sentences_index:
104 |         sen_index = list(sen_index)
105 |         new_sentence_index = sen_index[:sen_max_len] + [pad_mark] * max(sen_max_len - len(sen_index), 0)
106 |         sen_index_list.append(new_sentence_index)
107 |         sen_len_list.append(min(len(sen_index), sen_max_len))
108 |     return np.array(sen_index_list), np.array(sen_len_list)
109 | 
110 | 
111 | # if __name__ == '__main__':
112 | #     word2id = get_word_id('data/word2id.pkl')
113 | #     for key, id in word2id.items():
114 | #         if id == 0:
115 | #             print(key)
116 | 


--------------------------------------------------------------------------------
/第3章  数字人文下的词性自动标注/Bi-LSTM-CRF/LSTM_CRF/lstm_crf_model.py:
--------------------------------------------------------------------------------
 1 | # encoding=utf-8
 2 | import tensorflow as tf
 3 | from tensorflow.contrib.crf import crf_log_likelihood
 4 | from tensorflow.contrib.rnn import LSTMCell
 5 | 
 6 | 
 7 | class BiLSTM_CRF(object):
 8 |     def __init__(self, hidden_dim, num_tags, input_x, sequence_lengths, dropout_pl, labels):
 9 |         self.hidden_dim = hidden_dim
10 |         self.num_tags = num_tags
11 |         self.input_x = input_x
12 |         self.sequence_lengths = sequence_lengths
13 |         self.dropout_pl = dropout_pl
14 |         self.labels = labels
15 | 
16 |     # 建立模型，执行正向传播，返回正向传播得到的值
17 |     def positive_propagation(self):
18 |         with tf.variable_scope('lstm-crf'):
19 |             cell_fw = LSTMCell(self.hidden_dim)
20 |             cell_bw = LSTMCell(self.hidden_dim)
21 |             # inputs(self.input_x)的shape通常是[batch_size, sequence_length, dim_embedding]
22 |             # output_fw_seq和output_bw_seq的shape都是[batch_size, sequence_length, num_units]
23 |             (output_fw_seq, output_bw_seq), _ = tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw, self.input_x,
24 |                                                                                 self.sequence_lengths, dtype=tf.float32)
25 |             out_put = tf.concat([output_fw_seq, output_bw_seq], axis=-1)  # 对正反向的输出进行合并
26 |             out_put = tf.nn.dropout(out_put, self.dropout_pl)  # 防止过拟合
27 |         # 循环神经网络之后进行一次线性变换，用于把输出转换为crf_log_likelihood的接收格式，主要
28 |         # 是把最后一维的维度转换为num_tags，以便于随后进行优化
29 |         with tf.variable_scope('proj'):
30 |             W = tf.get_variable(name='W',
31 |                                 shape=[2 * self.hidden_dim, self.num_tags],
32 |                                 initializer=tf.contrib.layers.xavier_initializer(),
33 |                                 dtype=tf.float32
34 |                                 )
35 |             b = tf.get_variable(name='b',
36 |                                 shape=[self.num_tags],
37 |                                 initializer=tf.zeros_initializer,
38 |                                 dtype=tf.float32
39 |                                 )
40 |             s = tf.shape(out_put)
41 |             # 正向传播的结果计算
42 |             out_put = tf.reshape(out_put, [-1, 2 * self.hidden_dim])  # 就是一个维度变换
43 |             pred = tf.matmul(out_put, W) + b  # 进行线性变换
44 |             # s[1]是所选取的最大句子长度
45 |             logits = tf.reshape(pred, [-1, s[1], self.num_tags])
46 | 
47 |             # CRF损失值的计算
48 |             # transition_params是CRF的转换矩阵，会被自动计算出来
49 |             # tag_indices：填入维度为[batch_size, max_seq_len]的矩阵，也就是Golden标签，注意这里的标签都是以索引方式表示的这个就是真实的标签序列了
50 |             # sequence_lengths：维度为[batch_size]的向量，记录了每个序列的长度
51 |             # inputs：unary potentials，也就是每个标签的预测概率值，这个值根据实际情况选择计算方法，CNN,RNN...都可以
52 |             # crf_log_likelihood求的是CRF的损失值，牵扯到前向后向算法，会获得概率转移矩阵
53 |             log_likelihood, transition_params = crf_log_likelihood(inputs=logits, tag_indices=self.labels,
54 |                                                                    sequence_lengths=self.sequence_lengths)
55 |             loss = -tf.reduce_mean(log_likelihood)
56 |             return loss, transition_params, logits
57 | 


--------------------------------------------------------------------------------
/第3章  数字人文下的词性自动标注/Bi-LSTM-CRF/README.md:
--------------------------------------------------------------------------------
 1 | # Bi-LSTM-CRF
 2 | 利用Bi-LSTM-CRF实现词性标注
 3 | 
 4 | ### 使用方式
 5 | 1. 配置环境   
 6 |   本项目在`env4BiLSTM-CRF.txt`所列环境中测试通过，其中下列python版本和tensorflow版本为必须项：
 7 | > Python == 3.7.6  
 8 | > tensorflow == 1.12.0
 9 | 2. 首先进入`LSTM_CRF`目录，然后直接运行`lstm_crf_main.py`即可。
10 | 


--------------------------------------------------------------------------------
/第3章  数字人文下的词性自动标注/Bi-LSTM-CRF/env4BiLSTM-CRF.txt:
--------------------------------------------------------------------------------
 1 | Package             Version  
 2 | ------------------- ---------
 3 | absl-py             0.7.0    
 4 | astor               0.7.1    
 5 | astroid             2.3.3    
 6 | certifi             2020.6.20
 7 | cycler              0.10.0   
 8 | Cython              0.29.15  
 9 | gast                0.2.2    
10 | grpcio              1.16.1   
11 | h5py                2.9.0    
12 | isort               4.3.21   
13 | joblib              0.16.0   
14 | Keras-Applications  1.0.6    
15 | Keras-Preprocessing 1.0.5    
16 | kiwisolver          1.1.0    
17 | lazy-object-proxy   1.4.3    
18 | Markdown            3.0.1    
19 | matplotlib          3.2.1    
20 | mccabe              0.6.1    
21 | mkl-fft             1.0.10   
22 | mkl-random          1.0.2    
23 | numpy               1.15.4   
24 | pandas              0.25.3   
25 | patsy               0.5.1    
26 | pip                 18.1     
27 | protobuf            3.6.1    
28 | pylint              2.4.4    
29 | pyparsing           2.4.7    
30 | python-dateutil     2.8.1    
31 | pytz                2019.3   
32 | scikit-learn        0.21.3   
33 | scipy               1.2.1    
34 | seaborn             0.9.0    
35 | setuptools          40.6.3   
36 | six                 1.12.0   
37 | statsmodels         0.10.1   
38 | tensorboard         1.12.2   
39 | tensorflow          1.12.0   
40 | termcolor           1.1.0    
41 | tornado             6.0.3    
42 | typed-ast           1.4.0    
43 | Werkzeug            1.0.1    
44 | wheel               0.32.3   
45 | wrapt               1.11.2   
46 | 


--------------------------------------------------------------------------------
/第3章  数字人文下的词性自动标注/CRF/crf_learn.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hsc748NLP/code-for-digital-humanities-tutorial/6220c365ab5ff7df16f1f20a46cc928b034db36a/第3章  数字人文下的词性自动标注/CRF/crf_learn.exe


--------------------------------------------------------------------------------
/第3章  数字人文下的词性自动标注/CRF/crf_test.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hsc748NLP/code-for-digital-humanities-tutorial/6220c365ab5ff7df16f1f20a46cc928b034db36a/第3章  数字人文下的词性自动标注/CRF/crf_test.exe


--------------------------------------------------------------------------------
/第3章  数字人文下的词性自动标注/CRF/libcrfpp.dll:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hsc748NLP/code-for-digital-humanities-tutorial/6220c365ab5ff7df16f1f20a46cc928b034db36a/第3章  数字人文下的词性自动标注/CRF/libcrfpp.dll


--------------------------------------------------------------------------------
/第3章  数字人文下的词性自动标注/CRF/readme.md:
--------------------------------------------------------------------------------
 1 | #### 使用CRF实现词性标注的步骤
 2 | 以Window10为例：
 3 | - 1.打开终端，并进入CRF项目文件夹
 4 |    - 同时按<kbd>Win</kbd>+<kbd>R</kbd>打开“运行”窗口。
 5 |    - 输入`cmd`并按`确定`。
 6 |    - 在终端窗口中，输入`cd`+`空格`+`/d`+`空格`+`CRF文件夹绝对路径`，如`cd /d code-for-digital-humanities-tutorial\第三章  数字人文下的词性自动标注\CRF`   
 7 |   
 8 | 
 9 | - 2.依次在终端中输入下述的CRF运行指令，即可实现基于CRF的词性标注模型的训练、测试、性能评估。
10 | 
11 | #### CRF运行指令
12 | 依次在终端命令行输入下列命令，按<kbd>Enter</kbd>执行。
13 | 
14 | 1.训练模型
15 | > crf_learn -p 8 template data/train.txt model 
16 | 
17 | 2.测试模型
18 | > crf_test -m model data/test.txt >output.txt
19 | 
20 | 3.评估模型在测试集上的效果
21 | > python conlleval.py < output.txt >prf.txt


--------------------------------------------------------------------------------
/第3章  数字人文下的词性自动标注/CRF/template:
--------------------------------------------------------------------------------
 1 | # Unigram
 2 | U00:%x[-2,0]
 3 | U01:%x[-1,0]
 4 | U02:%x[0,0]
 5 | U03:%x[1,0]
 6 | U04:%x[2,0]
 7 | U05:%x[-1,0]/%x[0,0]
 8 | U06:%x[0,0]/%x[1,0]
 9 | 
10 | # Bigram
11 | B
12 | 


--------------------------------------------------------------------------------
/第3章  数字人文下的词性自动标注/HMM/data/output.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hsc748NLP/code-for-digital-humanities-tutorial/6220c365ab5ff7df16f1f20a46cc928b034db36a/第3章  数字人文下的词性自动标注/HMM/data/output.txt


--------------------------------------------------------------------------------
/第3章  数字人文下的词性自动标注/README.md:
--------------------------------------------------------------------------------
1 | ## 词性标注代码
2 | 
3 | 本文件夹包含将BiLSTM-CRF,CRF,HMM三种序列标注模型用于词性标注任务的方式
4 | 


--------------------------------------------------------------------------------
/第3章  数字人文下的词性自动标注/第三章 数字人文下的词性自动标注.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hsc748NLP/code-for-digital-humanities-tutorial/6220c365ab5ff7df16f1f20a46cc928b034db36a/第3章  数字人文下的词性自动标注/第三章 数字人文下的词性自动标注.pdf


--------------------------------------------------------------------------------
/第4章  数字人文下的实体识别/BERT-NER-pytorch_sample/README.md:
--------------------------------------------------------------------------------
 1 | <!--
 2 |  * @Author: your name
 3 |  * @Date: 2021-04-08 17:53:47
 4 |  * @LastEditTime: 2021-04-09 11:53:06
 5 |  * @LastEditors: Please set LastEditors
 6 |  * @Description: In User Settings Edit
 7 |  * @FilePath: /pgrad/public_resources/BERT-NER/README.md
 8 | -->
 9 | # BERT NER
10 | 
11 | :bangbang: Model without Extra `X` label checkout branch [experiment](https://github.com/kamalkraj/BERT-NER/tree/experiment)
12 | 
13 | Use google BERT to do CoNLL-2003 NER !
14 | 
15 | 
16 | # Requirements
17 | 
18 | -  `python3`
19 | - `pip3 install -r requirements.txt`
20 | 
21 | # Run
22 | 
23 | CUDA_VISIBLE_DEVICES=2 nohup python run_ner.py --data_dir=data/data_0/ --bert_model=/home/pgrad/pretrain_models/hflroberta/ --task_name=ner --output_dir=out/bert-base --max_seq_length=128 --do_train --train_batch_size=220 --num_train_epochs=3 --do_eval --warmup_proportion=0.4 > hflrobertabase.log
24 | 
25 | 
26 | # Result
27 | 
28 | ### Validation Data
29 | ```
30 |              precision    recall  f1-score   support
31 | 
32 |        MISC     0.9407    0.9304    0.9355       273
33 |         LOC     0.9650    0.9881    0.9764       419
34 |         PER     0.9844    0.9783    0.9813       322
35 |         ORG     0.9794    0.9852    0.9822       337
36 | 
37 | avg / total     0.9683    0.9734    0.9708      1351
38 | ```
39 | ### Test Data
40 | ```
41 |              precision    recall  f1-score   support
42 | 
43 |         ORG     0.9152    0.9073    0.9113       464
44 |         PER     0.9767    0.9692    0.9730       260
45 |         LOC     0.9397    0.9263    0.9330       353
46 |        MISC     0.8276    0.9014    0.8629       213
47 | 
48 | avg / total     0.9198    0.9240    0.9217      1290
49 | ```
50 | 
51 | ## Pretrained model download from [here](https://drive.google.com/file/d/1UKE2UVFStXZFtXFgZObGg5mo_MzW-ZoC/view?usp=sharing) 
52 | 
53 | # Inference
54 | 
55 | ```python
56 | from bert import Ner
57 | 
58 | model = Ner("out/")
59 | 
60 | output = model.predict("Steve went to Paris")
61 | 
62 | print(output)
63 | # {
64 | #     "Steve": {
65 | #         "tag": "B-PER",
66 | #         "confidence": 0.999879002571106
67 | #     },
68 | #     "went": {
69 | #         "tag": "O",
70 | #         "confidence": 0.9968552589416504
71 | #     },
72 | #     "to": {
73 | #         "tag": "O",
74 | #         "confidence": 0.9996656179428101
75 | #     },
76 | #     "Paris": {
77 | #         "tag": "B-LOC",
78 | #         "confidence": 0.999504804611206
79 | #     }
80 | # }
81 | 
82 | ```
83 | 
84 | 
85 | ### Tensorflow version
86 | 
87 | - https://github.com/kyzhouhzau/BERT-NER
88 | 


--------------------------------------------------------------------------------
/第4章  数字人文下的实体识别/BERT-NER-pytorch_sample/bert.py:
--------------------------------------------------------------------------------
 1 | """BERT NER Inference.""" 
 2 | 
 3 | from __future__ import absolute_import, division, print_function
 4 | 
 5 | import json
 6 | import os
 7 | 
 8 | import torch
 9 | import torch.nn.functional as F
10 | from nltk import word_tokenize
11 | from pytorch_pretrained_bert.modeling import (CONFIG_NAME, WEIGHTS_NAME,
12 |                                               BertConfig,
13 |                                               BertForTokenClassification)
14 | from pytorch_pretrained_bert.tokenization import BertTokenizer
15 | 
16 | 
17 | class Ner:
18 | 
19 |     def __init__(self,model_dir: str):
20 |         self.model , self.tokenizer, self.model_config = self.load_model(model_dir)
21 |         self.label_map = self.model_config["label_map"]
22 |         self.max_seq_length = self.model_config["max_seq_length"]
23 |         self.label_map = {int(k):v for k,v in self.label_map.items()}
24 |         self.model.eval()
25 | 
26 |     def load_model(self, model_dir: str, model_config: str = "model_config.json"):
27 |         model_config = os.path.join(model_dir,model_config)
28 |         model_config = json.load(open(model_config))
29 |         output_config_file = os.path.join(model_dir, CONFIG_NAME)
30 |         output_model_file = os.path.join(model_dir, WEIGHTS_NAME)
31 |         config = BertConfig(output_config_file)
32 |         model = BertForTokenClassification(config, num_labels=model_config["num_labels"])
33 |         model.load_state_dict(torch.load(output_model_file))
34 |         tokenizer = BertTokenizer.from_pretrained(model_config["bert_model"],do_lower_case=False)
35 |         return model, tokenizer, model_config
36 | 
37 |     def tokenize(self, text: str):
38 |         """ tokenize input"""
39 |         words = word_tokenize(text)
40 |         tokens = []
41 |         valid_positions = []
42 |         for i,word in enumerate(words):
43 |             token = self.tokenizer.tokenize(word)
44 |             tokens.extend(token)
45 |             for i in range(len(token)):
46 |                 if i == 0:
47 |                     valid_positions.append(1)
48 |                 else:
49 |                     valid_positions.append(0)
50 |         return tokens, valid_positions
51 | 
52 |     def preprocess(self, text: str):
53 |         """ preprocess """
54 |         tokens, valid_positions = self.tokenize(text)
55 |         ## insert "[CLS]"
56 |         tokens.insert(0,"[CLS]")
57 |         ## insert "[SEP]"
58 |         tokens.append("[SEP]")
59 |         segment_ids = []
60 |         for i in range(len(tokens)):
61 |             segment_ids.append(0)
62 |         input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
63 |         input_mask = [1] * len(input_ids)
64 |         while len(input_ids) < self.max_seq_length:
65 |             input_ids.append(0)
66 |             input_mask.append(0)
67 |             segment_ids.append(0)
68 |         return input_ids,input_mask,segment_ids,valid_positions
69 | 
70 |     def predict(self, text: str):
71 |         input_ids,input_mask,segment_ids,valid_positions = self.preprocess(text)
72 |         input_ids = torch.tensor([input_ids],dtype=torch.long)
73 |         input_mask = torch.tensor([input_mask],dtype=torch.long)
74 |         segment_ids = torch.tensor([segment_ids],dtype=torch.long)
75 |         with torch.no_grad():
76 |             logits = self.model(input_ids, segment_ids, input_mask)
77 |         logits = F.softmax(logits,dim=2)
78 |         logits_label = torch.argmax(logits,dim=2)
79 |         logits_label = logits_label.detach().cpu().numpy()
80 |         # import ipdb; ipdb.set_trace()
81 |         logits_confidence = [values[label].item() for values,label in zip(logits[0],logits_label[0])]
82 | 
83 |         logits_label = [logits_label[0][index] for index,i in enumerate(input_mask[0]) if i.item()==1]
84 |         logits_label.pop(0)
85 |         logits_label.pop()
86 | 
87 |         assert len(logits_label) == len(valid_positions)
88 |         labels = []
89 |         for valid,label in zip(valid_positions,logits_label):
90 |             if valid:
91 |                 labels.append(self.label_map[label])
92 |         words = word_tokenize(text)
93 |         assert len(labels) == len(words)
94 |         output = [word:{"tag":label,"confidence":confidence} for word,label,confidence in zip(words,labels,logits_confidence)]
95 |         return output
96 | 


--------------------------------------------------------------------------------
/第4章  数字人文下的实体识别/BERT-NER-pytorch_sample/printmodel.py:
--------------------------------------------------------------------------------
1 | from transformers import BertTokenizer,BertModel
2 | 
3 | tokenizer = BertTokenizer.from_pretrained("pretrain_models/sikuroberta_vocabtxt")
4 | model = BertModel.from_pretrained("pretrain_models/sikuroberta_vocabtxt")
5 | print(model)
6 | 
7 |   


--------------------------------------------------------------------------------
/第4章  数字人文下的实体识别/BERT-NER-pytorch_sample/requirements.txt:
--------------------------------------------------------------------------------
1 | pytorch-pretrained-bert==0.6.1
2 | seqeval==0.0.5
3 | torch==1.0.1.post2
4 | tqdm==4.31.1
5 | nltk==3.4


--------------------------------------------------------------------------------
/第4章  数字人文下的实体识别/BERT-NER-pytorch_sample/run.pid:
--------------------------------------------------------------------------------
1 | 74651
2 | 


--------------------------------------------------------------------------------
/第4章  数字人文下的实体识别/BERT-NER-pytorch_sample/run.sh:
--------------------------------------------------------------------------------
 1 | ###
 2 |  # @Author: your name
 3 |  # @Date: 2021-04-15 10:54:17
 4 |  # @LastEditTime: 2021-04-15 11:00:54
 5 |  # @LastEditors: Please set LastEditors
 6 |  # @Description: In User Settings Edit
 7 |  # @FilePath: /BERT-NER-Pytorch/run.sh
 8 | ### 
 9 | CUDA_VISIBLE_DEVICES=1  python run_ner.py --data_dir=train_data_cixing/ \
10 | --bert_model=pretrain_models/sikuroberta_vocabtxt/ \
11 | --task_name=ner \
12 | --output_dir=output/train_data_cixing_out/ \
13 | --max_seq_length=128 \
14 | --do_train --do_eval  --eval_batch_size=64  --train_batch_size=64 --num_train_epochs 10 \
15 | --warmup_proportion=0.4 > logsikubert0.log 2>&1 & echo $! > run.pid


--------------------------------------------------------------------------------
/第4章  数字人文下的实体识别/BERT-NER-pytorch_sample/run_test.sh:
--------------------------------------------------------------------------------
 1 | ###
 2 |  # @Author: your name
 3 |  # @Date: 2021-04-15 10:54:17
 4 |  # @LastEditTime: 2021-04-15 11:00:54
 5 |  # @LastEditors: Please set LastEditors
 6 |  # @Description: In User Settings Edit
 7 |  # @FilePath: /BERT-NER-Pytorch/run.sh
 8 | ### 
 9 | CUDA_VISIBLE_DEVICES=1  python run_ner.py --data_dir=train_data_book/data_2/ \
10 | --bert_model=pretrain_models/sikuroberta_vocabtxt/ \
11 | --task_name=ner \
12 | --output_dir=output/train_book_sikuroberta_vocabtxt \
13 | --max_seq_length=128 \
14 | --do_eval  --eval_batch_size=64  --train_batch_size=64 --num_train_epochs 5 \
15 | --warmup_proportion=0.4 > logsikubert4.log 2>&1 & echo $! > run.pid


--------------------------------------------------------------------------------
/第4章  数字人文下的实体识别/BERT-NER-pytorch_sample/settings.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Author: your name
 3 | Date: 2021-04-14 21:26:33
 4 | LastEditTime: 2021-04-15 10:53:15
 5 | LastEditors: Please set LastEditors
 6 | Description: In User Settings Edit
 7 | FilePath: /BERT-CRF-Pytorch/processors/settings.py
 8 | '''
 9 | LABELS=["X","O",'B-w', 'I-w', 'E-w', 'S-w', 'B-a', 'I-a', 'E-a', 'S-a', 'B-n', 'I-n', 'E-n', 'S-n', 'B-v', 'I-v', 'E-v', 'S-v', 'B-r', 'I-r', 'E-r', 'S-r', 'B-y', 'I-y', 'E-y', 'S-y', 'B-ns', 'I-ns', 'E-ns', 'S-ns', 'B-nr', 'I-nr', 'E-nr', 'S-nr', 'B-c', 'I-c', 'E-c', 'S-c', 'B-d', 'I-d', 'E-d', 'S-d', 'B-u', 'I-u', 'E-u', 'S-u', 'B-p', 'I-p', 'E-p', 'S-p', 'B-t', 'I-t', 'E-t', 'S-t', 'B-f', 'I-f', 'E-f', 'S-f', 'B-m', 'I-m', 'E-m', 'S-m', 'B-vs', 'I-vs', 'E-vs', 'S-vs', 'B-q', 'I-q', 'E-q', 'S-q', 'B-j', 'I-j', 'E-j', 'S-j', 'B-vy', 'I-vy', 'E-vy', 'S-vy', 'B-nx', 'I-nx', 'E-nx', 'S-nx', 'B-zn', 'I-zn', 'E-zn', 'S-zn', 'B-i', 'I-i', 'E-i', 'S-i', 'B-x', 'I-x', 'E-x', 'S-x', 'B-vw', 'I-vw', 'E-vw', 'S-vw', 'B-zv', 'I-zv', 'E-zv', 'S-zv', 'B-za', 'I-za', 'E-za', 'S-za', 'B-s', 'I-s', 'E-s', 'S-s',"[CLS]","[SEP]"]
10 | 
11 | #LABELS=["X","O",'nr-B', 'nr-I', 'nr-E', 'nr-S', 'n-B', 'n-I', 'n-E', 'n-S', 'w-B', 'w-I', 'w-E', 'w-S', 'ns-B', 'ns-I', 'ns-E', 'ns-S', 'u-B', 'u-I', 'u-E', 'u-S', 'v-B', 'v-I', 'v-E', 'v-S', 'p-B', 'p-I', 'p-E', 'p-S', 'nx-B', 'nx-I', 'nx-E', 'nx-S', 'd-B', 'd-I', 'd-E', 'd-S', 'r-B', 'r-I', 'r-E', 'r-S', 'a-B', 'a-I', 'a-E', 'a-S', 'c-B', 'c-I', 'c-E', 'c-S', 't-B', 't-I', 't-E', 't-S', 'm-B', 'm-I', 'm-E', 'm-S', 'q-B', 'q-I', 'q-E', 'q-S', 'y-B', 'y-I', 'y-E', 'y-S', 'j-B', 'j-I', 'j-E', 'j-S', 'nc-B', 'nc-I', 'nc-E', 'nc-S', 'nrx-B', 'nrx-I', 'nrx-E', 'nrx-S', 'f-B', 'f-I', 'f-E', 'f-S', 'gv-B', 'gv-I', 'gv-E', 'gv-S', 'i-B', 'i-I', 'i-E', 'i-S',"[CLS]","[SEP]"]
12 | #LABELS=["X","O",'B-T','I-T','E-T','S-T',"[CLS]","[SEP]"]
13 | #LABELS=["X","O",'B','N','E','S',"[CLS]","[SEP]"]
14 | #LABELS=["X","O",'B-A','I-A','E-A','S-A',"[CLS]","[SEP]"]
15 | #LABELS=["X","O",'B','I','E','S',"[CLS]","[SEP]"]
16 | #LABELS=["X","O",'B-BOOK','M-BOOK','E-BOOK','S-BOOK',"[CLS]","[SEP]"]
17 | 
18 | 


--------------------------------------------------------------------------------
/第4章  数字人文下的实体识别/BERT-NER-pytorch_sample/train_data_cixing/test.txt:
--------------------------------------------------------------------------------
 1 | 使 S-v
 2 | 司 B-nr
 3 | 徒 E-nr
 4 | 禁 S-v
 5 | 掠 B-nr
 6 | 欒 I-nr
 7 | 氏 E-nr
 8 | 者 S-r
 9 | ， S-w
10 | 歸 S-v
11 | 所 S-r
12 | 取 S-v
13 | 焉 S-y
14 | ， S-w
15 | 使 S-v
16 | 候 S-n
17 | 出 S-v
18 | 諸 S-j
19 | 轘 B-ns
20 | 轅 E-ns
21 | 。 S-w
22 | 


--------------------------------------------------------------------------------
/第4章  数字人文下的实体识别/BERT-NER-pytorch_sample/train_data_cixing/train.txt:
--------------------------------------------------------------------------------
 1 | 太 B-n
 2 | 子 E-n
 3 | 曰 S-v
 4 | ： S-w
 5 | 
 6 | 使 S-v
 7 | 問 S-v
 8 | 弦 B-nr
 9 | 多 E-nr
10 | 以 S-p
11 | 琴 S-n
12 | ， S-w
13 | 曰 S-v
14 | ： S-w
15 | 
16 | 十 B-t
17 | 二 I-t
18 | 月 E-t
19 | 甲 B-t
20 | 戌 E-t
21 | ， S-w
22 | 晉 S-ns
23 | 作 S-v
24 | 六 S-m
25 | 軍 S-n
26 | 。 S-w
27 | 
28 | 


--------------------------------------------------------------------------------
/第4章  数字人文下的实体识别/BERT-NER-pytorch_sample/train_data_cixing/valid.txt:
--------------------------------------------------------------------------------
 1 | 使 S-v
 2 | 司 B-nr
 3 | 徒 E-nr
 4 | 禁 S-v
 5 | 掠 B-nr
 6 | 欒 I-nr
 7 | 氏 E-nr
 8 | 者 S-r
 9 | ， S-w
10 | 歸 S-v
11 | 所 S-r
12 | 取 S-v
13 | 焉 S-y
14 | ， S-w
15 | 使 S-v
16 | 候 S-n
17 | 出 S-v
18 | 諸 S-j
19 | 轘 B-ns
20 | 轅 E-ns
21 | 。 S-w
22 | 


--------------------------------------------------------------------------------
/第4章  数字人文下的实体识别/ChinsesNER-pytorch-master/.gitignore:
--------------------------------------------------------------------------------
1 | .vscode
2 | __pycache__


--------------------------------------------------------------------------------
/第4章  数字人文下的实体识别/ChinsesNER-pytorch-master/README.md:
--------------------------------------------------------------------------------
 1 | # ChinsesNER-pytorch
 2 | 
 3 | ### train
 4 | 
 5 | setp 1: edit **models/config.yml**
 6 | 
 7 |     embedding_size: 100
 8 |     hidden_size: 128
 9 |     model_path: models/
10 |     batch_size: 20
11 |     dropout: 0.5
12 |     tags:
13 |       - ORG
14 |       - PER
15 | 
16 | step 2: train
17 | 
18 |     python3 main.py train
19 |     or
20 |     cn = ChineseNER("train")
21 |     cn.train()
22 | 
23 |     ...
24 |     epoch [4] |██████                   | 154/591
25 |             loss 0.46
26 |             evaluation
27 |             ORG     recall 1.00     precision 1.00  f1 1.00
28 |     --------------------------------------------------
29 |     epoch [4] |██████                   | 155/591
30 |             loss 1.47
31 |             evaluation
32 |             ORG     recall 0.92     precision 0.92  f1 0.92
33 |     --------------------------------------------------
34 |     epoch [4] |██████                   | 156/591
35 |             loss 0.46
36 |             evaluation
37 |             ORG     recall 0.94     precision 1.00  f1 0.97
38 | 
39 | ### predict
40 | 
41 |     python3 main.py predict
42 |     or 
43 |     cn = ChineseNER("predict")
44 |     cn.predict()
45 | 
46 |     请输入文本: 海利装饰材料有限公司
47 |     [{'start': 0, 'stop': 10, 'word': '海利装饰材料有限公司', 'type': 'ORG'}]
48 | 
49 | ### REFERENCES
50 | - [Log-Linear Models, MEMMs, and CRFs](http://www.cs.columbia.edu/~mcollins/crf.pdf)
51 | - [Neural Architectures for Named Entity Recognition](https://arxiv.org/pdf/1603.01360.pdf)
52 | 


--------------------------------------------------------------------------------
/第4章  数字人文下的实体识别/ChinsesNER-pytorch-master/models/config.yml:
--------------------------------------------------------------------------------
1 | embedding_size: 100
2 | hidden_size: 128
3 | model_path: models/
4 | batch_size: 20
5 | dropout: 0.5
6 | tags:
7 |   - ORG
8 |   - PER


--------------------------------------------------------------------------------
/第4章  数字人文下的实体识别/ChinsesNER-pytorch-master/models/data.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hsc748NLP/code-for-digital-humanities-tutorial/6220c365ab5ff7df16f1f20a46cc928b034db36a/第4章  数字人文下的实体识别/ChinsesNER-pytorch-master/models/data.pkl


--------------------------------------------------------------------------------
/第4章  数字人文下的实体识别/ChinsesNER-pytorch-master/models/params.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hsc748NLP/code-for-digital-humanities-tutorial/6220c365ab5ff7df16f1f20a46cc928b034db36a/第4章  数字人文下的实体识别/ChinsesNER-pytorch-master/models/params.pkl


--------------------------------------------------------------------------------
/第4章  数字人文下的实体识别/ChinsesNER-pytorch-master/utils.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | '''
 3 | @Author: yanwii
 4 | @Date: 2018-11-07 13:52:12
 5 | '''
 6 | 
 7 | def format_result(result, text, tag): 
 8 |     entities = [] 
 9 |     for i in result: 
10 |         begin, end = i 
11 |         entities.append({ 
12 |             "start":begin, 
13 |             "stop":end + 1, 
14 |             "word":text[begin:end+1],
15 |             "type":tag
16 |         }) 
17 |     return entities
18 | 
19 | def get_tags(path, tag, tag_map):
20 |     begin_tag = tag_map.get("B-" + tag)
21 |     mid_tag = tag_map.get("I-" + tag)
22 |     end_tag = tag_map.get("E-" + tag)
23 |     single_tag = tag_map.get("S")
24 |     o_tag = tag_map.get("O")
25 |     begin = -1
26 |     end = 0
27 |     tags = []
28 |     last_tag = 0
29 |     for index, tag in enumerate(path):
30 |         if tag == begin_tag and index == 0:
31 |             begin = 0
32 |         elif tag == begin_tag:
33 |             begin = index
34 |         elif tag == end_tag and last_tag in [mid_tag, begin_tag] and begin > -1:
35 |             end = index
36 |             tags.append([begin, end])
37 |         elif tag == o_tag or tag == single_tag:
38 |             begin = -1
39 |         last_tag = tag
40 |     return tags
41 | 
42 | def f1_score(tar_path, pre_path, tag, tag_map):
43 |     origin = 0.
44 |     found = 0.
45 |     right = 0.
46 |     for fetch in zip(tar_path, pre_path):
47 |         tar, pre = fetch
48 |         tar_tags = get_tags(tar, tag, tag_map)
49 |         pre_tags = get_tags(pre, tag, tag_map)
50 | 
51 |         origin += len(tar_tags)
52 |         found += len(pre_tags)
53 | 
54 |         for p_tag in pre_tags:
55 |             if p_tag in tar_tags:
56 |                 right += 1
57 | 
58 |     recall = 0. if origin == 0 else (right / origin)
59 |     precision = 0. if found == 0 else (right / found)
60 |     f1 = 0. if recall+precision == 0 else (2*precision*recall)/(precision + recall)
61 |     print("\t{}\trecall {:.2f}\tprecision {:.2f}\tf1 {:.2f}".format(tag, recall, precision, f1))
62 |     return recall, precision, f1


--------------------------------------------------------------------------------
/第4章  数字人文下的实体识别/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ## 命名实体识别代码
 3 | 
 4 | 该部分为本书第四章节对应的源代码，包含数据预处理和使用LSTM-CRF与BERT实现命名实体识别的代码实现
 5 | 
 6 | 
 7 | ## 数据预处理模块
 8 | 
 9 | 1.将预处理的数据放入data文件夹下，其格式需与filename.txt保持一致。
10 | 2.运行pro_ner.py将数据转为BIOES标注格式
11 | 3.运行train_test_divide.py划分训练集与测试集
12 | 
13 | 
14 | ## 命名实体识别模块
15 | 
16 | 1.BILSTM-CRF代码见ChinsesNER-pytorch-master文件夹
17 | 2.BERT代码见BERT-NER-pytorch_sample文件夹
18 | 


--------------------------------------------------------------------------------
/第4章  数字人文下的实体识别/数据预处理/data/filename.txt:
--------------------------------------------------------------------------------
1 | 軒轅/nr 乃/d 修/v 德/n 振/v 兵/nrx ，/w 治/v 五氣/n ，/w 藝/v 五種/nrx ，/w 撫/v 萬民/nrx ，/w 度/v 四方/n ，/w 教/v 熊羆/nrx 貔貅/nrx 貙虎/nrx ，/w 以/p 與/p 炎帝/nr 戰/v 於/p 阪泉/ns 之/u 野/n 。/w 


--------------------------------------------------------------------------------
/第4章  数字人文下的实体识别/数据预处理/data_charseq/filename.txt:
--------------------------------------------------------------------------------
 1 | 軒 B-nr
 2 | 轅 E-nr
 3 | 乃 O
 4 | 修 O
 5 | 德 O
 6 | 振 O
 7 | 兵 O
 8 | ， O
 9 | 治 O
10 | 五 O
11 | 氣 O
12 | ， O
13 | 藝 O
14 | 五 O
15 | 種 O
16 | ， O
17 | 撫 O
18 | 萬 O
19 | 民 O
20 | ， O
21 | 度 O
22 | 四 O
23 | 方 O
24 | ， O
25 | 教 O
26 | 熊 O
27 | 羆 O
28 | 貔 O
29 | 貅 O
30 | 貙 O
31 | 虎 O
32 | ， O
33 | 以 O
34 | 與 O
35 | 炎 B-nr
36 | 帝 E-nr
37 | 戰 O
38 | 於 O
39 | 阪 B-ns
40 | 泉 E-ns
41 | 之 O
42 | 野 O
43 | 。 O
44 | 
45 | 


--------------------------------------------------------------------------------
/第4章  数字人文下的实体识别/数据预处理/data_seq/filename.txt:
--------------------------------------------------------------------------------
 1 | 軒轅	nr
 2 | 乃	d
 3 | 修	v
 4 | 德	n
 5 | 振	v
 6 | 兵	nrx
 7 | ，	w
 8 | 治	v
 9 | 五氣	n
10 | ，	w
11 | 藝	v
12 | 五種	nrx
13 | ，	w
14 | 撫	v
15 | 萬民	nrx
16 | ，	w
17 | 度	v
18 | 四方	n
19 | ，	w
20 | 教	v
21 | 熊羆	nrx
22 | 貔貅	nrx
23 | 貙虎	nrx
24 | ，	w
25 | 以	p
26 | 與	p
27 | 炎帝	nr
28 | 戰	v
29 | 於	p
30 | 阪泉	ns
31 | 之	u
32 | 野	n
33 | 。	w
34 | 
35 | 


--------------------------------------------------------------------------------
/第4章  数字人文下的实体识别/数据预处理/pro_ner.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import re
 3 | from string import punctuation
 4 | 
 5 | from tqdm import tqdm
 6 | from zhon import hanzi
 7 | 
 8 | punc = hanzi.punctuation + punctuation
 9 | 
10 | 
11 | def word_pos2word_seq(filepath, resultfolder='data_seq'):
12 |     """
13 |     word/tag转换为word\ttag
14 |     word指代词，tag指代词性标签
15 |     (不带BIES)
16 |     """
17 |     if resultfolder=='data_seq': 
18 |         if not os.path.exists('data_seq'):os.makedirs('data_seq') # 创建输出结果文件夹
19 |     data_name = os.path.split(filepath)[1][:-4]  # 获取当前输入数据文件文件名（不含前面的文件夹路径和最后的.txt）
20 |     with open(filepath, 'rt', encoding='utf8')as f:
21 |         with open('{}/{}.txt'.format(resultfolder, data_name), 'w', encoding='utf8') as r:
22 |             for line in tqdm(f.readlines()): # 遍历读取每一行数据
23 |                 if line == '\n': # 若该行为空行则跳过
24 |                     # r.write('\n')
25 |                     continue
26 |                 content_lst = line.strip('\n\r').strip(' ') # 去除每行末尾空格
27 |                 content_lst = re.sub('  ', ' ', content_lst).split(' ')# 去除连续多余的空格为1个并按照空格拆分为列表: [word/tag, word2/tag2, ……]
28 | 
29 |                 char_tag_lst = [c.split('/') for c in content_lst]
30 |                 char_lst = [c[0] for c in char_tag_lst] # word列表
31 |                 tag_lst = [c[1] for c in char_tag_lst] # tag列表
32 | 
33 |                 for char, tag in zip(char_lst, tag_lst):
34 |                     r.write(char + '\t' + tag + '\n')
35 |                 r.write('\n') # 每行结束之后增加一个空行用于区分不同行转换出的序列
36 | 
37 | 
38 | def word_seq2char_seq(filepath,resultfolder='data_charseq'):
39 |     """
40 |     word\ttag转换为char\ttag
41 |     (带BIES)
42 |     """
43 |     if resultfolder=='data_charseq':
44 |         if not os.path.exists('data_charseq'):
45 |             os.makedirs('data_charseq')
46 |     data_name = os.path.split(filepath)[1] #[:-4]  # 数据文件名
47 |     sep_char = ' '  # 生成的文件 word tag中的分隔符
48 | 
49 |     with open(filepath, 'rt', encoding='utf-8-sig')as f:
50 |         with open('{}/{}'.format(resultfolder,data_name), 'w', encoding='utf-8')as r:
51 |             for line in tqdm(f.readlines()): # 遍历读取每行 word\t tag\n
52 |                 if line == '\n':
53 |                     # r.write(' \n')  # crf_learn并不认可数据中使用’\n’作为sentence间的分割符（空行），但能够识别‘space（空格）\n’的空行分隔符。
54 |                     r.write('\n')   # bert_ner_pytorch的断句 使用’\n’作为sentence间的分割符（空行）
55 |                     continue
56 |                 word_tag_lst = line.strip('\n').split('\t')
57 |                 word = word_tag_lst[0] # word
58 |                 tag = word_tag_lst[1] # tag
59 | 
60 |                 char_lst = list(word) # 每行单个字组成的列表
61 |                 tag_lst = []
62 |                 if tag not in ['nr', 'ns', 't']: # 本次识别的实体词性标签
63 |                     for char in word:
64 |                         r.write(char + sep_char +'O\n') # 将非此次需要识别的标签认定为O
65 |                     continue
66 | 
67 |                 if len(word) == 1: # 单个字组成的实体，用S-tag表示
68 |                     # char_lst.append(word)
69 |                     tag_lst.append('S-' + tag)
70 |                 elif len(word) == 2: # 双字实体
71 |                     # char_lst.extend([word[0],word[1]])
72 |                     tag_lst.extend(['B-' + tag, 'E-' + tag])
73 |                 else: # 三字以上实体
74 |                     for id, char in enumerate(word):
75 |                         # char_lst.append(char)
76 |                         if id == 0:
77 |                             tag_lst.append('B-' + tag)
78 |                         elif id < len(word) - 1:
79 |                             tag_lst.append('I-' + tag)
80 |                         else:
81 |                             tag_lst.append('E-' + tag)
82 |                 for char, tag in zip(char_lst, tag_lst):
83 |                     r.write(char + sep_char + tag + '\n')
84 | 
85 | 
86 | def main():
87 |     word_pos2word_seq('data/filename.txt')
88 |     word_seq2char_seq('data_seq/filename.txt')
89 | 
90 | if __name__ == '__main__':
91 |     main()
92 | 


--------------------------------------------------------------------------------
/第4章  数字人文下的实体识别/数据预处理/train_test_divide.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from os import path
 3 | 
 4 | from sklearn.model_selection import KFold
 5 | from sklearn.model_selection import train_test_split
 6 | from tqdm import tqdm
 7 | 
 8 | 
 9 | def data_merge(folder, merged_txt='data_merged.txt'):
10 |     """ 将多个txt的数据合并 """
11 |     if type(folder) == list:  # 若folder为list，则汇总合并多个文件夹内所有文件的path
12 |         paths = []
13 |         [paths.extend(path.join(folder_i, f) for f in os.listdir(folder_i)) for folder_i in folder]
14 |     else:
15 |         paths = [path.join(folder, f) for f in os.listdir(folder)]
16 | 
17 |     with open(merged_txt, 'w+', encoding='utf-8')as dmf:
18 |         for p in tqdm(paths):
19 |             with open(p, 'rt', encoding='utf-8')as pf:
20 |                 dmf.writelines(pf.readlines())
21 | 
22 | 
23 | def load_data(filepaths=None, datafile=None):
24 |     """ 加载文件夹内所有数据 """
25 |     if filepaths is None:
26 |         filepaths = [datafile]
27 |     data = []
28 |     [data.extend(open(path, 'rt', encoding='utf-8').readlines()) for path in filepaths]
29 |     # random.shuffle(data)
30 |     return data
31 | 
32 | 
33 | def train_test_divide(data):
34 |     """ 读取数据文件并划分为训练集、测试集 """
35 |     # random.shuffle(data)
36 |     train_data, test_data = train_test_split(data, test_size=0.1)
37 | 
38 |     with open('train_data.txt', 'w+', encoding='utf-8')as tr:
39 |         tr.write('\n'.join(train_data))
40 |     with open('test_data.txt', 'w+', encoding='utf-8')as te:
41 |         te.write('\n'.join(test_data))
42 |     return train_data, test_data
43 | 
44 | 
45 | def train_test_divide_kfold(data, outputfolder='data'):
46 |     """ 十折交叉验证划分数据训练集、测试集 """
47 |     [os.makedirs(outputfolder + os.sep + 'data_{}'.format(i)) for i in range(10) if not os.path.exists(outputfolder + os.sep + 'data_{}'.format(i))]
48 | 
49 |     kf = KFold(n_splits=10, shuffle=False)  # shuffle 是否打乱数据，此处为否
50 |     k = 0
51 |     for train_index, test_index in kf.split(data):
52 |         train_list = [data[tr].strip('\r\n') for tr in train_index]
53 |         test_list = [data[te].strip('\r\n') for te in test_index]
54 |         with open(outputfolder + os.sep + 'data_' + str(k) + os.sep + 'train.tsv', 'w+', encoding='utf-8')as tr:
55 |             tr.write('\n'.join(train_list))
56 |         with open(outputfolder + os.sep + 'data_' + str(k) + os.sep + 'test.tsv', 'w+', encoding='utf-8')as te:
57 |             te.write('\n'.join(test_list))
58 |         k += 1
59 |         print('第{}折完成！'.format(k))
60 | 
61 | 
62 | if __name__ == '__main__':
63 |     file = '待划分文件路径（含文件名）'
64 |     data = load_data(datafile=file)
65 |     train_test_divide_kfold(data, '结果文件路径（含文件名）')


--------------------------------------------------------------------------------
/第4章  数字人文下的实体识别/第四章 数字人文下的命名实体识别.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hsc748NLP/code-for-digital-humanities-tutorial/6220c365ab5ff7df16f1f20a46cc928b034db36a/第4章  数字人文下的实体识别/第四章 数字人文下的命名实体识别.pdf


--------------------------------------------------------------------------------
/第5章  数字人文下的模型预训练/README.md:
--------------------------------------------------------------------------------
 1 | ## 语言模型预训练
 2 | 
 3 | 本部分包含预训练BERT类模型和将bin模型转换为ckpt模型的代码
 4 | 
 5 | 1.pytorch_chinese_lm_pretrain文件夹内包含bert类模型预训练的基础代码，可直接采用训练语句使用，也可通过修改sh文件夹中的内容使用。此处实现参照了transformers库的预训练预训练代码和中文模型预训练的github项目(https://github.com/zhusleep/pytorch_chinese_lm_pretrain)
 6 | 
 7 | 2.transfer.py用于将预训练完成的bin文件转为ckpt格式，可供tensorflow框架加载。
 8 | 
 9 | ## 建议运行环境
10 | ```
11 | torch==1.6.0
12 | 
13 | transformers==3.4.0
14 | 
15 | 1.15.0<= tensorflow <2.0
16 | ```
17 | 


--------------------------------------------------------------------------------
/第5章  数字人文下的模型预训练/pytorch_chinese_lm_pretrain/README.md:
--------------------------------------------------------------------------------
 1 | # 基于pytorch的中文语言模型预训练
 2 | 
 3 | 提供三种中文语言模型预训练的方法。预训练bert类模型对硬件的要求较高，建议在16G以上显存的设备上运行代码。
 4 | 
 5 | ## bert-base-chinese
 6 | 
 7 | (https://huggingface.co/bert-base-chinese)
 8 | ​
 9 | 
10 | 基于官方案例实现bert模型训练。
11 | 
12 | https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling
13 | (本文使用的transformers版本为3.4.0)
14 | ```
15 | python run_language_model_bert.py     --output_dir=output     --model_type=bert     --model_name_or_path=bert-base-chinese     --do_train     --train_data_file=data/train.txt     --do_eval     --eval_data_file=data/eval.txt     --mlm --per_device_train_batch_size=4  --save_total_limit=1  --num_train_epochs=5
16 | 
17 | ```
18 | 会自动从官网上下载bert-base-chinese模型来继续训练。
19 | 
20 | ## roberta-wwm-ext
21 | 
22 | (https://github.com/ymcui/Chinese-BERT-wwm)
23 | 
24 | 
25 | 要基于上面的代码run_language_model_roberta.py继续预训练roberta。还需要做两个改动。
26 | * 下载roberta-wwm-ext到本地目录hflroberta，在config.json中修改“model_type”:"roberta"为"model_type":"bert"。
27 | * 对上面的run_language_modeling.py中的AutoModel和AutoTokenizer都进行替换为BertModel和BertTokenizer。
28 | 
29 | 假设config.json已经改好，可以运行如下命令。
30 | ```
31 | python run_language_model_roberta.py     --output_dir=output     --model_type=bert     --model_name_or_path=hflroberta     --do_train     --train_data_file=data/train.txt     --do_eval     --eval_data_file=data/eval.txt     --mlm --per_device_train_batch_size=4  --save_total_limit=1  --num_train_epochs=5
32 | ```
33 | 
34 | ### ernie
35 | https://github.com/nghuyong/ERNIE-Pytorch）
36 | 
37 | ernie是百度发布的基于百度知道贴吧等中文语料结合实体预测等任务生成的预训练模型。这个模型的准确率在某些任务上要优于bert-base-chinese和roberta。如果基于ernie1.0模型做领域数据预训练的话只需要一步修改。
38 | 
39 | * 下载ernie1.0到本地目录ernie，在config.json中增加字段"model_type":"bert"。
40 | 运行
41 | ```
42 | python run_language_model_ernie.py     --output_dir=output     --model_type=bert     --model_name_or_path=ernie     --do_train     --train_data_file=train.txt     --do_eval     --eval_data_file=eval.txt     --mlm --per_device_train_batch_size=4  --save_total_limit=1  --num_train_epochs=5
43 | 
44 | ```
45 | 


--------------------------------------------------------------------------------
/第5章  数字人文下的模型预训练/pytorch_chinese_lm_pretrain/output/README.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/第5章  数字人文下的模型预训练/pytorch_chinese_lm_pretrain/run_bert.sh:
--------------------------------------------------------------------------------
 1 | ###
 2 |  # @Author: your name
 3 |  # @Date: 2021-05-15 20:18:45
 4 |  # @LastEditTime: 2021-06-08 20:27:08
 5 |  # @LastEditors: Please set LastEditors
 6 |  # @Description: In User Settings Edit
 7 |  # @FilePath: /pytorch_chinese_lm_pretrain-master/run_bert.sh
 8 | ### 
 9 | TRAIN_FILE='train.txt'
10 | TEST_FILE='eval.txt'
11 | PreTrain_Model='bert-base-chinese'
12 | mkdir -p log
13 | CUDA_VISIBLE_DEVICES=0,1 python run_language_model_bert.py   \
14 |     --output_dir=output/$PreTrain_Model   \
15 |     --model_type=bert     \
16 |     --overwrite_output_dir \
17 |     --save_total_limit=3 \
18 |     --num_train_epochs=10 \
19 |     --learning_rate=5e-4 \
20 |     --local_rank=-1 \
21 |     --model_name_or_path=$PreTrain_Model  \
22 |     --do_train     \
23 |     --train_data_file=$TRAIN_FILE     \
24 |     --do_eval     \
25 |     --eval_data_file=$TEST_FILE     \
26 |     --mlm \
27 |     --per_device_train_batch_size=32  \
28 |     > log/log_$PreTrain_Model.log 2>&1 & echo $! > log/run_$PreTrain_Model.pid
29 | 


--------------------------------------------------------------------------------
/第5章  数字人文下的模型预训练/pytorch_chinese_lm_pretrain/run_bert_from_scratch.sh:
--------------------------------------------------------------------------------
 1 | ###
 2 |  # @Author: your name
 3 |  # @Date: 2021-05-14 20:57:35
 4 |  # @LastEditTime: 2021-05-30 14:55:08
 5 |  # @LastEditors: your name
 6 |  # @Description: In User Settings Edit
 7 |  # @FilePath: /pytorch_chinese_lm_pretrain-master/run_roberta.sh
 8 | ### 
 9 | TRAIN_FILE='train.txt'
10 | TEST_FILE='eval.txt'
11 | PreTrain_Model='roberta-base'
12 | From_Scratch='/home/admin/zihe.zhu/pytorch_chinese_lm_pretrain-master/train_tokenizer/pretrained_models/'
13 | mkdir -p log
14 | CUDA_VISIBLE_DEVICES=0,1 nohup python full_copy.py \
15 |     --output_dir=output/$PreTrain_Model \
16 |     --model_type=bert \
17 |     --overwrite_output_dir \
18 |     --save_total_limit=3 \
19 |     --num_train_epochs=10 \
20 |     --learning_rate=5e-4 \
21 |     --local_rank=-1 \
22 |     --cache_dir=$From_Scratch \
23 |     --config_name=$From_Scratch \
24 |     --tokenizer_name=$From_Scratch \
25 |     --do_train \
26 |     --train_data_file=$TRAIN_FILE \
27 |     --do_eval \
28 |     --eval_data_file=$TEST_FILE \
29 |     --mlm \
30 |     --per_device_train_batch_size=32  \
31 |     > log/log_$PreTrain_Model.log 2>&1 & echo $! > log/run_$PreTrain_Model.pid


--------------------------------------------------------------------------------
/第5章  数字人文下的模型预训练/pytorch_chinese_lm_pretrain/run_roberta.sh:
--------------------------------------------------------------------------------
 1 | ###
 2 |  # @Author: your name
 3 |  # @Date: 2021-05-14 20:57:35
 4 |  # @LastEditTime: 2021-05-30 14:55:16
 5 |  # @LastEditors: your name
 6 |  # @Description: In User Settings Edit
 7 |  # @FilePath: /pytorch_chinese_lm_pretrain-master/run_roberta.sh
 8 | ### 
 9 | TRAIN_FILE='train.txt'
10 | TEST_FILE='eval.txt'
11 | PreTrain_Model='roberta-base'
12 | mkdir -p log
13 | CUDA_VISIBLE_DEVICES=0,1 nohup python full_copy.py \
14 |     --output_dir=output/$PreTrain_Model \
15 |     --model_type=bert \
16 |     --overwrite_output_dir \
17 |     --save_total_limit=3 \
18 |     --num_train_epochs=10 \
19 |     --learning_rate=5e-4 \
20 |     --local_rank=-1 \
21 |     --model_name_or_path=$PreTrain_Model \
22 |     --do_train \
23 |     --train_data_file=$TRAIN_FILE \
24 |     --do_eval \
25 |     --eval_data_file=$TEST_FILE \
26 |     --mlm \
27 |     --per_device_train_batch_size=32  \
28 |     > log/log_$PreTrain_Model.log 2>&1 & echo $! > log/run_$PreTrain_Model.pid


--------------------------------------------------------------------------------
/第5章  数字人文下的模型预训练/transfer.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 |  
 3 | """
 4 | Convert Huggingface Pytorch checkpoint to Tensorflow checkpoint.
 5 | """
 6 |  
 7 | import os
 8 | 
 9 | import numpy as np
10 | import tensorflow as tf
11 | import torch
12 | from transformers import BertModel
13 | 
14 | 
15 | def convert_pytorch_checkpoint_to_tf(model: BertModel, ckpt_dir: str, model_name: str):
16 |  
17 |     """
18 |     :param model:BertModel Pytorch model instance to be converted
19 |     :param ckpt_dir: Tensorflow model directory
20 |     :param model_name: model name
21 |     :return:
22 |     Currently supported Huggingface models:
23 |         Y BertModel
24 |         N BertForMaskedLM
25 |         N BertForPreTraining
26 |         N BertForMultipleChoice
27 |         N BertForNextSentencePrediction
28 |         N BertForSequenceClassification
29 |         N BertForQuestionAnswering
30 |     """
31 |  
32 |     tensors_to_transpose = ("dense.weight", "attention.self.query", "attention.self.key", "attention.self.value")
33 |  
34 |     var_map = (
35 |         ("layer.", "layer_"),
36 |         ("word_embeddings.weight", "word_embeddings"),
37 |         ("position_embeddings.weight", "position_embeddings"),
38 |         ("token_type_embeddings.weight", "token_type_embeddings"),
39 |         (".", "/"),
40 |         ("LayerNorm/weight", "LayerNorm/gamma"),
41 |         ("LayerNorm/bias", "LayerNorm/beta"),
42 |         ("weight", "kernel"),
43 |     )
44 |  
45 |     if not os.path.isdir(ckpt_dir):
46 |         os.makedirs(ckpt_dir)
47 |  
48 |     state_dict = model.state_dict()
49 |  
50 |     def to_tf_var_name(name: str):
51 |         for patt, repl in iter(var_map):
52 |             name = name.replace(patt, repl)
53 |         return "bert/{}".format(name)
54 |  
55 |     def create_tf_var(tensor: np.ndarray, name: str, session: tf.Session):
56 |         tf_dtype = tf.dtypes.as_dtype(tensor.dtype)
57 |         tf_var = tf.get_variable(dtype=tf_dtype, shape=tensor.shape, name=name, initializer=tf.zeros_initializer())
58 |         session.run(tf.variables_initializer([tf_var]))
59 |         session.run(tf_var)
60 |         return tf_var
61 |  
62 |     tf.reset_default_graph()
63 |     with tf.Session() as session:
64 |         for var_name in state_dict:
65 |             tf_name = to_tf_var_name(var_name)
66 |             torch_tensor = state_dict[var_name].numpy()
67 |             if any([x in var_name for x in tensors_to_transpose]):
68 |                 torch_tensor = torch_tensor.T
69 |             tf_var = create_tf_var(tensor=torch_tensor, name=tf_name, session=session)
70 |             tf.keras.backend.set_value(tf_var, torch_tensor)
71 |             tf_weight = session.run(tf_var)
72 |             print("Successfully created {}: {}".format(tf_name, np.allclose(tf_weight, torch_tensor)))
73 |  
74 |         saver = tf.train.Saver(tf.trainable_variables())
75 |         saver.save(session, os.path.join(ckpt_dir, model_name.replace("-", "_").replace(".ckpt", "") + ".ckpt"))
76 |  
77 | def convert(pytorch_bin_path: str, pytorch_bin_model: str, tf_ckpt_path: str, tf_ckpt_model: str):
78 |  
79 |     model = BertModel.from_pretrained(
80 |         pretrained_model_name_or_path=pytorch_bin_path,
81 |         state_dict=torch.load(os.path.join(pytorch_bin_path, pytorch_bin_model), map_location='cpu')
82 |     )
83 |  
84 |     convert_pytorch_checkpoint_to_tf(model=model, ckpt_dir=tf_ckpt_path, model_name=tf_ckpt_model)
85 |  
86 | if __name__ == '__main__':
87 |     bin_path = r'/home/admin/pretrain_models/sikuroberta_vocabtxt'
88 |     bin_model = 'pytorch_model.bin'
89 |     ckpt_path = r'/home/admin/pretrain_models/sikuroberta_vocabtxt_ckpt'
90 |     ckpt_model = 'bert_model.ckpt'
91 |  
92 |     convert(bin_path, bin_model, ckpt_path, ckpt_model)
93 | 
94 | 


--------------------------------------------------------------------------------
/第5章  数字人文下的模型预训练/第五章 数字人文下的模型预训练.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hsc748NLP/code-for-digital-humanities-tutorial/6220c365ab5ff7df16f1f20a46cc928b034db36a/第5章  数字人文下的模型预训练/第五章 数字人文下的模型预训练.pdf


--------------------------------------------------------------------------------
/第6章  数字人文下的知识图谱构建及应用/FLASK/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /workspace.xml


--------------------------------------------------------------------------------
/第6章  数字人文下的知识图谱构建及应用/FLASK/.idea/.name:
--------------------------------------------------------------------------------
1 | app.py


--------------------------------------------------------------------------------
/第6章  数字人文下的知识图谱构建及应用/FLASK/.idea/falsk_test.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="Flask">
 4 |     <option name="enabled" value="true" />
 5 |   </component>
 6 |   <component name="NewModuleRootManager">
 7 |     <content url="file://$MODULE_DIR$">
 8 |       <sourceFolder url="file://$MODULE_DIR$" isTestSource="false" />
 9 |     </content>
10 |     <orderEntry type="inheritedJdk" />
11 |     <orderEntry type="sourceFolder" forTests="false" />
12 |   </component>
13 |   <component name="TemplatesService">
14 |     <option name="TEMPLATE_CONFIGURATION" value="Jinja2" />
15 |   </component>
16 | </module>


--------------------------------------------------------------------------------
/第6章  数字人文下的知识图谱构建及应用/FLASK/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 | <component name="InspectionProjectProfileManager">
2 |   <settings>
3 |     <option name="USE_PROJECT_PROFILE" value="false" />
4 |     <version value="1.0" />
5 |   </settings>
6 | </component>


--------------------------------------------------------------------------------
/第6章  数字人文下的知识图谱构建及应用/FLASK/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="JavaScriptSettings">
4 |     <option name="languageLevel" value="ES6" />
5 |   </component>
6 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7 (3)" project-jdk-type="Python SDK" />
7 | </project>


--------------------------------------------------------------------------------
/第6章  数字人文下的知识图谱构建及应用/FLASK/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/falsk_test.iml" filepath="$PROJECT_DIR$/.idea/falsk_test.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/第6章  数字人文下的知识图谱构建及应用/FLASK/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="$PROJECT_DIR$/../../.." vcs="Git" />
5 |     <mapping directory="$PROJECT_DIR$" vcs="Git" />
6 |     <mapping directory="$PROJECT_DIR$/Medicine" vcs="Git" />
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/第6章  数字人文下的知识图谱构建及应用/FLASK/FindSim.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python  
  2 | # -*- coding:utf-8 _*-  
  3 | """
  4 | @Time:2021-05-10 20:07
  5 | @Author:Veigar
  6 | @File: FindSim.py
  7 | @Github:https://github.com/veigaran
  8 | """
  9 | #!/usr/bin/env python
 10 | # -*- coding:utf-8 _*-
 11 | 
 12 | import jieba
 13 | 
 14 | from Params import Params
 15 | 
 16 | 
 17 | class FindSim(Params):
 18 |     def __init__(self):
 19 |         super().__init__()
 20 |         self.result = {}
 21 | 
 22 |     def find_sim_words(self, question):
 23 |         """
 24 |         当全匹配失败时，就采用相似度计算来找相似的词
 25 |         :param question:
 26 |         :return:
 27 |         """
 28 |         import re
 29 |         import string
 30 |         from gensim.models import KeyedVectors
 31 | 
 32 |         jieba.load_userdict(self.vocab_path)
 33 |         self.model = KeyedVectors.load_word2vec_format(self.word2vec_path, binary=False)
 34 | 
 35 |         sentence = re.sub("[{}]", re.escape(string.punctuation), question)
 36 |         sentence = re.sub("[，。‘’；：？、！【】]", " ", sentence)
 37 |         sentence = sentence.strip()
 38 | 
 39 |         words = [w.strip() for w in jieba.cut(sentence) if w.strip() not in self.stopwords and len(w.strip()) >= 2]
 40 | 
 41 |         alist = []
 42 | 
 43 |         for word in words:
 44 |             temp = [self.medicine_entities, self.generic_entities, self.cate_entities, self.indication_entities]
 45 |             for i in range(len(temp)):
 46 |                 flag = ''
 47 |                 if i == 0:
 48 |                     flag = "Medicine"
 49 |                 elif i == 1:
 50 |                     flag = "genericNameFormat"
 51 |                 elif i == 2:
 52 |                     flag = "list_cate"
 53 |                 else:
 54 |                     flag = "indications"
 55 |                 scores = self.simCal(word, temp[i], flag)
 56 |                 alist.extend(scores)
 57 |         temp1 = sorted(alist, key=lambda k: k[1], reverse=True)
 58 |         if temp1:
 59 |             self.result[temp1[0][2]] = [temp1[0][0]]
 60 | 
 61 |     def editDistanceDP(self, s1, s2):
 62 |         """
 63 |         采用DP方法计算编辑距离
 64 |         :param s1:
 65 |         :param s2:
 66 |         :return:
 67 |         """
 68 |         m = len(s1)
 69 |         n = len(s2)
 70 |         solution = [[0 for j in range(n + 1)] for i in range(m + 1)]
 71 |         for i in range(len(s2) + 1):
 72 |             solution[0][i] = i
 73 |         for i in range(len(s1) + 1):
 74 |             solution[i][0] = i
 75 | 
 76 |         for i in range(1, m + 1):
 77 |             for j in range(1, n + 1):
 78 |                 if s1[i - 1] == s2[j - 1]:
 79 |                     solution[i][j] = solution[i - 1][j - 1]
 80 |                 else:
 81 |                     solution[i][j] = 1 + min(solution[i][j - 1], min(solution[i - 1][j],
 82 |                                                                      solution[i - 1][j - 1]))
 83 |         return solution[m][n]
 84 | 
 85 |     def simCal(self, word, entities, flag):
 86 |         """
 87 |         计算词语和字典中的词的相似度
 88 |         相同字符的个数/min(|A|,|B|)   +  余弦相似度
 89 |         :param word: str
 90 |         :param entities:List
 91 |         :return:
 92 |         """
 93 |         a = len(word)
 94 |         scores = []
 95 |         for entity in entities:
 96 |             sim_num = 0
 97 |             b = len(entity)
 98 |             c = len(set(entity + word))
 99 |             temp = []
100 |             for w in word:
101 |                 if w in entity:
102 |                     sim_num += 1
103 |             if sim_num != 0:
104 |                 score1 = sim_num / c  # overlap score
105 |                 temp.append(score1)
106 |             try:
107 |                 score2 = self.model.similarity(word, entity)  # 余弦相似度分数
108 |                 temp.append(score2)
109 |             except:
110 |                 pass
111 |             score3 = 1 - self.editDistanceDP(word, entity) / (a + b)  # 编辑距离分数
112 |             if score3:
113 |                 temp.append(score3)
114 | 
115 |             score = sum(temp) / len(temp)
116 |             if score >= 0.7:
117 |                 scores.append((entity, score, flag))
118 | 
119 |         scores.sort(key=lambda k: k[1], reverse=True)
120 |         return scores
121 | 


--------------------------------------------------------------------------------
/第6章  数字人文下的知识图谱构建及应用/FLASK/Params.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python  
 2 | # -*- coding:utf-8 _*-  
 3 | """
 4 | @Time:2021-05-10 20:07
 5 | @Author:Veigar
 6 | @File: Params.py
 7 | @Github:https://github.com/veigaran
 8 | """
 9 | import os
10 | import pickle
11 | 
12 | import ahocorasick
13 | import joblib
14 | 
15 | 
16 | class Params:
17 |     def __init__(self):
18 |         cur_dir = '/'.join(os.path.abspath(__file__).split('/')[:-1])
19 |         # 路径
20 |         self.vocab_path = os.path.join(cur_dir, 'data/vocab.txt')
21 |         self.stopwords_path = os.path.join(cur_dir, 'data/stop_words.utf8')
22 |         self.word2vec_path = r'F:\A文档\python学习\Competition\Medication\代码\data\merge_sgns_bigram_char300.txt'  # os.path.join(cur_dir, 'data/merge_sgns_bigram_char300.txt')
23 |         self.stopwords = [w.strip() for w in open(self.stopwords_path, 'r', encoding='utf8') if w.strip()]
24 | 
25 |         # 意图分类模型文件
26 |         self.tfidf_path = os.path.join(cur_dir, 'model/tf.pkl')
27 |         self.nb_test_path = os.path.join(cur_dir, 'model/SVM.m')  # 测试nb模型
28 |         self.tfidf_model = pickle.load(open(self.tfidf_path, "rb"))
29 |         self.nb_model = joblib.load(self.nb_test_path)
30 | 
31 |         self.person_path = os.path.join(cur_dir, 'data/人物.txt')
32 |         self.alias_path = os.path.join(cur_dir, 'data/别名.txt')
33 |         self.surname_path = os.path.join(cur_dir, 'data/姓氏.txt')
34 |         self.country_path = os.path.join(cur_dir, 'data/国家.txt')
35 |         self.school_path = os.path.join(cur_dir, 'data/学派.txt')
36 |         self.rank_path = os.path.join(cur_dir, 'data/等级.txt')
37 |         self.field_path = os.path.join(cur_dir, 'data/领域.txt')
38 | 
39 |         self.person_entities = [w.strip() for w in open(self.person_path, encoding='utf8') if w.strip()]
40 |         self.alias_entities = [w.strip() for w in open(self.alias_path, encoding='utf8') if w.strip()]
41 |         self.surname_entities = [w.strip() for w in open(self.surname_path, encoding='utf8') if w.strip()]
42 |         self.country_entities = [w.strip() for w in open(self.country_path, encoding='utf8') if w.strip()]
43 |         self.school_entities = [w.strip() for w in open(self.school_path, encoding='utf8') if w.strip()]
44 |         self.rank_entities = [w.strip() for w in open(self.rank_path, encoding='utf8') if w.strip()]
45 |         self.field_entities = [w.strip() for w in open(self.field_path, encoding='utf8') if w.strip()]
46 | 
47 |         # 构造领域actree
48 |         self.person_tree = self.build_actree(list(set(self.person_entities)))
49 |         self.alias_tree = self.build_actree(list(set(self.alias_entities)))
50 |         self.surname_tree = self.build_actree(list(set(self.surname_entities)))
51 |         self.country_tree = self.build_actree(list(set(self.country_entities)))
52 |         self.school_tree = self.build_actree(list(set(self.school_entities)))
53 |         self.rank_tree = self.build_actree(list(set(self.rank_entities)))
54 |         self.field_tree = self.build_actree(list(set(self.field_entities)))
55 | 
56 |         self.name_qwds = ['英文名是什么', '通用名是什么', '一般叫什么', '哪些名字', '什么名字']
57 |         self.country_qwds = ['国家是什么', '国家', '属于哪个国家']
58 |         self.children_qwds = ['子女有哪些', '子女是谁', '儿子是谁', '孩子有哪些', '孩子是谁']
59 |         self.father_qwds = ['父亲是谁', '爸爸是谁', '父亲', '爸爸', '爸爸是什么名字']
60 | 
61 |     def build_actree(self, wordlist):
62 |         """
63 |         构造actree，加速过滤
64 |         :param wordlist:
65 |         :return:
66 |         """
67 |         actree = ahocorasick.Automaton()
68 |         # 向树中添加单词
69 |         for index, word in enumerate(wordlist):
70 |             actree.add_word(word, (index, word))
71 |         actree.make_automaton()
72 |         return actree
73 | 


--------------------------------------------------------------------------------
/第6章  数字人文下的知识图谱构建及应用/FLASK/app.py:
--------------------------------------------------------------------------------
 1 | from flask import Flask, request, render_template
 2 | 
 3 | import kbqa
 4 | 
 5 | app = Flask(__name__, static_url_path='')
 6 | 
 7 | 
 8 | @app.route('/')
 9 | def hello_world():
10 |     return render_template('search.html')
11 | 
12 | 
13 | @app.route('/wstmsearch', methods=['GET', 'POST'])
14 | def wstm_search():
15 |     answer = str
16 |     if request.method == 'POST':
17 |         # 取出待搜索keyword
18 |         keyword = request.form['keyword']
19 |         handler = kbqa.KBQA()
20 |         # question = input("用户：")
21 |         question = keyword
22 |         answer = handler.qa_main(question)
23 |         print('ok')
24 |         print("AI机器人：", answer)
25 |         print("*" * 50)
26 | 
27 |         return render_template('result.html', search_result=answer, keyword=question)
28 |     return render_template('search.html')
29 | 
30 | 
31 | 
32 | if __name__ == '__main__':
33 |     app.run()
34 |     # app = Flask(__name__)
35 |     # app.config['SERVER_NAME'] = 'veiagra.top'
36 |     # app.run(debug=True, host='0.0.0.0', port=443,
37 |     #         ssl_context=('./etc/nginx/ssl_certs/veiagra.pem', '/etc/nginx/ssl_certs/veiagra.key'))
38 | 


--------------------------------------------------------------------------------
/第6章  数字人文下的知识图谱构建及应用/FLASK/https/default:
--------------------------------------------------------------------------------
  1 | ##
  2 | # You should look at the following URL's in order to grasp a solid understanding
  3 | # of Nginx configuration files in order to fully unleash the power of Nginx.
  4 | # http://wiki.nginx.org/Pitfalls
  5 | # http://wiki.nginx.org/QuickStart
  6 | # http://wiki.nginx.org/Configuration
  7 | #
  8 | # Generally, you will want to move this file somewhere, and start with a clean
  9 | # file but keep this around for reference. Or just disable in sites-enabled.
 10 | #
 11 | # Please see /usr/share/doc/nginx-doc/examples/ for more detailed examples.
 12 | ##
 13 | 
 14 | # Default server configuration
 15 | #
 16 | 
 17 | upstream flask {
 18 |         server 127.0.0.1:5000;
 19 |         server 127.0.0.1:5001;
 20 | }
 21 | 
 22 | server {
 23 | 	listen 80 default_server;
 24 | 	listen [::]:80 default_server;
 25 | 	root /var/www/html;
 26 | 
 27 | 	# Add index.php to the list if you are using PHP
 28 | 	index index.html index.htm index.nginx-debian.html;
 29 | 
 30 | 	server_name www.veiagra.top;
 31 | 	location / {
 32 |                 # 请求转发到gunicorn服务器
 33 |                 proxy_pass http://127.0.0.1:5000;
 34 |                 # 请求转发到多个gunicorn服务器
 35 |                 # proxy_pass http://flask;
 36 |                 # 设置请求头，并将头信息传递给服务器端 
 37 |                 proxy_set_header Host $host;
 38 |                 # 设置请求头，传递原始请求ip给 gunicorn 服务器
 39 |                 proxy_set_header X-Real-IP $remote_addr;
 40 |         }
 41 | 
 42 | 	#location / {
 43 | 		# First attempt to serve request as file, then
 44 | 		# as directory, then fall back to displaying a 404.
 45 | 		try_files $uri $uri/ =404;
 46 | 	#}
 47 | 	
 48 | 	# SSL configuration
 49 | 	#
 50 | 	# listen 443 ssl default_server;
 51 | 	# listen [::]:443 ssl default_server;
 52 | 	#
 53 | 	# Note: You should disable gzip for SSL traffic.
 54 | 	# See: https://bugs.debian.org/773332
 55 | 	#
 56 | 	# Read up on ssl_ciphers to ensure a secure configuration.
 57 | 	# See: https://bugs.debian.org/765782
 58 | 	#
 59 | 	# Self signed certs generated by the ssl-cert package
 60 | 	# Don't use them in a production server!
 61 | 	#
 62 | 	# include snippets/snakeoil.conf;
 63 | 	# pass the PHP scripts to FastCGI server listening on 127.0.0.1:9000
 64 | 	#
 65 | 	#location ~ \.php$ {
 66 | 	#	include snippets/fastcgi-php.conf;
 67 | 	#
 68 | 	#	# With php7.0-cgi alone:
 69 | 	#	fastcgi_pass 127.0.0.1:9000;
 70 | 	#	# With php7.0-fpm:
 71 | 	#	fastcgi_pass unix:/run/php/php7.0-fpm.sock;
 72 | 	#}
 73 | 
 74 | 	# deny access to .htaccess files, if Apache's document root
 75 | 	# concurs with nginx's one
 76 | 	#
 77 | 	#location ~ /\.ht {
 78 | 	#	deny all;
 79 | 	#}
 80 | }
 81 | 
 82 | 
 83 | # Virtual Host configuration for example.com
 84 | #
 85 | # You can move that to a different file under sites-available/ and symlink that
 86 | # to sites-enabled/ to enable it.
 87 | #
 88 | #server {
 89 | #	listen 80;
 90 | #	listen [::]:80;
 91 | #
 92 | #	server_name example.com;
 93 | #
 94 | #	root /var/www/example.com;
 95 | #	index index.html;
 96 | #
 97 | #	location / {
 98 | #		try_files $uri $uri/ =404;
 99 | #	}
100 | #}
101 | 


--------------------------------------------------------------------------------
/第6章  数字人文下的知识图谱构建及应用/FLASK/https/https.conf:
--------------------------------------------------------------------------------
 1 | upstream veiagra_api
 2 | {
 3 | 	ip_hash;
 4 | 	server 127.0.0.1:5000;
 5 | 	server 127.0.0.1:5001;
 6 | }
 7 | server {
 8 | 	listen  *:80;
 9 | 	listen	[::]:80;
10 |     listen  *:443 ssl;
11 | 	listen	[::]:443 ssl;
12 |     server_name  www.veiagra.top;
13 |     ssl_certificate      /etc/nginx/ssl_certs/veiagra.pem;
14 |     ssl_certificate_key  /etc/nginx/ssl_certs/veiagra.key;
15 |     #ssl_session_cache    shared:SSL:1m;
16 |     ssl_session_timeout  5m;
17 |     ssl_ciphers  HIGH:!aNULL:!MD5;
18 |     ssl_prefer_server_ciphers  on;
19 |     location / {
20 |         # 请求转发到gunicorn服务器
21 |         proxy_pass http://127.0.0.1:5000;
22 | 		# 请求转发到多个gunicorn服务器
23 | 		# proxy_pass http://flask;
24 | 		# 设置请求头，并将头信息传递给服务器端         
25 | 		proxy_set_header Host $host;
26 | 		# 设置请求头，传递原始请求ip给 gunicorn 服务器
27 | 		proxy_set_header X-Real-IP $remote_addr;
28 |     }
29 | }
30 | 
31 | server {
32 |     listen 80;
33 |     server_name www.veiagra.top; # 域名
34 |     # 强制跳转https
35 |     rewrite ^(.*) https://$server_name$1 permanent;
36 | 	location / {
37 | 	 # 请求转发到gunicorn服务器
38 |         proxy_pass http://127.0.0.1:5000;
39 | 		# 请求转发到多个gunicorn服务器
40 | 		# proxy_pass http://flask;
41 | 		# 设置请求头，并将头信息传递给服务器端         
42 | 		proxy_set_header Host $host;
43 | 		# 设置请求头，传递原始请求ip给 gunicorn 服务器
44 | 		proxy_set_header X-Real-IP $remote_addr;
45 | 	}
46 | }


--------------------------------------------------------------------------------
/第6章  数字人文下的知识图谱构建及应用/FLASK/https/nginx.conf:
--------------------------------------------------------------------------------
  1 | 
  2 | #user  nobody;
  3 | worker_processes  1;
  4 | 
  5 | #error_log  logs/error.log;
  6 | #error_log  logs/error.log  notice;
  7 | #error_log  logs/error.log  info;
  8 | 
  9 | #pid        logs/nginx.pid;
 10 | 
 11 | 
 12 | events {
 13 |     worker_connections  1024;
 14 | }
 15 | 
 16 | 
 17 | 
 18 | http {
 19 |     include       mime.types;
 20 |     default_type  application/octet-stream;
 21 | 
 22 |     #log_format  main  '$remote_addr - $remote_user [$time_local] "$request" '
 23 |     #                  '$status $body_bytes_sent "$http_referer" '
 24 |     #                  '"$http_user_agent" "$http_x_forwarded_for"';
 25 | 
 26 |     #access_log  logs/access.log  main;
 27 | 
 28 |     sendfile        on;
 29 |     #tcp_nopush     on;
 30 | 
 31 |     #keepalive_timeout  0;
 32 |     keepalive_timeout  65;
 33 | 
 34 |     #gzip  on;
 35 | 
 36 | 
 37 | 	upstream flask {
 38 |         server 127.0.0.1:5000;
 39 |         server 127.0.0.1:5001;
 40 | }
 41 | 	proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
 42 | 
 43 |     server {
 44 |         listen 80 default_server;
 45 |         server_name  www.veiagra.top;
 46 | 		listen [::]:80 default_server;
 47 | 
 48 |         #charset koi8-r;
 49 |         #access_log  logs/host.access.log  main;
 50 | 		  
 51 |         #root /var/www/html;
 52 |         #index index.html index.htm index.nginx-debian.html;
 53 | 
 54 |         server_name _;
 55 | 
 56 |         location / {
 57 | 			# 请求转发到gunicorn服务器
 58 |             #proxy_pass http://127.0.0.1:8000;
 59 |             # 请求转发到多个gunicorn服务器
 60 |             proxy_pass http://flask;
 61 |             # 设置请求头，并将头信息传递给服务器端 
 62 |             #proxy_set_header Host $host;
 63 |             # 设置请求头，传递原始请求ip给 gunicorn 服务器
 64 |             #proxy_set_header X-Real-IP $remote_addr;
 65 |             
 66 |         }
 67 | 	
 68 | 
 69 |         #error_page  404              /404.html;
 70 | 
 71 |         # redirect server error pages to the static page /50x.html
 72 |         #
 73 |         error_page   500 502 503 504  /50x.html;
 74 |         location = /50x.html {
 75 |             root   html;
 76 |         }
 77 | 
 78 |         # proxy the PHP scripts to Apache listening on 127.0.0.1:80
 79 |         #
 80 |         #location ~ \.php$ {
 81 |         #    proxy_pass   http://127.0.0.1;
 82 |         #}
 83 | 
 84 |         # pass the PHP scripts to FastCGI server listening on 127.0.0.1:9000
 85 |         #
 86 |         #location ~ \.php$ {
 87 |         #    root           html;
 88 |         #    fastcgi_pass   127.0.0.1:9000;
 89 |         #    fastcgi_index  index.php;
 90 |         #    fastcgi_param  SCRIPT_FILENAME  /scripts$fastcgi_script_name;
 91 |         #    include        fastcgi_params;
 92 |         #}
 93 | 
 94 |         # deny access to .htaccess files, if Apache's document root
 95 |         # concurs with nginx's one
 96 |         #
 97 |         #location ~ /\.ht {
 98 |         #    deny  all;
 99 |         #}
100 |     }
101 | 
102 | 
103 |     # another virtual host using mix of IP-, name-, and port-based configuration
104 |     #
105 |     #server {
106 |     #    listen       8000;
107 |     #    listen       somename:8080;
108 |     #    server_name  somename  alias  another.alias;
109 | 
110 |     #    location / {
111 |     #        root   html;
112 |     #        index  index.html index.htm;
113 |     #    }
114 |     #}
115 | 
116 | 
117 |     # HTTPS server
118 |     #
119 |     #server {
120 |     #    listen       443 ssl;
121 |     #    server_name  localhost;
122 | 
123 |     #    ssl_certificate      cert.pem;
124 |     #    ssl_certificate_key  cert.key;
125 | 
126 |     #    ssl_session_cache    shared:SSL:1m;
127 |     #    ssl_session_timeout  5m;
128 | 
129 |     #    ssl_ciphers  HIGH:!aNULL:!MD5;
130 |     #    ssl_prefer_server_ciphers  on;
131 | 
132 |     #    location / {
133 |     #        root   html;
134 |     #        index  index.html index.htm;
135 |     #    }
136 |     #}
137 | 
138 | }
139 | 


--------------------------------------------------------------------------------
/第6章  数字人文下的知识图谱构建及应用/FLASK/kbqa.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python  
 2 | # -*- coding:utf-8 _*-  
 3 | """
 4 | @Time:2021-05-10 20:09
 5 | @Author:Veigar
 6 | @File: kbqa.py
 7 | @Github:https://github.com/veigaran
 8 | """
 9 | 
10 | 
11 | class KBQA:
12 |     def __init__(self):
13 |         pass
14 |         # self.extractor = EntityExtractor()
15 |         # self.searcher = AnswerSearching()
16 | 
17 |     def qa_main(self, input_str):
18 |         answer = "对不起，您的问题我不知道，我今后会努力改进的。"
19 |         entities = self.extractor.extractor(input_str)
20 |         if not entities:
21 |             return answer
22 |         sqls = self.searcher.question_parser(entities)
23 |         final_answer = self.searcher.searching(sqls)
24 |         if not final_answer:
25 |             return answer
26 |         else:
27 |             return '\n'.join(final_answer)
28 | 
29 | 
30 | if __name__ == "__main__":
31 |     handler = KBQA()
32 |     while True:
33 |         question = input("请输入：")
34 |         if not question:
35 |             break
36 |         answer = handler.qa_main(question)
37 |         print("", answer)
38 |         print("*"*50)


--------------------------------------------------------------------------------
/第6章  数字人文下的知识图谱构建及应用/FLASK/model/NB.m:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hsc748NLP/code-for-digital-humanities-tutorial/6220c365ab5ff7df16f1f20a46cc928b034db36a/第6章  数字人文下的知识图谱构建及应用/FLASK/model/NB.m


--------------------------------------------------------------------------------
/第6章  数字人文下的知识图谱构建及应用/FLASK/model/ch_ner_model.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hsc748NLP/code-for-digital-humanities-tutorial/6220c365ab5ff7df16f1f20a46cc928b034db36a/第6章  数字人文下的知识图谱构建及应用/FLASK/model/ch_ner_model.h5


--------------------------------------------------------------------------------
/第6章  数字人文下的知识图谱构建及应用/FLASK/model/tf.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hsc748NLP/code-for-digital-humanities-tutorial/6220c365ab5ff7df16f1f20a46cc928b034db36a/第6章  数字人文下的知识图谱构建及应用/FLASK/model/tf.pkl


--------------------------------------------------------------------------------
/第6章  数字人文下的知识图谱构建及应用/FLASK/predict.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python  
  2 | # -*- coding:utf-8 _*-  
  3 | """
  4 | @Time:2021-05-10 20:06
  5 | @Author:Veigar
  6 | @File: predict.py
  7 | @Github:https://github.com/veigaran
  8 | """
  9 | import pickle
 10 | 
 11 | import jieba
 12 | import torch
 13 | 
 14 | 
 15 | def get_seg_features(string):
 16 |     """
 17 |     对句子分词，构造词的长度特征，为BIES格式,
 18 |     [对]对应的特征为[4], 不设为0，因为pad的id就是0
 19 |     [句子]对应的特征为[1,3],
 20 |     [中华人民]对应的特征为[1,2,2,3]
 21 |     """
 22 |     seg_feature = []
 23 | 
 24 |     for word in jieba.cut(string):
 25 |         if len(word) == 1:
 26 |             seg_feature.append(4)
 27 |         else:
 28 |             tmp = [2] * len(word)
 29 |             tmp[0] = 1
 30 |             tmp[-1] = 3
 31 |             seg_feature.extend(tmp)
 32 |     return seg_feature
 33 | 
 34 | 
 35 | def prepare_dataset(sentences, char_to_id, tag_to_id, lower=False, test=False):
 36 |     """
 37 |     把文本型的样本和标签，转化为index，便于输入模型
 38 |     需要在每个样本和标签前后加<start>和<end>,
 39 |     但由于pytorch-crf这个包里面会自动添加<start>和<end>的转移概率，
 40 |     所以我们不用在手动加入。
 41 |     """
 42 | 
 43 |     def f(x):
 44 |         return x.lower() if lower else x
 45 | 
 46 |     data = []
 47 |     for s in sentences:
 48 | 
 49 |         chars = [w[0] for w in s]
 50 |         tags = [w[-1] for w in s]
 51 | 
 52 |         """ 句子转化为index """
 53 |         chars_idx = [char_to_id[f(c) if f(c) in char_to_id else '<unk>'] for c in chars]
 54 | 
 55 |         """ 对句子分词，构造词的长度特征 """
 56 |         segs_idx = get_seg_features("".join(chars))
 57 | 
 58 |         if not test:
 59 |             tags_idx = [tag_to_id[t] for t in tags]
 60 | 
 61 |         else:
 62 |             tags_idx = [tag_to_id["<pad>"] for _ in tags]
 63 | 
 64 |         assert len(chars_idx) == len(segs_idx) == len(tags_idx)
 65 |         data.append([chars, chars_idx, segs_idx, tags_idx])
 66 | 
 67 |     return data
 68 | 
 69 | 
 70 | def result_to_json(string, tags):
 71 |     """ 按规范的格式输出预测结果 """
 72 |     item = {"string": string, "entities": []}
 73 |     entity_name = ""
 74 |     entity_start = 0
 75 |     idx = 0
 76 |     for char, tag in zip(string, tags):
 77 |         if tag[0] == "S":
 78 |             item["entities"].append({"word": char, "start": idx, "end": idx + 1, "type": tag[2:]})
 79 |         elif tag[0] == "B":
 80 |             entity_name += char
 81 |             entity_start = idx
 82 |         elif tag[0] == "I":
 83 |             entity_name += char
 84 |         elif tag[0] == "E":
 85 |             entity_name += char
 86 |             item["entities"].append({"word": entity_name, "start": entity_start, "end": idx + 1, "type": tag[2:]})
 87 |             entity_name = ""
 88 |         else:
 89 |             entity_name = ""
 90 |             entity_start = idx
 91 |         idx += 1
 92 |     return item
 93 | 
 94 | 
 95 | def predict(input_str):
 96 |     map_file = r'./model/maps.pkl'
 97 |     with open(map_file, "rb") as f:
 98 |         char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)
 99 | 
100 |     """ 用cpu预测 """
101 |     model_file = r'./model/medical_ner.ckpt'
102 |     model = torch.load(model_file, map_location="cpu")
103 |     # model.eval()
104 | 
105 |     if not input_str:
106 |         input_str = input("请输入文本: ")
107 | 
108 |     _, char_ids, seg_ids, _ = prepare_dataset([input_str], char_to_id, tag_to_id, test=True)[0]
109 |     char_tensor = torch.LongTensor(char_ids).view(1, -1)
110 |     seg_tensor = torch.LongTensor(seg_ids).view(1, -1)
111 | 
112 |     with torch.no_grad():
113 |         """ 得到维特比解码后的路径，并转换为标签 """
114 |         paths = model(char_tensor, seg_tensor)
115 |         tags = [id_to_tag[idx] for idx in paths[0]]
116 |     res = result_to_json(input_str, tags)
117 |     entity_type = res["entities"][0]['type']
118 |     word = res["entities"][0]['word']
119 |     result = {}
120 |     if entity_type == "DRU":
121 |         result["person"] = [word]
122 |     # pprint(result_to_json(input_str, tags))
123 |     print(entity_type, word, '\n', result)
124 |     return result
125 | 
126 | 
127 | 


--------------------------------------------------------------------------------
/第6章  数字人文下的知识图谱构建及应用/FLASK/static/0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hsc748NLP/code-for-digital-humanities-tutorial/6220c365ab5ff7df16f1f20a46cc928b034db36a/第6章  数字人文下的知识图谱构建及应用/FLASK/static/0.png


--------------------------------------------------------------------------------
/第6章  数字人文下的知识图谱构建及应用/FLASK/static/default.css:
--------------------------------------------------------------------------------
 1 | body{margin:0;padding:0;background:#ffe;font-size:14px;font-family:'微软雅黑','宋体',sans-serif;color:#231F20;overflow:auto}
 2 | a {color:#000;font-size:14px;}
 3 | #main{width:100%;}
 4 | #wrap{position:relative;margin:0 auto;width:1100px;height:680px;margin-top:10px;}
 5 | #text{width:400px;height:425px;left:60px;top:80px;position:absolute;}
 6 | #code{display:none;font-size:16px;}
 7 | #clock-box {position:absolute;left:60px;top:550px;font-size:28px;display:none;}
 8 | #clock-box a {font-size:28px;text-decoration:none;}
 9 | #clock{margin-left:48px;}
10 | #clock .digit {font-size:64px;}
11 | #canvas{margin:0 auto;width:1100px;height:680px;}
12 | #error{margin:0 auto;text-align:center;margin-top:60px;display:none;}
13 | .hand{cursor:pointer;}
14 | .say{margin-left:5px;}
15 | .space{margin-right:150px;}
16 | 


--------------------------------------------------------------------------------
/第6章  数字人文下的知识图谱构建及应用/FLASK/templates/0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hsc748NLP/code-for-digital-humanities-tutorial/6220c365ab5ff7df16f1f20a46cc928b034db36a/第6章  数字人文下的知识图谱构建及应用/FLASK/templates/0.png


--------------------------------------------------------------------------------
/第6章  数字人文下的知识图谱构建及应用/FLASK/templates/result.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 | <head>
 4 |     <meta charset="UTF-8">
 5 |     <title>古汉语典籍问答系统-检索结果</title>
 6 | 	<style type="text/css">
 7 | 	.container {
 8 | 				width: 500px;
 9 | 				height: 50px;
10 | 				margin: 100px auto;
11 | 			}
12 | 			
13 | 			.parent {
14 | 				width: 100%;
15 | 				height: 42px;
16 | 				top: 4px;
17 | 				position: relative;
18 | 			}
19 | 			
20 | 			.parent>input:first-of-type {
21 | 				/*输入框高度设置为40px, border占据2px，总高度为42px*/
22 | 				width: 380px;
23 | 				height: 40px; 
24 | 				border: 1px solid #ccc;
25 | 				font-size: 16px;
26 | 				outline: none;
27 | 			}
28 | 			
29 | 			.parent>input:first-of-type:focus {
30 | 				border: 1px solid #317ef3;
31 | 				padding-left: 10px;
32 | 			}
33 | 			
34 | 			.parent>input:last-of-type {
35 | 				/*button按钮border并不占据外围大小，设置高度42px*/
36 | 				width: 100px;
37 | 				height: 40px; 
38 | 				position: absolute;
39 | 				background: #317ef3;
40 | 				border: 1px solid #317ef3;
41 | 				color: #fff;
42 | 				font-size: 16px;
43 | 				outline: none;
44 | 			}
45 | 			.input_sub{
46 | 				width:100px;
47 | 				height: 40px;
48 | 				background: green;
49 |  
50 | 				/*去掉submit按钮默认边框*/
51 | 				border:0px;
52 | 				/*改成右浮动也是可以的*/
53 | 				float:left;
54 | 				color:white;/*搜索的字体颜色为白色*/
55 | 				cursor:pointer;/*鼠标变为小手*/
56 | 			}
57 | 			.test{
58 | 				width:128px;
59 | 				height:128px;
60 | 				position:relative;
61 | 			}
62 |  
63 | 	</style>
64 | </head>
65 | <body>
66 | 
67 | <div class="container">
68 | 	<div align="center" position="relative">
69 | 		<img src ="0.png" align="center"/>
70 | 	</div>
71 | 	
72 | 	<div>
73 |     	<form method="POST" action="/wstmsearch" class ="parent">
74 |             <input type="search" name="keyword" value="{{ keyword }}">
75 |             <input type="submit" value="搜索"class="input_sub">
76 |         </form>
77 |     </div>
78 | 
79 |     <div class="resultArea">
80 |  <span class="info"><br class="Result">
81 |      <br>
82 |      {{ search_result }}</span></span>
83 |     </div>
84 | 
85 |     </div>
86 | 
87 | </body>
88 | </html>


--------------------------------------------------------------------------------
/第6章  数字人文下的知识图谱构建及应用/FLASK/templates/search.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 | <head>
 4 |     <meta charset="UTF-8">
 5 |     <title>古汉语典籍自动问答系统</title>
 6 | 	<style type="text/css">
 7 | 	.container {
 8 | 				width: 500px;
 9 | 				height: 50px;
10 | 				margin: 100px auto;
11 | 			}
12 | 			
13 | 			.parent {
14 | 				width: 100%;
15 | 				height: 42px;
16 | 				top: 4px;
17 | 				position: relative;
18 | 			}
19 | 			
20 | 			.parent>input:first-of-type {
21 | 				/*输入框高度设置为40px, border占据2px，总高度为42px*/
22 | 				width: 380px;
23 | 				height: 40px; 
24 | 				border: 1px solid #ccc;
25 | 				font-size: 16px;
26 | 				outline: none;
27 | 			}
28 | 			
29 | 			.parent>input:first-of-type:focus {
30 | 				border: 1px solid #317ef3;
31 | 				padding-left: 10px;
32 | 			}
33 | 			
34 | 			.parent>input:last-of-type {
35 | 				/*button按钮border并不占据外围大小，设置高度42px*/
36 | 				width: 100px;
37 | 				height: 40px; 
38 | 				position: absolute;
39 | 				background: #317ef3;
40 | 				border: 1px solid #317ef3;
41 | 				color: #fff;
42 | 				font-size: 16px;
43 | 				outline: none;
44 | 			}
45 | 			.input_sub{
46 | 				width:100px;
47 | 				height: 40px;
48 | 				background: green;
49 |  
50 | 				/*去掉submit按钮默认边框*/
51 | 				border:0px;
52 | 				/*改成右浮动也是可以的*/
53 | 				float:left;
54 | 				color:white;/*搜索的字体颜色为白色*/
55 | 				cursor:pointer;/*鼠标变为小手*/
56 | 			}
57 | 			.test{
58 | 				width:128px;
59 | 				height:128px;
60 | 				position:relative;
61 | 			}
62 |  
63 | 	</style>
64 | </head>
65 | 
66 | 
67 | <body>
68 | <!-- 
69 | <body background="0.png"> -->
70 | 
71 | {#<body style="background-image: url(F:/A文档/python学习/Competition/Medication/flask_test/Flask/templates/0.png) no-repeat;">#}
72 | 
73 | <div class="container">
74 | 	<div align="center" position="relative">
75 | 		<img src ="0.png"  align="center"/></br>
76 | 	</div>
77 | 	<div>
78 | 	<form method="POST" action="/wstmsearch" class="parent">
79 | 		<input type="search" name="keyword" placeholder="请输入要检索的内容">
80 | 		<input type="submit" value="搜索" class="input_sub">
81 | 	</form>
82 | 	</div>
83 | </div>
84 | </body>
85 | </html>


--------------------------------------------------------------------------------
/第6章  数字人文下的知识图谱构建及应用/README.md:
--------------------------------------------------------------------------------
 1 | ## 知识图谱自动构建与应用源代码
 2 | 
 3 | 此处提供FLASK文件和检索模块，完整源代码和数据请参见本课题组另一项目的链接:https://github.com/veigaran/ZUOZHUAN_KBQA
 4 | 
 5 | ## 建议运行环境
 6 | ```
 7 | jieba==0.42.1
 8 | 
 9 | py2neo 
10 | 
11 | ahocorasick
12 | 
13 | flask
14 | ```
15 | 
16 | 


--------------------------------------------------------------------------------
/第7章  数字人文下的文本分类/README.md:
--------------------------------------------------------------------------------
1 | ## 文本分类源代码
2 | 
3 | 此文件夹内为使用RNN对非物质文化遗产文本进行分类的源代码
4 | 
5 | 文本分类实现可参考下列仓库，包括常见的机器学习算法，如naive bayes，SVM等，也包括深度学习算法，如CNN、RNN，BERT等
6 | 代码实现：
7 | https://github.com/veigaran/NLP_ROAD/tree/master/2-%E5%9F%BA%E4%BA%8E%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0%E7%9A%84%E6%96%87%E6%9C%AC%E5%88%86%E7%B1%BB
8 | https://github.com/veigaran/NLP_ROAD/tree/master/4-%E6%B7%B1%E5%BA%A6%E5%AD%A6%E4%B9%A0%E4%B8%8B%E7%9A%84%E6%96%87%E6%9C%AC%E5%88%86%E7%B1%BB
9 | 


--------------------------------------------------------------------------------
/第7章  数字人文下的文本分类/第七章 数字人文下的文本分类.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hsc748NLP/code-for-digital-humanities-tutorial/6220c365ab5ff7df16f1f20a46cc928b034db36a/第7章  数字人文下的文本分类/第七章 数字人文下的文本分类.pdf


--------------------------------------------------------------------------------
/第7章  数字人文下的文本分类/非遗信息 全.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hsc748NLP/code-for-digital-humanities-tutorial/6220c365ab5ff7df16f1f20a46cc928b034db36a/第7章  数字人文下的文本分类/非遗信息 全.xlsx


--------------------------------------------------------------------------------
/第8章  数字人文下的文本聚类/README.md:
--------------------------------------------------------------------------------
1 | ## 文本聚类源代码
2 | 
3 | 文件夹内为基于K-means的文本自动聚类源代码，使用方法请参照教材第八章内容。详细代码说明见[此仓库](https://github.com/veigaran/NLP_ROAD/tree/master/3-%E5%9F%BA%E4%BA%8E%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0%E7%9A%84%E6%96%87%E6%9C%AC%E8%81%9A%E7%B1%BB)。
4 | 


--------------------------------------------------------------------------------
/第8章  数字人文下的文本聚类/code/cluster/DBSCAN.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | import numpy as np
 3 | from sklearn.cluster import DBSCAN
 4 | 
 5 | 
 6 | def DBscan(X):
 7 |     ##产生随机数据的中心
 8 |     # centers = [[1, 1], [-1, -1], [1, -1]]
 9 |     ##产生的数据个数
10 |     # n_samples = 750
11 |     ##生产数据:此实验结果受cluster_std的影响，或者说受eps 和cluster_std差值影响
12 |     # X, lables_true = make_blobs(n_samples=n_samples, centers=centers, cluster_std=0.4,
13 |     #                           random_state=0)
14 |     ##设置分层聚类函数
15 |     db = DBSCAN(eps=0.5, min_samples=50)
16 |     ##训练数据
17 |     db.fit(X)
18 |     ##初始化一个全是False的bool类型的数组
19 |     core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
20 |     '''
21 |        这里是关键点(针对这行代码：xy = X[class_member_mask & ~core_samples_mask])：
22 |        db.core_sample_indices_  表示的是某个点在寻找核心点集合的过程中暂时被标为噪声点的点(即周围点
23 |        小于min_samples)，并不是最终的噪声点。在对核心点进行联通的过程中，这部分点会被进行重新归类(即标签
24 |        并不会是表示噪声点的-1)，也可也这样理解，这些点不适合做核心点，但是会被包含在某个核心点的范围之内
25 |     '''
26 |     core_samples_mask[db.core_sample_indices_] = True
27 | 
28 |     ##每个数据的分类
29 |     lables = db.labels_
30 | 
31 |     ##分类个数：lables中包含-1，表示噪声点
32 |     n_clusters_ = len(np.unique(lables)) - (1 if -1 in lables else 0)
33 | 
34 |     ##绘图
35 |     unique_labels = set(lables)
36 |     '''
37 |        1)np.linspace 返回[0,1]之间的len(unique_labels) 个数
38 |        2)plt.cm 一个颜色映射模块
39 |        3)生成的每个colors包含4个值，分别是rgba
40 |        4)其实这行代码的意思就是生成4个可以和光谱对应的颜色值
41 |     '''
42 |     colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels)))
43 | 
44 |     plt.figure(1)
45 |     plt.clf()
46 | 
47 |     for k, col in zip(unique_labels, colors):
48 |         ##-1表示噪声点,这里的k表示黑色
49 |         if k == -1:
50 |             col = 'k'
51 | 
52 |         ##生成一个True、False数组，lables == k 的设置成True
53 |         class_member_mask = (lables == k)
54 | 
55 |         ##两个数组做&运算，找出即是核心点又等于分类k的值  markeredgecolor='k',
56 |         xy = X[class_member_mask & core_samples_mask]
57 |         plt.plot(xy[:, 0], xy[:, 1], 'o', c=col, markersize=14)
58 |         '''
59 |            1)~优先级最高，按位对core_samples_mask 求反，求出的是噪音点的位置
60 |            2)& 于运算之后，求出虽然刚开始是噪音点的位置，但是重新归类却属于k的点
61 |            3)对核心分类之后进行的扩展
62 |         '''
63 |         xy = X[class_member_mask & ~core_samples_mask]
64 |         plt.plot(xy[:, 0], xy[:, 1], 'o', c=col, markersize=6)
65 | 
66 |     plt.title('Estimated number of clusters: %d' % n_clusters_)
67 |     print(n_clusters_)
68 |     plt.show()
69 | 
70 | def main():
71 |     X_w2v_100 = np.loadtxt("./Word2vector/w2v_sentence_vec_100D.txt")  # word2vec_sentence_size=100
72 |     DBscan(X_w2v_100)
73 | 
74 | if __name__ == '__main__':
75 |     main()


--------------------------------------------------------------------------------
/第8章  数字人文下的文本聚类/code/cluster/Hierarchy_C.py:
--------------------------------------------------------------------------------
  1 | from itertools import cycle  ##python自带的迭代器模块
  2 | 
  3 | import jieba
  4 | import matplotlib.pyplot as plt
  5 | import numpy as np
  6 | from sklearn.cluster import AgglomerativeClustering
  7 | from sklearn.cluster import MeanShift, estimate_bandwidth
  8 | from sklearn.decomposition import PCA
  9 | 
 10 | 
 11 | # 读取txt文档
 12 | def read_txt(path):
 13 |     f = open(path, 'r', encoding='UTF-8')
 14 |     lines = f.readlines()
 15 |     f.close()
 16 |     return lines
 17 | 
 18 | 
 19 | # onehot编码，返回np.array()
 20 | # Reference https://blog.csdn.net/Dorothy_Xue/article/details/84641417
 21 | def onehot(text):
 22 |     # 对原有文档用jieba分词，并建立字典
 23 |     data = []
 24 |     words = []
 25 |     for sentence in text:
 26 |         sentence = sentence.strip()
 27 |         seg_list = jieba.cut(sentence, cut_all=False)
 28 |         seg_list = '/'.join(seg_list)
 29 |         temp = seg_list.split('/')
 30 |         for word in temp:
 31 |             words.append(word)
 32 |         data.append(seg_list)
 33 |     dic = list(set(words))  # 去重
 34 | 
 35 |     # 手动onehot编码
 36 |     vector = []
 37 |     for i in range(0, len(data)):
 38 |         temp = []
 39 |         for j in range(0, len(dic)):
 40 |             if dic[j] in data[i].split('/'):
 41 |                 temp.append(1)
 42 |             else:
 43 |                 temp.append(0)
 44 |         temp = np.array(temp)
 45 |         vector.append(temp)
 46 |     length = len(vector)
 47 |     vector = np.array(vector)
 48 |     return vector
 49 | 
 50 | 
 51 | # 设置聚类函数，X是二维列表，绘制聚类示意图
 52 | # Reference https://www.cnblogs.com/lc1217/p/6963687.html
 53 | def Hierarchy(X):
 54 |     linkages = ['ward', 'average', 'complete']
 55 |     n_clusters_ = 6
 56 |     ac = AgglomerativeClustering(linkage=linkages[2], n_clusters=n_clusters_)
 57 |     # ac = DBSCAN(eps=0.1, min_samples=5)
 58 |     ##训练数据
 59 |     ac.fit(X)
 60 | 
 61 |     ##每个数据的分类
 62 |     lables = ac.labels_
 63 | 
 64 |     ##绘图
 65 |     plt.figure(1)
 66 |     plt.clf()
 67 | 
 68 |     colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
 69 |     for k, col in zip(range(n_clusters_), colors):
 70 |         ##根据lables中的值是否等于k，重新组成一个True、False的数组
 71 |         my_members = lables == k
 72 |         ##X[my_members, 0] 取出my_members对应位置为True的值的横坐标
 73 |         plt.plot(X[my_members, 0], X[my_members, 1], col + '.')
 74 | 
 75 |     plt.title('Estimated number of clusters: %d' % n_clusters_)
 76 |     plt.show()
 77 | 
 78 | 
 79 | def Mean_shift(X):
 80 |     # 产生随机数据的中心
 81 |     # centers = [[1, 1], [-1, -1], [1, -1]]
 82 |     # 产生的数据个数
 83 |     # n_samples=10000
 84 |     # 生产数据
 85 |     # X, _ = make_blobs(n_samples=n_samples, centers= centers, cluster_std=0.6,random_state =0)
 86 | 
 87 |     # 带宽，也就是以某个点为核心时的搜索半径
 88 |     bandwidth = estimate_bandwidth(X, quantile=0.2, n_samples=500)
 89 |     # 设置均值偏移函数
 90 |     ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
 91 |     # 训练数据
 92 |     ms.fit(X)
 93 |     # 每个点的标签
 94 |     labels = ms.labels_
 95 |     print(labels)
 96 |     # 簇中心的点的集合
 97 |     cluster_centers = ms.cluster_centers_
 98 |     # 总共的标签分类
 99 |     labels_unique = np.unique(labels)
100 |     # 聚簇的个数，即分类的个数
101 |     n_clusters_ = len(labels_unique)
102 | 
103 |     print("number of estimated clusters : %d" % n_clusters_)
104 | 
105 |     # 绘图
106 |     plt.figure(1)
107 |     plt.clf()
108 | 
109 |     colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
110 |     for k, col in zip(range(n_clusters_), colors):
111 |         # 根据lables中的值是否等于k，重新组成一个True、False的数组
112 |         my_members = labels == k
113 |         cluster_center = cluster_centers[k]
114 |         # X[my_members, 0] 取出my_members对应位置为True的值的横坐标
115 |         plt.plot(X[my_members, 0], X[my_members, 1], col + '.')
116 |         plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=14)
117 |     plt.title('Estimated number of clusters: %d' % n_clusters_)
118 |     plt.show()
119 | 
120 | 
121 | def main():
122 |     path = "./title_info.txt"
123 |     text = read_txt(path)
124 |     vector = onehot(text)
125 |     # 降维 PCA/SVD
126 |     pca = PCA(n_components=2)  # 降到2维
127 |     pca.fit(vector)
128 |     new_vector = pca.fit_transform(vector)
129 |     # 层次聚类
130 |     Hierarchy(new_vector)
131 | 
132 |     # 均值聚类
133 |     # Mean_shift(new_vector)
134 | 
135 | 
136 | if __name__ == '__main__':
137 |     main()
138 | 


--------------------------------------------------------------------------------
/第8章  数字人文下的文本聚类/code/cluster/kmeans-all.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from scipy.spatial.distance import cdist
 3 | from sklearn import metrics
 4 | from sklearn.cluster import KMeans
 5 | from sklearn.metrics import silhouette_score
 6 | 
 7 | 
 8 | def kmeans(X):
 9 |     K = range(2, 11)
10 |     meandistortions = []
11 |     Scores = []  # 存放轮廓系数
12 |     CH=[]
13 |     all = []
14 |     '''kmeans = KMeans(n_clusters=8)
15 |     kmeans.fit(X)
16 |     lables = kmeans.labels_
17 |     meandistortions.append(sum(np.min(cdist(X, kmeans.cluster_centers_, 'euclidean'), axis=1)) / X.shape[0])
18 |     Scores.append(
19 |         silhouette_score(X, lables, metric='euclidean'))  # euclidean 欧氏距离
20 |     CH.append(metrics.calinski_harabasz_score(X, lables))
21 |     print("标签：", lables)
22 |     np.savetxt("D:\我\非遗\高维聚类结果\\400_6类_label.txt",lables)
23 |     test_stat = {}
24 |     l = lables.tolist()
25 |     # print(l)
26 |     for i in set(l):
27 |         test_stat[i] = l.count(i)
28 |     print(test_stat)'''
29 |     for k in K:
30 |         kmeans = KMeans(n_clusters=k)
31 |         kmeans.fit(X)
32 |         lables = kmeans.labels_
33 |         meandistortions.append(sum(np.min(cdist(X, kmeans.cluster_centers_, 'euclidean'), axis=1)) / X.shape[0])
34 |         Scores.append(
35 |             silhouette_score(X, lables, metric='euclidean'))  # euclidean 欧氏距离
36 |         CH.append(metrics.calinski_harabasz_score(X, lables))
37 |         print("标签：", lables)
38 |         if(k==6):
39 |             np.savetxt("D:\我\非遗\高维聚类结果\\tfidf\\6类_label3.txt", lables)
40 |         elif(k==8):
41 |             np.savetxt("D:\我\非遗\高维聚类结果\\tfidf\\8类_label3.txt", lables)
42 |         elif(k==10):
43 |             np.savetxt("D:\我\非遗\高维聚类结果\\tfidf\\10类_label3.txt", lables)
44 |         test_stat = {}
45 |         l = lables.tolist()
46 |         # print(l)
47 |         for i in set(l):
48 |             test_stat[i] = l.count(i)
49 |         print(test_stat)
50 | 
51 |         '''降维可视化
52 |         tsne = TSNE(perplexity=30, n_components=2, init='pca')   TSNE降维，降到2D
53 |         data = tsne.fit_transform(X)
54 | 
55 |         x_min, x_max = np.min(data, 0), np.max(data, 0)
56 |         data = (data - x_min) / (x_max - x_min)  # 归一化
57 | 
58 |         plt.figure()
59 |         for i in range(data.shape[0]):
60 |             plt.text(data[i, 0], data[i, 1], str(lables[i]),
61 |                      color=plt.cm.Set1(lables[i] / 10.),
62 |                      fontdict={'weight': 'bold', 'size': 9})
63 |         plt.xticks([])
64 |         plt.yticks([])
65 |         plt.show()'''
66 | 
67 | 
68 |     #np.save("D:\我\非遗\高维聚类结果\\100w\\number.txt", test_stat)
69 |     print("轮廓系数：", Scores)
70 |     #np.savetxt("D:\我\非遗\Word2vector\标签\轮廓系数_50.txt", Scores)
71 |     print("成本函数：", meandistortions)
72 |     #np.savetxt("D:\我\非遗\Word2vector\标签\成本函数_50.txt", meandistortions)
73 |     #print("CH：", CH)
74 |     #np.savetxt("D:\我\非遗\Word2vector\标签\CH_50.txt", CH)
75 | 
76 | 
77 | def main():
78 |     #X_w2v_50 = np.loadtxt("./Word2vector/w2v_sentence_vec_50D.txt")
79 |     X_w2v_100 = np.loadtxt("./Word2vector/cbow-hn/w2v_sentence_vec_100D_cbow.txt")
80 |     X_w2v_200 = np.loadtxt("./Word2vector/cbow-hn/w2v_sentence_vec_200D_cbow.txt")
81 |     X_w2v_300 = np.loadtxt("./Word2vector/cbow-hn/w2v_sentence_vec_300D_cbow.txt")
82 |     X_w2v_400 = np.loadtxt("./Word2vector/cbow-hn/w2v_sentence_vec_400D_cbow.txt")
83 |     X_tfidf = np.loadtxt("words_tfidf2.txt")
84 | 
85 |     X_w2v_100_ns = np.loadtxt("./Word2vector/cbow-ns/w2v_sentence_vec_100D_ns.txt")
86 |     X_w2v_200_ns = np.loadtxt("./Word2vector/cbow-ns/w2v_sentence_vec_200D_ns.txt")
87 |     X_w2v_300_ns = np.loadtxt("./Word2vector/cbow-ns/w2v_sentence_vec_300D_ns.txt")
88 |     X_w2v_400_ns = np.loadtxt("./Word2vector/cbow-ns/w2v_sentence_vec_400D_ns.txt")
89 | 
90 |     kmeans(X_tfidf)
91 | 
92 | if __name__ == '__main__':
93 |     main()


--------------------------------------------------------------------------------
/第8章  数字人文下的文本聚类/code/cluster/kmeans（余弦相似度）/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hsc748NLP/code-for-digital-humanities-tutorial/6220c365ab5ff7df16f1f20a46cc928b034db36a/第8章  数字人文下的文本聚类/code/cluster/kmeans（余弦相似度）/__init__.py


--------------------------------------------------------------------------------
/第8章  数字人文下的文本聚类/code/cluster/kmeans（余弦相似度）/basealgorithm.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | from cluster.basefunction import getCenter_mean
 4 | from cluster.basefunction import similarity_cos, similarity_euclidean, similarity_manhattan
 5 | 
 6 | '''
 7 | 此模块待填坑：
 8 | 1.缺少层次聚类法函数
 9 | 2.缺少报错信息raise
10 | '''
11 | 
12 | class kmeans():
13 |     def __init__(self,k=5,max_iteration=100000,min_center=0.001,select_decision="random",getcenter_function="mean",similarity_function="cos"):
14 |         self.k = k
15 |         self.max_iteration = max_iteration
16 |         self.min_center = min_center
17 |         self.select_decision = select_decision
18 |         self.getcenter_function = getcenter_function
19 |         self.similarity_function = similarity_function
20 |         self.clustercenter = [[] for _ in range(k) ]
21 |         self.clusterdata=[ [] for _ in range(k) ]
22 |         pass
23 | 
24 |     def similarity(self,point_data):
25 |         if(self.similarity_function=="cos"):
26 |             result=[ similarity_cos(self.clustercenter[i],point_data) for i in range(self.k)]
27 |             return result.index(max(result))
28 |         elif(self.similarity_function=="euclidean"):
29 |             result = [similarity_euclidean(self.clustercenter[i], point_data) for i in range(self.k)]
30 |             return result.index(min(result))
31 |         elif (self.similarity_function == "manhattan"):
32 |             result = [similarity_manhattan(self.clustercenter[i], point_data) for i in range(self.k)]
33 |             return result.index(min(result))
34 |         else:
35 |             print("similarity 参数错误！")
36 | 
37 |     def getCenter(self,tmp_data,origin_center):
38 |         if(self.getcenter_function=="mean"):
39 |             return getCenter_mean(tmp_data,origin_center)
40 |         else:
41 |             print("similarity 参数错误！")
42 | 
43 |     def selected_point(self,data):
44 |         if(self.select_decision=="random"):
45 |             self.selected_random(data)
46 |         else:
47 |             print(" select_decision 参数错误！")
48 | 
49 |     def selected_random(self,data):
50 |         tmp_set=set([])
51 |         if(len(data)<self.k):
52 |             for i in range(self.k):
53 |                 self.clustercenter[i]=[i]*len(data[0])
54 |         else:
55 |             for i in range(self.k):
56 |                 tmp_index=random.randint(0,len(data))
57 |                 while(tmp_index in tmp_set):
58 |                     tmp_index = random.randint(0, len(data))
59 |                 self.clustercenter[i]=data[tmp_index]
60 |         pass
61 | 
62 |     def train_predict(self,data):
63 |         if(not data):
64 |             print("数据为空！")
65 |         self.selected_point(data)
66 |         for tmp_i in range(self.max_iteration):
67 |             tmp_clustercenter=self.clustercenter.copy()
68 |             for i in range(len(data)):
69 |                 tmp_index=self.similarity(data[i])
70 |                 self.clusterdata[tmp_index].append(data[i])
71 |             for i in range(self.k):
72 |                 tmp_clustercenter[i]=self.getCenter(self.clusterdata[i],self.clustercenter[i])
73 |             change_center=sum([ similarity_euclidean(tmp_clustercenter[i],self.clustercenter[i]) for i in range(self.k)])
74 |             if(change_center<self.min_center):
75 |                 break
76 |             self.clustercenter=tmp_clustercenter
77 |         self.clusterdata = [[] for _ in range(self.k)]
78 |         result_data=self.predict(data)
79 |         return result_data
80 | 
81 | 
82 |     def predict(self,data):
83 |         return [ self.similarity(tmp_data) for tmp_data in data]
84 | 
85 | 
86 | class hierarchical_Cluster():
87 |     def __init__(self):
88 |         pass
89 | 
90 |     def similarity(self):
91 |         pass
92 | 
93 |     def train_predict(self,data):
94 |         pass
95 | 
96 |     def predict(self,data):
97 |         pass
98 | 


--------------------------------------------------------------------------------
/第8章  数字人文下的文本聚类/code/cluster/kmeans（余弦相似度）/basefunction.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | '''
 4 | 计算向量相似度的几种算法
 5 | 待填坑：
 6 | 1.getCenter_weight 根据重心计算类别中心
 7 | '''
 8 | 
 9 | def similarity_cos(pre_vector,back_vector):
10 |     prev = np.array(pre_vector)
11 |     backv = np.array(back_vector)
12 |     cos_value = np.dot(prev, backv) / (np.linalg.norm(prev) * (np.linalg.norm(backv)))
13 |     return cos_value
14 | 
15 | def similarity_euclidean(pre_vector,back_vector):
16 |     prev = np.array(pre_vector)
17 |     backv = np.array(back_vector)
18 |     euclidean_value = np.sqrt(np.sum(np.square(prev-backv)))
19 |     return euclidean_value
20 | 
21 | def similarity_manhattan(pre_vector,back_vector):
22 |     prev = np.array(pre_vector)
23 |     backv = np.array(back_vector)
24 |     manhattan_value = np.sum(np.abs(prev-backv))
25 |     return manhattan_value
26 | 
27 | def getCenter_mean(cluster_data,origin_center):
28 |     #print(cluster_data)
29 |     origin_array=np.zeros(np.array(origin_center).shape,dtype=np.float)
30 |     tmp_data=np.zeros(np.array(origin_center).shape,dtype=np.float)
31 |     for data in cluster_data:
32 |         tmp_data=tmp_data+np.array(data)
33 |     if((origin_array==tmp_data).all()):
34 |         return origin_center
35 |     else:
36 |         return tmp_data/len(cluster_data)
37 | 
38 | 
39 | def getCenter_weight(cluster_data,origin_center):
40 |     pass
41 | 
42 | class editDistance():
43 |     def __init__(self):
44 |         self.wordarray=[]
45 | 
46 |     def initArray(self,firstStr,secondStr):
47 |         flenth=len(firstStr)
48 |         slenth=len(secondStr)
49 |         self.wordarray=[[0 for _i_ in range(slenth+1)] for _j_ in range(flenth+1)]
50 |         for i in range(flenth+1):
51 |             self.wordarray[i][0]=i
52 |         for i in range(slenth+1):
53 |             self.wordarray[0][i]=i
54 | 
55 |     def getDistance(self,firstStr,secondStr):
56 |         flenth = len(firstStr)
57 |         slenth = len(secondStr)
58 |         self.initArray(firstStr,secondStr)
59 |         for i in range(flenth):
60 |             for j in range(slenth):
61 |                 sameflag=(firstStr[i]==secondStr[j])
62 |                 self.wordarray[i+1][j+1]=self.getMin(i+1,j+1,sameflag)
63 |         return self.wordarray[flenth][slenth]
64 | 
65 |     def getMin(self,i,j,sameflag):
66 |         if(sameflag):
67 |             tmp_list=[self.wordarray[i][j-1]+1,self.wordarray[i-1][j-1],self.wordarray[i-1][j]+1]
68 |         else:
69 |             tmp_list=[self.wordarray[i][j-1]+1,self.wordarray[i-1][j-1]+1,self.wordarray[i-1][j]+1]
70 |         return min(tmp_list)
71 | 
72 | 
73 | if __name__=="__main__":
74 |     ed=editDistance()
75 |     str1="i am chinese!"
76 |     str2="i am a chinese!"
77 |     print(ed.getDistance(str1,str2))
78 |     pass
79 | 


--------------------------------------------------------------------------------
/第8章  数字人文下的文本聚类/code/cluster/kmeans（余弦相似度）/kmeans-cos.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | #from cluster.cluster.basefunction
 3 | from cluster.basealgorithm import kmeans
 4 | from sklearn.metrics import silhouette_score
 5 | 
 6 | 
 7 | def kmeans_cos(X):
 8 |     Scores = []
 9 |     for k in range(2,11):
10 |         lables = kmeans(k=k, similarity_function="cos", max_iteration=300).train_predict(X.tolist())
11 |         #print(lables)
12 |         Scores.append(silhouette_score(X, lables, metric='euclidean'))
13 |         test_stat = {}
14 |         # print(l)
15 |         for i in set(lables):
16 |             test_stat[i] = lables.count(i)
17 |         print(test_stat)
18 |     print("轮廓系数：", Scores)
19 | 
20 | 
21 | X_w2v_100 = np.loadtxt("./Word2vector/w2v_sentence_vec_400D.txt")
22 | kmeans_cos(X_w2v_100)
23 | 


--------------------------------------------------------------------------------
/第8章  数字人文下的文本聚类/code/cluster/mean-shift.py:
--------------------------------------------------------------------------------
 1 | from itertools import cycle  ##python自带的迭代器模块
 2 | 
 3 | import matplotlib.pyplot as plt
 4 | import numpy as np
 5 | from sklearn.cluster import MeanShift, estimate_bandwidth
 6 | from sklearn.datasets.samples_generator import make_blobs
 7 | 
 8 | ##产生随机数据的中心
 9 | #centers = [[1, 1], [-1, -1], [1, -1]]
10 | ##产生的数据个数
11 | #n_samples=10000
12 | ##生产数据
13 | X, label_true = make_blobs(n_samples=n_samples, centers= centers, cluster_std=0.6,random_state =0)
14 | 
15 | 
16 | ##带宽，也就是以某个点为核心时的搜索半径
17 | bandwidth = estimate_bandwidth(X, quantile=0.2, n_samples=500)
18 | ##设置均值偏移函数
19 | ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
20 | ##训练数据
21 | ms.fit(X)
22 | ##每个点的标签
23 | labels = ms.labels_
24 | print(labels)
25 | ##簇中心的点的集合
26 | cluster_centers = ms.cluster_centers_
27 | ##总共的标签分类
28 | labels_unique = np.unique(labels)
29 | ##聚簇的个数，即分类的个数
30 | n_clusters_ = len(labels_unique)
31 | 
32 | print("number of estimated clusters : %d" % n_clusters_)
33 | 
34 | 
35 | ##绘图
36 | plt.figure(1)
37 | plt.clf()
38 | 
39 | colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
40 | for k, col in zip(range(n_clusters_), colors):
41 |     ##根据lables中的值是否等于k，重新组成一个True、False的数组
42 |     my_members = labels == k
43 |     cluster_center = cluster_centers[k]
44 |     ##X[my_members, 0] 取出my_members对应位置为True的值的横坐标
45 |     plt.plot(X[my_members, 0], X[my_members, 1], col + '.')
46 |     plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
47 |              markeredgecolor='k', markersize=14)
48 | plt.title('Estimated number of clusters: %d' % n_clusters_)
49 | plt.show()


--------------------------------------------------------------------------------
/第8章  数字人文下的文本聚类/code/cluster/onehot.py:
--------------------------------------------------------------------------------
 1 | import jieba
 2 | import numpy as np
 3 | from sklearn.decomposition import PCA
 4 | 
 5 | 
 6 | # 读取txt文档
 7 | def read_txt(path):
 8 |     f = open(path, 'r', encoding='UTF-8')
 9 |     lines = f.readlines()
10 |     f.close()
11 |     return lines
12 | 
13 | 
14 | # onehot编码，返回np.array()
15 | # Reference https://blog.csdn.net/Dorothy_Xue/article/details/84641417
16 | def onehot(text):
17 |     # 对原有文档用jieba分词，并建立字典
18 |     data = []
19 |     words = []
20 |     for sentence in text:
21 |         sentence = sentence.strip()
22 |         seg_list = jieba.cut(sentence, cut_all=False)
23 |         seg_list = '/'.join(seg_list)
24 |         temp = seg_list.split('/')
25 |         for word in temp:
26 |             words.append(word)
27 |         data.append(seg_list)
28 |     dic = list(set(words))  # 去重
29 | 
30 |     # 手动onehot编码
31 |     vector = []
32 |     for i in range(0, len(data)):
33 |         temp = []
34 |         for j in range(0, len(dic)):
35 |             if dic[j] in data[i].split('/'):
36 |                 temp.append(1)
37 |             else:
38 |                 temp.append(0)
39 |         temp = np.array(temp)
40 |         vector.append(temp)
41 |     length = len(vector)
42 |     vector = np.array(vector)
43 |     return vector
44 | 
45 | 
46 | def main():
47 |     path = "./title_info.txt"
48 |     text = read_txt(path)
49 |     vector = onehot(text)
50 |     print(1)
51 |     # 降维 PCA/SVD
52 |     pca = PCA(n_components=2)  # 降到2维
53 |     pca.fit(vector)
54 |     new_vector = pca.fit_transform(vector)
55 |     print(2)
56 | 
57 |     # 写入文本
58 |     np.savetxt("onehot_data.txt", new_vector)
59 | 
60 | 
61 | 
62 | if __name__ == '__main__':
63 |     main()
64 | 


--------------------------------------------------------------------------------
/第8章  数字人文下的文本聚类/code/cluster/pca.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from sklearn.decomposition import PCA
 3 | from sklearn.manifold import TSNE
 4 | 
 5 | def pca():
 6 |     f = open('./Word2vector/w2v_sentence_vec_250D.txt')
 7 |     line = f.readline()
 8 |     data = []
 9 |     data_list = []
10 |     while line:
11 |         num = list(map(float, line.split()))
12 |         data_list.append(num)
13 |         line = f.readline()
14 |     f.close()
15 |     X = np.array(data_list)  # 导入数据，维度为n
16 |     pca = PCA(n_components=3)  # 降到2维
17 |     pca.fit(X)  # 训练
18 |     newX = pca.fit_transform(X)  # 降维后的数据
19 |     # PCA(copy=True, n_components=2, whiten=False)
20 |     # print(pca.explained_variance_ratio_)  #输出贡献率
21 |     # print(newX)
22 |     np.savetxt("./Word2vector/w2v_sentence_vec_250D-2d.txt", newX)
23 | 
24 | 
25 | def tsne():
26 |     X = np.loadtxt("D:\我\非遗\Word2vector\w2v_sentence_vec_100D.txt")
27 |     tsne = TSNE(perplexity=30, n_components=2, init='pca')  # TSNE降维，降到2D
28 |     data = tsne.fit_transform(X)
29 | 
30 | 
31 | 


--------------------------------------------------------------------------------
/第8章  数字人文下的文本聚类/code/ex_key/basextract.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | 
  3 | from gensim.models.word2vec import Word2Vec
  4 | 
  5 | 
  6 | class tfidf:
  7 |     def __init__(self):
  8 |         self.idf={}
  9 |         self.keyData={}
 10 | 
 11 |     def load_data(self,data):
 12 |         tmp_dict={}
 13 |         for key in data.keys():
 14 |             word_list=data[key].split(" ")
 15 |             for word in word_list:
 16 |                 tmp_dict[word]=tmp_dict.get(word,[])
 17 |                 tmp_dict[word].append(key)
 18 |                 self.keyData[key]=self.keyData.get(key,{})
 19 |                 self.keyData[key][word]=self.keyData[key].get(word,0)+1
 20 |         docs_len=len(data.keys())
 21 |         for word in tmp_dict.keys():
 22 |             self.idf[word]=math.log(docs_len-len(set(tmp_dict[word]))+0.5)-math.log(len(set(tmp_dict[word]))+0.5)
 23 |         for key in self.keyData.keys():
 24 |             for word in self.keyData[key].keys():
 25 |                 self.keyData[key][word]=self.keyData[key][word]*self.idf[word]
 26 |         return self
 27 | 
 28 |     def extractKeyword(self, data ,n):
 29 |         self.load_data(data)
 30 |         tmp_dict={}
 31 |         for key in self.keyData.keys():
 32 |             sort_list=sorted([[word,self.keyData[key][word]] for word in self.keyData[key].keys()],key=lambda x:x[1],reverse=True)
 33 |             tmp_dict[key]=[ word for word,_ in sort_list][0:n]
 34 |         return tmp_dict
 35 | 
 36 |     def get_vector(self):
 37 |         pass
 38 | 
 39 | class word2vect_model:
 40 |     def __init__(self,path,min_count=2,embedding_dim=64,max_vocab_size=3000,window_size=5):
 41 |         self.min_count=min_count
 42 |         self.embedding_dim=embedding_dim
 43 |         self.max_vocab_size=max_vocab_size
 44 |         self.window_size=window_size
 45 |         self.modelpath=path
 46 | 
 47 |     def train(self,sentences):
 48 |         self.model=Word2Vec(sentences,size=self.embedding_dim, window=self.window_size, min_count=self.min_count,
 49 |                               max_vocab_size=self.max_vocab_size)
 50 |         self.model.save(self.modelpath)
 51 |         return self
 52 | 
 53 |     def load_model(self):
 54 |         self.model=Word2Vec.load(self.modelpath)
 55 |         return self
 56 | 
 57 |     def get_vector(self,word):
 58 |         return self.model[word]
 59 | 
 60 |     def is_haveword(self,word):
 61 |         try:
 62 |             vector=self.model[word]
 63 |             return True
 64 |         except:
 65 |             return False
 66 | 
 67 | if __name__=="__main__":
 68 |     '''
 69 |     crime_re = re.compile(r"(.*?罪)、{0,1}")
 70 |     docs={}
 71 |     model=tfidf()
 72 |     f = open("C:/Users/sfe_williamsL/Desktop/毕业论文/result_id.txt", "rt", encoding="utf-8")
 73 |     for line in f.readlines():
 74 |         datas = line.split("\t")
 75 |         if (len(datas) < 2):
 76 |             continue
 77 |         crimes=crime_re.findall(datas[1])
 78 |         content = datas[3]
 79 |         for ctype in crimes:
 80 |             ctype=ctype.strip("、")
 81 |             docs[ctype] = docs.get(ctype,"")+" ".join([word for word in jieba.cut(content) if (len(word) > 1)])
 82 |     f.close()
 83 |     tmp_dict=model.extractKeyword(docs,10)
 84 |     w=open("C:/Users/sfe_williamsL/Desktop/毕业论文/keyword_10.txt","wt",encoding="utf-8")
 85 |     print(tmp_dict)
 86 |     for key in tmp_dict.keys():
 87 |         w.write("\n".join(tmp_dict[key])+"\n")
 88 |     w.close()
 89 |     
 90 |     
 91 |     #word2vec计算
 92 |     doc_data = []
 93 |     i = 0
 94 |     f = open("C:/Users/sfe_williamsL/Desktop/毕业论文/result_id.txt", "rt", encoding="utf-8")
 95 |     for line in f.readlines():
 96 |         tmp_data = []
 97 |         datas = line.split("\t")
 98 |         if (len(datas) < 2):
 99 |             continue
100 |         docid = datas[0]
101 |         content = datas[3]
102 |         word_list =list(jieba.cut(content))
103 |         doc_data.append(word_list)
104 |         i = i + 1
105 |     f.close()
106 |     wm=word2vect_model(path="C:/Users/sfe_williamsL/Desktop/毕业论文/data/word2vect_8",embedding_dim=8)
107 |     wm.train(doc_data)
108 |     '''
109 |     docs = {}
110 |     model = tfidf()
111 |     f = open("D:\我\非遗\高维聚类结果\\400维六类数据\\总.txt", "rt", encoding="utf-8")
112 |     i=0
113 |     for line in f.readlines():
114 |         if(not line.replace("\r","").replace("\n","")):
115 |             continue
116 |         i=i+1
117 |         content = line.replace("\r","").replace("\n","")
118 |         docs[i] = docs.get(i,content)
119 |     f.close()
120 |     print(docs)
121 |     tmp_dict = model.extractKeyword(docs, 15)
122 |     w = open("D:\我\非遗\高维聚类结果\\400维六类数据\\keyword_15.txt", "wt", encoding="utf-8")
123 |     print(tmp_dict)
124 |     for key in tmp_dict.keys():
125 |         w.write("\t".join(tmp_dict[key]) + "\n")
126 |     w.close()


--------------------------------------------------------------------------------
/第8章  数字人文下的文本聚类/code/ex_key/extraction-keywords.py:
--------------------------------------------------------------------------------
 1 | import codecs
 2 | 
 3 | import numpy
 4 | import pandas
 5 | from sklearn.feature_extraction.text import CountVectorizer
 6 | from sklearn.feature_extraction.text import TfidfTransformer
 7 | 
 8 | 
 9 | def extract(corpus):
10 |     '''corpus = []  # 文档预料 空格连接
11 | 
12 |     # 读取预料 一行预料为一个文档
13 |     path1 = "D:\我\非遗\高维聚类结果\\350_类11.txt"
14 |     for line in open(path1, 'r', encoding="utf-8").readlines():
15 |         corpus.append(line.strip())
16 |     #print(corpus)'''
17 |     corpus1 = ["我 来到 北京 清华大学",  # 第一类文本切词后的结果，词之间以空格隔开
18 |               "他 来到 了 网易 杭研 大厦",  # 第二类文本的切词结果
19 |               "小明 硕士 毕业 与 中国 科学院",  # 第三类文本的切词结果
20 |               "我 爱 北京 天安门"]
21 | 
22 |     contents = [
23 |         '我 是 中国 人。',
24 |         '你 是 美国 人。',
25 |         '他 叫 什么 名字？',
26 |         '她 是 谁 啊？'
27 |     ]
28 |     countVectorizer = CountVectorizer(
29 |     '''min_df=0,
30 |     token_pattern=r"\b\w+\b"'''
31 |     )  # 增加了min_df=0参数，保留最小长度为0的分词，和token_pattern,设置分词的正则表达式。
32 |     textVector = countVectorizer.fit_transform(corpus)
33 |     transformer = TfidfTransformer(sublinear_tf=True)  # 该类会统计每个词语的tf-idf权值
34 |     tfidf = transformer.fit_transform(textVector)  # .fit_transform()方法得到tf-idf矩阵
35 |     weight = tfidf.toarray()
36 |     #print(weight)
37 |     word = countVectorizer.get_feature_names()
38 |     #print(word)
39 |     sort = numpy.argsort(weight, axis=1)[:, -10:]  # 对tf-idf矩阵每行的值进行排序，输出对应索引，并取每行前五，得到sort,格式为numpy.ndarray
40 |     keywords = pandas.Index(word)[sort].values
41 |     tagDF = pandas.DataFrame({
42 |         'tag1': keywords[:, 0],  # 提取第一行，得到包含所有文档的第1个关键词的数组
43 |         'tag2': keywords[:, 1],  # 提取第二行，得到包含所有文档的第2个关键词的数组
44 |         'tag3': keywords[:, 2],
45 |         'tag4': keywords[:, 3],
46 |         'tag5': keywords[:, 4],
47 |         'tag6': keywords[:, 5],
48 |         'tag7': keywords[:, 6],
49 |         'tag8': keywords[:, 7],
50 |         'tag9': keywords[:, 8],
51 |         'tag10': keywords[:, 9],
52 |     })
53 |     tagDF.to_csv("D:\我\非遗\高维聚类结果\\400-10\\掉包keywords_10.txt",header=False,index=False)
54 |     print(tagDF)
55 | 
56 | def read(path):
57 |     with codecs.open(path, 'r', 'utf8') as f:
58 |         line = f.readlines()
59 |     return line
60 | 
61 | 
62 | def corpus(data):
63 |     final = []
64 |     for line in data:
65 |         l = line.split(' ')
66 |         res = [x.strip() for x in l if x.strip() != '']
67 |         cor = " ".join(res)
68 |         final.append(cor)
69 |     #print(final[1])
70 |     return final
71 | 
72 | 
73 | def main():
74 |     path = "D:\我\非遗\高维聚类结果\\400-10\\总.txt"
75 |     data = read(path)
76 |     final = corpus(data)
77 |     #print(final)
78 |     extract(final)
79 |     #print(data)
80 | 
81 | 
82 | if __name__ == "__main__":
83 |     main()


--------------------------------------------------------------------------------
/第8章  数字人文下的文本聚类/code/some_deal/data.py:
--------------------------------------------------------------------------------
 1 | import codecs
 2 | 
 3 | import jieba
 4 | import xlrd
 5 | 
 6 | 
 7 | def readxls(path, col):
 8 |     xl = xlrd.open_workbook(path)
 9 |     sheet = xl.sheets()[0]
10 |     data = list(sheet.col_values(col))[1:]
11 |     return data
12 | 
13 | 
14 | def readtxt(path):
15 |     with codecs.open(path, 'r', 'utf8') as f:
16 |         line = f.readline()
17 |         line.replace('\\u3000','')
18 |         data = list(line)
19 |     return data
20 | 
21 | 
22 | def uni(title, info):
23 |     uni_lis = []
24 |     for i, j in zip(title, info):
25 |         if i != '' and j != '':
26 |             n = i+' '+j
27 |             uni_lis.append(n)
28 |     return uni_lis
29 | 
30 | 
31 | def writetxt(path, txt):
32 |     with codecs.open(path, 'a', 'utf-8') as f:
33 |         for i in txt:
34 |             f.write('\t'+str(i)+'\n')
35 | 
36 | 
37 | def cutwords(data, stopwords):
38 |     #分词
39 |     word_lis = []
40 |     for line in data:
41 |         slist = jieba.cut(line, cut_all=False)
42 |         output = " ".join(slist)
43 |         for key in output.split(' '):
44 |             if key not in stopwords:
45 |                 word_lis.append(key)
46 |     return word_lis
47 | 
48 | 
49 | def main():
50 |     path_xls = ".\非遗国家级.xlsx"
51 |     path_txt = ".\info.txt"
52 |     path_stopword = ".\停用词.txt"
53 |     title = readxls(path_xls, 0)
54 |     info = readxls(path_xls, 8)
55 |     stopwords = readtxt(path_stopword)   # 读取停用词
56 |     data = uni(title, info)              # 标题和详细信息结合
57 |     #random.shuffle(result)
58 |     #train_list = data[:int(len(data) * 0.9)]
59 |     #test_list = data[int(len(data) * 0.9):]
60 |     writetxt(path_txt, data)             # 输出全部训练数据
61 |     #train_data = cutwords(train_list, stopwords)            # 分词
62 |     #test_data = cutwords(test_list, stopwords)
63 |     #writetxt("./cut_words.txt", cutwords(data, stopwords))      # 分词
64 | 
65 | 
66 | if __name__=="__main__":
67 |     main()
68 | 


--------------------------------------------------------------------------------
/第8章  数字人文下的文本聚类/code/some_deal/divide.py:
--------------------------------------------------------------------------------
 1 | import codecs
 2 | 
 3 | import jieba
 4 | import jieba.analyse
 5 | import numpy as np
 6 | 
 7 | data0 = []
 8 | data1 = []
 9 | data2 = []
10 | data3 = []
11 | data4 = []
12 | data5 = []
13 | data6 = []
14 | data7 = []
15 | data8 = []
16 | data9 = []
17 | 
18 | def readtxt(path):
19 |     with codecs.open(path, 'r', 'utf8') as f:
20 |         line = f.readlines()
21 |     return line
22 | 
23 | 
24 | def divide(line, lables):
25 |     for i, j in zip(lables, range(len(line))):
26 |         line[j] = line[j].strip('\n')
27 |         #res = [x.strip() for x in line[j] if x.strip() != '']
28 |         if i == 0:
29 |             data0.append(line[j])
30 |         elif i == 1:
31 |             data1.append(line[j])
32 |         elif i == 2:
33 |             data2.append(line[j])
34 |         elif i == 3:
35 |             data3.append(line[j])
36 |         elif i == 4:
37 |             data4.append(line[j])
38 |         else:
39 |             data5.append(line[j])
40 | 
41 | 
42 |     #print(data0)
43 | 
44 | def extract_kw(data):
45 |     #print(str(data))
46 |     kw = jieba.analyse.extract_tags(str(data), topK=30, withWeight=False, allowPOS=())
47 |     print(kw)
48 | 
49 | 
50 | def write(data, path):
51 |     with codecs.open(path, 'a', encoding='utf8') as f:
52 |         for line in data:
53 |             f.write(line+' '+'\n')
54 |         f.write('\n')
55 |     f.close()
56 | 
57 | 
58 | def main():
59 |     path="D:/我/非遗/cut_words_entity.txt"
60 |     pathtxt="D:/我/非遗/title_info.txt"
61 |     path_lables = "D:\我\非遗\高维聚类结果\\400-ns\\6类_label.txt"
62 |     line = readtxt(pathtxt)
63 |     #print(line)
64 |     lables = np.loadtxt(path_lables)
65 |     #print(lables)
66 |     divide(line, lables)
67 |     write(data0, "D:\我\非遗\高维聚类结果\\400-6\\类1.txt")
68 |     write(data1, "D:\我\非遗\高维聚类结果\\400-6\\类2.txt")
69 |     write(data2, "D:\我\非遗\高维聚类结果\\400-6\\类3.txt")
70 |     write(data3, "D:\我\非遗\高维聚类结果\\400-6\\类4.txt")
71 |     write(data4, "D:\我\非遗\高维聚类结果\\400-6\\类5.txt")
72 |     write(data5, "D:\我\非遗\高维聚类结果\\400-6\\类6.txt")
73 |     #write(data6, "D:\我\非遗\高维聚类结果\\400-10\\类7.txt")
74 |     #write(data7, "D:\我\非遗\高维聚类结果\\400-10\\类8.txt")
75 |     #write(data8, "D:\我\非遗\高维聚类结果\\400-10\\类9.txt")
76 |     #write(data9, "D:\我\非遗\高维聚类结果\\400-10\\类10.txt")
77 | 
78 | main()


--------------------------------------------------------------------------------
/第8章  数字人文下的文本聚类/code/some_deal/keywords.py:
--------------------------------------------------------------------------------
 1 | import codecs
 2 | 
 3 | 
 4 | def readtxt(path):
 5 |     with codecs.open(path,'r',encoding='utf-8') as f:
 6 |         line=f.readlines()  #line为一个列表 一行一个元素
 7 |         line = [x.strip('\r\n') for x in line]
 8 |         print(line)
 9 |         return line
10 | 
11 | def trans(data):
12 |     words=[]
13 |     for line in data:
14 |         line = line.split(',')
15 |         with codecs.open(r'D:\我\非遗\高维聚类结果\400维十类\掉包keywords_15.txt', 'a', 'utf-8')as f:
16 |             for i in line:
17 |                 f.write(str(i)+'\n')
18 |             f.write('\n')
19 | 
20 | 
21 | 
22 | 
23 | def main():
24 |     path6 = r'D:\我\非遗\高维聚类结果\400维六类数据\掉包keywords_15.txt'
25 |     path8 = r'D:\我\非遗\高维聚类结果\400维八类\掉包keywords_15.txt'
26 |     path10 = r'D:\我\非遗\高维聚类结果\400维十类\掉包keywords_15.txt'
27 |     data = readtxt(path10)
28 |     trans(data)
29 | 
30 | 
31 | 
32 | if __name__ == '__main__':
33 |     main()


--------------------------------------------------------------------------------
/第8章  数字人文下的文本聚类/code/some_deal/porpotion.py:
--------------------------------------------------------------------------------
 1 | import codecs
 2 | 
 3 | import xlrd
 4 | from pylab import *
 5 | 
 6 | mpl.rcParams['font.sans-serif'] = ['SimHei']
 7 | 
 8 | 
 9 | # 读取类别txt
10 | def readtxt(path):
11 |     with codecs.open(path, 'r', 'utf8') as f:
12 |         line = f.readlines()
13 |     return line
14 | 
15 | # 读取excel 非遗信息全
16 | def readxls(path,col):
17 |     xl=xlrd.open_workbook(path)
18 |     sheet=xl.sheets()[0]
19 |     data=list(sheet.col_values(col))[1:]
20 |     return data
21 | 
22 | # 读类别里标题信息
23 | def ex_title(data):
24 |     ti=[]
25 |     temp=[]
26 |     for line in data:
27 |         l = line.split(' ')
28 |         ti.append(l[0])
29 |     #print(ti)
30 |     return ti
31 | 
32 | 
33 | # 去除多余空字符
34 | def corpus(data):
35 |     final = []
36 |     for line in data:
37 |         l = line.strip(' ').split(' ')
38 |         res = [x.strip() for x in l if x.strip() != '\xa0' or x.strip() != '\u3000' or x.strip() !='\ue81b'\
39 |                or x.strip() !=' ']
40 |         cor = " ".join(res)
41 |         final.append(cor)
42 |     #print(final)
43 |     return final
44 | 
45 | 
46 | # 计算各类数量
47 | def calculate(ti,dic):
48 |     sum=0
49 |     list=[]
50 |     dic1 = {'民间文学':0,'传统音乐':0,'传统舞蹈':0,'传统戏剧':0,'曲艺':0,'传统体育、游艺与杂技':0,\
51 |           '传统美术':0,'传统技艺':0,'传统医药':0,'民俗':0}
52 |     #print(dic1)
53 |     for i in dic:
54 |         for t in ti:
55 |             if t == i:
56 |                 list.append(t)
57 |                 dic1[dic[i]]+=1
58 |             #else:
59 |              #   print(t)
60 |     for v in dic1.values():
61 |         sum=sum+v
62 |     for i in ti:
63 |         if i not in list:
64 |             print(i)
65 |     print(sum)
66 |     print(dic1)
67 |     #print(list)
68 |     #print(len(list))
69 |     return dic1
70 | 
71 | def writetxt(path,txt):
72 |     with codecs.open(path,'a','utf-8') as f:
73 |         for i in txt:
74 |             f.write(str(i)+'\n')
75 | 
76 | 
77 | def main():
78 |     path1="D:\我\非遗\高维聚类结果\\400-10\\类2.txt"
79 |     #path_cla = "D:\我\非遗\高维聚类结果\标题加类别.txt"
80 |     path_xls = "D:\我\非遗\非遗初始语料\非遗国家级.xlsx"
81 |     #path_list = "D:\我\非遗\高维聚类结果\\400维六类数据\类4结果.txt"
82 |     #ti_cla = readtxt(path_cla)
83 |     #print(ti_cla)
84 |     title = readxls(path_xls, 0)
85 |     classes = readxls(path_xls, 4)
86 |     final_title=corpus(title)    # 去除奇异字符
87 |     dic=dict(zip(final_title, corpus(classes)))   # 标题：类别 字典
88 |     #print(dic)
89 | 
90 |     data = readtxt(path1)
91 |     ti = corpus(ex_title(data))
92 |     #print(ti)
93 |     dic_num=calculate(ti,dic)
94 | 
95 |     #writetxt(path_list,calculate(ti,dic))
96 |  
97 | 
98 | if __name__ == '__main__':
99 |     main()


--------------------------------------------------------------------------------
/第8章  数字人文下的文本聚类/code/some_deal/shufa.py:
--------------------------------------------------------------------------------
 1 | import codecs
 2 | 
 3 | import xlrd
 4 | 
 5 | 
 6 | def readxls(path):
 7 |     xl=xlrd.open_workbook(path)
 8 |     sheet=xl.sheets()[0]
 9 |     data=[]
10 |     for i in range(1, sheet.nrows):
11 |         data.append(sheet.row_values(i))
12 |     return data
13 | 
14 | 
15 | def form(data):
16 |     for i in data:
17 |         if i[1]!='':
18 |             i[1] = '\t'+'SC:'+i[1]
19 |         if i[2]!='':
20 |             i[2] = '\t' + 'USE:' + i[2]
21 |         if i[3]!='':
22 |             i[3] = '\t' + 'UF:' + i[3]
23 |         if i[4]!='':
24 |             i[4] = '\t' + 'AD:' + i[4]
25 |         if i[5] != '':
26 |             i[5] = '\t' + 'NT:' + i[5]
27 |         if i[6] != '':
28 |             i[6] = '\t' + 'BT:' + i[6]
29 |         if i[7] != '':
30 |             i[7] = '\t' + 'RT:' + i[7]
31 |     for j in data:
32 |         for k in range(len(j)):
33 |             j[k]=j[k].replace('/','\n\t   ')
34 |     return data
35 | 
36 | def write(path,data):
37 |     with codecs.open(path,'w',encoding='utf8') as f:
38 |         for i in data:
39 |             for j in i:
40 |                 if j != '':
41 |                     f.write(j+'\n')
42 |             f.write('\n')
43 | 
44 | 
45 | def main():
46 |     path=r'C:\Users\lenovo\Desktop\情报语言学\书法.xlsx'
47 |     path_put = r'C:\Users\lenovo\Desktop\情报语言学\书法.txt'
48 |     data = readxls(path)
49 |     write(path_put,form(data))
50 | 
51 | 
52 | main()


--------------------------------------------------------------------------------
/第8章  数字人文下的文本聚类/code/some_deal/test.py:
--------------------------------------------------------------------------------
 1 | import codecs
 2 | 
 3 | import jieba
 4 | 
 5 | 
 6 | def readtxt(path):
 7 |     with codecs.open(path,'r',encoding='utf-8') as f:
 8 |         line=f.readlines()  #line为一个列表 一行一个元素
 9 |         line = [x.strip('\r\n') for x in line]
10 |         #print(line)
11 |         return line
12 | 
13 | 
14 | # jieba分词（去除停用词后每行分词）
15 | def cut_words(data, stopwords):
16 |     cut_word=[]
17 |     words=''
18 |     for line in data:
19 |         jieba.load_userdict('./heritage_entity.txt')
20 |         slist=jieba.cut(line,cut_all=False)
21 |         #slist = [x.strip() for x in list(slist) if x.strip() != '\xa0' or x.strip() != '\u3000' or x.strip() !='\ue81b'\
22 |          #      or x.strip() !=' ']
23 |         for key in slist:
24 |             if key not in stopwords and key !=' ' and key != '\xa0' and key !='\u3000' and key !='\ue81b':
25 |                 words+=key+' '
26 |         #output=" ".join(words)
27 |         words+='\n'
28 |     cut_word.append(words)
29 |     return cut_word
30 | 
31 | 
32 | # 去除停用词、重复词词总量
33 | def wordlist(cut_word, stopwords):
34 |     final=[]
35 |     for line in data:
36 |         slist=jieba.cut(line,cut_all=False)
37 |         output=" ".join(list(slist))
38 |         for key in output.split(' '):
39 |             if (key not in stopwords) and (key not in cut_word):
40 |                 final.append(key)
41 |     return final
42 | 
43 | 
44 | def write(path,data):
45 |     with codecs.open(path,'a','utf-8')as f:
46 |         for i in data:
47 |             f.write(str(i))
48 | 
49 | def corpus(data):
50 |     final = []
51 |     for line in data:
52 |         l = line.strip(' ').split(' ')
53 |         res = [x.strip() for x in l if x.strip() != '\xa0' or x.strip() != '\u3000' or x.strip() !='\ue81b'\
54 |                or x.strip() !=' ']
55 |         cor = " ".join(res)
56 |         final.append(cor)
57 |     #print(final)
58 |     return final
59 | 
60 | def main():
61 |     path1 = "./title_info.txt"
62 |     path2 = "./cut_words.txt"
63 |     path3 = "./CW_noplace.txt"   # 基于实体词典分词
64 |     stopwords_path = "./停用词.txt"
65 |     data = readtxt(path1)
66 |     stopwords = readtxt(stopwords_path)
67 |     #stopwords = [x.replace('\r\n', '') for x in stopwords]
68 | 
69 |     #cut_word = cut_words(data, stopwords)
70 |     cut_word_entity = cut_words(data, stopwords)
71 |     print(cut_word_entity)
72 |     #print(corpus(cut_word_entity))
73 |     #write(path2, cut_word)
74 |     write(path3, cut_word_entity)
75 | 
76 | 
77 | if __name__=="__main__":
78 |         main()
79 | 


--------------------------------------------------------------------------------
/第8章  数字人文下的文本聚类/code/some_deal/title_info.py:
--------------------------------------------------------------------------------
 1 | import codecs
 2 | 
 3 | import xlrd
 4 | 
 5 | 
 6 | def readxls(path,col):
 7 |     xl=xlrd.open_workbook(path)
 8 |     sheet=xl.sheets()[0]
 9 |     data=list(sheet.col_values(col))[1:]
10 |     return data
11 | 
12 | def readtxt(path):
13 |     with codecs.open(path,'r','utf8') as f:
14 |         line=f.readline()
15 |         data=list(line)
16 |     return data
17 | 
18 | #union
19 | def uni(title,info):
20 |     uni_lis=[]
21 |     n=''
22 |     for i,j in zip(title,info):
23 |         n=i+' '+j
24 |         uni_lis.append(n)
25 |     return uni_lis
26 | 
27 | def writetxt(path,txt):
28 |     with codecs.open(path,'a','utf-8') as f:
29 |         for i in txt:
30 |             f.write(str(i)+'\n')
31 | 
32 | def main():
33 |     path_xls=".\非遗国家级.xlsx"
34 |     path_txt=".\\title_info.txt"
35 |     path_stopword=".\停用词.txt"
36 |     #path_txt=r'C:\Users\lenovo\Desktop\非遗\title_info_onehot.txt'
37 |     title=readxls(path_xls,0)
38 |     info=readxls(path_xls,8)
39 |     classes = readxls(path_xls,4)
40 |     stopwords=readtxt(path_stopword)
41 |     #data=uni(title,info)
42 |     #ti_class = uni(title,classes)
43 |     #dic=dict.fromkeys(title,classes)
44 |     print(dict(zip(title,classes)))
45 |     #writetxt(path_txt, data)
46 |     #writetxt("D:\我\非遗\高维聚类结果\标题加类别.txt",ti_class)
47 |     ''''#分词
48 |     word_lis=[]
49 |     for line in data:
50 |         slist = jieba.cut(line, cut_all=False)
51 |         output = " ".join(slist)
52 |         for key in output.split(' '):
53 |             if key not in stopwords:
54 |                 word_lis.append(key)
55 | 
56 |     # 参考官方文档运用sklearn.feature_extraction.text.TfidfVectorizer,将corpus文本转换为tfidf值的svm向量
57 |     tfidfvec = TfidfVectorizer()
58 |     cop_tfidf = tfidfvec.fit_transform(word_lis)
59 |     weight = cop_tfidf.toarray()
60 | 
61 | 
62 |     #降维
63 |     X = np.array(weight)  # 导入数据
64 |     pca = PCA(n_components=2)  # 降到2维
65 |     pca.fit(X)  # 训练
66 |     newX = pca.fit_transform(X)  # 降维后的数据
67 |     # PCA(copy=True, n_components=2, whiten=False)
68 |     # print(pca.explained_variance_ratio_)  #输出贡献率
69 |     print(newX)
70 | 
71 |     #层次聚类
72 |     X = newX
73 |     ##设置分层聚类函数
74 |     linkages = ['ward', 'average', 'complete']
75 |     n_clusters_ = 6
76 |     ac = AgglomerativeClustering(linkage=linkages[2], n_clusters=n_clusters_)
77 |     ##训练数据
78 |     ac.fit(X)
79 | 
80 |     ##每个数据的分类
81 |     lables = ac.labels_
82 | 
83 |     ##绘图
84 |     plt.figure(1)
85 |     plt.clf()
86 | 
87 |     colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
88 |     for k, col in zip(range(n_clusters_), colors):
89 |         # 根据lables中的值是否等于k，重新组成一个True、False的数组
90 |         my_members = lables == k
91 |         ##X[my_members, 0] 取出my_members对应位置为True的值的横坐标
92 |         plt.plot(X[my_members, 0], X[my_members, 1], col + '.')
93 | 
94 |     plt.title('Estimated number of clusters: %d' % n_clusters_)
95 |     plt.show()'''
96 | 
97 | if __name__=="__main__":
98 |     main()
99 | 


--------------------------------------------------------------------------------
/第8章  数字人文下的文本聚类/code/tsne_plot/3D+tsne2维画图.py:
--------------------------------------------------------------------------------
 1 | from time import time
 2 | 
 3 | import matplotlib.pyplot as plt
 4 | import numpy as np
 5 | from matplotlib.ticker import NullFormatter
 6 | from sklearn import manifold, datasets
 7 | 
 8 | # # Next line to silence pyflakes. This import is needed.
 9 | # Axes3D
10 | 
11 | n_points = 2690
12 | # X是一个(1000, 3)的2维数据，color是一个(1000,)的1维数据
13 | X = np.loadtxt("D:\我\非遗\Word2vector\w2v_sentence_vec_100D.txt")
14 | X, color = datasets.samples_generator.make_s_curve(n_points, random_state=0)
15 | n_neighbors = 10
16 | n_components = 2
17 | 
18 | fig = plt.figure(figsize=(8, 8))
19 | # 创建了一个figure，标题为"Manifold Learning with 1000 points, 10 neighbors"
20 | plt.suptitle("Manifold Learning with %i points, %i neighbors"
21 |              % (1000, n_neighbors), fontsize=14)
22 | 
23 | 
24 | '''绘制S曲线的3D图像'''
25 | ax = fig.add_subplot(211, projection='3d')
26 | ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=color, cmap=plt.cm.Spectral)
27 | ax.view_init(4, -72)  # 初始化视角
28 | 
29 | '''t-SNE'''
30 | t0 = time()
31 | tsne = manifold.TSNE(n_components=n_components, init='pca', random_state=0)
32 | Y = tsne.fit_transform(X)  # 转换后的输出
33 | t1 = time()
34 | print("t-SNE: %.2g sec" % (t1 - t0))  # 算法用时
35 | ax = fig.add_subplot(2, 1, 2)
36 | plt.scatter(Y[:, 0], Y[:, 1], c=color, cmap=plt.cm.Spectral)
37 | plt.title("t-SNE (%.2g sec)" % (t1 - t0))
38 | ax.xaxis.set_major_formatter(NullFormatter())  # 设置标签显示格式为空
39 | ax.yaxis.set_major_formatter(NullFormatter())
40 | # plt.axis('tight')
41 | 
42 | plt.show()


--------------------------------------------------------------------------------
/第8章  数字人文下的文本聚类/code/tsne_plot/heatmap.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import seaborn as sns
 3 | from pylab import *
 4 | 
 5 | mpl.rcParams['font.sans-serif'] = ['SimHei']
 6 | 
 7 | 
 8 | labels = ['民间文学', '传统音乐', '传统舞蹈', '传统戏剧', '曲艺', '传统体育、游艺与杂技', '传统美术', \
 9 |           '传统技艺', '传统医药', '民俗']
10 | classes = ['1', '2', '3', '4', '5', '6']
11 | sum=[191,352,278,422,175,102,307,427,89,347]
12 | six = [927, 693, 235, 377, 209, 249]
13 | eight = [174, 141, 693, 920, 209, 198, 110, 245]
14 | 
15 | def hotmap(X):
16 |     '''for r,i in zip(range(len(eight)),X):
17 |             print(i)
18 |             for j in range(len(i)):
19 |                 i[j]=i[j]/eight[r]'''
20 |     for i in X:
21 |         print(i)
22 |         for j,k in zip(range(len(i)),range(len(sum))):
23 |             i[j]=i[j]/sum[k]
24 |     #print(X)
25 |     X = np.transpose(X)
26 |     print(X)
27 |     dt = pd.DataFrame(X, columns=['类1', '类2', '类3', '类4', '类5', '类6'], index=labels)
28 |     print(dt)
29 |     # pt = dt.pivot(index=labels,columns=classes,values=0)
30 |     # cmap用matplotlib colormap
31 |     ax = sns.heatmap(dt, cmap='YlGnBu')
32 |     # rainbow为 matplotlib 的colormap名称
33 |     ax.set_title('six classes heatmap')
34 |     # ax.set_xlabel('classes')
35 |     # ax.set_ylabel('')
36 |     plt.show()
37 | 
38 | 
39 | def main():
40 |     w2v_six_class = [[82,127,55,182,31,31,118,158,28,115],[46,98,97,86,84,42,79,95,28,38],[7,31,25,45,2,8,16,31,11,59],[28,45,54,54,19,12,48,61,15,41],\
41 |              [13,4,12,38,6,6,16,29,3,82],[15,48,35,17,33,3,30,52,4,12]]
42 |     w2v_eight = [[17,28,28,24,12,8,30,19,2,6],[12,23,13,8,30,2,14,31,3,5],[46,98,97,86,84,42,79,95,28,38],\
43 |                  [87,127,52,179,27,31,121,157,28,111],[13,4,12,38,6,6,16,29,3,82],[6,25,17,39,2,7,13,25,11,53],\
44 |                  [3,25,22,9,4,1,16,22,1,7],[7,23,37,39,10,5,18,48,13,45]]
45 |     w2v_ten = [[12,23,13,8,30,2,14,31,3,5],[80,118,45,152,26,27,113,146,25,94],[46,98,97,86,84,42,79,95,28,38],\
46 |                [12,4,12,38,6,6,16,29,3,82],[7,19,18,40,0,8,9,18,2,52],[17,27,28,24,12,8,30,17,2,4],[3,25,22,9,3,1,16,21,1,7],\
47 |                [1,8,4,7,2,0,5,14,9,9],[0,1,0,0,1,0,0,1,0,0],[13,30,39,58,11,8,25,54,16,56]]
48 |     hotmap(w2v_six_class)
49 | 
50 | if __name__ == '__main__':
51 |     main()
52 | 
53 | 


--------------------------------------------------------------------------------
/第8章  数字人文下的文本聚类/code/tsne_plot/tsne.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | import numpy as np
 3 | from sklearn.manifold import TSNE
 4 | 
 5 | 
 6 | def plot_embedding(X, lables):
 7 |     tsne = TSNE(perplexity=30, n_components=2, init='pca')  # TSNE降维，降到2D
 8 |     data = tsne.fit_transform(X)
 9 | 
10 |     x_min, x_max = np.min(data, 0), np.max(data, 0)
11 |     data = (data - x_min) / (x_max - x_min)
12 | 
13 |     plt.figure()
14 |     for i in range(data.shape[0]):
15 |         plt.text(data[i, 0], data[i, 1], str(lables[i]),
16 |                  color=plt.cm.Set1(lables[i] / 10.),
17 |                  fontdict={'weight': 'bold', 'size': 9})
18 |     plt.xticks([])
19 |     plt.yticks([])
20 |     plt.show()
21 |     #plt.title(title)
22 | 
23 | def main():
24 |     X = np.loadtxt("D:\我\非遗\Word2vector\w2v_sentence_vec_100D.txt")
25 |     lables = np.loadtxt("D:\我\非遗\Word2vector\标签\\100-lables.txt")
26 |     tsne = TSNE(perplexity=30, n_components=2, init='pca')  # TSNE降维，降到2D
27 |     data = tsne.fit_transform(X)
28 |     print(data)
29 |     plot_embedding(data, lables)
30 | 
31 | 
32 | main()
33 | '''
34 | X = np.loadtxt("D:\我\非遗\Word2vector\w2v_sentence_vec_100D.txt")
35 | labels = np.loadtxt("D:\我\非遗\Word2vector\标签\\100-lables.txt")
36 | tsne = TSNE(perplexity=30, n_components=2, init='pca')  # TSNE降维，降到2D
37 | data = tsne.fit_transform(X)
38 | plt.figure(1)
39 | plt.clf()
40 | colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
41 | for k, col in zip(range(6), colors):
42 |     # 根据lables中的值是否等于k，重新组成一个True、False的数组
43 |     my_members = labels == k
44 |     # X[my_members, 0] 取出my_members对应位置为True的值的横坐标
45 |     plt.plot(data[my_members, 0], data[my_members, 1], col + '.')
46 | 
47 | #plt.title('Estimated number of clusters: %d' % n_clusters_)
48 | plt.show()'''
49 | 


--------------------------------------------------------------------------------
/第8章  数字人文下的文本聚类/code/tsne_plot/ttt.py:
--------------------------------------------------------------------------------
 1 | from itertools import cycle
 2 | 
 3 | import matplotlib.pyplot as plt
 4 | import numpy as np
 5 | from sklearn import manifold
 6 | 
 7 | '''# read data
 8 | path="D:\我\非遗\cut_words_entity.txt"
 9 | num = []
10 | with codecs.open(path, 'r', 'utf8') as f:
11 |     line = f.readlines()
12 |     for i in line:
13 |         l=i.strip().replace('\n', '').split(' ')
14 |         res = [x.strip() for x in l if x.strip() != '']
15 |         print(res)
16 |         num.append(len(res))
17 | print(max(num))
18 | print(min(num))
19 | print(np.average(num))'''
20 | X = np.loadtxt("D:\我\非遗\Word2vector\w2v_sentence_vec_50D.txt")
21 | y = np.loadtxt("D:\我\非遗\Word2vector\标签\lables_50.txt")
22 | tsne = manifold.TSNE(n_components=2, init='pca', random_state=501)
23 | X_tsne = tsne.fit_transform(X)
24 | 
25 | print("Org data dimension is {}.\
26 |       Embedded data dimension is {}".format(X.shape[-1], X_tsne.shape[-1]))
27 | 
28 | #嵌入空间可视化
29 | colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
30 | x_min, x_max = X_tsne.min(0), X_tsne.max(0)
31 | X_norm = (X_tsne - x_min) / (x_max - x_min)  # 归一化
32 | plt.figure(figsize=(8, 8))
33 | for i in range(X_norm.shape[0],):
34 |     plt.text(X_norm[i, 0], X_norm[i, 1], str(y[i]), color=plt.cm.Set1(y[i] /20.),
35 |              fontdict={'weight': 'bold', 'size': 9})
36 | plt.xticks([])
37 | plt.yticks([])
38 | plt.show()
39 | '''
40 | plt.figure(1)
41 | plt.clf()
42 | colors =['k','darkgrey','brown','r','peru','tan','gold','olive','y','sage','palegreen','g','c','deepskyblue','b','m','pink']
43 | #colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
44 | for k, col in zip(range(6), colors):
45 |     # 根据lables中的值是否等于k，重新组成一个True、False的数组
46 |     my_members = y == k
47 |     # X[my_members, 0] 取出my_members对应位置为True的值的横坐标
48 |     plt.plot(X_tsne[my_members, 0], X_tsne[my_members, 1], col + '.')
49 | plt.show()'''
50 | 
51 | 
52 | 
53 | 


--------------------------------------------------------------------------------
/第8章  数字人文下的文本聚类/code/vsm.py:
--------------------------------------------------------------------------------
 1 | import codecs
 2 | import math
 3 | import os
 4 | 
 5 | from sklearn.feature_extraction.text import CountVectorizer
 6 | from sklearn.feature_extraction.text import TfidfTransformer
 7 | 
 8 | path = r"C:\Users\lenovo\Desktop\信息检索系统\文摘"
 9 | 
10 | def readtxt(path):
11 |     cate = [path +'\\'+ x for x in os.listdir(path)]
12 |     print(cate)
13 |     data = []
14 |     for f in cate:
15 |         with codecs.open(f, 'r', 'utf8') as f:
16 |             line = f.readlines()
17 |             data.append(line)
18 |     data_final=[]
19 |     for i in range(len(data)):
20 |         ll = ''
21 |         for l in data[i]:
22 |             l = l.replace('\r\n','').replace('.','').replace(',','').replace('"','').replace('--','').replace('\'','')
23 |             ll = ll+l
24 |         data_final.append(ll)
25 |     print(data_final)
26 |     return data_final
27 | 
28 | 
29 | if __name__ == "__main__":
30 |     #corpus = []  # 文档预料 空格连接
31 |     corpus = readtxt(path)
32 |     # 读取预料 一行预料为一个文档
33 |     #path1 = "D:\我\非遗\高维聚类结果\\350_类11.txt"
34 |     #for line in open(path1, 'r', encoding="utf-8").readlines():
35 |     #    corpus.append(line.strip())
36 |         # print corpus
37 |     #time.sleep(5)
38 | 
39 |     # 将文本中的词语转换为词频矩阵 矩阵元素a[i][j] 表示j词在i类文本下的词频
40 |     vectorizer = CountVectorizer(
41 |     min_df=0,
42 |     token_pattern=r"\b\w+\b"
43 |     )
44 | 
45 |     # 该类会统计每个词语的tf-idf权值
46 |     transformer = TfidfTransformer()
47 | 
48 |     # 第一个fit_transform是计算tf-idf 第二个fit_transform是将文本转为词频矩阵
49 |     tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus))
50 | 
51 |     # 获取词袋模型中的所有词语
52 |     word = vectorizer.get_feature_names()
53 |     print(len(word))
54 |     # 将tf-idf矩阵抽取出来，元素w[i][j]表示j词在i类文本中的tf-idf权重
55 |     weight = tfidf.toarray()
56 |     print(weight)
57 |     #path2 = "./words_tfidf2.txt"
58 |     #result = codecs.open(path2, 'w', 'utf-8')
59 |     #for j in range(len(word)):
60 |      #   result.write(word[j] + ' ')
61 |     #result.write('\r\n\r\n')
62 |     # 打印每类文本的tf-idf词语权重，第一个for遍历所有文本，第二个for便利某一类文本下的词语权重
63 |     sum = 0
64 |     sq1 = 0
65 |     sq2 = 0
66 |     for i in range(len(weight[0])):
67 |         sum += weight[0][i] * weight[1][i]
68 |         sq1 += pow(weight[0][i], 2)
69 |         sq2 += pow(weight[1][i], 2)
70 |     try:
71 |         result = round(float(sum) / (math.sqrt(sq1) * math.sqrt(sq2)), 2)
72 |     except ZeroDivisionError:
73 |         result = 0.0
74 |     print(result)
75 | 
76 |     #result.close()
77 | 
78 | 


--------------------------------------------------------------------------------
/第8章  数字人文下的文本聚类/code/word2vec/doc2vec.py:
--------------------------------------------------------------------------------
 1 | import codecs
 2 | 
 3 | import gensim
 4 | import numpy as np
 5 | from gensim.models.doc2vec import Doc2Vec
 6 | 
 7 | TaggededDocument = gensim.models.doc2vec.TaggedDocument
 8 | 
 9 | 
10 | def readtxt(path):
11 |     data = []
12 |     with codecs.open(path,'r',encoding='utf-8') as f:
13 |         doc = f.readlines()
14 |         #for line in f.readlines():
15 |             #line = line.strip('\n')
16 |             #data.append(line)
17 |         return doc
18 | 
19 | 
20 | def train(x_train):
21 |     # D2V参数解释：
22 |     # min_count：忽略所有单词中单词频率小于这个值的单词。
23 |     # window：窗口的尺寸。（句子中当前和预测单词之间的最大距离）
24 |     # size:特征向量的维度
25 |     # sample：高频词汇的随机降采样的配置阈值，默认为1e-3，范围是(0,1e-5)。
26 |     # negative: 如果>0,则会采用negativesampling，用于设置多少个noise words（一般是5-20）。默认值是5。
27 |     # workers：用于控制训练的并行数。
28 |     model_dm = Doc2Vec(x_train, min_count=1, window=3, vector_size=160, sample=1e-3, negative=5, workers=4)
29 |     # total_examples：统计句子数
30 |     # epochs：在语料库上的迭代次数(epochs)。
31 |     model_dm.train(x_train, total_examples=model_dm.corpus_count, epochs=70)
32 |     model_dm.save('d2v_heritage_160.model')
33 | 
34 |     return model_dm
35 | 
36 | def test():
37 |     model_dm = Doc2Vec.load("model/model_dm_wangyi")
38 |     test_text = ['《', '舞林', '争霸' '》', '十强' '出炉', '复活', '舞者', '澳门', '踢馆']
39 |     inferred_vector_dm = model_dm.infer_vector(test_text)
40 |     print(inferred_vector_dm)
41 |     sims = model_dm.docvecs.most_similar([inferred_vector_dm], topn=10)
42 | 
43 |     return sims
44 | 
45 |     '''sims = test()
46 |     for count, sim in sims:
47 |         sentence = x_train[count]
48 |         words = ''
49 |         for word in sentence[0]:
50 |             words = words + word + ' '
51 |         print(words, sim, len(sentence[0]))
52 |      '''
53 | 
54 | 
55 | def get_dataset():
56 |     with open(r'D:\我\非遗\cut_words_entity.txt', 'r', encoding='utf8') as f:
57 |         docs = f.readlines()
58 |         print(len(docs))
59 | 
60 |     x_train = []
61 |     # y = np.concatenate(np.ones(len(docs)))
62 |     for i, text in enumerate(docs):
63 |         word_list = text.split(' ')
64 |         l = len(word_list)
65 |         word_list[l - 1] = word_list[l - 1].strip()
66 |         document = TaggededDocument(word_list, tags=[i])
67 |         x_train.append(document)
68 | 
69 |     return x_train
70 | 
71 | 
72 | def main():
73 |     path = r'D:\我\非遗\cut_words_entity.txt'
74 |     x_train = get_dataset()
75 |     #print(x_train)
76 |     train(x_train)
77 |     #test()
78 |     data = []
79 |     model = Doc2Vec.load('d2v_heritage_160.model')
80 |     for i in range(2690):
81 |         data.append(model.docvecs[i])
82 |     X = np.array(data)
83 |     np.savetxt("./d2v_heritage_160.txt", X)
84 |     # print(model.docvecs[10])
85 | 
86 | 
87 | main()


--------------------------------------------------------------------------------
/第8章  数字人文下的文本聚类/code/word2vec/word2vector.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from gensim.models import Word2Vec
 3 | from gensim.models.word2vec import LineSentence
 4 | 
 5 | 
 6 | # 训练word2vec模型 参数说明：
 7 |     # sentences: 包含句子的list，或迭代器
 8 |     # size:      词向量的维数，size越大需要越多的训练数据，同时能得到更好的模型
 9 |     # alpha:     初始学习速率，随着训练过程递减，最后降到 min_alpha
10 |     # window:    上下文窗口大小，即预测当前这个词的时候最多使用距离为window大小的词
11 |     # max_vocab_size: 词表大小，如果实际词的数量超过了这个值，过滤那些频率低的
12 |     # workers:   并行度
13 |     # iter:      训练轮数
14 |     # sg=0 cbow，sg=1 skip-gram
15 |     # hs=0 negative sampling, hs=1 hierarchy
16 |     #sentences = word2vec.Text8Corpus(r'D:\我\非遗\cut_words_entity')
17 |     #model.save('heritage.model')  保存模型
18 |     # https://blog.csdn.net/laobai1015/article/details/86540813 参数解释
19 | 
20 | def build_vec(list_sentence, model):
21 |     list_vec_sentence = []
22 |     for sentence in list_sentence:     # 每个sentence为一个list
23 | 
24 |         if len(sentence) > 1000:
25 |             arrlists = [model[word] for word in sentence[0:1000]]
26 |             x = np.average(arrlists, axis=0)
27 |         else:
28 |             arrlists = [model[word] for word in sentence]
29 |             x = np.average(arrlists, axis=0)
30 |         list_vec_sentence.append(x)
31 |     return list_vec_sentence
32 | 
33 | 
34 | def main():
35 |     path = r'D:\我\非遗\cut_words_entity.txt'
36 |     sentences = LineSentence(path)
37 |     model = Word2Vec(sentences, sg=0, size=100, min_count=0)  # sg=0 cbow，hs=0默认 negative sampling
38 |     # model.wv.save_word2vec_format('heritage_word_100.bin', binary=True)
39 |     model.save('heritage_ns_100.model')
40 |     vec_sentence = build_vec(sentences, model)
41 |     #print(vec_sentence)
42 |     list_vec_sentence = []
43 |     for s in sentences:
44 |         for word in s:
45 |             arrlists = [model[word]]
46 |             x = np.average(arrlists, axis=0)
47 |         list_vec_sentence.append(x)
48 |     np.savetxt("w2v_sentence_vec_100D_ns.txt", list_vec_sentence)
49 |     '''
50 | 
51 |     model = Word2Vec.load('./heritage.model')
52 |     word = model.most_similar("赛龙舟")
53 |     print(word)
54 |     #print(model[''])
55 | '''
56 | 
57 | if __name__ == '__main__':
58 |     main()


--------------------------------------------------------------------------------
/第8章  数字人文下的文本聚类/code/分词/中科院分词.py:
--------------------------------------------------------------------------------
 1 | import pynlpir
 2 | pynlpir.open()
 3 | import codecs
 4 | from ctypes import c_char_p
 5 | 
 6 | 
 7 | def readtxt(path):
 8 |     with codecs.open(path,'r',encoding='utf-8') as f:
 9 |         line=f.readlines()  #line为一个列表 一行一个元素
10 |         line = [x.strip('\r\n') for x in line]
11 |         #print(line)
12 |         return line
13 | 
14 | def cutwords(data,stopwords,en):
15 |     words=''
16 |     cut_word= []
17 |     for word in en:
18 |         pynlpir.nlpir.ImportUserDict(c_char_p(word.encode()))
19 |     for line in data:
20 |         slist = pynlpir.segment(line, pos_tagging=False)
21 |         print(slist)
22 |         for key in slist:
23 |             if key not in stopwords and key != ' ' and key != '\xa0' and key != '\u3000' and key != '\ue81b':
24 |                 words += key+' '
25 |         # output=" ".join(words)
26 |         words += '\n'
27 |     cut_word.append(words)
28 |     print(cut_word)
29 |     return cut_word
30 | 
31 | 
32 | def write(path,data):
33 |     with codecs.open(path,'a','utf-8')as f:
34 |         for i in data:
35 |             f.write(str(i))
36 | 
37 | 
38 | def main():
39 |     path1 = "D:\我\非遗\example.txt"
40 |     stopwords_path = "D:\我\非遗\停用词.txt"
41 |     path_entity = "D:\我\非遗\heritage_entity.txt"
42 |     data = readtxt(path1)
43 |     stopwords = readtxt(stopwords_path)
44 |     entity = readtxt(path_entity)
45 |     #print(entity)
46 |     cutwords(data,stopwords,entity)
47 | 
48 | 
49 | if __name__ == '__main__':
50 |     main()


--------------------------------------------------------------------------------
/第8章  数字人文下的文本聚类/第八章  数字人文下的文本聚类.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hsc748NLP/code-for-digital-humanities-tutorial/6220c365ab5ff7df16f1f20a46cc928b034db36a/第8章  数字人文下的文本聚类/第八章  数字人文下的文本聚类.pdf


--------------------------------------------------------------------------------
/第8章  数字人文下的文本聚类/非遗信息 全.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hsc748NLP/code-for-digital-humanities-tutorial/6220c365ab5ff7df16f1f20a46cc928b034db36a/第8章  数字人文下的文本聚类/非遗信息 全.xlsx


--------------------------------------------------------------------------------
/第9章  数字人文下的机器翻译/README.md:
--------------------------------------------------------------------------------
1 | ## 机器自动翻译源代码
2 | 
3 | 使用开源的opennmt进行古汉语到现代汉语和英语自动翻译的源代码，详细使用说明见教材第九章的内容。
4 | 


--------------------------------------------------------------------------------
/第9章  数字人文下的机器翻译/opennmt/README.md:
--------------------------------------------------------------------------------
 1 | ## 一、使用方法
 2 | 
 3 | 1. 下载本仓库
 4 | 2. 从百度网盘下载模型文件，链接：https://pan.baidu.com/s/1md7mVEH46AmisIZXwRkKrQ 提取码：ckdx 
 5 | 3. 运行translate.py,根据需求更改输入
 6 | 
 7 | ## 二、文件说明
 8 | 
 9 | ### 1、data
10 | 
11 | 包含原始训练语料src-train.txt、tgt-train.txt及验证测试语料src-valid.txt、tgt-valid.txt、src-test_.txt、tgt-test.txt共6个文件；
12 | 
13 | 其中src开头为古文语料，tgt为目标英文语料
14 | 
15 | ### 2、model
16 | 
17 | 已训练完毕的翻译模型model.pt，因模型文件过大，已上传到百度网盘，链接：https://pan.baidu.com/s/1md7mVEH46AmisIZXwRkKrQ 提取码：ckdx 
18 | 
19 | ### 3、omnt
20 | 
21 | 项目依赖文件
22 | 
23 | 
24 | 
25 | 


--------------------------------------------------------------------------------
/第9章  数字人文下的机器翻译/opennmt/data/pred.txt:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/第9章  数字人文下的机器翻译/opennmt/data/src-test.txt:
--------------------------------------------------------------------------------
1 | 虽然 ， 每 至 于 族 ， 吾 见 其 难为 ， 怵然 为戒 ， 视 为 止 ， 行 为迟 。


--------------------------------------------------------------------------------
/第9章  数字人文下的机器翻译/opennmt/data/tgt-test.txt:
--------------------------------------------------------------------------------
1 | test


--------------------------------------------------------------------------------
/第9章  数字人文下的机器翻译/opennmt/model/说明.txt:
--------------------------------------------------------------------------------
1 | 将从网盘下载的model.pt文件放在此文件夹内


--------------------------------------------------------------------------------
/第9章  数字人文下的机器翻译/opennmt/onmt/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7" project-jdk-type="Python SDK" />
4 | </project>


--------------------------------------------------------------------------------
/第9章  数字人文下的机器翻译/opennmt/onmt/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/onmt.iml" filepath="$PROJECT_DIR$/.idea/onmt.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/第9章  数字人文下的机器翻译/opennmt/onmt/.idea/onmt.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$" />
 5 |     <orderEntry type="inheritedJdk" />
 6 |     <orderEntry type="sourceFolder" forTests="false" />
 7 |   </component>
 8 |   <component name="TestRunnerService">
 9 |     <option name="PROJECT_TEST_RUNNER" value="Unittests" />
10 |   </component>
11 | </module>


--------------------------------------------------------------------------------
/第9章  数字人文下的机器翻译/opennmt/onmt/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="$PROJECT_DIR$/.." vcs="Git" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/第9章  数字人文下的机器翻译/opennmt/onmt/__init__.py:
--------------------------------------------------------------------------------
 1 | """ Main entry point of the ONMT library """
 2 | from __future__ import division, print_function
 3 | 
 4 | import sys
 5 | 
 6 | import onmt.utils.optimizers
 7 | 
 8 | onmt.utils.optimizers.Optim = onmt.utils.optimizers.Optimizer
 9 | sys.modules["onmt.Optim"] = onmt.utils.optimizers
10 | 
11 | # For Flake
12 | __all__ = [onmt.inputters, onmt.encoders, onmt.decoders, onmt.models,
13 |            onmt.utils, onmt.modules, "Trainer"]
14 | 
15 | __version__ = "1.1.1"
16 | 


--------------------------------------------------------------------------------
/第9章  数字人文下的机器翻译/opennmt/onmt/bin/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hsc748NLP/code-for-digital-humanities-tutorial/6220c365ab5ff7df16f1f20a46cc928b034db36a/第9章  数字人文下的机器翻译/opennmt/onmt/bin/__init__.py


--------------------------------------------------------------------------------
/第9章  数字人文下的机器翻译/opennmt/onmt/bin/translate.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | from __future__ import unicode_literals
 5 | 
 6 | import onmt.opts as opts
 7 | from onmt.translate.translator import build_translator
 8 | from onmt.utils.logging import init_logger
 9 | from onmt.utils.misc import split_corpus
10 | from onmt.utils.parse import ArgumentParser
11 | 
12 | 
13 | def translate(opt):
14 |     ArgumentParser.validate_translate_opts(opt)
15 |     logger = init_logger(opt.log_file)
16 | 
17 |     translator = build_translator(opt, report_score=True)
18 |     src_shards = split_corpus(opt.src, opt.shard_size)
19 |     tgt_shards = split_corpus(opt.tgt, opt.shard_size)
20 |     shard_pairs = zip(src_shards, tgt_shards)
21 | 
22 |     for i, (src_shard, tgt_shard) in enumerate(shard_pairs):
23 |         logger.info("Translating shard %d." % i)
24 |         translator.translate(
25 |             src=src_shard,
26 |             tgt=tgt_shard,
27 |             src_dir=opt.src_dir,
28 |             batch_size=opt.batch_size,
29 |             batch_type=opt.batch_type,
30 |             attn_debug=opt.attn_debug,
31 |             align_debug=opt.align_debug
32 |             )
33 | 
34 | 
35 | def _get_parser():
36 |     parser = ArgumentParser(description='translate.py')
37 | 
38 |     opts.config_opts(parser)
39 |     opts.translate_opts(parser)
40 |     return parser
41 | 
42 | 
43 | def main():
44 |     parser = _get_parser()
45 | 
46 |     opt = parser.parse_args()
47 |     translate(opt)
48 | 
49 | 
50 | if __name__ == "__main__":
51 |     main()
52 | 


--------------------------------------------------------------------------------
/第9章  数字人文下的机器翻译/opennmt/onmt/decoders/__init__.py:
--------------------------------------------------------------------------------
 1 | """Module defining decoders."""
 2 | from onmt.decoders.cnn_decoder import CNNDecoder
 3 | from onmt.decoders.decoder import InputFeedRNNDecoder, \
 4 |     StdRNNDecoder
 5 | from onmt.decoders.transformer import TransformerDecoder
 6 | 
 7 | str2dec = {"rnn": StdRNNDecoder, "ifrnn": InputFeedRNNDecoder,
 8 |            "cnn": CNNDecoder, "transformer": TransformerDecoder
 9 |        }
10 | 
11 | __all__ = ["DecoderBase", "TransformerDecoder", "StdRNNDecoder", "CNNDecoder",
12 |            "InputFeedRNNDecoder", "str2dec"]
13 | 


--------------------------------------------------------------------------------
/第9章  数字人文下的机器翻译/opennmt/onmt/encoders/__init__.py:
--------------------------------------------------------------------------------
 1 | """Module defining encoders."""
 2 | from onmt.encoders.cnn_encoder import CNNEncoder
 3 | from onmt.encoders.ggnn_encoder import GGNNEncoder
 4 | from onmt.encoders.mean_encoder import MeanEncoder
 5 | from onmt.encoders.rnn_encoder import RNNEncoder
 6 | from onmt.encoders.transformer import TransformerEncoder
 7 | 
 8 | str2enc = {"ggnn": GGNNEncoder, "rnn": RNNEncoder, "brnn": RNNEncoder,
 9 |            "cnn": CNNEncoder, "transformer": TransformerEncoder,
10 |            "mean": MeanEncoder}
11 | 
12 | __all__ = ["EncoderBase", "TransformerEncoder", "RNNEncoder", "CNNEncoder",
13 |            "MeanEncoder", "str2enc"]
14 | 


--------------------------------------------------------------------------------
/第9章  数字人文下的机器翻译/opennmt/onmt/encoders/cnn_encoder.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Implementation of "Convolutional Sequence to Sequence Learning"
 3 | """
 4 | import torch.nn as nn
 5 | 
 6 | from onmt.encoders.encoder import EncoderBase
 7 | from onmt.utils.cnn_factory import shape_transform, StackedCNN
 8 | 
 9 | SCALE_WEIGHT = 0.5 ** 0.5
10 | 
11 | 
12 | class CNNEncoder(EncoderBase):
13 |     """Encoder based on "Convolutional Sequence to Sequence Learning"
14 |     :cite:`DBLP:journals/corr/GehringAGYD17`.
15 |     """
16 | 
17 |     def __init__(self, num_layers, hidden_size,
18 |                  cnn_kernel_width, dropout, embeddings):
19 |         super(CNNEncoder, self).__init__()
20 | 
21 |         self.embeddings = embeddings
22 |         input_size = embeddings.embedding_size
23 |         self.linear = nn.Linear(input_size, hidden_size)
24 |         self.cnn = StackedCNN(num_layers, hidden_size,
25 |                               cnn_kernel_width, dropout)
26 | 
27 |     @classmethod
28 |     def from_opt(cls, opt, embeddings):
29 |         """Alternate constructor."""
30 |         return cls(
31 |             opt.enc_layers,
32 |             opt.enc_rnn_size,
33 |             opt.cnn_kernel_width,
34 |             opt.dropout[0] if type(opt.dropout) is list else opt.dropout,
35 |             embeddings)
36 | 
37 |     def forward(self, input, lengths=None, hidden=None):
38 |         """See :class:`onmt.modules.EncoderBase.forward()`"""
39 |         self._check_args(input, lengths, hidden)
40 | 
41 |         emb = self.embeddings(input)
42 |         # s_len, batch, emb_dim = emb.size()
43 | 
44 |         emb = emb.transpose(0, 1).contiguous()
45 |         emb_reshape = emb.view(emb.size(0) * emb.size(1), -1)
46 |         emb_remap = self.linear(emb_reshape)
47 |         emb_remap = emb_remap.view(emb.size(0), emb.size(1), -1)
48 |         emb_remap = shape_transform(emb_remap)
49 |         out = self.cnn(emb_remap)
50 | 
51 |         return emb_remap.squeeze(3).transpose(0, 1).contiguous(), \
52 |             out.squeeze(3).transpose(0, 1).contiguous(), lengths
53 | 
54 |     def update_dropout(self, dropout):
55 |         self.cnn.dropout.p = dropout
56 | 


--------------------------------------------------------------------------------
/第9章  数字人文下的机器翻译/opennmt/onmt/encoders/encoder.py:
--------------------------------------------------------------------------------
 1 | """Base class for encoders and generic multi encoders."""
 2 | 
 3 | import torch.nn as nn
 4 | 
 5 | from onmt.utils.misc import aeq
 6 | 
 7 | 
 8 | class EncoderBase(nn.Module):
 9 |     """
10 |     Base encoder class. Specifies the interface used by different encoder types
11 |     and required by :class:`onmt.Models.NMTModel`.
12 | 
13 |     .. mermaid::
14 | 
15 |        graph BT
16 |           A[Input]
17 |           subgraph RNN
18 |             C[Pos 1]
19 |             D[Pos 2]
20 |             E[Pos N]
21 |           end
22 |           F[Memory_Bank]
23 |           G[Final]
24 |           A-->C
25 |           A-->D
26 |           A-->E
27 |           C-->F
28 |           D-->F
29 |           E-->F
30 |           E-->G
31 |     """
32 | 
33 |     @classmethod
34 |     def from_opt(cls, opt, embeddings=None):
35 |         raise NotImplementedError
36 | 
37 |     def _check_args(self, src, lengths=None, hidden=None):
38 |         n_batch = src.size(1)
39 |         if lengths is not None:
40 |             n_batch_, = lengths.size()
41 |             aeq(n_batch, n_batch_)
42 | 
43 |     def forward(self, src, lengths=None):
44 |         """
45 |         Args:
46 |             src (LongTensor):
47 |                padded sequences of sparse indices ``(src_len, batch, nfeat)``
48 |             lengths (LongTensor): length of each sequence ``(batch,)``
49 | 
50 | 
51 |         Returns:
52 |             (FloatTensor, FloatTensor):
53 | 
54 |             * final encoder state, used to initialize decoder
55 |             * memory bank for attention, ``(src_len, batch, hidden)``
56 |         """
57 | 
58 |         raise NotImplementedError
59 | 


--------------------------------------------------------------------------------
/第9章  数字人文下的机器翻译/opennmt/onmt/encoders/mean_encoder.py:
--------------------------------------------------------------------------------
 1 | """Define a minimal encoder."""
 2 | import torch
 3 | from onmt.encoders.encoder import EncoderBase
 4 | from onmt.utils.misc import sequence_mask
 5 | 
 6 | 
 7 | class MeanEncoder(EncoderBase):
 8 |     """A trivial non-recurrent encoder. Simply applies mean pooling.
 9 | 
10 |     Args:
11 |        num_layers (int): number of replicated layers
12 |        embeddings (onmt.modules.Embeddings): embedding module to use
13 |     """
14 | 
15 |     def __init__(self, num_layers, embeddings):
16 |         super(MeanEncoder, self).__init__()
17 |         self.num_layers = num_layers
18 |         self.embeddings = embeddings
19 | 
20 |     @classmethod
21 |     def from_opt(cls, opt, embeddings):
22 |         """Alternate constructor."""
23 |         return cls(
24 |             opt.enc_layers,
25 |             embeddings)
26 | 
27 |     def forward(self, src, lengths=None):
28 |         """See :func:`EncoderBase.forward()`"""
29 |         self._check_args(src, lengths)
30 | 
31 |         emb = self.embeddings(src)
32 |         _, batch, emb_dim = emb.size()
33 | 
34 |         if lengths is not None:
35 |             # we avoid padding while mean pooling
36 |             mask = sequence_mask(lengths).float()
37 |             mask = mask / lengths.unsqueeze(1).float()
38 |             mean = torch.bmm(mask.unsqueeze(1), emb.transpose(0, 1)).squeeze(1)
39 |         else:
40 |             mean = emb.mean(0)
41 | 
42 |         mean = mean.expand(self.num_layers, batch, emb_dim)
43 |         memory_bank = emb
44 |         encoder_final = (mean, mean)
45 |         return encoder_final, memory_bank, lengths
46 | 


--------------------------------------------------------------------------------
/第9章  数字人文下的机器翻译/opennmt/onmt/encoders/rnn_encoder.py:
--------------------------------------------------------------------------------
  1 | """Define RNN-based encoders."""
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | from onmt.encoders.encoder import EncoderBase
  5 | from onmt.utils.rnn_factory import rnn_factory
  6 | from torch.nn.utils.rnn import pack_padded_sequence as pack
  7 | from torch.nn.utils.rnn import pad_packed_sequence as unpack
  8 | 
  9 | 
 10 | class RNNEncoder(EncoderBase):
 11 |     """ A generic recurrent neural network encoder.
 12 | 
 13 |     Args:
 14 |        rnn_type (str):
 15 |           style of recurrent unit to use, one of [RNN, LSTM, GRU, SRU]
 16 |        bidirectional (bool) : use a bidirectional RNN
 17 |        num_layers (int) : number of stacked layers
 18 |        hidden_size (int) : hidden size of each layer
 19 |        dropout (float) : dropout value for :class:`torch.nn.Dropout`
 20 |        embeddings (onmt.modules.Embeddings): embedding module to use
 21 |     """
 22 | 
 23 |     def __init__(self, rnn_type, bidirectional, num_layers,
 24 |                  hidden_size, dropout=0.0, embeddings=None,
 25 |                  use_bridge=False):
 26 |         super(RNNEncoder, self).__init__()
 27 |         assert embeddings is not None
 28 | 
 29 |         num_directions = 2 if bidirectional else 1
 30 |         assert hidden_size % num_directions == 0
 31 |         hidden_size = hidden_size // num_directions
 32 |         self.embeddings = embeddings
 33 | 
 34 |         self.rnn, self.no_pack_padded_seq = \
 35 |             rnn_factory(rnn_type,
 36 |                         input_size=embeddings.embedding_size,
 37 |                         hidden_size=hidden_size,
 38 |                         num_layers=num_layers,
 39 |                         dropout=dropout,
 40 |                         bidirectional=bidirectional)
 41 | 
 42 |         # Initialize the bridge layer
 43 |         self.use_bridge = use_bridge
 44 |         if self.use_bridge:
 45 |             self._initialize_bridge(rnn_type,
 46 |                                     hidden_size,
 47 |                                     num_layers)
 48 | 
 49 |     @classmethod
 50 |     def from_opt(cls, opt, embeddings):
 51 |         """Alternate constructor."""
 52 |         return cls(
 53 |             opt.rnn_type,
 54 |             opt.brnn,
 55 |             opt.enc_layers,
 56 |             opt.enc_rnn_size,
 57 |             opt.dropout[0] if type(opt.dropout) is list else opt.dropout,
 58 |             embeddings,
 59 |             opt.bridge)
 60 | 
 61 |     def forward(self, src, lengths=None):
 62 |         """See :func:`EncoderBase.forward()`"""
 63 |         self._check_args(src, lengths)
 64 | 
 65 |         emb = self.embeddings(src)
 66 |         # s_len, batch, emb_dim = emb.size()
 67 | 
 68 |         packed_emb = emb
 69 |         if lengths is not None and not self.no_pack_padded_seq:
 70 |             # Lengths data is wrapped inside a Tensor.
 71 |             lengths_list = lengths.view(-1).tolist()
 72 |             packed_emb = pack(emb, lengths_list)
 73 | 
 74 |         memory_bank, encoder_final = self.rnn(packed_emb)
 75 | 
 76 |         if lengths is not None and not self.no_pack_padded_seq:
 77 |             memory_bank = unpack(memory_bank)[0]
 78 | 
 79 |         if self.use_bridge:
 80 |             encoder_final = self._bridge(encoder_final)
 81 |         return encoder_final, memory_bank, lengths
 82 | 
 83 |     def _initialize_bridge(self, rnn_type,
 84 |                            hidden_size,
 85 |                            num_layers):
 86 | 
 87 |         # LSTM has hidden and cell state, other only one
 88 |         number_of_states = 2 if rnn_type == "LSTM" else 1
 89 |         # Total number of states
 90 |         self.total_hidden_dim = hidden_size * num_layers
 91 | 
 92 |         # Build a linear layer for each
 93 |         self.bridge = nn.ModuleList([nn.Linear(self.total_hidden_dim,
 94 |                                                self.total_hidden_dim,
 95 |                                                bias=True)
 96 |                                      for _ in range(number_of_states)])
 97 | 
 98 |     def _bridge(self, hidden):
 99 |         """Forward hidden state through bridge."""
100 |         def bottle_hidden(linear, states):
101 |             """
102 |             Transform from 3D to 2D, apply linear and return initial size
103 |             """
104 |             size = states.size()
105 |             result = linear(states.view(-1, self.total_hidden_dim))
106 |             return F.relu(result).view(size)
107 | 
108 |         if isinstance(hidden, tuple):  # LSTM
109 |             outs = tuple([bottle_hidden(layer, hidden[ix])
110 |                           for ix, layer in enumerate(self.bridge)])
111 |         else:
112 |             outs = bottle_hidden(self.bridge[0], hidden)
113 |         return outs
114 | 
115 |     def update_dropout(self, dropout):
116 |         self.rnn.dropout = dropout
117 | 


--------------------------------------------------------------------------------
/第9章  数字人文下的机器翻译/opennmt/onmt/inputters/MakeToken.py:
--------------------------------------------------------------------------------
 1 | import MeCab
 2 | import sentencepiece as spm
 3 | 
 4 | 
 5 | def korean_token(datatxt):
 6 |     m = MeCab.Tagger()
 7 |     delete_tag = ['BOS/EOS', 'JKS', 'JKC', 'JKG', 'JKO', 'JKB', 'JKV', 'JKQ', 'JX', 'JC']
 8 | 
 9 |     def del_post_pos(sentence):
10 |         tokens = sentence.split()  # 원본 문장 띄어쓰기로 분리
11 | 
12 |         dict_list = []
13 | 
14 |         for token in tokens:  # 띄어쓰기로 분리된 각 토큰 {'단어':'형태소 태그'} 와 같이 딕셔너리 생성
15 |             m.parse('')
16 |             node = m.parseToNode(token)
17 |             word_list = []
18 |             morph_list = []
19 | 
20 |             while node:
21 |                 morphs = node.feature.split(',')
22 |                 word_list.append(node.surface)
23 |                 morph_list.append(morphs[0])
24 |                 node = node.next
25 | 
26 |             dict_list.append(dict(zip(word_list, morph_list)))
27 | 
28 |         for dic in dict_list:  # delete_tag에 해당하는 단어 쌍 지우기 (조사에 해당하는 단어 지우기)
29 |             for key in list(dic.keys()):
30 |                 if dic[key] in delete_tag:
31 |                     del dic[key]
32 | 
33 |         combine_word = [''.join(list(dic.keys())) for dic in dict_list]  # 형태소로 분리된 각 단어 합치기
34 |         result = ' '.join(combine_word)  # 띄어쓰기로 분리된 각 토큰 합치기
35 | 
36 |         return result  # 온전한 문장을 반환
37 | 
38 |     data = open(datatxt,'r', encoding='utf-8')
39 | 
40 |     with open("data/kor.txt", "w", encoding='utf-8') as f:
41 |         for row in data:
42 |             f.write(del_post_pos(row))
43 |             f.write('\n')
44 | 
45 |     spm.SentencePieceTrainer.Train(
46 |         '--input=data/kor.txt \
47 |         --model_prefix=data/korean_tok \
48 |         --vocab_size=100000 \
49 |         --hard_vocab_limit=false'
50 |     )
51 | 
52 | def english_token(datatxt):
53 |     data = open(datatxt,'r', encoding='utf-8')
54 | 
55 |     with open("data/eng.txt", "w", encoding='utf-8') as f:
56 |         for row in data:
57 |             f.write(row)
58 |             f.write('\n')
59 | 
60 |     spm.SentencePieceTrainer.Train(
61 |         '--input=data/eng.txt \
62 |         --model_prefix=data/english_tok \
63 |         --vocab_size=100000 \
64 |         --hard_vocab_limit=false'
65 |     )
66 | 
67 | 


--------------------------------------------------------------------------------
/第9章  数字人文下的机器翻译/opennmt/onmt/inputters/__init__.py:
--------------------------------------------------------------------------------
 1 | """Module defining inputters.
 2 | 
 3 | Inputters implement the logic of transforming raw data to vectorized inputs,
 4 | e.g., from a line of text to a sequence of embeddings.
 5 | """
 6 | from onmt.inputters.text_dataset import text_sort_key, TextDataReader
 7 | 
 8 | str2reader = {"text": TextDataReader}
 9 | str2sortkey = {'text': text_sort_key}
10 | 
11 | 
12 | __all__ = ['Dataset', 'load_old_vocab', 'get_fields', 'DataReaderBase',
13 |            'filter_example', 'old_style_vocab',
14 |            'build_vocab', 'OrderedIterator',
15 |            'text_sort_key', 'TextDataReader']
16 | 


--------------------------------------------------------------------------------
/第9章  数字人文下的机器翻译/opennmt/onmt/inputters/datareader_base.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | 
 3 | 
 4 | # several data readers need optional dependencies. There's no
 5 | # appropriate builtin exception
 6 | class MissingDependencyException(Exception):
 7 |     pass
 8 | 
 9 | 
10 | class DataReaderBase(object):
11 |     """Read data from file system and yield as dicts.
12 | 
13 |     Raises:
14 |         onmt.inputters.datareader_base.MissingDependencyException: A number
15 |             of DataReaders need specific additional packages.
16 |             If any are missing, this will be raised.
17 |     """
18 | 
19 |     @classmethod
20 |     def from_opt(cls, opt):
21 |         """Alternative constructor.
22 | 
23 |         Args:
24 |             opt (argparse.Namespace): The parsed arguments.
25 |         """
26 |         print("DataReaderBase from_opt")
27 |         return cls()
28 | 
29 |     @classmethod
30 |     def _read_file(cls, path):
31 |         """Line-by-line read a file as bytes."""
32 |         print("DataReaderBase _read_file")
33 |         with open(path, "rb") as f:
34 |             for line in f:
35 |                 yield line
36 | 
37 |     @staticmethod
38 |     def _raise_missing_dep(*missing_deps):
39 |         """Raise missing dep exception with standard error message."""
40 |         raise MissingDependencyException(
41 |             "Could not create reader. Be sure to install "
42 |             "the following dependencies: " + ", ".join(missing_deps))
43 | 
44 |     def read(self, data, side, src_dir):
45 |         """Read data from file system and yield as dicts."""
46 |         raise NotImplementedError()
47 | 


--------------------------------------------------------------------------------
/第9章  数字人文下的机器翻译/opennmt/onmt/models/__init__.py:
--------------------------------------------------------------------------------
1 | """Module defining models."""
2 | 
3 | __all__ = ["build_model_saver", "ModelSaver", "NMTModel"]
4 | 


--------------------------------------------------------------------------------
/第9章  数字人文下的机器翻译/opennmt/onmt/models/model.py:
--------------------------------------------------------------------------------
 1 | """ Onmt NMT Model base class definition """
 2 | import torch.nn as nn
 3 | 
 4 | 
 5 | class NMTModel(nn.Module):
 6 |     """
 7 |     Core trainable object in OpenNMT. Implements a trainable interface
 8 |     for a simple, generic encoder + decoder model.
 9 | 
10 |     Args:
11 |       encoder (onmt.encoders.EncoderBase): an encoder object
12 |       decoder (onmt.decoders.DecoderBase): a decoder object
13 |     """
14 | 
15 |     def __init__(self, encoder, decoder):
16 |         super(NMTModel, self).__init__()
17 |         print("NMTModel")
18 |         self.encoder = encoder
19 |         self.decoder = decoder
20 | 
21 |     def forward(self, src, tgt, lengths, bptt=False, with_align=False):
22 |         """Forward propagate a `src` and `tgt` pair for training.
23 |         Possible initialized with a beginning decoder state.
24 | 
25 |         Args:
26 |             src (Tensor): A source sequence passed to encoder.
27 |                 typically for inputs this will be a padded `LongTensor`
28 |                 of size ``(len, batch, features)``. However, may be an
29 |                 image or other generic input depending on encoder.
30 |             tgt (LongTensor): A target sequence passed to decoder.
31 |                 Size ``(tgt_len, batch, features)``.
32 |             lengths(LongTensor): The src lengths, pre-padding ``(batch,)``.
33 |             bptt (Boolean): A flag indicating if truncated bptt is set.
34 |                 If reset then init_state
35 |             with_align (Boolean): A flag indicating whether output alignment,
36 |                 Only valid for transformer decoder.
37 | 
38 |         Returns:
39 |             (FloatTensor, dict[str, FloatTensor]):
40 | 
41 |             * decoder output ``(tgt_len, batch, hidden)``
42 |             * dictionary attention dists of ``(tgt_len, batch, src_len)``
43 |         """
44 |         dec_in = tgt[:-1]  # exclude last target from inputs
45 | 
46 |         enc_state, memory_bank, lengths = self.encoder(src, lengths)
47 | 
48 |         if bptt is False:
49 |             self.decoder.init_state(src, memory_bank, enc_state)
50 |         dec_out, attns = self.decoder(dec_in, memory_bank,
51 |                                       memory_lengths=lengths,
52 |                                       with_align=with_align)
53 |         return dec_out, attns
54 | 
55 |     def update_dropout(self, dropout):
56 |         self.encoder.update_dropout(dropout)
57 |         self.decoder.update_dropout(dropout)
58 | 


--------------------------------------------------------------------------------
/第9章  数字人文下的机器翻译/opennmt/onmt/models/stacked_rnn.py:
--------------------------------------------------------------------------------
 1 | """ Implementation of ONMT RNN for Input Feeding Decoding """
 2 | import torch
 3 | import torch.nn as nn
 4 | 
 5 | 
 6 | class StackedLSTM(nn.Module):
 7 |     """
 8 |     Our own implementation of stacked LSTM.
 9 |     Needed for the decoder, because we do input feeding.
10 |     """
11 | 
12 |     def __init__(self, num_layers, input_size, rnn_size, dropout):
13 |         super(StackedLSTM, self).__init__()
14 |         self.dropout = nn.Dropout(dropout)
15 |         self.num_layers = num_layers
16 |         self.layers = nn.ModuleList()
17 | 
18 |         for _ in range(num_layers):
19 |             self.layers.append(nn.LSTMCell(input_size, rnn_size))
20 |             input_size = rnn_size
21 | 
22 |     def forward(self, input_feed, hidden):
23 |         h_0, c_0 = hidden
24 |         h_1, c_1 = [], []
25 |         for i, layer in enumerate(self.layers):
26 |             h_1_i, c_1_i = layer(input_feed, (h_0[i], c_0[i]))
27 |             input_feed = h_1_i
28 |             if i + 1 != self.num_layers:
29 |                 input_feed = self.dropout(input_feed)
30 |             h_1 += [h_1_i]
31 |             c_1 += [c_1_i]
32 | 
33 |         h_1 = torch.stack(h_1)
34 |         c_1 = torch.stack(c_1)
35 | 
36 |         return input_feed, (h_1, c_1)
37 | 
38 | 
39 | class StackedGRU(nn.Module):
40 |     """
41 |     Our own implementation of stacked GRU.
42 |     Needed for the decoder, because we do input feeding.
43 |     """
44 | 
45 |     def __init__(self, num_layers, input_size, rnn_size, dropout):
46 |         super(StackedGRU, self).__init__()
47 |         self.dropout = nn.Dropout(dropout)
48 |         self.num_layers = num_layers
49 |         self.layers = nn.ModuleList()
50 | 
51 |         for _ in range(num_layers):
52 |             self.layers.append(nn.GRUCell(input_size, rnn_size))
53 |             input_size = rnn_size
54 | 
55 |     def forward(self, input_feed, hidden):
56 |         h_1 = []
57 |         for i, layer in enumerate(self.layers):
58 |             h_1_i = layer(input_feed, hidden[0][i])
59 |             input_feed = h_1_i
60 |             if i + 1 != self.num_layers:
61 |                 input_feed = self.dropout(input_feed)
62 |             h_1 += [h_1_i]
63 | 
64 |         h_1 = torch.stack(h_1)
65 |         return input_feed, (h_1,)
66 | 


--------------------------------------------------------------------------------
/第9章  数字人文下的机器翻译/opennmt/onmt/modules/__init__.py:
--------------------------------------------------------------------------------
 1 | """  Attention and normalization modules  """
 2 | 
 3 | import onmt.modules.source_noise # noqa
 4 | 
 5 | __all__ = ["Elementwise", "context_gate_factory", "ContextGate",
 6 |            "GlobalAttention", "ConvMultiStepAttention", "CopyGenerator",
 7 |            "CopyGeneratorLoss", "CopyGeneratorLossCompute",
 8 |            "MultiHeadedAttention", "Embeddings", "PositionalEncoding",
 9 |            "WeightNormConv2d", "AverageAttention"]
10 | 


--------------------------------------------------------------------------------
/第9章  数字人文下的机器翻译/opennmt/onmt/modules/average_attn.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """Average Attention module."""
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | 
  7 | from onmt.modules.position_ffn import PositionwiseFeedForward
  8 | 
  9 | 
 10 | class AverageAttention(nn.Module):
 11 |     """
 12 |     Average Attention module from
 13 |     "Accelerating Neural Transformer via an Average Attention Network"
 14 |     :cite:`DBLP:journals/corr/abs-1805-00631`.
 15 | 
 16 |     Args:
 17 |        model_dim (int): the dimension of keys/values/queries,
 18 |            must be divisible by head_count
 19 |        dropout (float): dropout parameter
 20 |     """
 21 | 
 22 |     def __init__(self, model_dim, dropout=0.1, aan_useffn=False):
 23 |         self.model_dim = model_dim
 24 |         self.aan_useffn = aan_useffn
 25 |         super(AverageAttention, self).__init__()
 26 |         if aan_useffn:
 27 |             self.average_layer = PositionwiseFeedForward(model_dim, model_dim,
 28 |                                                          dropout)
 29 |         self.gating_layer = nn.Linear(model_dim * 2, model_dim * 2)
 30 | 
 31 |     def cumulative_average_mask(self, batch_size, inputs_len, device):
 32 |         """
 33 |         Builds the mask to compute the cumulative average as described in
 34 |         :cite:`DBLP:journals/corr/abs-1805-00631` -- Figure 3
 35 | 
 36 |         Args:
 37 |             batch_size (int): batch size
 38 |             inputs_len (int): length of the inputs
 39 | 
 40 |         Returns:
 41 |             (FloatTensor):
 42 | 
 43 |             * A Tensor of shape ``(batch_size, input_len, input_len)``
 44 |         """
 45 | 
 46 |         triangle = torch.tril(torch.ones(inputs_len, inputs_len,
 47 |                               dtype=torch.float, device=device))
 48 |         weights = torch.ones(1, inputs_len, dtype=torch.float, device=device) \
 49 |             / torch.arange(1, inputs_len + 1, dtype=torch.float, device=device)
 50 |         mask = triangle * weights.transpose(0, 1)
 51 | 
 52 |         return mask.unsqueeze(0).expand(batch_size, inputs_len, inputs_len)
 53 | 
 54 |     def cumulative_average(self, inputs, mask_or_step,
 55 |                            layer_cache=None, step=None):
 56 |         """
 57 |         Computes the cumulative average as described in
 58 |         :cite:`DBLP:journals/corr/abs-1805-00631` -- Equations (1) (5) (6)
 59 | 
 60 |         Args:
 61 |             inputs (FloatTensor): sequence to average
 62 |                 ``(batch_size, input_len, dimension)``
 63 |             mask_or_step: if cache is set, this is assumed
 64 |                 to be the current step of the
 65 |                 dynamic decoding. Otherwise, it is the mask matrix
 66 |                 used to compute the cumulative average.
 67 |             layer_cache: a dictionary containing the cumulative average
 68 |                 of the previous step.
 69 | 
 70 |         Returns:
 71 |             a tensor of the same shape and type as ``inputs``.
 72 |         """
 73 | 
 74 |         if layer_cache is not None:
 75 |             step = mask_or_step
 76 |             average_attention = (inputs + step *
 77 |                                  layer_cache["prev_g"]) / (step + 1)
 78 |             layer_cache["prev_g"] = average_attention
 79 |             return average_attention
 80 |         else:
 81 |             mask = mask_or_step
 82 |             return torch.matmul(mask.to(inputs.dtype), inputs)
 83 | 
 84 |     def forward(self, inputs, mask=None, layer_cache=None, step=None):
 85 |         """
 86 |         Args:
 87 |             inputs (FloatTensor): ``(batch_size, input_len, model_dim)``
 88 | 
 89 |         Returns:
 90 |             (FloatTensor, FloatTensor):
 91 | 
 92 |             * gating_outputs ``(batch_size, input_len, model_dim)``
 93 |             * average_outputs average attention
 94 |                 ``(batch_size, input_len, model_dim)``
 95 |         """
 96 | 
 97 |         batch_size = inputs.size(0)
 98 |         inputs_len = inputs.size(1)
 99 |         average_outputs = self.cumulative_average(
100 |           inputs, self.cumulative_average_mask(batch_size,
101 |                                                inputs_len, inputs.device)
102 |           if layer_cache is None else step, layer_cache=layer_cache)
103 |         if self.aan_useffn:
104 |             average_outputs = self.average_layer(average_outputs)
105 |         gating_outputs = self.gating_layer(torch.cat((inputs,
106 |                                                       average_outputs), -1))
107 |         input_gate, forget_gate = torch.chunk(gating_outputs, 2, dim=2)
108 |         gating_outputs = torch.sigmoid(input_gate) * inputs + \
109 |             torch.sigmoid(forget_gate) * average_outputs
110 | 
111 |         return gating_outputs, average_outputs
112 | 


--------------------------------------------------------------------------------
/第9章  数字人文下的机器翻译/opennmt/onmt/modules/conv_multi_step_attention.py:
--------------------------------------------------------------------------------
 1 | """ Multi Step Attention for CNN """
 2 | import torch
 3 | import torch.nn as nn
 4 | import torch.nn.functional as F
 5 | from onmt.utils.misc import aeq
 6 | 
 7 | 
 8 | SCALE_WEIGHT = 0.5 ** 0.5
 9 | 
10 | 
11 | def seq_linear(linear, x):
12 |     """ linear transform for 3-d tensor """
13 |     batch, hidden_size, length, _ = x.size()
14 |     h = linear(torch.transpose(x, 1, 2).contiguous().view(
15 |         batch * length, hidden_size))
16 |     return torch.transpose(h.view(batch, length, hidden_size, 1), 1, 2)
17 | 
18 | 
19 | class ConvMultiStepAttention(nn.Module):
20 |     """
21 |     Conv attention takes a key matrix, a value matrix and a query vector.
22 |     Attention weight is calculated by key matrix with the query vector
23 |     and sum on the value matrix. And the same operation is applied
24 |     in each decode conv layer.
25 |     """
26 | 
27 |     def __init__(self, input_size):
28 |         super(ConvMultiStepAttention, self).__init__()
29 |         self.linear_in = nn.Linear(input_size, input_size)
30 |         self.mask = None
31 | 
32 |     def apply_mask(self, mask):
33 |         """ Apply mask """
34 |         self.mask = mask
35 | 
36 |     def forward(self, base_target_emb, input_from_dec, encoder_out_top,
37 |                 encoder_out_combine):
38 |         """
39 |         Args:
40 |             base_target_emb: target emb tensor
41 |             input_from_dec: output of decode conv
42 |             encoder_out_top: the key matrix for calculation of attetion weight,
43 |                 which is the top output of encode conv
44 |             encoder_out_combine:
45 |                 the value matrix for the attention-weighted sum,
46 |                 which is the combination of base emb and top output of encode
47 |         """
48 | 
49 |         # checks
50 |         # batch, channel, height, width = base_target_emb.size()
51 |         batch, _, height, _ = base_target_emb.size()
52 |         # batch_, channel_, height_, width_ = input_from_dec.size()
53 |         batch_, _, height_, _ = input_from_dec.size()
54 |         aeq(batch, batch_)
55 |         aeq(height, height_)
56 | 
57 |         # enc_batch, enc_channel, enc_height = encoder_out_top.size()
58 |         enc_batch, _, enc_height = encoder_out_top.size()
59 |         # enc_batch_, enc_channel_, enc_height_ = encoder_out_combine.size()
60 |         enc_batch_, _, enc_height_ = encoder_out_combine.size()
61 | 
62 |         aeq(enc_batch, enc_batch_)
63 |         aeq(enc_height, enc_height_)
64 | 
65 |         preatt = seq_linear(self.linear_in, input_from_dec)
66 |         target = (base_target_emb + preatt) * SCALE_WEIGHT
67 |         target = torch.squeeze(target, 3)
68 |         target = torch.transpose(target, 1, 2)
69 |         pre_attn = torch.bmm(target, encoder_out_top)
70 | 
71 |         if self.mask is not None:
72 |             pre_attn.data.masked_fill_(self.mask, -float('inf'))
73 | 
74 |         attn = F.softmax(pre_attn, dim=2)
75 | 
76 |         context_output = torch.bmm(
77 |             attn, torch.transpose(encoder_out_combine, 1, 2))
78 |         context_output = torch.transpose(
79 |             torch.unsqueeze(context_output, 3), 1, 2)
80 |         return context_output, attn
81 | 


--------------------------------------------------------------------------------
/第9章  数字人文下的机器翻译/opennmt/onmt/modules/gate.py:
--------------------------------------------------------------------------------
 1 | """ ContextGate module """
 2 | import torch
 3 | import torch.nn as nn
 4 | 
 5 | 
 6 | def context_gate_factory(gate_type, embeddings_size, decoder_size,
 7 |                          attention_size, output_size):
 8 |     """Returns the correct ContextGate class"""
 9 | 
10 |     gate_types = {'source': SourceContextGate,
11 |                   'target': TargetContextGate,
12 |                   'both': BothContextGate}
13 | 
14 |     assert gate_type in gate_types, "Not valid ContextGate type: {0}".format(
15 |         gate_type)
16 |     return gate_types[gate_type](embeddings_size, decoder_size, attention_size,
17 |                                  output_size)
18 | 
19 | 
20 | class ContextGate(nn.Module):
21 |     """
22 |     Context gate is a decoder module that takes as input the previous word
23 |     embedding, the current decoder state and the attention state, and
24 |     produces a gate.
25 |     The gate can be used to select the input from the target side context
26 |     (decoder state), from the source context (attention state) or both.
27 |     """
28 | 
29 |     def __init__(self, embeddings_size, decoder_size,
30 |                  attention_size, output_size):
31 |         super(ContextGate, self).__init__()
32 |         input_size = embeddings_size + decoder_size + attention_size
33 |         self.gate = nn.Linear(input_size, output_size, bias=True)
34 |         self.sig = nn.Sigmoid()
35 |         self.source_proj = nn.Linear(attention_size, output_size)
36 |         self.target_proj = nn.Linear(embeddings_size + decoder_size,
37 |                                      output_size)
38 | 
39 |     def forward(self, prev_emb, dec_state, attn_state):
40 |         input_tensor = torch.cat((prev_emb, dec_state, attn_state), dim=1)
41 |         z = self.sig(self.gate(input_tensor))
42 |         proj_source = self.source_proj(attn_state)
43 |         proj_target = self.target_proj(
44 |             torch.cat((prev_emb, dec_state), dim=1))
45 |         return z, proj_source, proj_target
46 | 
47 | 
48 | class SourceContextGate(nn.Module):
49 |     """Apply the context gate only to the source context"""
50 | 
51 |     def __init__(self, embeddings_size, decoder_size,
52 |                  attention_size, output_size):
53 |         super(SourceContextGate, self).__init__()
54 |         self.context_gate = ContextGate(embeddings_size, decoder_size,
55 |                                         attention_size, output_size)
56 |         self.tanh = nn.Tanh()
57 | 
58 |     def forward(self, prev_emb, dec_state, attn_state):
59 |         z, source, target = self.context_gate(
60 |             prev_emb, dec_state, attn_state)
61 |         return self.tanh(target + z * source)
62 | 
63 | 
64 | class TargetContextGate(nn.Module):
65 |     """Apply the context gate only to the target context"""
66 | 
67 |     def __init__(self, embeddings_size, decoder_size,
68 |                  attention_size, output_size):
69 |         super(TargetContextGate, self).__init__()
70 |         self.context_gate = ContextGate(embeddings_size, decoder_size,
71 |                                         attention_size, output_size)
72 |         self.tanh = nn.Tanh()
73 | 
74 |     def forward(self, prev_emb, dec_state, attn_state):
75 |         z, source, target = self.context_gate(prev_emb, dec_state, attn_state)
76 |         return self.tanh(z * target + source)
77 | 
78 | 
79 | class BothContextGate(nn.Module):
80 |     """Apply the context gate to both contexts"""
81 | 
82 |     def __init__(self, embeddings_size, decoder_size,
83 |                  attention_size, output_size):
84 |         super(BothContextGate, self).__init__()
85 |         self.context_gate = ContextGate(embeddings_size, decoder_size,
86 |                                         attention_size, output_size)
87 |         self.tanh = nn.Tanh()
88 | 
89 |     def forward(self, prev_emb, dec_state, attn_state):
90 |         z, source, target = self.context_gate(prev_emb, dec_state, attn_state)
91 |         return self.tanh((1. - z) * target + z * source)
92 | 


--------------------------------------------------------------------------------
/第9章  数字人文下的机器翻译/opennmt/onmt/modules/position_ffn.py:
--------------------------------------------------------------------------------
 1 | """Position feed-forward network from "Attention is All You Need"."""
 2 | 
 3 | import torch.nn as nn
 4 | 
 5 | 
 6 | class PositionwiseFeedForward(nn.Module):
 7 |     """ A two-layer Feed-Forward-Network with residual layer norm.
 8 | 
 9 |     Args:
10 |         d_model (int): the size of input for the first-layer of the FFN.
11 |         d_ff (int): the hidden layer size of the second-layer
12 |             of the FNN.
13 |         dropout (float): dropout probability in :math:`[0, 1)`.
14 |     """
15 | 
16 |     def __init__(self, d_model, d_ff, dropout=0.1):
17 |         super(PositionwiseFeedForward, self).__init__()
18 |         self.w_1 = nn.Linear(d_model, d_ff)
19 |         self.w_2 = nn.Linear(d_ff, d_model)
20 |         self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
21 |         self.dropout_1 = nn.Dropout(dropout)
22 |         self.relu = nn.ReLU()
23 |         self.dropout_2 = nn.Dropout(dropout)
24 | 
25 |     def forward(self, x):
26 |         """Layer definition.
27 | 
28 |         Args:
29 |             x: ``(batch_size, input_len, model_dim)``
30 | 
31 |         Returns:
32 |             (FloatTensor): Output ``(batch_size, input_len, model_dim)``.
33 |         """
34 | 
35 |         inter = self.dropout_1(self.relu(self.w_1(self.layer_norm(x))))
36 |         output = self.dropout_2(self.w_2(inter))
37 |         return output + x
38 | 
39 |     def update_dropout(self, dropout):
40 |         self.dropout_1.p = dropout
41 |         self.dropout_2.p = dropout
42 | 


--------------------------------------------------------------------------------
/第9章  数字人文下的机器翻译/opennmt/onmt/modules/sparse_activations.py:
--------------------------------------------------------------------------------
 1 | """
 2 | An implementation of sparsemax (Martins & Astudillo, 2016). See
 3 | :cite:`DBLP:journals/corr/MartinsA16` for detailed description.
 4 | 
 5 | By Ben Peters and Vlad Niculae
 6 | """
 7 | 
 8 | import torch
 9 | import torch.nn as nn
10 | from torch.autograd import Function
11 | 
12 | 
13 | def _make_ix_like(input, dim=0):
14 |     d = input.size(dim)
15 |     rho = torch.arange(1, d + 1, device=input.device, dtype=input.dtype)
16 |     view = [1] * input.dim()
17 |     view[0] = -1
18 |     return rho.view(view).transpose(0, dim)
19 | 
20 | 
21 | def _threshold_and_support(input, dim=0):
22 |     """Sparsemax building block: compute the threshold
23 | 
24 |     Args:
25 |         input: any dimension
26 |         dim: dimension along which to apply the sparsemax
27 | 
28 |     Returns:
29 |         the threshold value
30 |     """
31 | 
32 |     input_srt, _ = torch.sort(input, descending=True, dim=dim)
33 |     input_cumsum = input_srt.cumsum(dim) - 1
34 |     rhos = _make_ix_like(input, dim)
35 |     support = rhos * input_srt > input_cumsum
36 | 
37 |     support_size = support.sum(dim=dim).unsqueeze(dim)
38 |     tau = input_cumsum.gather(dim, support_size - 1)
39 |     tau /= support_size.to(input.dtype)
40 |     return tau, support_size
41 | 
42 | 
43 | class SparsemaxFunction(Function):
44 | 
45 |     @staticmethod
46 |     def forward(ctx, input, dim=0):
47 |         """sparsemax: normalizing sparse transform (a la softmax)
48 | 
49 |         Parameters:
50 |             input (Tensor): any shape
51 |             dim: dimension along which to apply sparsemax
52 | 
53 |         Returns:
54 |             output (Tensor): same shape as input
55 |         """
56 |         ctx.dim = dim
57 |         max_val, _ = input.max(dim=dim, keepdim=True)
58 |         input -= max_val  # same numerical stability trick as for softmax
59 |         tau, supp_size = _threshold_and_support(input, dim=dim)
60 |         output = torch.clamp(input - tau, min=0)
61 |         ctx.save_for_backward(supp_size, output)
62 |         return output
63 | 
64 |     @staticmethod
65 |     def backward(ctx, grad_output):
66 |         supp_size, output = ctx.saved_tensors
67 |         dim = ctx.dim
68 |         grad_input = grad_output.clone()
69 |         grad_input[output == 0] = 0
70 | 
71 |         v_hat = grad_input.sum(dim=dim) / supp_size.to(output.dtype).squeeze()
72 |         v_hat = v_hat.unsqueeze(dim)
73 |         grad_input = torch.where(output != 0, grad_input - v_hat, grad_input)
74 |         return grad_input, None
75 | 
76 | 
77 | sparsemax = SparsemaxFunction.apply
78 | 
79 | 
80 | class Sparsemax(nn.Module):
81 | 
82 |     def __init__(self, dim=0):
83 |         self.dim = dim
84 |         super(Sparsemax, self).__init__()
85 | 
86 |     def forward(self, input):
87 |         return sparsemax(input, self.dim)
88 | 
89 | 
90 | class LogSparsemax(nn.Module):
91 | 
92 |     def __init__(self, dim=0):
93 |         self.dim = dim
94 |         super(LogSparsemax, self).__init__()
95 | 
96 |     def forward(self, input):
97 |         return torch.log(sparsemax(input, self.dim))
98 | 


--------------------------------------------------------------------------------
/第9章  数字人文下的机器翻译/opennmt/onmt/modules/sparse_losses.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from onmt.modules.sparse_activations import _threshold_and_support
 4 | from onmt.utils.misc import aeq
 5 | from torch.autograd import Function
 6 | 
 7 | 
 8 | class SparsemaxLossFunction(Function):
 9 | 
10 |     @staticmethod
11 |     def forward(ctx, input, target):
12 |         """
13 |         input (FloatTensor): ``(n, num_classes)``.
14 |         target (LongTensor): ``(n,)``, the indices of the target classes
15 |         """
16 |         input_batch, classes = input.size()
17 |         target_batch = target.size(0)
18 |         aeq(input_batch, target_batch)
19 | 
20 |         z_k = input.gather(1, target.unsqueeze(1)).squeeze()
21 |         tau_z, support_size = _threshold_and_support(input, dim=1)
22 |         support = input > tau_z
23 |         x = torch.where(
24 |             support, input**2 - tau_z**2,
25 |             torch.tensor(0.0, device=input.device)
26 |         ).sum(dim=1)
27 |         ctx.save_for_backward(input, target, tau_z)
28 |         # clamping necessary because of numerical errors: loss should be lower
29 |         # bounded by zero, but negative values near zero are possible without
30 |         # the clamp
31 |         return torch.clamp(x / 2 - z_k + 0.5, min=0.0)
32 | 
33 |     @staticmethod
34 |     def backward(ctx, grad_output):
35 |         input, target, tau_z = ctx.saved_tensors
36 |         sparsemax_out = torch.clamp(input - tau_z, min=0)
37 |         delta = torch.zeros_like(sparsemax_out)
38 |         delta.scatter_(1, target.unsqueeze(1), 1)
39 |         return sparsemax_out - delta, None
40 | 
41 | 
42 | sparsemax_loss = SparsemaxLossFunction.apply
43 | 
44 | 
45 | class SparsemaxLoss(nn.Module):
46 |     """
47 |     An implementation of sparsemax loss, first proposed in
48 |     :cite:`DBLP:journals/corr/MartinsA16`. If using
49 |     a sparse output layer, it is not possible to use negative log likelihood
50 |     because the loss is infinite in the case the target is assigned zero
51 |     probability. Inputs to SparsemaxLoss are arbitrary dense real-valued
52 |     vectors (like in nn.CrossEntropyLoss), not probability vectors (like in
53 |     nn.NLLLoss).
54 |     """
55 | 
56 |     def __init__(self, weight=None, ignore_index=-100,
57 |                  reduction='elementwise_mean'):
58 |         assert reduction in ['elementwise_mean', 'sum', 'none']
59 |         self.reduction = reduction
60 |         self.weight = weight
61 |         self.ignore_index = ignore_index
62 |         super(SparsemaxLoss, self).__init__()
63 | 
64 |     def forward(self, input, target):
65 |         loss = sparsemax_loss(input, target)
66 |         if self.ignore_index >= 0:
67 |             ignored_positions = target == self.ignore_index
68 |             size = float((target.size(0) - ignored_positions.sum()).item())
69 |             loss.masked_fill_(ignored_positions, 0.0)
70 |         else:
71 |             size = float(target.size(0))
72 |         if self.reduction == 'sum':
73 |             loss = loss.sum()
74 |         elif self.reduction == 'elementwise_mean':
75 |             loss = loss.sum() / size
76 |         return loss
77 | 


--------------------------------------------------------------------------------
/第9章  数字人文下的机器翻译/opennmt/onmt/modules/structured_attention.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.cuda
 3 | import torch.nn as nn
 4 | 
 5 | 
 6 | class MatrixTree(nn.Module):
 7 |     """Implementation of the matrix-tree theorem for computing marginals
 8 |     of non-projective dependency parsing. This attention layer is used
 9 |     in the paper "Learning Structured Text Representations"
10 |     :cite:`DBLP:journals/corr/LiuL17d`.
11 |     """
12 | 
13 |     def __init__(self, eps=1e-5):
14 |         self.eps = eps
15 |         super(MatrixTree, self).__init__()
16 | 
17 |     def forward(self, input):
18 |         laplacian = input.exp() + self.eps
19 |         output = input.clone()
20 |         for b in range(input.size(0)):
21 |             lap = laplacian[b].masked_fill(
22 |                 torch.eye(input.size(1), device=input.device).ne(0), 0)
23 |             lap = -lap + torch.diag(lap.sum(0))
24 |             # store roots on diagonal
25 |             lap[0] = input[b].diag().exp()
26 |             inv_laplacian = lap.inverse()
27 | 
28 |             factor = inv_laplacian.diag().unsqueeze(1)\
29 |                                          .expand_as(input[b]).transpose(0, 1)
30 |             term1 = input[b].exp().mul(factor).clone()
31 |             term2 = input[b].exp().mul(inv_laplacian.transpose(0, 1)).clone()
32 |             term1[:, 0] = 0
33 |             term2[0] = 0
34 |             output[b] = term1 - term2
35 |             roots_output = input[b].diag().exp().mul(
36 |                 inv_laplacian.transpose(0, 1)[0])
37 |             output[b] = output[b] + torch.diag(roots_output)
38 |         return output
39 | 


--------------------------------------------------------------------------------
/第9章  数字人文下的机器翻译/opennmt/onmt/modules/util_class.py:
--------------------------------------------------------------------------------
 1 | """ Misc classes """
 2 | import torch
 3 | import torch.nn as nn
 4 | 
 5 | 
 6 | # At the moment this class is only used by embeddings.Embeddings look-up tables
 7 | class Elementwise(nn.ModuleList):
 8 |     """
 9 |     A simple network container.
10 |     Parameters are a list of modules.
11 |     Inputs are a 3d Tensor whose last dimension is the same length
12 |     as the list.
13 |     Outputs are the result of applying modules to inputs elementwise.
14 |     An optional merge parameter allows the outputs to be reduced to a
15 |     single Tensor.
16 |     """
17 | 
18 |     def __init__(self, merge=None, *args):
19 |         assert merge in [None, 'first', 'concat', 'sum', 'mlp']
20 |         self.merge = merge
21 |         super(Elementwise, self).__init__(*args)
22 | 
23 |     def forward(self, inputs):
24 |         inputs_ = [feat.squeeze(2) for feat in inputs.split(1, dim=2)]
25 |         assert len(self) == len(inputs_)
26 |         outputs = [f(x) for f, x in zip(self, inputs_)]
27 |         if self.merge == 'first':
28 |             return outputs[0]
29 |         elif self.merge == 'concat' or self.merge == 'mlp':
30 |             return torch.cat(outputs, 2)
31 |         elif self.merge == 'sum':
32 |             return sum(outputs)
33 |         else:
34 |             return outputs
35 | 
36 | 
37 | class Cast(nn.Module):
38 |     """
39 |     Basic layer that casts its input to a specific data type. The same tensor
40 |     is returned if the data type is already correct.
41 |     """
42 | 
43 |     def __init__(self, dtype):
44 |         super(Cast, self).__init__()
45 |         self._dtype = dtype
46 | 
47 |     def forward(self, x):
48 |         return x.to(self._dtype)
49 | 


--------------------------------------------------------------------------------
/第9章  数字人文下的机器翻译/opennmt/onmt/translate/__init__.py:
--------------------------------------------------------------------------------
1 | """ Modules for translation """
2 | from onmt.translate.greedy_search import GreedySearch
3 | 
4 | __all__ = ['Translator', 'Translation', 'BeamSearch',
5 |            'GNMTGlobalScorer', 'TranslationBuilder',
6 |            'PenaltyBuilder', "DecodeStrategy", "GreedySearch"]
7 | 


--------------------------------------------------------------------------------
/第9章  数字人文下的机器翻译/opennmt/onmt/translate/penalties.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | 
  3 | import torch
  4 | 
  5 | 
  6 | class PenaltyBuilder(object):
  7 |     """Returns the Length and Coverage Penalty function for Beam Search.
  8 | 
  9 |     Args:
 10 |         length_pen (str): option name of length pen
 11 |         cov_pen (str): option name of cov pen
 12 | 
 13 |     Attributes:
 14 |         has_cov_pen (bool): Whether coverage penalty is None (applying it
 15 |             is a no-op). Note that the converse isn't true. Setting beta
 16 |             to 0 should force coverage length to be a no-op.
 17 |         has_len_pen (bool): Whether length penalty is None (applying it
 18 |             is a no-op). Note that the converse isn't true. Setting alpha
 19 |             to 1 should force length penalty to be a no-op.
 20 |         coverage_penalty (callable[[FloatTensor, float], FloatTensor]):
 21 |             Calculates the coverage penalty.
 22 |         length_penalty (callable[[int, float], float]): Calculates
 23 |             the length penalty.
 24 |     """
 25 | 
 26 |     def __init__(self, cov_pen, length_pen):
 27 |         self.has_cov_pen = not self._pen_is_none(cov_pen)
 28 |         self.coverage_penalty = self._coverage_penalty(cov_pen)
 29 |         self.has_len_pen = not self._pen_is_none(length_pen)
 30 |         self.length_penalty = self._length_penalty(length_pen)
 31 | 
 32 |     @staticmethod
 33 |     def _pen_is_none(pen):
 34 |         return pen == "none" or pen is None
 35 | 
 36 |     def _coverage_penalty(self, cov_pen):
 37 |         if cov_pen == "wu":
 38 |             return self.coverage_wu
 39 |         elif cov_pen == "summary":
 40 |             return self.coverage_summary
 41 |         elif self._pen_is_none(cov_pen):
 42 |             return self.coverage_none
 43 |         else:
 44 |             raise NotImplementedError("No '{:s}' coverage penalty.".format(
 45 |                 cov_pen))
 46 | 
 47 |     def _length_penalty(self, length_pen):
 48 |         if length_pen == "wu":
 49 |             return self.length_wu
 50 |         elif length_pen == "avg":
 51 |             return self.length_average
 52 |         elif self._pen_is_none(length_pen):
 53 |             return self.length_none
 54 |         else:
 55 |             raise NotImplementedError("No '{:s}' length penalty.".format(
 56 |                 length_pen))
 57 | 
 58 |     # Below are all the different penalty terms implemented so far.
 59 |     # Subtract coverage penalty from topk log probs.
 60 |     # Divide topk log probs by length penalty.
 61 | 
 62 |     def coverage_wu(self, cov, beta=0.):
 63 |         """GNMT coverage re-ranking score.
 64 | 
 65 |         See "Google's Neural Machine Translation System" :cite:`wu2016google`.
 66 |         ``cov`` is expected to be sized ``(*, seq_len)``, where ``*`` is
 67 |         probably ``batch_size x beam_size`` but could be several
 68 |         dimensions like ``(batch_size, beam_size)``. If ``cov`` is attention,
 69 |         then the ``seq_len`` axis probably sums to (almost) 1.
 70 |         """
 71 | 
 72 |         penalty = -torch.min(cov, cov.clone().fill_(1.0)).log().sum(-1)
 73 |         return beta * penalty
 74 | 
 75 |     def coverage_summary(self, cov, beta=0.):
 76 |         """Our summary penalty."""
 77 |         penalty = torch.max(cov, cov.clone().fill_(1.0)).sum(-1)
 78 |         penalty -= cov.size(-1)
 79 |         return beta * penalty
 80 | 
 81 |     def coverage_none(self, cov, beta=0.):
 82 |         """Returns zero as penalty"""
 83 |         none = torch.zeros((1,), device=cov.device,
 84 |                            dtype=torch.float)
 85 |         if cov.dim() == 3:
 86 |             none = none.unsqueeze(0)
 87 |         return none
 88 | 
 89 |     def length_wu(self, cur_len, alpha=0.):
 90 |         """GNMT length re-ranking score.
 91 | 
 92 |         See "Google's Neural Machine Translation System" :cite:`wu2016google`.
 93 |         """
 94 | 
 95 |         return ((5 + cur_len) / 6.0) ** alpha
 96 | 
 97 |     def length_average(self, cur_len, alpha=0.):
 98 |         """Returns the current sequence length."""
 99 |         return cur_len
100 | 
101 |     def length_none(self, cur_len, alpha=0.):
102 |         """Returns unmodified scores."""
103 |         return 1.0
104 | 


--------------------------------------------------------------------------------
/第9章  数字人文下的机器翻译/opennmt/onmt/utils/__init__.py:
--------------------------------------------------------------------------------
1 | """Module defining various utilities."""
2 | 
3 | __all__ = ["split_corpus", "aeq", "use_gpu", "set_random_seed", "ReportMgr",
4 |            "build_report_manager", "Statistics",
5 |            "MultipleOptimizer", "Optimizer", "AdaFactor", "EarlyStopping",
6 |            "scorers_from_opts", "make_batch_align_matrix"]
7 | 


--------------------------------------------------------------------------------
/第9章  数字人文下的机器翻译/opennmt/onmt/utils/cnn_factory.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Implementation of "Convolutional Sequence to Sequence Learning"
 3 | """
 4 | import onmt.modules
 5 | import torch
 6 | import torch.nn as nn
 7 | import torch.nn.init as init
 8 | 
 9 | SCALE_WEIGHT = 0.5 ** 0.5
10 | 
11 | 
12 | def shape_transform(x):
13 |     """ Tranform the size of the tensors to fit for conv input. """
14 |     return torch.unsqueeze(torch.transpose(x, 1, 2), 3)
15 | 
16 | 
17 | class GatedConv(nn.Module):
18 |     """ Gated convolution for CNN class """
19 | 
20 |     def __init__(self, input_size, width=3, dropout=0.2, nopad=False):
21 |         super(GatedConv, self).__init__()
22 |         self.conv = onmt.modules.WeightNormConv2d(
23 |             input_size, 2 * input_size, kernel_size=(width, 1), stride=(1, 1),
24 |             padding=(width // 2 * (1 - nopad), 0))
25 |         init.xavier_uniform_(self.conv.weight, gain=(4 * (1 - dropout))**0.5)
26 |         self.dropout = nn.Dropout(dropout)
27 | 
28 |     def forward(self, x_var):
29 |         x_var = self.dropout(x_var)
30 |         x_var = self.conv(x_var)
31 |         out, gate = x_var.split(int(x_var.size(1) / 2), 1)
32 |         out = out * torch.sigmoid(gate)
33 |         return out
34 | 
35 | 
36 | class StackedCNN(nn.Module):
37 |     """ Stacked CNN class """
38 | 
39 |     def __init__(self, num_layers, input_size, cnn_kernel_width=3,
40 |                  dropout=0.2):
41 |         super(StackedCNN, self).__init__()
42 |         self.dropout = dropout
43 |         self.num_layers = num_layers
44 |         self.layers = nn.ModuleList()
45 |         for _ in range(num_layers):
46 |             self.layers.append(
47 |                 GatedConv(input_size, cnn_kernel_width, dropout))
48 | 
49 |     def forward(self, x):
50 |         for conv in self.layers:
51 |             x = x + conv(x)
52 |             x *= SCALE_WEIGHT
53 |         return x
54 | 


--------------------------------------------------------------------------------
/第9章  数字人文下的机器翻译/opennmt/onmt/utils/distributed.py:
--------------------------------------------------------------------------------
  1 | """ Pytorch Distributed utils
  2 |     This piece of code was heavily inspired by the equivalent of Fairseq-py
  3 |     https://github.com/pytorch/fairseq
  4 | """
  5 | 
  6 | 
  7 | from __future__ import print_function
  8 | 
  9 | import math
 10 | import pickle
 11 | 
 12 | import torch.distributed
 13 | from onmt.utils.logging import logger
 14 | 
 15 | 
 16 | def is_master(opt, device_id):
 17 |     return opt.gpu_ranks[device_id] == 0
 18 | 
 19 | 
 20 | def multi_init(opt, device_id):
 21 |     dist_init_method = 'tcp://{master_ip}:{master_port}'.format(
 22 |         master_ip=opt.master_ip,
 23 |         master_port=opt.master_port)
 24 |     dist_world_size = opt.world_size
 25 |     torch.distributed.init_process_group(
 26 |         backend=opt.gpu_backend, init_method=dist_init_method,
 27 |         world_size=dist_world_size, rank=opt.gpu_ranks[device_id])
 28 |     gpu_rank = torch.distributed.get_rank()
 29 |     if not is_master(opt, device_id):
 30 |         logger.disabled = True
 31 | 
 32 |     return gpu_rank
 33 | 
 34 | 
 35 | def all_reduce_and_rescale_tensors(tensors, rescale_denom,
 36 |                                    buffer_size=10485760):
 37 |     """All-reduce and rescale tensors in chunks of the specified size.
 38 | 
 39 |     Args:
 40 |         tensors: list of Tensors to all-reduce
 41 |         rescale_denom: denominator for rescaling summed Tensors
 42 |         buffer_size: all-reduce chunk size in bytes
 43 |     """
 44 |     # buffer size in bytes, determine equiv. # of elements based on data type
 45 |     buffer_t = tensors[0].new(
 46 |         math.ceil(buffer_size / tensors[0].element_size())).zero_()
 47 |     buffer = []
 48 | 
 49 |     def all_reduce_buffer():
 50 |         # copy tensors into buffer_t
 51 |         offset = 0
 52 |         for t in buffer:
 53 |             numel = t.numel()
 54 |             buffer_t[offset:offset+numel].copy_(t.view(-1))
 55 |             offset += numel
 56 | 
 57 |         # all-reduce and rescale
 58 |         torch.distributed.all_reduce(buffer_t[:offset])
 59 |         buffer_t.div_(rescale_denom)
 60 | 
 61 |         # copy all-reduced buffer back into tensors
 62 |         offset = 0
 63 |         for t in buffer:
 64 |             numel = t.numel()
 65 |             t.view(-1).copy_(buffer_t[offset:offset+numel])
 66 |             offset += numel
 67 | 
 68 |     filled = 0
 69 |     for t in tensors:
 70 |         sz = t.numel() * t.element_size()
 71 |         if sz > buffer_size:
 72 |             # tensor is bigger than buffer, all-reduce and rescale directly
 73 |             torch.distributed.all_reduce(t)
 74 |             t.div_(rescale_denom)
 75 |         elif filled + sz > buffer_size:
 76 |             # buffer is full, all-reduce and replace buffer with grad
 77 |             all_reduce_buffer()
 78 |             buffer = [t]
 79 |             filled = sz
 80 |         else:
 81 |             # add tensor to buffer
 82 |             buffer.append(t)
 83 |             filled += sz
 84 | 
 85 |     if len(buffer) > 0:
 86 |         all_reduce_buffer()
 87 | 
 88 | 
 89 | def all_gather_list(data, max_size=4096):
 90 |     """Gathers arbitrary data from all nodes into a list."""
 91 |     world_size = torch.distributed.get_world_size()
 92 |     if not hasattr(all_gather_list, '_in_buffer') or \
 93 |             max_size != all_gather_list._in_buffer.size():
 94 |         all_gather_list._in_buffer = torch.cuda.ByteTensor(max_size)
 95 |         all_gather_list._out_buffers = [
 96 |             torch.cuda.ByteTensor(max_size)
 97 |             for i in range(world_size)
 98 |         ]
 99 |     in_buffer = all_gather_list._in_buffer
100 |     out_buffers = all_gather_list._out_buffers
101 | 
102 |     enc = pickle.dumps(data)
103 |     enc_size = len(enc)
104 |     if enc_size + 2 > max_size:
105 |         raise ValueError(
106 |             'encoded data exceeds max_size: {}'.format(enc_size + 2))
107 |     assert max_size < 255*256
108 |     in_buffer[0] = enc_size // 255  # this encoding works for max_size < 65k
109 |     in_buffer[1] = enc_size % 255
110 |     in_buffer[2:enc_size+2] = torch.ByteTensor(list(enc))
111 | 
112 |     torch.distributed.all_gather(out_buffers, in_buffer.cuda())
113 | 
114 |     results = []
115 |     for i in range(world_size):
116 |         out_buffer = out_buffers[i]
117 |         size = (255 * out_buffer[0].item()) + out_buffer[1].item()
118 | 
119 |         bytes_list = bytes(out_buffer[2:size+2].tolist())
120 |         result = pickle.loads(bytes_list)
121 |         results.append(result)
122 |     return results
123 | 


--------------------------------------------------------------------------------
/第9章  数字人文下的机器翻译/opennmt/onmt/utils/logging.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | 
 4 | import logging
 5 | from logging.handlers import RotatingFileHandler
 6 | 
 7 | logger = logging.getLogger()
 8 | 
 9 | 
10 | def init_logger(log_file=None, log_file_level=logging.NOTSET, rotate=False):
11 |     log_format = logging.Formatter("[%(asctime)s %(levelname)s] %(message)s")
12 |     logger = logging.getLogger()
13 |     logger.setLevel(logging.INFO)
14 | 
15 |     console_handler = logging.StreamHandler()
16 |     console_handler.setFormatter(log_format)
17 |     logger.handlers = [console_handler]
18 | 
19 |     if log_file and log_file != '':
20 |         if rotate:
21 |             file_handler = RotatingFileHandler(
22 |                 log_file, maxBytes=1000000, backupCount=10)
23 |         else:
24 |             file_handler = logging.FileHandler(log_file)
25 |         file_handler.setLevel(log_file_level)
26 |         file_handler.setFormatter(log_format)
27 |         logger.addHandler(file_handler)
28 | 
29 |     return logger
30 | 


--------------------------------------------------------------------------------
/第9章  数字人文下的机器翻译/opennmt/onmt/utils/rnn_factory.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  RNN tools
 3 | """
 4 | import onmt.models
 5 | import torch.nn as nn
 6 | 
 7 | 
 8 | def rnn_factory(rnn_type, **kwargs):
 9 |     """ rnn factory, Use pytorch version when available. """
10 |     no_pack_padded_seq = False
11 |     if rnn_type == "SRU":
12 |         # SRU doesn't support PackedSequence.
13 |         no_pack_padded_seq = True
14 |         rnn = onmt.models.sru.SRU(**kwargs)
15 |     else:
16 |         rnn = getattr(nn, rnn_type)(**kwargs)
17 |     return rnn, no_pack_padded_seq
18 | 


--------------------------------------------------------------------------------
/第9章  数字人文下的机器翻译/opennmt/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from os import path
 3 | 
 4 | from setuptools import setup
 5 | 
 6 | this_directory = path.abspath(path.dirname(__file__))
 7 | setup(
 8 |     install_requires=[
 9 |         "six",
10 |         "tqdm~=4.30.0",
11 |         "torch>=1.4.0",
12 |         "torchtext==0.4.0",
13 |         "future",
14 |         "configargparse",
15 |         "tensorboard>=1.14",
16 |         "pyonmttok==1.*;platform_system=='Linux'",
17 |         "pyyaml",
18 |         "sentencepiece",
19 |         "MeCab"
20 |     ],
21 | )
22 | 


--------------------------------------------------------------------------------
/第9章  数字人文下的机器翻译/opennmt/translate.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from onmt.bin.translate import main
 3 | 
 4 | 
 5 | 
 6 | def translate(input_text):
 7 |     '''
 8 |     :param input_text: 从浏览器前端文本框输入的原始文本
 9 |     :return: processed_text: 处理后可直接用于前端呈现的文本
10 |     '''
11 | 
12 |     def writetxt(path, text):
13 |         with open(path, 'w', encoding='utf8') as f:
14 |             f.write(str(text))
15 |         f.close()
16 | 
17 |     def readtxt(path):
18 |         with open(path, 'r', encoding='utf8') as f:
19 |             lines = f.readlines()
20 |             lines = lines[0].replace('\n','')
21 |         return lines
22 | 
23 |     writetxt('./data/src-test.txt', input_text)
24 |     main()
25 |     processed_text = readtxt('./data/pred.txt')
26 | 
27 |     return processed_text
28 | 
29 | 
30 | 
31 | if __name__ == "__main__":
32 |     input_text = '虽然 ， 每 至 于 族 ， 吾 见 其 难为 ， 怵然 为戒 ， 视 为 止 ， 行 为迟 。'
33 |     output = translate(input_text)
34 |     print(output)


--------------------------------------------------------------------------------
/第9章  数字人文下的机器翻译/第九章 数字人文下的机器翻译.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hsc748NLP/code-for-digital-humanities-tutorial/6220c365ab5ff7df16f1f20a46cc928b034db36a/第9章  数字人文下的机器翻译/第九章 数字人文下的机器翻译.pdf


--------------------------------------------------------------------------------