├── README.md ├── 第1章 数字人文下的汉字处理 ├── README.md ├── 第一章 数字人文下的汉字处理.pdf └── 篆体字识别 │ ├── ALL-Font │ ├── 370-B-v2-1.ttf │ ├── BaiZhouZhuanShuJiaoHan-1.ttf │ ├── ChaoShiJiXiJiaoZhuanTiFan-1.ttf │ ├── ChaoShiJiXiYinZhuanTiFan-1.ttf │ ├── FZTJLSK.TTF │ ├── FangYuanYinZhangZhuanTi-2.ttf │ ├── HOT-HTenshoStd-L-2.otf │ └── hktenkokk-1.ttf │ ├── DeleteBlankIMG.py │ ├── TTF.py │ ├── TTF2IMG.py │ ├── VGG16-train.py │ └── vgg16.py ├── 第2章 数字人文下的汉语分词 ├── README.md └── 第二章 数字人文下的汉字分词.pdf ├── 第3章 数字人文下的词性自动标注 ├── Bi-LSTM-CRF │ ├── LSTM_CRF │ │ ├── data │ │ │ ├── test_data │ │ │ ├── train_data │ │ │ └── word2id.pkl │ │ ├── lstm_crf_data_helper.py │ │ ├── lstm_crf_main.py │ │ └── lstm_crf_model.py │ ├── README.md │ └── env4BiLSTM-CRF.txt ├── CRF │ ├── conlleval.py │ ├── crf_learn.exe │ ├── crf_test.exe │ ├── data │ │ ├── test.txt │ │ └── train.txt │ ├── libcrfpp.dll │ ├── readme.md │ └── template ├── HMM │ ├── HMM.py │ └── data │ │ ├── corpus.txt │ │ ├── output.txt │ │ └── test.txt ├── README.md └── 第三章 数字人文下的词性自动标注.pdf ├── 第4章 数字人文下的实体识别 ├── BERT-NER-pytorch_sample │ ├── LICENSE │ ├── README.md │ ├── bert.py │ ├── conlleval.py │ ├── printmodel.py │ ├── requirements.txt │ ├── run.pid │ ├── run.sh │ ├── run_ner.py │ ├── run_test.sh │ ├── settings.py │ └── train_data_cixing │ │ ├── test.txt │ │ ├── train.txt │ │ └── valid.txt ├── ChinsesNER-pytorch-master │ ├── .gitignore │ ├── README.md │ ├── data │ │ ├── dev │ │ ├── train │ │ └── trainset │ ├── data_manager.py │ ├── main.py │ ├── model.py │ ├── models │ │ ├── config.yml │ │ ├── data.pkl │ │ └── params.pkl │ └── utils.py ├── README.md ├── 数据预处理 │ ├── data │ │ └── filename.txt │ ├── data_charseq │ │ └── filename.txt │ ├── data_seq │ │ └── filename.txt │ ├── pro_ner.py │ └── train_test_divide.py └── 第四章 数字人文下的命名实体识别.pdf ├── 第5章 数字人文下的模型预训练 ├── README.md ├── pytorch_chinese_lm_pretrain │ ├── README.md │ ├── data │ │ ├── eval.txt │ │ └── train.txt │ ├── output │ │ └── README.md │ ├── run_bert.sh │ ├── run_bert_from_scratch.sh │ ├── run_language_model_bert.py │ ├── run_language_model_ernie.py │ ├── run_language_model_roberta.py │ └── run_roberta.sh ├── transfer.py └── 第五章 数字人文下的模型预训练.pdf ├── 第6章 数字人文下的知识图谱构建及应用 ├── FLASK │ ├── .idea │ │ ├── .gitignore │ │ ├── .name │ │ ├── falsk_test.iml │ │ ├── inspectionProfiles │ │ │ └── profiles_settings.xml │ │ ├── misc.xml │ │ ├── modules.xml │ │ └── vcs.xml │ ├── FindSim.py │ ├── Params.py │ ├── app.py │ ├── data │ │ ├── char_vocabs.txt │ │ └── stopword.txt │ ├── entity_extractor.py │ ├── https │ │ ├── default │ │ ├── https.conf │ │ └── nginx.conf │ ├── kbqa.py │ ├── model │ │ ├── NB.m │ │ ├── ch_ner_model.h5 │ │ └── tf.pkl │ ├── predict.py │ ├── search_answer.py │ ├── static │ │ ├── 0.png │ │ └── default.css │ └── templates │ │ ├── 0.png │ │ ├── result.html │ │ └── search.html └── README.md ├── 第7章 数字人文下的文本分类 ├── README.md ├── 循环神经网络_文本分类.py ├── 第七章 数字人文下的文本分类.pdf └── 非遗信息 全.xlsx ├── 第8章 数字人文下的文本聚类 ├── README.md ├── code │ ├── cluster │ │ ├── DBSCAN.py │ │ ├── H_C.py │ │ ├── Hierarchy_C.py │ │ ├── kmeans-all.py │ │ ├── kmeans.py │ │ ├── kmeans(余弦相似度) │ │ │ ├── __init__.py │ │ │ ├── basealgorithm.py │ │ │ ├── basefunction.py │ │ │ └── kmeans-cos.py │ │ ├── mean-shift.py │ │ ├── onehot.py │ │ └── pca.py │ ├── ex_entity.py │ ├── ex_key │ │ ├── basextract.py │ │ └── extraction-keywords.py │ ├── some_deal │ │ ├── data.py │ │ ├── divide.py │ │ ├── keywords.py │ │ ├── porpotion.py │ │ ├── shufa.py │ │ ├── test.py │ │ └── title_info.py │ ├── tsne_plot │ │ ├── 3D+tsne2维画图.py │ │ ├── heatmap.py │ │ ├── tsne.py │ │ └── ttt.py │ ├── vsm.py │ ├── word2vec │ │ ├── doc2vec.py │ │ └── word2vector.py │ └── 分词 │ │ └── 中科院分词.py ├── 第八章 数字人文下的文本聚类.pdf └── 非遗信息 全.xlsx └── 第9章 数字人文下的机器翻译 ├── README.md ├── opennmt ├── README.md ├── data │ ├── pred.txt │ ├── src-test.txt │ └── tgt-test.txt ├── model │ └── 说明.txt ├── onmt │ ├── .idea │ │ ├── misc.xml │ │ ├── modules.xml │ │ ├── onmt.iml │ │ ├── vcs.xml │ │ └── workspace.xml │ ├── __init__.py │ ├── bin │ │ ├── __init__.py │ │ ├── preprocess.py │ │ ├── train.py │ │ └── translate.py │ ├── decoders │ │ ├── __init__.py │ │ ├── cnn_decoder.py │ │ ├── decoder.py │ │ ├── ensemble.py │ │ └── transformer.py │ ├── encoders │ │ ├── __init__.py │ │ ├── cnn_encoder.py │ │ ├── encoder.py │ │ ├── ggnn_encoder.py │ │ ├── mean_encoder.py │ │ ├── rnn_encoder.py │ │ └── transformer.py │ ├── inputters │ │ ├── MakeToken.py │ │ ├── __init__.py │ │ ├── datareader_base.py │ │ ├── dataset_base.py │ │ ├── inputter.py │ │ └── text_dataset.py │ ├── model_builder.py │ ├── models │ │ ├── __init__.py │ │ ├── model.py │ │ ├── model_saver.py │ │ ├── sru.py │ │ └── stacked_rnn.py │ ├── modules │ │ ├── __init__.py │ │ ├── average_attn.py │ │ ├── conv_multi_step_attention.py │ │ ├── copy_generator.py │ │ ├── embeddings.py │ │ ├── gate.py │ │ ├── global_attention.py │ │ ├── multi_headed_attn.py │ │ ├── position_ffn.py │ │ ├── source_noise.py │ │ ├── sparse_activations.py │ │ ├── sparse_losses.py │ │ ├── structured_attention.py │ │ ├── util_class.py │ │ └── weight_norm.py │ ├── opts.py │ ├── train_single.py │ ├── trainer.py │ ├── translate │ │ ├── __init__.py │ │ ├── beam_search.py │ │ ├── decode_strategy.py │ │ ├── greedy_search.py │ │ ├── penalties.py │ │ ├── translation.py │ │ └── translator.py │ └── utils │ │ ├── __init__.py │ │ ├── alignment.py │ │ ├── cnn_factory.py │ │ ├── distributed.py │ │ ├── earlystopping.py │ │ ├── logging.py │ │ ├── loss.py │ │ ├── misc.py │ │ ├── optimizers.py │ │ ├── parse.py │ │ ├── report_manager.py │ │ ├── rnn_factory.py │ │ └── statistics.py ├── setup.py └── translate.py └── 第九章 数字人文下的机器翻译.pdf /README.md: -------------------------------------------------------------------------------- 1 | # Resources for Digital Humanities Tutorial 2 | 3 | 《数字人文教程》配套资源合集 4 | 5 | 内含: 6 | 7 | - 代码 8 | - ppt 9 | -------------------------------------------------------------------------------- /第1章 数字人文下的汉字处理/README.md: -------------------------------------------------------------------------------- 1 | 2 | # 此处内容为本书第一章篆体字自动识别项目源代码、PPT。 3 | 4 | 运行环境: 5 | 6 | Python3.7 7 | 8 | tensorflow=2.3 9 | 10 | pillow=8.2 11 | 12 | fonttools=4.24.4 13 | 14 | -------------------------------------------------------------------------------- /第1章 数字人文下的汉字处理/第一章 数字人文下的汉字处理.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hsc748NLP/code-for-digital-humanities-tutorial/6220c365ab5ff7df16f1f20a46cc928b034db36a/第1章 数字人文下的汉字处理/第一章 数字人文下的汉字处理.pdf -------------------------------------------------------------------------------- /第1章 数字人文下的汉字处理/篆体字识别/ALL-Font/370-B-v2-1.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hsc748NLP/code-for-digital-humanities-tutorial/6220c365ab5ff7df16f1f20a46cc928b034db36a/第1章 数字人文下的汉字处理/篆体字识别/ALL-Font/370-B-v2-1.ttf -------------------------------------------------------------------------------- /第1章 数字人文下的汉字处理/篆体字识别/ALL-Font/BaiZhouZhuanShuJiaoHan-1.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hsc748NLP/code-for-digital-humanities-tutorial/6220c365ab5ff7df16f1f20a46cc928b034db36a/第1章 数字人文下的汉字处理/篆体字识别/ALL-Font/BaiZhouZhuanShuJiaoHan-1.ttf -------------------------------------------------------------------------------- /第1章 数字人文下的汉字处理/篆体字识别/ALL-Font/ChaoShiJiXiJiaoZhuanTiFan-1.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hsc748NLP/code-for-digital-humanities-tutorial/6220c365ab5ff7df16f1f20a46cc928b034db36a/第1章 数字人文下的汉字处理/篆体字识别/ALL-Font/ChaoShiJiXiJiaoZhuanTiFan-1.ttf -------------------------------------------------------------------------------- /第1章 数字人文下的汉字处理/篆体字识别/ALL-Font/ChaoShiJiXiYinZhuanTiFan-1.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hsc748NLP/code-for-digital-humanities-tutorial/6220c365ab5ff7df16f1f20a46cc928b034db36a/第1章 数字人文下的汉字处理/篆体字识别/ALL-Font/ChaoShiJiXiYinZhuanTiFan-1.ttf -------------------------------------------------------------------------------- /第1章 数字人文下的汉字处理/篆体字识别/ALL-Font/FZTJLSK.TTF: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hsc748NLP/code-for-digital-humanities-tutorial/6220c365ab5ff7df16f1f20a46cc928b034db36a/第1章 数字人文下的汉字处理/篆体字识别/ALL-Font/FZTJLSK.TTF -------------------------------------------------------------------------------- /第1章 数字人文下的汉字处理/篆体字识别/ALL-Font/FangYuanYinZhangZhuanTi-2.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hsc748NLP/code-for-digital-humanities-tutorial/6220c365ab5ff7df16f1f20a46cc928b034db36a/第1章 数字人文下的汉字处理/篆体字识别/ALL-Font/FangYuanYinZhangZhuanTi-2.ttf -------------------------------------------------------------------------------- /第1章 数字人文下的汉字处理/篆体字识别/ALL-Font/HOT-HTenshoStd-L-2.otf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hsc748NLP/code-for-digital-humanities-tutorial/6220c365ab5ff7df16f1f20a46cc928b034db36a/第1章 数字人文下的汉字处理/篆体字识别/ALL-Font/HOT-HTenshoStd-L-2.otf -------------------------------------------------------------------------------- /第1章 数字人文下的汉字处理/篆体字识别/ALL-Font/hktenkokk-1.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hsc748NLP/code-for-digital-humanities-tutorial/6220c365ab5ff7df16f1f20a46cc928b034db36a/第1章 数字人文下的汉字处理/篆体字识别/ALL-Font/hktenkokk-1.ttf -------------------------------------------------------------------------------- /第1章 数字人文下的汉字处理/篆体字识别/DeleteBlankIMG.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import numpy as np 4 | import tensorflow as tf 5 | from tqdm import tqdm 6 | 7 | 8 | def clean_blank(): 9 | # 解决全白图片的方案: 10 | blank_img_array = [255] * np.ones((30, 30, 3)) # 创建一个空白图片矩阵 11 | img_dir = "data" # 设置待清除空白图片的文件夹路径 12 | for each_cls in tqdm(os.listdir(img_dir), desc='正在清除空白图片'): 13 | dir_path = os.path.join(img_dir, each_cls) 14 | for each_img in os.listdir(dir_path): 15 | image_path = os.path.join(dir_path, each_img) 16 | # 加载图片,并将图片转成ndarray类型 17 | img_array = tf.keras.preprocessing.image.img_to_array(tf.keras.preprocessing.image.load_img(image_path)) 18 | # 判断每张图片是否是空白图片,若是,则删除。 19 | if (blank_img_array == img_array).all(): 20 | os.remove(image_path) 21 | else: 22 | continue 23 | 24 | 25 | if __name__ == '__main__': 26 | clean_blank() 27 | -------------------------------------------------------------------------------- /第1章 数字人文下的汉字处理/篆体字识别/TTF.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | 4 | from PIL import Image, ImageFont, ImageDraw 5 | from fontTools.ttLib import TTFont 6 | 7 | ''' 8 | 字体转图片 9 | ''' 10 | 11 | 12 | def char_to_img(all_chara, img_dir, uniMap, font, img_size): 13 | i = 0 14 | for chara in all_chara: 15 | # 判断是否存在该字 16 | if ord(chara) in uniMap: 17 | # 新建长宽为300像素,背景色为白色的画布对象 18 | im = Image.new("RGB", (img_size, img_size), "white") 19 | draw = ImageDraw.Draw(im) 20 | # 从画布的坐标(0, 0)处绘制黑色汉字文本 21 | draw.text((0, 0), chara, fill="#000", font=font) 22 | # 获取图像中非零区域边界并裁剪 23 | im = im.crop(im.getbbox()) 24 | # 保存汉字图像 25 | if not os.path.exists(img_dir + "/" + chara + "/"): 26 | os.mkdir(img_dir + "/" + chara + "/") 27 | save_path = img_dir + "/" + chara + "/" + str(len(os.listdir(img_dir + "/" + chara + "/"))) + ".png" 28 | im.save(save_path) 29 | 30 | 31 | if __name__ == '__main__': 32 | start = time.clock() 33 | TTF_DIR = "ALL-Font" # 存放.tff字体文件夹 34 | img_dir = "data" # 生成图片存储路径 35 | img_size = 30 # 生成图片存储路径 36 | 37 | # 判断是否存在文件夹,若否,则创建 38 | if not os.path.exists(img_dir): 39 | os.makedirs(img_dir) 40 | 41 | # 选取需要保存的汉字 42 | all_chara = [chr(i) for i in range(19968,26000)] 43 | 44 | # 遍历每一个.tff字体文件 45 | for each_font in os.listdir(TTF_DIR): 46 | ttf_path = TTF_DIR + "/" + each_font 47 | print("********" + ttf_path + "*****************") 48 | # 创建int型unicode编码与字符映射表 49 | fontmap = TTFont(ttf_path) 50 | uniMap = fontmap['cmap'].tables[0].ttFont.getBestCmap() 51 | # 加载并创建指定大小的字体对象 52 | font = ImageFont.truetype(ttf_path, img_size) 53 | char_to_img(all_chara, img_dir, uniMap, font, img_size) 54 | print('time spent: {}'.format(time.clock()-start)) -------------------------------------------------------------------------------- /第1章 数字人文下的汉字处理/篆体字识别/TTF2IMG.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | 4 | from PIL import Image, ImageFont, ImageDraw 5 | from fontTools.ttLib import TTFont 6 | 7 | ''' 8 | 字体转图片 9 | ''' 10 | 11 | 12 | def char_to_img(all_chara, img_dir, uniMap, font, img_size): 13 | i = 0 14 | for chara in all_chara: 15 | # 判断是否存在该字 16 | if ord(chara) in uniMap: 17 | # 新建长宽为300像素,背景色为白色的画布对象 18 | im = Image.new("RGB", (img_size, img_size), "white") 19 | draw = ImageDraw.Draw(im) 20 | # 从画布的坐标(0, 0)处绘制黑色汉字文本 21 | draw.text((0, 0), chara, fill="#000", font=font) 22 | # 获取图像中非零区域边界并裁剪 23 | im = im.crop(im.getbbox()) 24 | # 保存汉字图像 25 | if not os.path.exists(img_dir + "/" + chara + "/"): 26 | os.mkdir(img_dir + "/" + chara + "/") 27 | save_path = img_dir + "/" + chara + "/" + str(len(os.listdir(img_dir + "/" + chara + "/"))) + ".png" 28 | im.save(save_path) 29 | 30 | 31 | if __name__ == '__main__': 32 | start = time.clock() 33 | TTF_DIR = "ALL-Font" # 存放.tff字体文件夹 34 | img_dir = "data" # 生成图片存储路径 35 | img_size = 30 # 生成图片存储路径 36 | 37 | # 判断是否存在文件夹,若否,则创建 38 | if not os.path.exists(img_dir): 39 | os.makedirs(img_dir) 40 | 41 | # 选取需要保存的汉字 42 | all_chara = [chr(i) for i in range(19968,26000)] 43 | 44 | # 遍历每一个.tff字体文件 45 | for each_font in os.listdir(TTF_DIR): 46 | ttf_path = TTF_DIR + "/" + each_font 47 | print("********" + ttf_path + "*****************") 48 | # 创建int型unicode编码与字符映射表 49 | fontmap = TTFont(ttf_path) 50 | uniMap = fontmap['cmap'].tables[0].ttFont.getBestCmap() 51 | # 加载并创建指定大小的字体对象 52 | font = ImageFont.truetype(ttf_path, img_size) 53 | char_to_img(all_chara, img_dir, uniMap, font, img_size) 54 | print('time spent: {}'.format(time.clock()-start)) -------------------------------------------------------------------------------- /第2章 数字人文下的汉语分词/README.md: -------------------------------------------------------------------------------- 1 | ## 非物质文化遗产自动分词系统 2 | 此项目文件夹提供的是“第二章:数字人文下的汉语分词”中非物质文化遗产自动分词系统的源代码,系统功能及使用方法可参见教材第二章。 3 | ### 源码下载 4 | 由于源文件数量较多,因此请从百度云盘下载源代码: 5 | >下载链接:https://pan.baidu.com/s/14RAwSzgTBDI3asVUdASLGQ 6 | > 提取码:rqf0 7 | 8 | - 下载的源代码包含两部分: 9 | - (1)系统源码 10 | - (2)编译后的可执行文件 11 | ### 使用方式 12 | - #### 运行编译后的可执行文件(推荐): 13 | 首先进入下载的文件夹,再进入`dist(打包完成的)`,再进入`ICHAutoWordSegGUI`,找到并运行`ICHAutoWordSegGUI.exe`文件即可。 14 | - #### 运行系统源码(进阶): 15 | 1. 在conda中配置PyQT运行环境:本项目依赖的环境见项目文件夹中的`environment.yml`,配置方式请参考参考在“在Pycharm配置QtDesigner和PyUIC”的有关教程。 16 | 2. 在下载的文件夹中,找到并运行`ICHAutoWordSegGUI.py`模块即可。 -------------------------------------------------------------------------------- /第2章 数字人文下的汉语分词/第二章 数字人文下的汉字分词.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hsc748NLP/code-for-digital-humanities-tutorial/6220c365ab5ff7df16f1f20a46cc928b034db36a/第2章 数字人文下的汉语分词/第二章 数字人文下的汉字分词.pdf -------------------------------------------------------------------------------- /第3章 数字人文下的词性自动标注/Bi-LSTM-CRF/LSTM_CRF/data/word2id.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hsc748NLP/code-for-digital-humanities-tutorial/6220c365ab5ff7df16f1f20a46cc928b034db36a/第3章 数字人文下的词性自动标注/Bi-LSTM-CRF/LSTM_CRF/data/word2id.pkl -------------------------------------------------------------------------------- /第3章 数字人文下的词性自动标注/Bi-LSTM-CRF/LSTM_CRF/lstm_crf_data_helper.py: -------------------------------------------------------------------------------- 1 | # encoding=utf-8 2 | import pickle as pickle 3 | 4 | import numpy as np 5 | 6 | 7 | # 用于字标注向索引的转换,因为tensorflow中的crf接收的数据是数字形式的索引 8 | def tags2id(tags_list, tag2label): 9 | ''' 10 | :param tags_list: 11 | :param tag2label:字标注向索引映射的字典 12 | :return: 13 | ''' 14 | tags_label_list = [] 15 | for tags in tags_list: 16 | tags_label = [] 17 | for tag in tags: 18 | tags_label.append(tag2label[tag]) 19 | tags_label_list.append(tags_label) 20 | print('final tags2id') 21 | return tags_label_list 22 | 23 | 24 | # 用于获得数据 25 | def get_data(file_location): 26 | ''' 27 | :param file_location: 文件的存放路径,注意文件中的字和字标注是空格隔开的 28 | :return: 两个大的List,前面的list里面一个个小list存放的是一个个句子,后面的list里面一个个小list存放的是一个个句子对应的标签 29 | ''' 30 | sentences_list = [] 31 | tags_list = [] 32 | with open(file_location, 'r', encoding='utf-8') as fr: 33 | sentence_list = [] 34 | tag_list = [] 35 | for line in fr.readlines(): 36 | if line != '\n': 37 | [word, tag] = line.strip().split() 38 | sentence_list.append(word) 39 | tag_list.append(tag) 40 | else: 41 | sentences_list.append(sentence_list) 42 | tags_list.append(tag_list) 43 | sentence_list = [] 44 | tag_list = [] 45 | print('final get_data') 46 | return sentences_list, tags_list 47 | 48 | 49 | # 用于获得训练集中每个字对应的id,返回的是键为字值为id的一个字典 50 | # 注意这个dict只用于当前训练集,换训练集需要自己生成 51 | def get_word_id(file_location): 52 | with open(file_location, 'rb') as fr: 53 | word2id_dict = pickle.load(fr) 54 | print('final get_word_id') 55 | return word2id_dict 56 | 57 | 58 | # 用于初始化字向量,这里并没有通过word2vec获得,而是通过随机正太分布获得 59 | def random_embedding(word2id_dict, embedding_size): 60 | ''' 61 | :param word2id_dict: 用于获得总的字符个数 62 | :param embedding: 每个字的维度 63 | :return: 字向量组 64 | ''' 65 | embedding_mat = np.random.uniform(-0.25, 0.25, (len(word2id_dict), embedding_size)) 66 | embedding_mat = np.array(embedding_mat).astype(np.float32) 67 | print('final random_embedding') 68 | return embedding_mat 69 | 70 | 71 | # 获得一个句子中每个字对应的索引 72 | def sentence2id(sentences, word2id_dict): 73 | ''' 74 | :param sentences: 所有句子 75 | :param word2id_dict: 记录字和字对应索引的字典 76 | :return: 包含所有句子中每个字索引的List 77 | ''' 78 | sentences_id_list = [] 79 | for sentence in sentences: 80 | sentence_id_list = [] 81 | for word in sentence: 82 | if str(word).isdigit(): 83 | word = '' 84 | elif ('\u0041' <= word <= '\u005a') or ('\u0061' <= word <= '\u007a'): 85 | word = '' 86 | if word not in word2id_dict.keys(): 87 | word = '' 88 | sentence_id_list.append(word2id_dict[word]) 89 | sentences_id_list.append(sentence_id_list) 90 | print('final sentence2id') 91 | return sentences_id_list 92 | 93 | 94 | # 对句子和标签都可进行填充,同时获得每个序列长度的列表 95 | def padding_sentences(sentences_index, pad_mark=0): 96 | ''' 97 | :param sentences_index: 每个句子各个字或者字标注对应的索引 98 | :param pad_mark: 用什么进行填充,默认为用零进行填充 99 | :return: 填充后的各个句子或标注的索引和序列长度列表 100 | ''' 101 | sen_max_len = max(map(lambda x: len(x), sentences_index)) 102 | sen_index_list, sen_len_list = [], [] 103 | for sen_index in sentences_index: 104 | sen_index = list(sen_index) 105 | new_sentence_index = sen_index[:sen_max_len] + [pad_mark] * max(sen_max_len - len(sen_index), 0) 106 | sen_index_list.append(new_sentence_index) 107 | sen_len_list.append(min(len(sen_index), sen_max_len)) 108 | return np.array(sen_index_list), np.array(sen_len_list) 109 | 110 | 111 | # if __name__ == '__main__': 112 | # word2id = get_word_id('data/word2id.pkl') 113 | # for key, id in word2id.items(): 114 | # if id == 0: 115 | # print(key) 116 | -------------------------------------------------------------------------------- /第3章 数字人文下的词性自动标注/Bi-LSTM-CRF/LSTM_CRF/lstm_crf_model.py: -------------------------------------------------------------------------------- 1 | # encoding=utf-8 2 | import tensorflow as tf 3 | from tensorflow.contrib.crf import crf_log_likelihood 4 | from tensorflow.contrib.rnn import LSTMCell 5 | 6 | 7 | class BiLSTM_CRF(object): 8 | def __init__(self, hidden_dim, num_tags, input_x, sequence_lengths, dropout_pl, labels): 9 | self.hidden_dim = hidden_dim 10 | self.num_tags = num_tags 11 | self.input_x = input_x 12 | self.sequence_lengths = sequence_lengths 13 | self.dropout_pl = dropout_pl 14 | self.labels = labels 15 | 16 | # 建立模型,执行正向传播,返回正向传播得到的值 17 | def positive_propagation(self): 18 | with tf.variable_scope('lstm-crf'): 19 | cell_fw = LSTMCell(self.hidden_dim) 20 | cell_bw = LSTMCell(self.hidden_dim) 21 | # inputs(self.input_x)的shape通常是[batch_size, sequence_length, dim_embedding] 22 | # output_fw_seq和output_bw_seq的shape都是[batch_size, sequence_length, num_units] 23 | (output_fw_seq, output_bw_seq), _ = tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw, self.input_x, 24 | self.sequence_lengths, dtype=tf.float32) 25 | out_put = tf.concat([output_fw_seq, output_bw_seq], axis=-1) # 对正反向的输出进行合并 26 | out_put = tf.nn.dropout(out_put, self.dropout_pl) # 防止过拟合 27 | # 循环神经网络之后进行一次线性变换,用于把输出转换为crf_log_likelihood的接收格式,主要 28 | # 是把最后一维的维度转换为num_tags,以便于随后进行优化 29 | with tf.variable_scope('proj'): 30 | W = tf.get_variable(name='W', 31 | shape=[2 * self.hidden_dim, self.num_tags], 32 | initializer=tf.contrib.layers.xavier_initializer(), 33 | dtype=tf.float32 34 | ) 35 | b = tf.get_variable(name='b', 36 | shape=[self.num_tags], 37 | initializer=tf.zeros_initializer, 38 | dtype=tf.float32 39 | ) 40 | s = tf.shape(out_put) 41 | # 正向传播的结果计算 42 | out_put = tf.reshape(out_put, [-1, 2 * self.hidden_dim]) # 就是一个维度变换 43 | pred = tf.matmul(out_put, W) + b # 进行线性变换 44 | # s[1]是所选取的最大句子长度 45 | logits = tf.reshape(pred, [-1, s[1], self.num_tags]) 46 | 47 | # CRF损失值的计算 48 | # transition_params是CRF的转换矩阵,会被自动计算出来 49 | # tag_indices:填入维度为[batch_size, max_seq_len]的矩阵,也就是Golden标签,注意这里的标签都是以索引方式表示的这个就是真实的标签序列了 50 | # sequence_lengths:维度为[batch_size]的向量,记录了每个序列的长度 51 | # inputs:unary potentials,也就是每个标签的预测概率值,这个值根据实际情况选择计算方法,CNN,RNN...都可以 52 | # crf_log_likelihood求的是CRF的损失值,牵扯到前向后向算法,会获得概率转移矩阵 53 | log_likelihood, transition_params = crf_log_likelihood(inputs=logits, tag_indices=self.labels, 54 | sequence_lengths=self.sequence_lengths) 55 | loss = -tf.reduce_mean(log_likelihood) 56 | return loss, transition_params, logits 57 | -------------------------------------------------------------------------------- /第3章 数字人文下的词性自动标注/Bi-LSTM-CRF/README.md: -------------------------------------------------------------------------------- 1 | # Bi-LSTM-CRF 2 | 利用Bi-LSTM-CRF实现词性标注 3 | 4 | ### 使用方式 5 | 1. 配置环境 6 | 本项目在`env4BiLSTM-CRF.txt`所列环境中测试通过,其中下列python版本和tensorflow版本为必须项: 7 | > Python == 3.7.6 8 | > tensorflow == 1.12.0 9 | 2. 首先进入`LSTM_CRF`目录,然后直接运行`lstm_crf_main.py`即可。 10 | -------------------------------------------------------------------------------- /第3章 数字人文下的词性自动标注/Bi-LSTM-CRF/env4BiLSTM-CRF.txt: -------------------------------------------------------------------------------- 1 | Package Version 2 | ------------------- --------- 3 | absl-py 0.7.0 4 | astor 0.7.1 5 | astroid 2.3.3 6 | certifi 2020.6.20 7 | cycler 0.10.0 8 | Cython 0.29.15 9 | gast 0.2.2 10 | grpcio 1.16.1 11 | h5py 2.9.0 12 | isort 4.3.21 13 | joblib 0.16.0 14 | Keras-Applications 1.0.6 15 | Keras-Preprocessing 1.0.5 16 | kiwisolver 1.1.0 17 | lazy-object-proxy 1.4.3 18 | Markdown 3.0.1 19 | matplotlib 3.2.1 20 | mccabe 0.6.1 21 | mkl-fft 1.0.10 22 | mkl-random 1.0.2 23 | numpy 1.15.4 24 | pandas 0.25.3 25 | patsy 0.5.1 26 | pip 18.1 27 | protobuf 3.6.1 28 | pylint 2.4.4 29 | pyparsing 2.4.7 30 | python-dateutil 2.8.1 31 | pytz 2019.3 32 | scikit-learn 0.21.3 33 | scipy 1.2.1 34 | seaborn 0.9.0 35 | setuptools 40.6.3 36 | six 1.12.0 37 | statsmodels 0.10.1 38 | tensorboard 1.12.2 39 | tensorflow 1.12.0 40 | termcolor 1.1.0 41 | tornado 6.0.3 42 | typed-ast 1.4.0 43 | Werkzeug 1.0.1 44 | wheel 0.32.3 45 | wrapt 1.11.2 46 | -------------------------------------------------------------------------------- /第3章 数字人文下的词性自动标注/CRF/crf_learn.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hsc748NLP/code-for-digital-humanities-tutorial/6220c365ab5ff7df16f1f20a46cc928b034db36a/第3章 数字人文下的词性自动标注/CRF/crf_learn.exe -------------------------------------------------------------------------------- /第3章 数字人文下的词性自动标注/CRF/crf_test.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hsc748NLP/code-for-digital-humanities-tutorial/6220c365ab5ff7df16f1f20a46cc928b034db36a/第3章 数字人文下的词性自动标注/CRF/crf_test.exe -------------------------------------------------------------------------------- /第3章 数字人文下的词性自动标注/CRF/libcrfpp.dll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hsc748NLP/code-for-digital-humanities-tutorial/6220c365ab5ff7df16f1f20a46cc928b034db36a/第3章 数字人文下的词性自动标注/CRF/libcrfpp.dll -------------------------------------------------------------------------------- /第3章 数字人文下的词性自动标注/CRF/readme.md: -------------------------------------------------------------------------------- 1 | #### 使用CRF实现词性标注的步骤 2 | 以Window10为例: 3 | - 1.打开终端,并进入CRF项目文件夹 4 | - 同时按Win+R打开“运行”窗口。 5 | - 输入`cmd`并按`确定`。 6 | - 在终端窗口中,输入`cd`+`空格`+`/d`+`空格`+`CRF文件夹绝对路径`,如`cd /d code-for-digital-humanities-tutorial\第三章 数字人文下的词性自动标注\CRF` 7 | 8 | 9 | - 2.依次在终端中输入下述的CRF运行指令,即可实现基于CRF的词性标注模型的训练、测试、性能评估。 10 | 11 | #### CRF运行指令 12 | 依次在终端命令行输入下列命令,按Enter执行。 13 | 14 | 1.训练模型 15 | > crf_learn -p 8 template data/train.txt model 16 | 17 | 2.测试模型 18 | > crf_test -m model data/test.txt >output.txt 19 | 20 | 3.评估模型在测试集上的效果 21 | > python conlleval.py < output.txt >prf.txt -------------------------------------------------------------------------------- /第3章 数字人文下的词性自动标注/CRF/template: -------------------------------------------------------------------------------- 1 | # Unigram 2 | U00:%x[-2,0] 3 | U01:%x[-1,0] 4 | U02:%x[0,0] 5 | U03:%x[1,0] 6 | U04:%x[2,0] 7 | U05:%x[-1,0]/%x[0,0] 8 | U06:%x[0,0]/%x[1,0] 9 | 10 | # Bigram 11 | B 12 | -------------------------------------------------------------------------------- /第3章 数字人文下的词性自动标注/HMM/data/output.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hsc748NLP/code-for-digital-humanities-tutorial/6220c365ab5ff7df16f1f20a46cc928b034db36a/第3章 数字人文下的词性自动标注/HMM/data/output.txt -------------------------------------------------------------------------------- /第3章 数字人文下的词性自动标注/README.md: -------------------------------------------------------------------------------- 1 | ## 词性标注代码 2 | 3 | 本文件夹包含将BiLSTM-CRF,CRF,HMM三种序列标注模型用于词性标注任务的方式 4 | -------------------------------------------------------------------------------- /第3章 数字人文下的词性自动标注/第三章 数字人文下的词性自动标注.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hsc748NLP/code-for-digital-humanities-tutorial/6220c365ab5ff7df16f1f20a46cc928b034db36a/第3章 数字人文下的词性自动标注/第三章 数字人文下的词性自动标注.pdf -------------------------------------------------------------------------------- /第4章 数字人文下的实体识别/BERT-NER-pytorch_sample/README.md: -------------------------------------------------------------------------------- 1 | 9 | # BERT NER 10 | 11 | :bangbang: Model without Extra `X` label checkout branch [experiment](https://github.com/kamalkraj/BERT-NER/tree/experiment) 12 | 13 | Use google BERT to do CoNLL-2003 NER ! 14 | 15 | 16 | # Requirements 17 | 18 | - `python3` 19 | - `pip3 install -r requirements.txt` 20 | 21 | # Run 22 | 23 | CUDA_VISIBLE_DEVICES=2 nohup python run_ner.py --data_dir=data/data_0/ --bert_model=/home/pgrad/pretrain_models/hflroberta/ --task_name=ner --output_dir=out/bert-base --max_seq_length=128 --do_train --train_batch_size=220 --num_train_epochs=3 --do_eval --warmup_proportion=0.4 > hflrobertabase.log 24 | 25 | 26 | # Result 27 | 28 | ### Validation Data 29 | ``` 30 | precision recall f1-score support 31 | 32 | MISC 0.9407 0.9304 0.9355 273 33 | LOC 0.9650 0.9881 0.9764 419 34 | PER 0.9844 0.9783 0.9813 322 35 | ORG 0.9794 0.9852 0.9822 337 36 | 37 | avg / total 0.9683 0.9734 0.9708 1351 38 | ``` 39 | ### Test Data 40 | ``` 41 | precision recall f1-score support 42 | 43 | ORG 0.9152 0.9073 0.9113 464 44 | PER 0.9767 0.9692 0.9730 260 45 | LOC 0.9397 0.9263 0.9330 353 46 | MISC 0.8276 0.9014 0.8629 213 47 | 48 | avg / total 0.9198 0.9240 0.9217 1290 49 | ``` 50 | 51 | ## Pretrained model download from [here](https://drive.google.com/file/d/1UKE2UVFStXZFtXFgZObGg5mo_MzW-ZoC/view?usp=sharing) 52 | 53 | # Inference 54 | 55 | ```python 56 | from bert import Ner 57 | 58 | model = Ner("out/") 59 | 60 | output = model.predict("Steve went to Paris") 61 | 62 | print(output) 63 | # { 64 | # "Steve": { 65 | # "tag": "B-PER", 66 | # "confidence": 0.999879002571106 67 | # }, 68 | # "went": { 69 | # "tag": "O", 70 | # "confidence": 0.9968552589416504 71 | # }, 72 | # "to": { 73 | # "tag": "O", 74 | # "confidence": 0.9996656179428101 75 | # }, 76 | # "Paris": { 77 | # "tag": "B-LOC", 78 | # "confidence": 0.999504804611206 79 | # } 80 | # } 81 | 82 | ``` 83 | 84 | 85 | ### Tensorflow version 86 | 87 | - https://github.com/kyzhouhzau/BERT-NER 88 | -------------------------------------------------------------------------------- /第4章 数字人文下的实体识别/BERT-NER-pytorch_sample/bert.py: -------------------------------------------------------------------------------- 1 | """BERT NER Inference.""" 2 | 3 | from __future__ import absolute_import, division, print_function 4 | 5 | import json 6 | import os 7 | 8 | import torch 9 | import torch.nn.functional as F 10 | from nltk import word_tokenize 11 | from pytorch_pretrained_bert.modeling import (CONFIG_NAME, WEIGHTS_NAME, 12 | BertConfig, 13 | BertForTokenClassification) 14 | from pytorch_pretrained_bert.tokenization import BertTokenizer 15 | 16 | 17 | class Ner: 18 | 19 | def __init__(self,model_dir: str): 20 | self.model , self.tokenizer, self.model_config = self.load_model(model_dir) 21 | self.label_map = self.model_config["label_map"] 22 | self.max_seq_length = self.model_config["max_seq_length"] 23 | self.label_map = {int(k):v for k,v in self.label_map.items()} 24 | self.model.eval() 25 | 26 | def load_model(self, model_dir: str, model_config: str = "model_config.json"): 27 | model_config = os.path.join(model_dir,model_config) 28 | model_config = json.load(open(model_config)) 29 | output_config_file = os.path.join(model_dir, CONFIG_NAME) 30 | output_model_file = os.path.join(model_dir, WEIGHTS_NAME) 31 | config = BertConfig(output_config_file) 32 | model = BertForTokenClassification(config, num_labels=model_config["num_labels"]) 33 | model.load_state_dict(torch.load(output_model_file)) 34 | tokenizer = BertTokenizer.from_pretrained(model_config["bert_model"],do_lower_case=False) 35 | return model, tokenizer, model_config 36 | 37 | def tokenize(self, text: str): 38 | """ tokenize input""" 39 | words = word_tokenize(text) 40 | tokens = [] 41 | valid_positions = [] 42 | for i,word in enumerate(words): 43 | token = self.tokenizer.tokenize(word) 44 | tokens.extend(token) 45 | for i in range(len(token)): 46 | if i == 0: 47 | valid_positions.append(1) 48 | else: 49 | valid_positions.append(0) 50 | return tokens, valid_positions 51 | 52 | def preprocess(self, text: str): 53 | """ preprocess """ 54 | tokens, valid_positions = self.tokenize(text) 55 | ## insert "[CLS]" 56 | tokens.insert(0,"[CLS]") 57 | ## insert "[SEP]" 58 | tokens.append("[SEP]") 59 | segment_ids = [] 60 | for i in range(len(tokens)): 61 | segment_ids.append(0) 62 | input_ids = self.tokenizer.convert_tokens_to_ids(tokens) 63 | input_mask = [1] * len(input_ids) 64 | while len(input_ids) < self.max_seq_length: 65 | input_ids.append(0) 66 | input_mask.append(0) 67 | segment_ids.append(0) 68 | return input_ids,input_mask,segment_ids,valid_positions 69 | 70 | def predict(self, text: str): 71 | input_ids,input_mask,segment_ids,valid_positions = self.preprocess(text) 72 | input_ids = torch.tensor([input_ids],dtype=torch.long) 73 | input_mask = torch.tensor([input_mask],dtype=torch.long) 74 | segment_ids = torch.tensor([segment_ids],dtype=torch.long) 75 | with torch.no_grad(): 76 | logits = self.model(input_ids, segment_ids, input_mask) 77 | logits = F.softmax(logits,dim=2) 78 | logits_label = torch.argmax(logits,dim=2) 79 | logits_label = logits_label.detach().cpu().numpy() 80 | # import ipdb; ipdb.set_trace() 81 | logits_confidence = [values[label].item() for values,label in zip(logits[0],logits_label[0])] 82 | 83 | logits_label = [logits_label[0][index] for index,i in enumerate(input_mask[0]) if i.item()==1] 84 | logits_label.pop(0) 85 | logits_label.pop() 86 | 87 | assert len(logits_label) == len(valid_positions) 88 | labels = [] 89 | for valid,label in zip(valid_positions,logits_label): 90 | if valid: 91 | labels.append(self.label_map[label]) 92 | words = word_tokenize(text) 93 | assert len(labels) == len(words) 94 | output = [word:{"tag":label,"confidence":confidence} for word,label,confidence in zip(words,labels,logits_confidence)] 95 | return output 96 | -------------------------------------------------------------------------------- /第4章 数字人文下的实体识别/BERT-NER-pytorch_sample/printmodel.py: -------------------------------------------------------------------------------- 1 | from transformers import BertTokenizer,BertModel 2 | 3 | tokenizer = BertTokenizer.from_pretrained("pretrain_models/sikuroberta_vocabtxt") 4 | model = BertModel.from_pretrained("pretrain_models/sikuroberta_vocabtxt") 5 | print(model) 6 | 7 | -------------------------------------------------------------------------------- /第4章 数字人文下的实体识别/BERT-NER-pytorch_sample/requirements.txt: -------------------------------------------------------------------------------- 1 | pytorch-pretrained-bert==0.6.1 2 | seqeval==0.0.5 3 | torch==1.0.1.post2 4 | tqdm==4.31.1 5 | nltk==3.4 -------------------------------------------------------------------------------- /第4章 数字人文下的实体识别/BERT-NER-pytorch_sample/run.pid: -------------------------------------------------------------------------------- 1 | 74651 2 | -------------------------------------------------------------------------------- /第4章 数字人文下的实体识别/BERT-NER-pytorch_sample/run.sh: -------------------------------------------------------------------------------- 1 | ### 2 | # @Author: your name 3 | # @Date: 2021-04-15 10:54:17 4 | # @LastEditTime: 2021-04-15 11:00:54 5 | # @LastEditors: Please set LastEditors 6 | # @Description: In User Settings Edit 7 | # @FilePath: /BERT-NER-Pytorch/run.sh 8 | ### 9 | CUDA_VISIBLE_DEVICES=1 python run_ner.py --data_dir=train_data_cixing/ \ 10 | --bert_model=pretrain_models/sikuroberta_vocabtxt/ \ 11 | --task_name=ner \ 12 | --output_dir=output/train_data_cixing_out/ \ 13 | --max_seq_length=128 \ 14 | --do_train --do_eval --eval_batch_size=64 --train_batch_size=64 --num_train_epochs 10 \ 15 | --warmup_proportion=0.4 > logsikubert0.log 2>&1 & echo $! > run.pid -------------------------------------------------------------------------------- /第4章 数字人文下的实体识别/BERT-NER-pytorch_sample/run_test.sh: -------------------------------------------------------------------------------- 1 | ### 2 | # @Author: your name 3 | # @Date: 2021-04-15 10:54:17 4 | # @LastEditTime: 2021-04-15 11:00:54 5 | # @LastEditors: Please set LastEditors 6 | # @Description: In User Settings Edit 7 | # @FilePath: /BERT-NER-Pytorch/run.sh 8 | ### 9 | CUDA_VISIBLE_DEVICES=1 python run_ner.py --data_dir=train_data_book/data_2/ \ 10 | --bert_model=pretrain_models/sikuroberta_vocabtxt/ \ 11 | --task_name=ner \ 12 | --output_dir=output/train_book_sikuroberta_vocabtxt \ 13 | --max_seq_length=128 \ 14 | --do_eval --eval_batch_size=64 --train_batch_size=64 --num_train_epochs 5 \ 15 | --warmup_proportion=0.4 > logsikubert4.log 2>&1 & echo $! > run.pid -------------------------------------------------------------------------------- /第4章 数字人文下的实体识别/BERT-NER-pytorch_sample/settings.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Author: your name 3 | Date: 2021-04-14 21:26:33 4 | LastEditTime: 2021-04-15 10:53:15 5 | LastEditors: Please set LastEditors 6 | Description: In User Settings Edit 7 | FilePath: /BERT-CRF-Pytorch/processors/settings.py 8 | ''' 9 | LABELS=["X","O",'B-w', 'I-w', 'E-w', 'S-w', 'B-a', 'I-a', 'E-a', 'S-a', 'B-n', 'I-n', 'E-n', 'S-n', 'B-v', 'I-v', 'E-v', 'S-v', 'B-r', 'I-r', 'E-r', 'S-r', 'B-y', 'I-y', 'E-y', 'S-y', 'B-ns', 'I-ns', 'E-ns', 'S-ns', 'B-nr', 'I-nr', 'E-nr', 'S-nr', 'B-c', 'I-c', 'E-c', 'S-c', 'B-d', 'I-d', 'E-d', 'S-d', 'B-u', 'I-u', 'E-u', 'S-u', 'B-p', 'I-p', 'E-p', 'S-p', 'B-t', 'I-t', 'E-t', 'S-t', 'B-f', 'I-f', 'E-f', 'S-f', 'B-m', 'I-m', 'E-m', 'S-m', 'B-vs', 'I-vs', 'E-vs', 'S-vs', 'B-q', 'I-q', 'E-q', 'S-q', 'B-j', 'I-j', 'E-j', 'S-j', 'B-vy', 'I-vy', 'E-vy', 'S-vy', 'B-nx', 'I-nx', 'E-nx', 'S-nx', 'B-zn', 'I-zn', 'E-zn', 'S-zn', 'B-i', 'I-i', 'E-i', 'S-i', 'B-x', 'I-x', 'E-x', 'S-x', 'B-vw', 'I-vw', 'E-vw', 'S-vw', 'B-zv', 'I-zv', 'E-zv', 'S-zv', 'B-za', 'I-za', 'E-za', 'S-za', 'B-s', 'I-s', 'E-s', 'S-s',"[CLS]","[SEP]"] 10 | 11 | #LABELS=["X","O",'nr-B', 'nr-I', 'nr-E', 'nr-S', 'n-B', 'n-I', 'n-E', 'n-S', 'w-B', 'w-I', 'w-E', 'w-S', 'ns-B', 'ns-I', 'ns-E', 'ns-S', 'u-B', 'u-I', 'u-E', 'u-S', 'v-B', 'v-I', 'v-E', 'v-S', 'p-B', 'p-I', 'p-E', 'p-S', 'nx-B', 'nx-I', 'nx-E', 'nx-S', 'd-B', 'd-I', 'd-E', 'd-S', 'r-B', 'r-I', 'r-E', 'r-S', 'a-B', 'a-I', 'a-E', 'a-S', 'c-B', 'c-I', 'c-E', 'c-S', 't-B', 't-I', 't-E', 't-S', 'm-B', 'm-I', 'm-E', 'm-S', 'q-B', 'q-I', 'q-E', 'q-S', 'y-B', 'y-I', 'y-E', 'y-S', 'j-B', 'j-I', 'j-E', 'j-S', 'nc-B', 'nc-I', 'nc-E', 'nc-S', 'nrx-B', 'nrx-I', 'nrx-E', 'nrx-S', 'f-B', 'f-I', 'f-E', 'f-S', 'gv-B', 'gv-I', 'gv-E', 'gv-S', 'i-B', 'i-I', 'i-E', 'i-S',"[CLS]","[SEP]"] 12 | #LABELS=["X","O",'B-T','I-T','E-T','S-T',"[CLS]","[SEP]"] 13 | #LABELS=["X","O",'B','N','E','S',"[CLS]","[SEP]"] 14 | #LABELS=["X","O",'B-A','I-A','E-A','S-A',"[CLS]","[SEP]"] 15 | #LABELS=["X","O",'B','I','E','S',"[CLS]","[SEP]"] 16 | #LABELS=["X","O",'B-BOOK','M-BOOK','E-BOOK','S-BOOK',"[CLS]","[SEP]"] 17 | 18 | -------------------------------------------------------------------------------- /第4章 数字人文下的实体识别/BERT-NER-pytorch_sample/train_data_cixing/test.txt: -------------------------------------------------------------------------------- 1 | 使 S-v 2 | 司 B-nr 3 | 徒 E-nr 4 | 禁 S-v 5 | 掠 B-nr 6 | 欒 I-nr 7 | 氏 E-nr 8 | 者 S-r 9 | , S-w 10 | 歸 S-v 11 | 所 S-r 12 | 取 S-v 13 | 焉 S-y 14 | , S-w 15 | 使 S-v 16 | 候 S-n 17 | 出 S-v 18 | 諸 S-j 19 | 轘 B-ns 20 | 轅 E-ns 21 | 。 S-w 22 | -------------------------------------------------------------------------------- /第4章 数字人文下的实体识别/BERT-NER-pytorch_sample/train_data_cixing/train.txt: -------------------------------------------------------------------------------- 1 | 太 B-n 2 | 子 E-n 3 | 曰 S-v 4 | : S-w 5 | 6 | 使 S-v 7 | 問 S-v 8 | 弦 B-nr 9 | 多 E-nr 10 | 以 S-p 11 | 琴 S-n 12 | , S-w 13 | 曰 S-v 14 | : S-w 15 | 16 | 十 B-t 17 | 二 I-t 18 | 月 E-t 19 | 甲 B-t 20 | 戌 E-t 21 | , S-w 22 | 晉 S-ns 23 | 作 S-v 24 | 六 S-m 25 | 軍 S-n 26 | 。 S-w 27 | 28 | -------------------------------------------------------------------------------- /第4章 数字人文下的实体识别/BERT-NER-pytorch_sample/train_data_cixing/valid.txt: -------------------------------------------------------------------------------- 1 | 使 S-v 2 | 司 B-nr 3 | 徒 E-nr 4 | 禁 S-v 5 | 掠 B-nr 6 | 欒 I-nr 7 | 氏 E-nr 8 | 者 S-r 9 | , S-w 10 | 歸 S-v 11 | 所 S-r 12 | 取 S-v 13 | 焉 S-y 14 | , S-w 15 | 使 S-v 16 | 候 S-n 17 | 出 S-v 18 | 諸 S-j 19 | 轘 B-ns 20 | 轅 E-ns 21 | 。 S-w 22 | -------------------------------------------------------------------------------- /第4章 数字人文下的实体识别/ChinsesNER-pytorch-master/.gitignore: -------------------------------------------------------------------------------- 1 | .vscode 2 | __pycache__ -------------------------------------------------------------------------------- /第4章 数字人文下的实体识别/ChinsesNER-pytorch-master/README.md: -------------------------------------------------------------------------------- 1 | # ChinsesNER-pytorch 2 | 3 | ### train 4 | 5 | setp 1: edit **models/config.yml** 6 | 7 | embedding_size: 100 8 | hidden_size: 128 9 | model_path: models/ 10 | batch_size: 20 11 | dropout: 0.5 12 | tags: 13 | - ORG 14 | - PER 15 | 16 | step 2: train 17 | 18 | python3 main.py train 19 | or 20 | cn = ChineseNER("train") 21 | cn.train() 22 | 23 | ... 24 | epoch [4] |██████ | 154/591 25 | loss 0.46 26 | evaluation 27 | ORG recall 1.00 precision 1.00 f1 1.00 28 | -------------------------------------------------- 29 | epoch [4] |██████ | 155/591 30 | loss 1.47 31 | evaluation 32 | ORG recall 0.92 precision 0.92 f1 0.92 33 | -------------------------------------------------- 34 | epoch [4] |██████ | 156/591 35 | loss 0.46 36 | evaluation 37 | ORG recall 0.94 precision 1.00 f1 0.97 38 | 39 | ### predict 40 | 41 | python3 main.py predict 42 | or 43 | cn = ChineseNER("predict") 44 | cn.predict() 45 | 46 | 请输入文本: 海利装饰材料有限公司 47 | [{'start': 0, 'stop': 10, 'word': '海利装饰材料有限公司', 'type': 'ORG'}] 48 | 49 | ### REFERENCES 50 | - [Log-Linear Models, MEMMs, and CRFs](http://www.cs.columbia.edu/~mcollins/crf.pdf) 51 | - [Neural Architectures for Named Entity Recognition](https://arxiv.org/pdf/1603.01360.pdf) 52 | -------------------------------------------------------------------------------- /第4章 数字人文下的实体识别/ChinsesNER-pytorch-master/models/config.yml: -------------------------------------------------------------------------------- 1 | embedding_size: 100 2 | hidden_size: 128 3 | model_path: models/ 4 | batch_size: 20 5 | dropout: 0.5 6 | tags: 7 | - ORG 8 | - PER -------------------------------------------------------------------------------- /第4章 数字人文下的实体识别/ChinsesNER-pytorch-master/models/data.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hsc748NLP/code-for-digital-humanities-tutorial/6220c365ab5ff7df16f1f20a46cc928b034db36a/第4章 数字人文下的实体识别/ChinsesNER-pytorch-master/models/data.pkl -------------------------------------------------------------------------------- /第4章 数字人文下的实体识别/ChinsesNER-pytorch-master/models/params.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hsc748NLP/code-for-digital-humanities-tutorial/6220c365ab5ff7df16f1f20a46cc928b034db36a/第4章 数字人文下的实体识别/ChinsesNER-pytorch-master/models/params.pkl -------------------------------------------------------------------------------- /第4章 数字人文下的实体识别/ChinsesNER-pytorch-master/utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | ''' 3 | @Author: yanwii 4 | @Date: 2018-11-07 13:52:12 5 | ''' 6 | 7 | def format_result(result, text, tag): 8 | entities = [] 9 | for i in result: 10 | begin, end = i 11 | entities.append({ 12 | "start":begin, 13 | "stop":end + 1, 14 | "word":text[begin:end+1], 15 | "type":tag 16 | }) 17 | return entities 18 | 19 | def get_tags(path, tag, tag_map): 20 | begin_tag = tag_map.get("B-" + tag) 21 | mid_tag = tag_map.get("I-" + tag) 22 | end_tag = tag_map.get("E-" + tag) 23 | single_tag = tag_map.get("S") 24 | o_tag = tag_map.get("O") 25 | begin = -1 26 | end = 0 27 | tags = [] 28 | last_tag = 0 29 | for index, tag in enumerate(path): 30 | if tag == begin_tag and index == 0: 31 | begin = 0 32 | elif tag == begin_tag: 33 | begin = index 34 | elif tag == end_tag and last_tag in [mid_tag, begin_tag] and begin > -1: 35 | end = index 36 | tags.append([begin, end]) 37 | elif tag == o_tag or tag == single_tag: 38 | begin = -1 39 | last_tag = tag 40 | return tags 41 | 42 | def f1_score(tar_path, pre_path, tag, tag_map): 43 | origin = 0. 44 | found = 0. 45 | right = 0. 46 | for fetch in zip(tar_path, pre_path): 47 | tar, pre = fetch 48 | tar_tags = get_tags(tar, tag, tag_map) 49 | pre_tags = get_tags(pre, tag, tag_map) 50 | 51 | origin += len(tar_tags) 52 | found += len(pre_tags) 53 | 54 | for p_tag in pre_tags: 55 | if p_tag in tar_tags: 56 | right += 1 57 | 58 | recall = 0. if origin == 0 else (right / origin) 59 | precision = 0. if found == 0 else (right / found) 60 | f1 = 0. if recall+precision == 0 else (2*precision*recall)/(precision + recall) 61 | print("\t{}\trecall {:.2f}\tprecision {:.2f}\tf1 {:.2f}".format(tag, recall, precision, f1)) 62 | return recall, precision, f1 -------------------------------------------------------------------------------- /第4章 数字人文下的实体识别/README.md: -------------------------------------------------------------------------------- 1 | 2 | ## 命名实体识别代码 3 | 4 | 该部分为本书第四章节对应的源代码,包含数据预处理和使用LSTM-CRF与BERT实现命名实体识别的代码实现 5 | 6 | 7 | ## 数据预处理模块 8 | 9 | 1.将预处理的数据放入data文件夹下,其格式需与filename.txt保持一致。 10 | 2.运行pro_ner.py将数据转为BIOES标注格式 11 | 3.运行train_test_divide.py划分训练集与测试集 12 | 13 | 14 | ## 命名实体识别模块 15 | 16 | 1.BILSTM-CRF代码见ChinsesNER-pytorch-master文件夹 17 | 2.BERT代码见BERT-NER-pytorch_sample文件夹 18 | -------------------------------------------------------------------------------- /第4章 数字人文下的实体识别/数据预处理/data/filename.txt: -------------------------------------------------------------------------------- 1 | 軒轅/nr 乃/d 修/v 德/n 振/v 兵/nrx ,/w 治/v 五氣/n ,/w 藝/v 五種/nrx ,/w 撫/v 萬民/nrx ,/w 度/v 四方/n ,/w 教/v 熊羆/nrx 貔貅/nrx 貙虎/nrx ,/w 以/p 與/p 炎帝/nr 戰/v 於/p 阪泉/ns 之/u 野/n 。/w -------------------------------------------------------------------------------- /第4章 数字人文下的实体识别/数据预处理/data_charseq/filename.txt: -------------------------------------------------------------------------------- 1 | 軒 B-nr 2 | 轅 E-nr 3 | 乃 O 4 | 修 O 5 | 德 O 6 | 振 O 7 | 兵 O 8 | , O 9 | 治 O 10 | 五 O 11 | 氣 O 12 | , O 13 | 藝 O 14 | 五 O 15 | 種 O 16 | , O 17 | 撫 O 18 | 萬 O 19 | 民 O 20 | , O 21 | 度 O 22 | 四 O 23 | 方 O 24 | , O 25 | 教 O 26 | 熊 O 27 | 羆 O 28 | 貔 O 29 | 貅 O 30 | 貙 O 31 | 虎 O 32 | , O 33 | 以 O 34 | 與 O 35 | 炎 B-nr 36 | 帝 E-nr 37 | 戰 O 38 | 於 O 39 | 阪 B-ns 40 | 泉 E-ns 41 | 之 O 42 | 野 O 43 | 。 O 44 | 45 | -------------------------------------------------------------------------------- /第4章 数字人文下的实体识别/数据预处理/data_seq/filename.txt: -------------------------------------------------------------------------------- 1 | 軒轅 nr 2 | 乃 d 3 | 修 v 4 | 德 n 5 | 振 v 6 | 兵 nrx 7 | , w 8 | 治 v 9 | 五氣 n 10 | , w 11 | 藝 v 12 | 五種 nrx 13 | , w 14 | 撫 v 15 | 萬民 nrx 16 | , w 17 | 度 v 18 | 四方 n 19 | , w 20 | 教 v 21 | 熊羆 nrx 22 | 貔貅 nrx 23 | 貙虎 nrx 24 | , w 25 | 以 p 26 | 與 p 27 | 炎帝 nr 28 | 戰 v 29 | 於 p 30 | 阪泉 ns 31 | 之 u 32 | 野 n 33 | 。 w 34 | 35 | -------------------------------------------------------------------------------- /第4章 数字人文下的实体识别/数据预处理/pro_ner.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | from string import punctuation 4 | 5 | from tqdm import tqdm 6 | from zhon import hanzi 7 | 8 | punc = hanzi.punctuation + punctuation 9 | 10 | 11 | def word_pos2word_seq(filepath, resultfolder='data_seq'): 12 | """ 13 | word/tag转换为word\ttag 14 | word指代词,tag指代词性标签 15 | (不带BIES) 16 | """ 17 | if resultfolder=='data_seq': 18 | if not os.path.exists('data_seq'):os.makedirs('data_seq') # 创建输出结果文件夹 19 | data_name = os.path.split(filepath)[1][:-4] # 获取当前输入数据文件文件名(不含前面的文件夹路径和最后的.txt) 20 | with open(filepath, 'rt', encoding='utf8')as f: 21 | with open('{}/{}.txt'.format(resultfolder, data_name), 'w', encoding='utf8') as r: 22 | for line in tqdm(f.readlines()): # 遍历读取每一行数据 23 | if line == '\n': # 若该行为空行则跳过 24 | # r.write('\n') 25 | continue 26 | content_lst = line.strip('\n\r').strip(' ') # 去除每行末尾空格 27 | content_lst = re.sub(' ', ' ', content_lst).split(' ')# 去除连续多余的空格为1个并按照空格拆分为列表: [word/tag, word2/tag2, ……] 28 | 29 | char_tag_lst = [c.split('/') for c in content_lst] 30 | char_lst = [c[0] for c in char_tag_lst] # word列表 31 | tag_lst = [c[1] for c in char_tag_lst] # tag列表 32 | 33 | for char, tag in zip(char_lst, tag_lst): 34 | r.write(char + '\t' + tag + '\n') 35 | r.write('\n') # 每行结束之后增加一个空行用于区分不同行转换出的序列 36 | 37 | 38 | def word_seq2char_seq(filepath,resultfolder='data_charseq'): 39 | """ 40 | word\ttag转换为char\ttag 41 | (带BIES) 42 | """ 43 | if resultfolder=='data_charseq': 44 | if not os.path.exists('data_charseq'): 45 | os.makedirs('data_charseq') 46 | data_name = os.path.split(filepath)[1] #[:-4] # 数据文件名 47 | sep_char = ' ' # 生成的文件 word tag中的分隔符 48 | 49 | with open(filepath, 'rt', encoding='utf-8-sig')as f: 50 | with open('{}/{}'.format(resultfolder,data_name), 'w', encoding='utf-8')as r: 51 | for line in tqdm(f.readlines()): # 遍历读取每行 word\t tag\n 52 | if line == '\n': 53 | # r.write(' \n') # crf_learn并不认可数据中使用’\n’作为sentence间的分割符(空行),但能够识别‘space(空格)\n’的空行分隔符。 54 | r.write('\n') # bert_ner_pytorch的断句 使用’\n’作为sentence间的分割符(空行) 55 | continue 56 | word_tag_lst = line.strip('\n').split('\t') 57 | word = word_tag_lst[0] # word 58 | tag = word_tag_lst[1] # tag 59 | 60 | char_lst = list(word) # 每行单个字组成的列表 61 | tag_lst = [] 62 | if tag not in ['nr', 'ns', 't']: # 本次识别的实体词性标签 63 | for char in word: 64 | r.write(char + sep_char +'O\n') # 将非此次需要识别的标签认定为O 65 | continue 66 | 67 | if len(word) == 1: # 单个字组成的实体,用S-tag表示 68 | # char_lst.append(word) 69 | tag_lst.append('S-' + tag) 70 | elif len(word) == 2: # 双字实体 71 | # char_lst.extend([word[0],word[1]]) 72 | tag_lst.extend(['B-' + tag, 'E-' + tag]) 73 | else: # 三字以上实体 74 | for id, char in enumerate(word): 75 | # char_lst.append(char) 76 | if id == 0: 77 | tag_lst.append('B-' + tag) 78 | elif id < len(word) - 1: 79 | tag_lst.append('I-' + tag) 80 | else: 81 | tag_lst.append('E-' + tag) 82 | for char, tag in zip(char_lst, tag_lst): 83 | r.write(char + sep_char + tag + '\n') 84 | 85 | 86 | def main(): 87 | word_pos2word_seq('data/filename.txt') 88 | word_seq2char_seq('data_seq/filename.txt') 89 | 90 | if __name__ == '__main__': 91 | main() 92 | -------------------------------------------------------------------------------- /第4章 数字人文下的实体识别/数据预处理/train_test_divide.py: -------------------------------------------------------------------------------- 1 | import os 2 | from os import path 3 | 4 | from sklearn.model_selection import KFold 5 | from sklearn.model_selection import train_test_split 6 | from tqdm import tqdm 7 | 8 | 9 | def data_merge(folder, merged_txt='data_merged.txt'): 10 | """ 将多个txt的数据合并 """ 11 | if type(folder) == list: # 若folder为list,则汇总合并多个文件夹内所有文件的path 12 | paths = [] 13 | [paths.extend(path.join(folder_i, f) for f in os.listdir(folder_i)) for folder_i in folder] 14 | else: 15 | paths = [path.join(folder, f) for f in os.listdir(folder)] 16 | 17 | with open(merged_txt, 'w+', encoding='utf-8')as dmf: 18 | for p in tqdm(paths): 19 | with open(p, 'rt', encoding='utf-8')as pf: 20 | dmf.writelines(pf.readlines()) 21 | 22 | 23 | def load_data(filepaths=None, datafile=None): 24 | """ 加载文件夹内所有数据 """ 25 | if filepaths is None: 26 | filepaths = [datafile] 27 | data = [] 28 | [data.extend(open(path, 'rt', encoding='utf-8').readlines()) for path in filepaths] 29 | # random.shuffle(data) 30 | return data 31 | 32 | 33 | def train_test_divide(data): 34 | """ 读取数据文件并划分为训练集、测试集 """ 35 | # random.shuffle(data) 36 | train_data, test_data = train_test_split(data, test_size=0.1) 37 | 38 | with open('train_data.txt', 'w+', encoding='utf-8')as tr: 39 | tr.write('\n'.join(train_data)) 40 | with open('test_data.txt', 'w+', encoding='utf-8')as te: 41 | te.write('\n'.join(test_data)) 42 | return train_data, test_data 43 | 44 | 45 | def train_test_divide_kfold(data, outputfolder='data'): 46 | """ 十折交叉验证划分数据训练集、测试集 """ 47 | [os.makedirs(outputfolder + os.sep + 'data_{}'.format(i)) for i in range(10) if not os.path.exists(outputfolder + os.sep + 'data_{}'.format(i))] 48 | 49 | kf = KFold(n_splits=10, shuffle=False) # shuffle 是否打乱数据,此处为否 50 | k = 0 51 | for train_index, test_index in kf.split(data): 52 | train_list = [data[tr].strip('\r\n') for tr in train_index] 53 | test_list = [data[te].strip('\r\n') for te in test_index] 54 | with open(outputfolder + os.sep + 'data_' + str(k) + os.sep + 'train.tsv', 'w+', encoding='utf-8')as tr: 55 | tr.write('\n'.join(train_list)) 56 | with open(outputfolder + os.sep + 'data_' + str(k) + os.sep + 'test.tsv', 'w+', encoding='utf-8')as te: 57 | te.write('\n'.join(test_list)) 58 | k += 1 59 | print('第{}折完成!'.format(k)) 60 | 61 | 62 | if __name__ == '__main__': 63 | file = '待划分文件路径(含文件名)' 64 | data = load_data(datafile=file) 65 | train_test_divide_kfold(data, '结果文件路径(含文件名)') -------------------------------------------------------------------------------- /第4章 数字人文下的实体识别/第四章 数字人文下的命名实体识别.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hsc748NLP/code-for-digital-humanities-tutorial/6220c365ab5ff7df16f1f20a46cc928b034db36a/第4章 数字人文下的实体识别/第四章 数字人文下的命名实体识别.pdf -------------------------------------------------------------------------------- /第5章 数字人文下的模型预训练/README.md: -------------------------------------------------------------------------------- 1 | ## 语言模型预训练 2 | 3 | 本部分包含预训练BERT类模型和将bin模型转换为ckpt模型的代码 4 | 5 | 1.pytorch_chinese_lm_pretrain文件夹内包含bert类模型预训练的基础代码,可直接采用训练语句使用,也可通过修改sh文件夹中的内容使用。此处实现参照了transformers库的预训练预训练代码和中文模型预训练的github项目(https://github.com/zhusleep/pytorch_chinese_lm_pretrain) 6 | 7 | 2.transfer.py用于将预训练完成的bin文件转为ckpt格式,可供tensorflow框架加载。 8 | 9 | ## 建议运行环境 10 | ``` 11 | torch==1.6.0 12 | 13 | transformers==3.4.0 14 | 15 | 1.15.0<= tensorflow <2.0 16 | ``` 17 | -------------------------------------------------------------------------------- /第5章 数字人文下的模型预训练/pytorch_chinese_lm_pretrain/README.md: -------------------------------------------------------------------------------- 1 | # 基于pytorch的中文语言模型预训练 2 | 3 | 提供三种中文语言模型预训练的方法。预训练bert类模型对硬件的要求较高,建议在16G以上显存的设备上运行代码。 4 | 5 | ## bert-base-chinese 6 | 7 | (https://huggingface.co/bert-base-chinese) 8 | ​ 9 | 10 | 基于官方案例实现bert模型训练。 11 | 12 | https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling 13 | (本文使用的transformers版本为3.4.0) 14 | ``` 15 | python run_language_model_bert.py --output_dir=output --model_type=bert --model_name_or_path=bert-base-chinese --do_train --train_data_file=data/train.txt --do_eval --eval_data_file=data/eval.txt --mlm --per_device_train_batch_size=4 --save_total_limit=1 --num_train_epochs=5 16 | 17 | ``` 18 | 会自动从官网上下载bert-base-chinese模型来继续训练。 19 | 20 | ## roberta-wwm-ext 21 | 22 | (https://github.com/ymcui/Chinese-BERT-wwm) 23 | 24 | 25 | 要基于上面的代码run_language_model_roberta.py继续预训练roberta。还需要做两个改动。 26 | * 下载roberta-wwm-ext到本地目录hflroberta,在config.json中修改“model_type”:"roberta"为"model_type":"bert"。 27 | * 对上面的run_language_modeling.py中的AutoModel和AutoTokenizer都进行替换为BertModel和BertTokenizer。 28 | 29 | 假设config.json已经改好,可以运行如下命令。 30 | ``` 31 | python run_language_model_roberta.py --output_dir=output --model_type=bert --model_name_or_path=hflroberta --do_train --train_data_file=data/train.txt --do_eval --eval_data_file=data/eval.txt --mlm --per_device_train_batch_size=4 --save_total_limit=1 --num_train_epochs=5 32 | ``` 33 | 34 | ### ernie 35 | https://github.com/nghuyong/ERNIE-Pytorch) 36 | 37 | ernie是百度发布的基于百度知道贴吧等中文语料结合实体预测等任务生成的预训练模型。这个模型的准确率在某些任务上要优于bert-base-chinese和roberta。如果基于ernie1.0模型做领域数据预训练的话只需要一步修改。 38 | 39 | * 下载ernie1.0到本地目录ernie,在config.json中增加字段"model_type":"bert"。 40 | 运行 41 | ``` 42 | python run_language_model_ernie.py --output_dir=output --model_type=bert --model_name_or_path=ernie --do_train --train_data_file=train.txt --do_eval --eval_data_file=eval.txt --mlm --per_device_train_batch_size=4 --save_total_limit=1 --num_train_epochs=5 43 | 44 | ``` 45 | -------------------------------------------------------------------------------- /第5章 数字人文下的模型预训练/pytorch_chinese_lm_pretrain/output/README.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /第5章 数字人文下的模型预训练/pytorch_chinese_lm_pretrain/run_bert.sh: -------------------------------------------------------------------------------- 1 | ### 2 | # @Author: your name 3 | # @Date: 2021-05-15 20:18:45 4 | # @LastEditTime: 2021-06-08 20:27:08 5 | # @LastEditors: Please set LastEditors 6 | # @Description: In User Settings Edit 7 | # @FilePath: /pytorch_chinese_lm_pretrain-master/run_bert.sh 8 | ### 9 | TRAIN_FILE='train.txt' 10 | TEST_FILE='eval.txt' 11 | PreTrain_Model='bert-base-chinese' 12 | mkdir -p log 13 | CUDA_VISIBLE_DEVICES=0,1 python run_language_model_bert.py \ 14 | --output_dir=output/$PreTrain_Model \ 15 | --model_type=bert \ 16 | --overwrite_output_dir \ 17 | --save_total_limit=3 \ 18 | --num_train_epochs=10 \ 19 | --learning_rate=5e-4 \ 20 | --local_rank=-1 \ 21 | --model_name_or_path=$PreTrain_Model \ 22 | --do_train \ 23 | --train_data_file=$TRAIN_FILE \ 24 | --do_eval \ 25 | --eval_data_file=$TEST_FILE \ 26 | --mlm \ 27 | --per_device_train_batch_size=32 \ 28 | > log/log_$PreTrain_Model.log 2>&1 & echo $! > log/run_$PreTrain_Model.pid 29 | -------------------------------------------------------------------------------- /第5章 数字人文下的模型预训练/pytorch_chinese_lm_pretrain/run_bert_from_scratch.sh: -------------------------------------------------------------------------------- 1 | ### 2 | # @Author: your name 3 | # @Date: 2021-05-14 20:57:35 4 | # @LastEditTime: 2021-05-30 14:55:08 5 | # @LastEditors: your name 6 | # @Description: In User Settings Edit 7 | # @FilePath: /pytorch_chinese_lm_pretrain-master/run_roberta.sh 8 | ### 9 | TRAIN_FILE='train.txt' 10 | TEST_FILE='eval.txt' 11 | PreTrain_Model='roberta-base' 12 | From_Scratch='/home/admin/zihe.zhu/pytorch_chinese_lm_pretrain-master/train_tokenizer/pretrained_models/' 13 | mkdir -p log 14 | CUDA_VISIBLE_DEVICES=0,1 nohup python full_copy.py \ 15 | --output_dir=output/$PreTrain_Model \ 16 | --model_type=bert \ 17 | --overwrite_output_dir \ 18 | --save_total_limit=3 \ 19 | --num_train_epochs=10 \ 20 | --learning_rate=5e-4 \ 21 | --local_rank=-1 \ 22 | --cache_dir=$From_Scratch \ 23 | --config_name=$From_Scratch \ 24 | --tokenizer_name=$From_Scratch \ 25 | --do_train \ 26 | --train_data_file=$TRAIN_FILE \ 27 | --do_eval \ 28 | --eval_data_file=$TEST_FILE \ 29 | --mlm \ 30 | --per_device_train_batch_size=32 \ 31 | > log/log_$PreTrain_Model.log 2>&1 & echo $! > log/run_$PreTrain_Model.pid -------------------------------------------------------------------------------- /第5章 数字人文下的模型预训练/pytorch_chinese_lm_pretrain/run_roberta.sh: -------------------------------------------------------------------------------- 1 | ### 2 | # @Author: your name 3 | # @Date: 2021-05-14 20:57:35 4 | # @LastEditTime: 2021-05-30 14:55:16 5 | # @LastEditors: your name 6 | # @Description: In User Settings Edit 7 | # @FilePath: /pytorch_chinese_lm_pretrain-master/run_roberta.sh 8 | ### 9 | TRAIN_FILE='train.txt' 10 | TEST_FILE='eval.txt' 11 | PreTrain_Model='roberta-base' 12 | mkdir -p log 13 | CUDA_VISIBLE_DEVICES=0,1 nohup python full_copy.py \ 14 | --output_dir=output/$PreTrain_Model \ 15 | --model_type=bert \ 16 | --overwrite_output_dir \ 17 | --save_total_limit=3 \ 18 | --num_train_epochs=10 \ 19 | --learning_rate=5e-4 \ 20 | --local_rank=-1 \ 21 | --model_name_or_path=$PreTrain_Model \ 22 | --do_train \ 23 | --train_data_file=$TRAIN_FILE \ 24 | --do_eval \ 25 | --eval_data_file=$TEST_FILE \ 26 | --mlm \ 27 | --per_device_train_batch_size=32 \ 28 | > log/log_$PreTrain_Model.log 2>&1 & echo $! > log/run_$PreTrain_Model.pid -------------------------------------------------------------------------------- /第5章 数字人文下的模型预训练/transfer.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | """ 4 | Convert Huggingface Pytorch checkpoint to Tensorflow checkpoint. 5 | """ 6 | 7 | import os 8 | 9 | import numpy as np 10 | import tensorflow as tf 11 | import torch 12 | from transformers import BertModel 13 | 14 | 15 | def convert_pytorch_checkpoint_to_tf(model: BertModel, ckpt_dir: str, model_name: str): 16 | 17 | """ 18 | :param model:BertModel Pytorch model instance to be converted 19 | :param ckpt_dir: Tensorflow model directory 20 | :param model_name: model name 21 | :return: 22 | Currently supported Huggingface models: 23 | Y BertModel 24 | N BertForMaskedLM 25 | N BertForPreTraining 26 | N BertForMultipleChoice 27 | N BertForNextSentencePrediction 28 | N BertForSequenceClassification 29 | N BertForQuestionAnswering 30 | """ 31 | 32 | tensors_to_transpose = ("dense.weight", "attention.self.query", "attention.self.key", "attention.self.value") 33 | 34 | var_map = ( 35 | ("layer.", "layer_"), 36 | ("word_embeddings.weight", "word_embeddings"), 37 | ("position_embeddings.weight", "position_embeddings"), 38 | ("token_type_embeddings.weight", "token_type_embeddings"), 39 | (".", "/"), 40 | ("LayerNorm/weight", "LayerNorm/gamma"), 41 | ("LayerNorm/bias", "LayerNorm/beta"), 42 | ("weight", "kernel"), 43 | ) 44 | 45 | if not os.path.isdir(ckpt_dir): 46 | os.makedirs(ckpt_dir) 47 | 48 | state_dict = model.state_dict() 49 | 50 | def to_tf_var_name(name: str): 51 | for patt, repl in iter(var_map): 52 | name = name.replace(patt, repl) 53 | return "bert/{}".format(name) 54 | 55 | def create_tf_var(tensor: np.ndarray, name: str, session: tf.Session): 56 | tf_dtype = tf.dtypes.as_dtype(tensor.dtype) 57 | tf_var = tf.get_variable(dtype=tf_dtype, shape=tensor.shape, name=name, initializer=tf.zeros_initializer()) 58 | session.run(tf.variables_initializer([tf_var])) 59 | session.run(tf_var) 60 | return tf_var 61 | 62 | tf.reset_default_graph() 63 | with tf.Session() as session: 64 | for var_name in state_dict: 65 | tf_name = to_tf_var_name(var_name) 66 | torch_tensor = state_dict[var_name].numpy() 67 | if any([x in var_name for x in tensors_to_transpose]): 68 | torch_tensor = torch_tensor.T 69 | tf_var = create_tf_var(tensor=torch_tensor, name=tf_name, session=session) 70 | tf.keras.backend.set_value(tf_var, torch_tensor) 71 | tf_weight = session.run(tf_var) 72 | print("Successfully created {}: {}".format(tf_name, np.allclose(tf_weight, torch_tensor))) 73 | 74 | saver = tf.train.Saver(tf.trainable_variables()) 75 | saver.save(session, os.path.join(ckpt_dir, model_name.replace("-", "_").replace(".ckpt", "") + ".ckpt")) 76 | 77 | def convert(pytorch_bin_path: str, pytorch_bin_model: str, tf_ckpt_path: str, tf_ckpt_model: str): 78 | 79 | model = BertModel.from_pretrained( 80 | pretrained_model_name_or_path=pytorch_bin_path, 81 | state_dict=torch.load(os.path.join(pytorch_bin_path, pytorch_bin_model), map_location='cpu') 82 | ) 83 | 84 | convert_pytorch_checkpoint_to_tf(model=model, ckpt_dir=tf_ckpt_path, model_name=tf_ckpt_model) 85 | 86 | if __name__ == '__main__': 87 | bin_path = r'/home/admin/pretrain_models/sikuroberta_vocabtxt' 88 | bin_model = 'pytorch_model.bin' 89 | ckpt_path = r'/home/admin/pretrain_models/sikuroberta_vocabtxt_ckpt' 90 | ckpt_model = 'bert_model.ckpt' 91 | 92 | convert(bin_path, bin_model, ckpt_path, ckpt_model) 93 | 94 | -------------------------------------------------------------------------------- /第5章 数字人文下的模型预训练/第五章 数字人文下的模型预训练.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hsc748NLP/code-for-digital-humanities-tutorial/6220c365ab5ff7df16f1f20a46cc928b034db36a/第5章 数字人文下的模型预训练/第五章 数字人文下的模型预训练.pdf -------------------------------------------------------------------------------- /第6章 数字人文下的知识图谱构建及应用/FLASK/.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /workspace.xml -------------------------------------------------------------------------------- /第6章 数字人文下的知识图谱构建及应用/FLASK/.idea/.name: -------------------------------------------------------------------------------- 1 | app.py -------------------------------------------------------------------------------- /第6章 数字人文下的知识图谱构建及应用/FLASK/.idea/falsk_test.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 16 | -------------------------------------------------------------------------------- /第6章 数字人文下的知识图谱构建及应用/FLASK/.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | -------------------------------------------------------------------------------- /第6章 数字人文下的知识图谱构建及应用/FLASK/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | -------------------------------------------------------------------------------- /第6章 数字人文下的知识图谱构建及应用/FLASK/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /第6章 数字人文下的知识图谱构建及应用/FLASK/.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /第6章 数字人文下的知识图谱构建及应用/FLASK/FindSim.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding:utf-8 _*- 3 | """ 4 | @Time:2021-05-10 20:07 5 | @Author:Veigar 6 | @File: FindSim.py 7 | @Github:https://github.com/veigaran 8 | """ 9 | #!/usr/bin/env python 10 | # -*- coding:utf-8 _*- 11 | 12 | import jieba 13 | 14 | from Params import Params 15 | 16 | 17 | class FindSim(Params): 18 | def __init__(self): 19 | super().__init__() 20 | self.result = {} 21 | 22 | def find_sim_words(self, question): 23 | """ 24 | 当全匹配失败时,就采用相似度计算来找相似的词 25 | :param question: 26 | :return: 27 | """ 28 | import re 29 | import string 30 | from gensim.models import KeyedVectors 31 | 32 | jieba.load_userdict(self.vocab_path) 33 | self.model = KeyedVectors.load_word2vec_format(self.word2vec_path, binary=False) 34 | 35 | sentence = re.sub("[{}]", re.escape(string.punctuation), question) 36 | sentence = re.sub("[,。‘’;:?、!【】]", " ", sentence) 37 | sentence = sentence.strip() 38 | 39 | words = [w.strip() for w in jieba.cut(sentence) if w.strip() not in self.stopwords and len(w.strip()) >= 2] 40 | 41 | alist = [] 42 | 43 | for word in words: 44 | temp = [self.medicine_entities, self.generic_entities, self.cate_entities, self.indication_entities] 45 | for i in range(len(temp)): 46 | flag = '' 47 | if i == 0: 48 | flag = "Medicine" 49 | elif i == 1: 50 | flag = "genericNameFormat" 51 | elif i == 2: 52 | flag = "list_cate" 53 | else: 54 | flag = "indications" 55 | scores = self.simCal(word, temp[i], flag) 56 | alist.extend(scores) 57 | temp1 = sorted(alist, key=lambda k: k[1], reverse=True) 58 | if temp1: 59 | self.result[temp1[0][2]] = [temp1[0][0]] 60 | 61 | def editDistanceDP(self, s1, s2): 62 | """ 63 | 采用DP方法计算编辑距离 64 | :param s1: 65 | :param s2: 66 | :return: 67 | """ 68 | m = len(s1) 69 | n = len(s2) 70 | solution = [[0 for j in range(n + 1)] for i in range(m + 1)] 71 | for i in range(len(s2) + 1): 72 | solution[0][i] = i 73 | for i in range(len(s1) + 1): 74 | solution[i][0] = i 75 | 76 | for i in range(1, m + 1): 77 | for j in range(1, n + 1): 78 | if s1[i - 1] == s2[j - 1]: 79 | solution[i][j] = solution[i - 1][j - 1] 80 | else: 81 | solution[i][j] = 1 + min(solution[i][j - 1], min(solution[i - 1][j], 82 | solution[i - 1][j - 1])) 83 | return solution[m][n] 84 | 85 | def simCal(self, word, entities, flag): 86 | """ 87 | 计算词语和字典中的词的相似度 88 | 相同字符的个数/min(|A|,|B|) + 余弦相似度 89 | :param word: str 90 | :param entities:List 91 | :return: 92 | """ 93 | a = len(word) 94 | scores = [] 95 | for entity in entities: 96 | sim_num = 0 97 | b = len(entity) 98 | c = len(set(entity + word)) 99 | temp = [] 100 | for w in word: 101 | if w in entity: 102 | sim_num += 1 103 | if sim_num != 0: 104 | score1 = sim_num / c # overlap score 105 | temp.append(score1) 106 | try: 107 | score2 = self.model.similarity(word, entity) # 余弦相似度分数 108 | temp.append(score2) 109 | except: 110 | pass 111 | score3 = 1 - self.editDistanceDP(word, entity) / (a + b) # 编辑距离分数 112 | if score3: 113 | temp.append(score3) 114 | 115 | score = sum(temp) / len(temp) 116 | if score >= 0.7: 117 | scores.append((entity, score, flag)) 118 | 119 | scores.sort(key=lambda k: k[1], reverse=True) 120 | return scores 121 | -------------------------------------------------------------------------------- /第6章 数字人文下的知识图谱构建及应用/FLASK/Params.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding:utf-8 _*- 3 | """ 4 | @Time:2021-05-10 20:07 5 | @Author:Veigar 6 | @File: Params.py 7 | @Github:https://github.com/veigaran 8 | """ 9 | import os 10 | import pickle 11 | 12 | import ahocorasick 13 | import joblib 14 | 15 | 16 | class Params: 17 | def __init__(self): 18 | cur_dir = '/'.join(os.path.abspath(__file__).split('/')[:-1]) 19 | # 路径 20 | self.vocab_path = os.path.join(cur_dir, 'data/vocab.txt') 21 | self.stopwords_path = os.path.join(cur_dir, 'data/stop_words.utf8') 22 | self.word2vec_path = r'F:\A文档\python学习\Competition\Medication\代码\data\merge_sgns_bigram_char300.txt' # os.path.join(cur_dir, 'data/merge_sgns_bigram_char300.txt') 23 | self.stopwords = [w.strip() for w in open(self.stopwords_path, 'r', encoding='utf8') if w.strip()] 24 | 25 | # 意图分类模型文件 26 | self.tfidf_path = os.path.join(cur_dir, 'model/tf.pkl') 27 | self.nb_test_path = os.path.join(cur_dir, 'model/SVM.m') # 测试nb模型 28 | self.tfidf_model = pickle.load(open(self.tfidf_path, "rb")) 29 | self.nb_model = joblib.load(self.nb_test_path) 30 | 31 | self.person_path = os.path.join(cur_dir, 'data/人物.txt') 32 | self.alias_path = os.path.join(cur_dir, 'data/别名.txt') 33 | self.surname_path = os.path.join(cur_dir, 'data/姓氏.txt') 34 | self.country_path = os.path.join(cur_dir, 'data/国家.txt') 35 | self.school_path = os.path.join(cur_dir, 'data/学派.txt') 36 | self.rank_path = os.path.join(cur_dir, 'data/等级.txt') 37 | self.field_path = os.path.join(cur_dir, 'data/领域.txt') 38 | 39 | self.person_entities = [w.strip() for w in open(self.person_path, encoding='utf8') if w.strip()] 40 | self.alias_entities = [w.strip() for w in open(self.alias_path, encoding='utf8') if w.strip()] 41 | self.surname_entities = [w.strip() for w in open(self.surname_path, encoding='utf8') if w.strip()] 42 | self.country_entities = [w.strip() for w in open(self.country_path, encoding='utf8') if w.strip()] 43 | self.school_entities = [w.strip() for w in open(self.school_path, encoding='utf8') if w.strip()] 44 | self.rank_entities = [w.strip() for w in open(self.rank_path, encoding='utf8') if w.strip()] 45 | self.field_entities = [w.strip() for w in open(self.field_path, encoding='utf8') if w.strip()] 46 | 47 | # 构造领域actree 48 | self.person_tree = self.build_actree(list(set(self.person_entities))) 49 | self.alias_tree = self.build_actree(list(set(self.alias_entities))) 50 | self.surname_tree = self.build_actree(list(set(self.surname_entities))) 51 | self.country_tree = self.build_actree(list(set(self.country_entities))) 52 | self.school_tree = self.build_actree(list(set(self.school_entities))) 53 | self.rank_tree = self.build_actree(list(set(self.rank_entities))) 54 | self.field_tree = self.build_actree(list(set(self.field_entities))) 55 | 56 | self.name_qwds = ['英文名是什么', '通用名是什么', '一般叫什么', '哪些名字', '什么名字'] 57 | self.country_qwds = ['国家是什么', '国家', '属于哪个国家'] 58 | self.children_qwds = ['子女有哪些', '子女是谁', '儿子是谁', '孩子有哪些', '孩子是谁'] 59 | self.father_qwds = ['父亲是谁', '爸爸是谁', '父亲', '爸爸', '爸爸是什么名字'] 60 | 61 | def build_actree(self, wordlist): 62 | """ 63 | 构造actree,加速过滤 64 | :param wordlist: 65 | :return: 66 | """ 67 | actree = ahocorasick.Automaton() 68 | # 向树中添加单词 69 | for index, word in enumerate(wordlist): 70 | actree.add_word(word, (index, word)) 71 | actree.make_automaton() 72 | return actree 73 | -------------------------------------------------------------------------------- /第6章 数字人文下的知识图谱构建及应用/FLASK/app.py: -------------------------------------------------------------------------------- 1 | from flask import Flask, request, render_template 2 | 3 | import kbqa 4 | 5 | app = Flask(__name__, static_url_path='') 6 | 7 | 8 | @app.route('/') 9 | def hello_world(): 10 | return render_template('search.html') 11 | 12 | 13 | @app.route('/wstmsearch', methods=['GET', 'POST']) 14 | def wstm_search(): 15 | answer = str 16 | if request.method == 'POST': 17 | # 取出待搜索keyword 18 | keyword = request.form['keyword'] 19 | handler = kbqa.KBQA() 20 | # question = input("用户:") 21 | question = keyword 22 | answer = handler.qa_main(question) 23 | print('ok') 24 | print("AI机器人:", answer) 25 | print("*" * 50) 26 | 27 | return render_template('result.html', search_result=answer, keyword=question) 28 | return render_template('search.html') 29 | 30 | 31 | 32 | if __name__ == '__main__': 33 | app.run() 34 | # app = Flask(__name__) 35 | # app.config['SERVER_NAME'] = 'veiagra.top' 36 | # app.run(debug=True, host='0.0.0.0', port=443, 37 | # ssl_context=('./etc/nginx/ssl_certs/veiagra.pem', '/etc/nginx/ssl_certs/veiagra.key')) 38 | -------------------------------------------------------------------------------- /第6章 数字人文下的知识图谱构建及应用/FLASK/https/default: -------------------------------------------------------------------------------- 1 | ## 2 | # You should look at the following URL's in order to grasp a solid understanding 3 | # of Nginx configuration files in order to fully unleash the power of Nginx. 4 | # http://wiki.nginx.org/Pitfalls 5 | # http://wiki.nginx.org/QuickStart 6 | # http://wiki.nginx.org/Configuration 7 | # 8 | # Generally, you will want to move this file somewhere, and start with a clean 9 | # file but keep this around for reference. Or just disable in sites-enabled. 10 | # 11 | # Please see /usr/share/doc/nginx-doc/examples/ for more detailed examples. 12 | ## 13 | 14 | # Default server configuration 15 | # 16 | 17 | upstream flask { 18 | server 127.0.0.1:5000; 19 | server 127.0.0.1:5001; 20 | } 21 | 22 | server { 23 | listen 80 default_server; 24 | listen [::]:80 default_server; 25 | root /var/www/html; 26 | 27 | # Add index.php to the list if you are using PHP 28 | index index.html index.htm index.nginx-debian.html; 29 | 30 | server_name www.veiagra.top; 31 | location / { 32 | # 请求转发到gunicorn服务器 33 | proxy_pass http://127.0.0.1:5000; 34 | # 请求转发到多个gunicorn服务器 35 | # proxy_pass http://flask; 36 | # 设置请求头,并将头信息传递给服务器端 37 | proxy_set_header Host $host; 38 | # 设置请求头,传递原始请求ip给 gunicorn 服务器 39 | proxy_set_header X-Real-IP $remote_addr; 40 | } 41 | 42 | #location / { 43 | # First attempt to serve request as file, then 44 | # as directory, then fall back to displaying a 404. 45 | try_files $uri $uri/ =404; 46 | #} 47 | 48 | # SSL configuration 49 | # 50 | # listen 443 ssl default_server; 51 | # listen [::]:443 ssl default_server; 52 | # 53 | # Note: You should disable gzip for SSL traffic. 54 | # See: https://bugs.debian.org/773332 55 | # 56 | # Read up on ssl_ciphers to ensure a secure configuration. 57 | # See: https://bugs.debian.org/765782 58 | # 59 | # Self signed certs generated by the ssl-cert package 60 | # Don't use them in a production server! 61 | # 62 | # include snippets/snakeoil.conf; 63 | # pass the PHP scripts to FastCGI server listening on 127.0.0.1:9000 64 | # 65 | #location ~ \.php$ { 66 | # include snippets/fastcgi-php.conf; 67 | # 68 | # # With php7.0-cgi alone: 69 | # fastcgi_pass 127.0.0.1:9000; 70 | # # With php7.0-fpm: 71 | # fastcgi_pass unix:/run/php/php7.0-fpm.sock; 72 | #} 73 | 74 | # deny access to .htaccess files, if Apache's document root 75 | # concurs with nginx's one 76 | # 77 | #location ~ /\.ht { 78 | # deny all; 79 | #} 80 | } 81 | 82 | 83 | # Virtual Host configuration for example.com 84 | # 85 | # You can move that to a different file under sites-available/ and symlink that 86 | # to sites-enabled/ to enable it. 87 | # 88 | #server { 89 | # listen 80; 90 | # listen [::]:80; 91 | # 92 | # server_name example.com; 93 | # 94 | # root /var/www/example.com; 95 | # index index.html; 96 | # 97 | # location / { 98 | # try_files $uri $uri/ =404; 99 | # } 100 | #} 101 | -------------------------------------------------------------------------------- /第6章 数字人文下的知识图谱构建及应用/FLASK/https/https.conf: -------------------------------------------------------------------------------- 1 | upstream veiagra_api 2 | { 3 | ip_hash; 4 | server 127.0.0.1:5000; 5 | server 127.0.0.1:5001; 6 | } 7 | server { 8 | listen *:80; 9 | listen [::]:80; 10 | listen *:443 ssl; 11 | listen [::]:443 ssl; 12 | server_name www.veiagra.top; 13 | ssl_certificate /etc/nginx/ssl_certs/veiagra.pem; 14 | ssl_certificate_key /etc/nginx/ssl_certs/veiagra.key; 15 | #ssl_session_cache shared:SSL:1m; 16 | ssl_session_timeout 5m; 17 | ssl_ciphers HIGH:!aNULL:!MD5; 18 | ssl_prefer_server_ciphers on; 19 | location / { 20 | # 请求转发到gunicorn服务器 21 | proxy_pass http://127.0.0.1:5000; 22 | # 请求转发到多个gunicorn服务器 23 | # proxy_pass http://flask; 24 | # 设置请求头,并将头信息传递给服务器端 25 | proxy_set_header Host $host; 26 | # 设置请求头,传递原始请求ip给 gunicorn 服务器 27 | proxy_set_header X-Real-IP $remote_addr; 28 | } 29 | } 30 | 31 | server { 32 | listen 80; 33 | server_name www.veiagra.top; # 域名 34 | # 强制跳转https 35 | rewrite ^(.*) https://$server_name$1 permanent; 36 | location / { 37 | # 请求转发到gunicorn服务器 38 | proxy_pass http://127.0.0.1:5000; 39 | # 请求转发到多个gunicorn服务器 40 | # proxy_pass http://flask; 41 | # 设置请求头,并将头信息传递给服务器端 42 | proxy_set_header Host $host; 43 | # 设置请求头,传递原始请求ip给 gunicorn 服务器 44 | proxy_set_header X-Real-IP $remote_addr; 45 | } 46 | } -------------------------------------------------------------------------------- /第6章 数字人文下的知识图谱构建及应用/FLASK/https/nginx.conf: -------------------------------------------------------------------------------- 1 | 2 | #user nobody; 3 | worker_processes 1; 4 | 5 | #error_log logs/error.log; 6 | #error_log logs/error.log notice; 7 | #error_log logs/error.log info; 8 | 9 | #pid logs/nginx.pid; 10 | 11 | 12 | events { 13 | worker_connections 1024; 14 | } 15 | 16 | 17 | 18 | http { 19 | include mime.types; 20 | default_type application/octet-stream; 21 | 22 | #log_format main '$remote_addr - $remote_user [$time_local] "$request" ' 23 | # '$status $body_bytes_sent "$http_referer" ' 24 | # '"$http_user_agent" "$http_x_forwarded_for"'; 25 | 26 | #access_log logs/access.log main; 27 | 28 | sendfile on; 29 | #tcp_nopush on; 30 | 31 | #keepalive_timeout 0; 32 | keepalive_timeout 65; 33 | 34 | #gzip on; 35 | 36 | 37 | upstream flask { 38 | server 127.0.0.1:5000; 39 | server 127.0.0.1:5001; 40 | } 41 | proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; 42 | 43 | server { 44 | listen 80 default_server; 45 | server_name www.veiagra.top; 46 | listen [::]:80 default_server; 47 | 48 | #charset koi8-r; 49 | #access_log logs/host.access.log main; 50 | 51 | #root /var/www/html; 52 | #index index.html index.htm index.nginx-debian.html; 53 | 54 | server_name _; 55 | 56 | location / { 57 | # 请求转发到gunicorn服务器 58 | #proxy_pass http://127.0.0.1:8000; 59 | # 请求转发到多个gunicorn服务器 60 | proxy_pass http://flask; 61 | # 设置请求头,并将头信息传递给服务器端 62 | #proxy_set_header Host $host; 63 | # 设置请求头,传递原始请求ip给 gunicorn 服务器 64 | #proxy_set_header X-Real-IP $remote_addr; 65 | 66 | } 67 | 68 | 69 | #error_page 404 /404.html; 70 | 71 | # redirect server error pages to the static page /50x.html 72 | # 73 | error_page 500 502 503 504 /50x.html; 74 | location = /50x.html { 75 | root html; 76 | } 77 | 78 | # proxy the PHP scripts to Apache listening on 127.0.0.1:80 79 | # 80 | #location ~ \.php$ { 81 | # proxy_pass http://127.0.0.1; 82 | #} 83 | 84 | # pass the PHP scripts to FastCGI server listening on 127.0.0.1:9000 85 | # 86 | #location ~ \.php$ { 87 | # root html; 88 | # fastcgi_pass 127.0.0.1:9000; 89 | # fastcgi_index index.php; 90 | # fastcgi_param SCRIPT_FILENAME /scripts$fastcgi_script_name; 91 | # include fastcgi_params; 92 | #} 93 | 94 | # deny access to .htaccess files, if Apache's document root 95 | # concurs with nginx's one 96 | # 97 | #location ~ /\.ht { 98 | # deny all; 99 | #} 100 | } 101 | 102 | 103 | # another virtual host using mix of IP-, name-, and port-based configuration 104 | # 105 | #server { 106 | # listen 8000; 107 | # listen somename:8080; 108 | # server_name somename alias another.alias; 109 | 110 | # location / { 111 | # root html; 112 | # index index.html index.htm; 113 | # } 114 | #} 115 | 116 | 117 | # HTTPS server 118 | # 119 | #server { 120 | # listen 443 ssl; 121 | # server_name localhost; 122 | 123 | # ssl_certificate cert.pem; 124 | # ssl_certificate_key cert.key; 125 | 126 | # ssl_session_cache shared:SSL:1m; 127 | # ssl_session_timeout 5m; 128 | 129 | # ssl_ciphers HIGH:!aNULL:!MD5; 130 | # ssl_prefer_server_ciphers on; 131 | 132 | # location / { 133 | # root html; 134 | # index index.html index.htm; 135 | # } 136 | #} 137 | 138 | } 139 | -------------------------------------------------------------------------------- /第6章 数字人文下的知识图谱构建及应用/FLASK/kbqa.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding:utf-8 _*- 3 | """ 4 | @Time:2021-05-10 20:09 5 | @Author:Veigar 6 | @File: kbqa.py 7 | @Github:https://github.com/veigaran 8 | """ 9 | 10 | 11 | class KBQA: 12 | def __init__(self): 13 | pass 14 | # self.extractor = EntityExtractor() 15 | # self.searcher = AnswerSearching() 16 | 17 | def qa_main(self, input_str): 18 | answer = "对不起,您的问题我不知道,我今后会努力改进的。" 19 | entities = self.extractor.extractor(input_str) 20 | if not entities: 21 | return answer 22 | sqls = self.searcher.question_parser(entities) 23 | final_answer = self.searcher.searching(sqls) 24 | if not final_answer: 25 | return answer 26 | else: 27 | return '\n'.join(final_answer) 28 | 29 | 30 | if __name__ == "__main__": 31 | handler = KBQA() 32 | while True: 33 | question = input("请输入:") 34 | if not question: 35 | break 36 | answer = handler.qa_main(question) 37 | print("", answer) 38 | print("*"*50) -------------------------------------------------------------------------------- /第6章 数字人文下的知识图谱构建及应用/FLASK/model/NB.m: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hsc748NLP/code-for-digital-humanities-tutorial/6220c365ab5ff7df16f1f20a46cc928b034db36a/第6章 数字人文下的知识图谱构建及应用/FLASK/model/NB.m -------------------------------------------------------------------------------- /第6章 数字人文下的知识图谱构建及应用/FLASK/model/ch_ner_model.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hsc748NLP/code-for-digital-humanities-tutorial/6220c365ab5ff7df16f1f20a46cc928b034db36a/第6章 数字人文下的知识图谱构建及应用/FLASK/model/ch_ner_model.h5 -------------------------------------------------------------------------------- /第6章 数字人文下的知识图谱构建及应用/FLASK/model/tf.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hsc748NLP/code-for-digital-humanities-tutorial/6220c365ab5ff7df16f1f20a46cc928b034db36a/第6章 数字人文下的知识图谱构建及应用/FLASK/model/tf.pkl -------------------------------------------------------------------------------- /第6章 数字人文下的知识图谱构建及应用/FLASK/predict.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding:utf-8 _*- 3 | """ 4 | @Time:2021-05-10 20:06 5 | @Author:Veigar 6 | @File: predict.py 7 | @Github:https://github.com/veigaran 8 | """ 9 | import pickle 10 | 11 | import jieba 12 | import torch 13 | 14 | 15 | def get_seg_features(string): 16 | """ 17 | 对句子分词,构造词的长度特征,为BIES格式, 18 | [对]对应的特征为[4], 不设为0,因为pad的id就是0 19 | [句子]对应的特征为[1,3], 20 | [中华人民]对应的特征为[1,2,2,3] 21 | """ 22 | seg_feature = [] 23 | 24 | for word in jieba.cut(string): 25 | if len(word) == 1: 26 | seg_feature.append(4) 27 | else: 28 | tmp = [2] * len(word) 29 | tmp[0] = 1 30 | tmp[-1] = 3 31 | seg_feature.extend(tmp) 32 | return seg_feature 33 | 34 | 35 | def prepare_dataset(sentences, char_to_id, tag_to_id, lower=False, test=False): 36 | """ 37 | 把文本型的样本和标签,转化为index,便于输入模型 38 | 需要在每个样本和标签前后加, 39 | 但由于pytorch-crf这个包里面会自动添加的转移概率, 40 | 所以我们不用在手动加入。 41 | """ 42 | 43 | def f(x): 44 | return x.lower() if lower else x 45 | 46 | data = [] 47 | for s in sentences: 48 | 49 | chars = [w[0] for w in s] 50 | tags = [w[-1] for w in s] 51 | 52 | """ 句子转化为index """ 53 | chars_idx = [char_to_id[f(c) if f(c) in char_to_id else ''] for c in chars] 54 | 55 | """ 对句子分词,构造词的长度特征 """ 56 | segs_idx = get_seg_features("".join(chars)) 57 | 58 | if not test: 59 | tags_idx = [tag_to_id[t] for t in tags] 60 | 61 | else: 62 | tags_idx = [tag_to_id[""] for _ in tags] 63 | 64 | assert len(chars_idx) == len(segs_idx) == len(tags_idx) 65 | data.append([chars, chars_idx, segs_idx, tags_idx]) 66 | 67 | return data 68 | 69 | 70 | def result_to_json(string, tags): 71 | """ 按规范的格式输出预测结果 """ 72 | item = {"string": string, "entities": []} 73 | entity_name = "" 74 | entity_start = 0 75 | idx = 0 76 | for char, tag in zip(string, tags): 77 | if tag[0] == "S": 78 | item["entities"].append({"word": char, "start": idx, "end": idx + 1, "type": tag[2:]}) 79 | elif tag[0] == "B": 80 | entity_name += char 81 | entity_start = idx 82 | elif tag[0] == "I": 83 | entity_name += char 84 | elif tag[0] == "E": 85 | entity_name += char 86 | item["entities"].append({"word": entity_name, "start": entity_start, "end": idx + 1, "type": tag[2:]}) 87 | entity_name = "" 88 | else: 89 | entity_name = "" 90 | entity_start = idx 91 | idx += 1 92 | return item 93 | 94 | 95 | def predict(input_str): 96 | map_file = r'./model/maps.pkl' 97 | with open(map_file, "rb") as f: 98 | char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) 99 | 100 | """ 用cpu预测 """ 101 | model_file = r'./model/medical_ner.ckpt' 102 | model = torch.load(model_file, map_location="cpu") 103 | # model.eval() 104 | 105 | if not input_str: 106 | input_str = input("请输入文本: ") 107 | 108 | _, char_ids, seg_ids, _ = prepare_dataset([input_str], char_to_id, tag_to_id, test=True)[0] 109 | char_tensor = torch.LongTensor(char_ids).view(1, -1) 110 | seg_tensor = torch.LongTensor(seg_ids).view(1, -1) 111 | 112 | with torch.no_grad(): 113 | """ 得到维特比解码后的路径,并转换为标签 """ 114 | paths = model(char_tensor, seg_tensor) 115 | tags = [id_to_tag[idx] for idx in paths[0]] 116 | res = result_to_json(input_str, tags) 117 | entity_type = res["entities"][0]['type'] 118 | word = res["entities"][0]['word'] 119 | result = {} 120 | if entity_type == "DRU": 121 | result["person"] = [word] 122 | # pprint(result_to_json(input_str, tags)) 123 | print(entity_type, word, '\n', result) 124 | return result 125 | 126 | 127 | -------------------------------------------------------------------------------- /第6章 数字人文下的知识图谱构建及应用/FLASK/static/0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hsc748NLP/code-for-digital-humanities-tutorial/6220c365ab5ff7df16f1f20a46cc928b034db36a/第6章 数字人文下的知识图谱构建及应用/FLASK/static/0.png -------------------------------------------------------------------------------- /第6章 数字人文下的知识图谱构建及应用/FLASK/static/default.css: -------------------------------------------------------------------------------- 1 | body{margin:0;padding:0;background:#ffe;font-size:14px;font-family:'微软雅黑','宋体',sans-serif;color:#231F20;overflow:auto} 2 | a {color:#000;font-size:14px;} 3 | #main{width:100%;} 4 | #wrap{position:relative;margin:0 auto;width:1100px;height:680px;margin-top:10px;} 5 | #text{width:400px;height:425px;left:60px;top:80px;position:absolute;} 6 | #code{display:none;font-size:16px;} 7 | #clock-box {position:absolute;left:60px;top:550px;font-size:28px;display:none;} 8 | #clock-box a {font-size:28px;text-decoration:none;} 9 | #clock{margin-left:48px;} 10 | #clock .digit {font-size:64px;} 11 | #canvas{margin:0 auto;width:1100px;height:680px;} 12 | #error{margin:0 auto;text-align:center;margin-top:60px;display:none;} 13 | .hand{cursor:pointer;} 14 | .say{margin-left:5px;} 15 | .space{margin-right:150px;} 16 | -------------------------------------------------------------------------------- /第6章 数字人文下的知识图谱构建及应用/FLASK/templates/0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hsc748NLP/code-for-digital-humanities-tutorial/6220c365ab5ff7df16f1f20a46cc928b034db36a/第6章 数字人文下的知识图谱构建及应用/FLASK/templates/0.png -------------------------------------------------------------------------------- /第6章 数字人文下的知识图谱构建及应用/FLASK/templates/result.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 古汉语典籍问答系统-检索结果 6 | 64 | 65 | 66 | 67 |
68 |
69 | 70 |
71 | 72 |
73 |
74 | 75 | 76 |
77 |
78 | 79 |
80 |
81 |
82 | {{ search_result }}
83 |
84 | 85 |
86 | 87 | 88 | -------------------------------------------------------------------------------- /第6章 数字人文下的知识图谱构建及应用/FLASK/templates/search.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 古汉语典籍自动问答系统 6 | 64 | 65 | 66 | 67 | 68 | 70 | 71 | {##} 72 | 73 |
74 |
75 |
76 |
77 |
78 |
79 | 80 | 81 |
82 |
83 |
84 | 85 | -------------------------------------------------------------------------------- /第6章 数字人文下的知识图谱构建及应用/README.md: -------------------------------------------------------------------------------- 1 | ## 知识图谱自动构建与应用源代码 2 | 3 | 此处提供FLASK文件和检索模块,完整源代码和数据请参见本课题组另一项目的链接:https://github.com/veigaran/ZUOZHUAN_KBQA 4 | 5 | ## 建议运行环境 6 | ``` 7 | jieba==0.42.1 8 | 9 | py2neo 10 | 11 | ahocorasick 12 | 13 | flask 14 | ``` 15 | 16 | -------------------------------------------------------------------------------- /第7章 数字人文下的文本分类/README.md: -------------------------------------------------------------------------------- 1 | ## 文本分类源代码 2 | 3 | 此文件夹内为使用RNN对非物质文化遗产文本进行分类的源代码 4 | 5 | 文本分类实现可参考下列仓库,包括常见的机器学习算法,如naive bayes,SVM等,也包括深度学习算法,如CNN、RNN,BERT等 6 | 代码实现: 7 | https://github.com/veigaran/NLP_ROAD/tree/master/2-%E5%9F%BA%E4%BA%8E%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0%E7%9A%84%E6%96%87%E6%9C%AC%E5%88%86%E7%B1%BB 8 | https://github.com/veigaran/NLP_ROAD/tree/master/4-%E6%B7%B1%E5%BA%A6%E5%AD%A6%E4%B9%A0%E4%B8%8B%E7%9A%84%E6%96%87%E6%9C%AC%E5%88%86%E7%B1%BB 9 | -------------------------------------------------------------------------------- /第7章 数字人文下的文本分类/第七章 数字人文下的文本分类.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hsc748NLP/code-for-digital-humanities-tutorial/6220c365ab5ff7df16f1f20a46cc928b034db36a/第7章 数字人文下的文本分类/第七章 数字人文下的文本分类.pdf -------------------------------------------------------------------------------- /第7章 数字人文下的文本分类/非遗信息 全.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hsc748NLP/code-for-digital-humanities-tutorial/6220c365ab5ff7df16f1f20a46cc928b034db36a/第7章 数字人文下的文本分类/非遗信息 全.xlsx -------------------------------------------------------------------------------- /第8章 数字人文下的文本聚类/README.md: -------------------------------------------------------------------------------- 1 | ## 文本聚类源代码 2 | 3 | 文件夹内为基于K-means的文本自动聚类源代码,使用方法请参照教材第八章内容。详细代码说明见[此仓库](https://github.com/veigaran/NLP_ROAD/tree/master/3-%E5%9F%BA%E4%BA%8E%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0%E7%9A%84%E6%96%87%E6%9C%AC%E8%81%9A%E7%B1%BB)。 4 | -------------------------------------------------------------------------------- /第8章 数字人文下的文本聚类/code/cluster/DBSCAN.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | from sklearn.cluster import DBSCAN 4 | 5 | 6 | def DBscan(X): 7 | ##产生随机数据的中心 8 | # centers = [[1, 1], [-1, -1], [1, -1]] 9 | ##产生的数据个数 10 | # n_samples = 750 11 | ##生产数据:此实验结果受cluster_std的影响,或者说受eps 和cluster_std差值影响 12 | # X, lables_true = make_blobs(n_samples=n_samples, centers=centers, cluster_std=0.4, 13 | # random_state=0) 14 | ##设置分层聚类函数 15 | db = DBSCAN(eps=0.5, min_samples=50) 16 | ##训练数据 17 | db.fit(X) 18 | ##初始化一个全是False的bool类型的数组 19 | core_samples_mask = np.zeros_like(db.labels_, dtype=bool) 20 | ''' 21 | 这里是关键点(针对这行代码:xy = X[class_member_mask & ~core_samples_mask]): 22 | db.core_sample_indices_ 表示的是某个点在寻找核心点集合的过程中暂时被标为噪声点的点(即周围点 23 | 小于min_samples),并不是最终的噪声点。在对核心点进行联通的过程中,这部分点会被进行重新归类(即标签 24 | 并不会是表示噪声点的-1),也可也这样理解,这些点不适合做核心点,但是会被包含在某个核心点的范围之内 25 | ''' 26 | core_samples_mask[db.core_sample_indices_] = True 27 | 28 | ##每个数据的分类 29 | lables = db.labels_ 30 | 31 | ##分类个数:lables中包含-1,表示噪声点 32 | n_clusters_ = len(np.unique(lables)) - (1 if -1 in lables else 0) 33 | 34 | ##绘图 35 | unique_labels = set(lables) 36 | ''' 37 | 1)np.linspace 返回[0,1]之间的len(unique_labels) 个数 38 | 2)plt.cm 一个颜色映射模块 39 | 3)生成的每个colors包含4个值,分别是rgba 40 | 4)其实这行代码的意思就是生成4个可以和光谱对应的颜色值 41 | ''' 42 | colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels))) 43 | 44 | plt.figure(1) 45 | plt.clf() 46 | 47 | for k, col in zip(unique_labels, colors): 48 | ##-1表示噪声点,这里的k表示黑色 49 | if k == -1: 50 | col = 'k' 51 | 52 | ##生成一个True、False数组,lables == k 的设置成True 53 | class_member_mask = (lables == k) 54 | 55 | ##两个数组做&运算,找出即是核心点又等于分类k的值 markeredgecolor='k', 56 | xy = X[class_member_mask & core_samples_mask] 57 | plt.plot(xy[:, 0], xy[:, 1], 'o', c=col, markersize=14) 58 | ''' 59 | 1)~优先级最高,按位对core_samples_mask 求反,求出的是噪音点的位置 60 | 2)& 于运算之后,求出虽然刚开始是噪音点的位置,但是重新归类却属于k的点 61 | 3)对核心分类之后进行的扩展 62 | ''' 63 | xy = X[class_member_mask & ~core_samples_mask] 64 | plt.plot(xy[:, 0], xy[:, 1], 'o', c=col, markersize=6) 65 | 66 | plt.title('Estimated number of clusters: %d' % n_clusters_) 67 | print(n_clusters_) 68 | plt.show() 69 | 70 | def main(): 71 | X_w2v_100 = np.loadtxt("./Word2vector/w2v_sentence_vec_100D.txt") # word2vec_sentence_size=100 72 | DBscan(X_w2v_100) 73 | 74 | if __name__ == '__main__': 75 | main() -------------------------------------------------------------------------------- /第8章 数字人文下的文本聚类/code/cluster/Hierarchy_C.py: -------------------------------------------------------------------------------- 1 | from itertools import cycle ##python自带的迭代器模块 2 | 3 | import jieba 4 | import matplotlib.pyplot as plt 5 | import numpy as np 6 | from sklearn.cluster import AgglomerativeClustering 7 | from sklearn.cluster import MeanShift, estimate_bandwidth 8 | from sklearn.decomposition import PCA 9 | 10 | 11 | # 读取txt文档 12 | def read_txt(path): 13 | f = open(path, 'r', encoding='UTF-8') 14 | lines = f.readlines() 15 | f.close() 16 | return lines 17 | 18 | 19 | # onehot编码,返回np.array() 20 | # Reference https://blog.csdn.net/Dorothy_Xue/article/details/84641417 21 | def onehot(text): 22 | # 对原有文档用jieba分词,并建立字典 23 | data = [] 24 | words = [] 25 | for sentence in text: 26 | sentence = sentence.strip() 27 | seg_list = jieba.cut(sentence, cut_all=False) 28 | seg_list = '/'.join(seg_list) 29 | temp = seg_list.split('/') 30 | for word in temp: 31 | words.append(word) 32 | data.append(seg_list) 33 | dic = list(set(words)) # 去重 34 | 35 | # 手动onehot编码 36 | vector = [] 37 | for i in range(0, len(data)): 38 | temp = [] 39 | for j in range(0, len(dic)): 40 | if dic[j] in data[i].split('/'): 41 | temp.append(1) 42 | else: 43 | temp.append(0) 44 | temp = np.array(temp) 45 | vector.append(temp) 46 | length = len(vector) 47 | vector = np.array(vector) 48 | return vector 49 | 50 | 51 | # 设置聚类函数,X是二维列表,绘制聚类示意图 52 | # Reference https://www.cnblogs.com/lc1217/p/6963687.html 53 | def Hierarchy(X): 54 | linkages = ['ward', 'average', 'complete'] 55 | n_clusters_ = 6 56 | ac = AgglomerativeClustering(linkage=linkages[2], n_clusters=n_clusters_) 57 | # ac = DBSCAN(eps=0.1, min_samples=5) 58 | ##训练数据 59 | ac.fit(X) 60 | 61 | ##每个数据的分类 62 | lables = ac.labels_ 63 | 64 | ##绘图 65 | plt.figure(1) 66 | plt.clf() 67 | 68 | colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk') 69 | for k, col in zip(range(n_clusters_), colors): 70 | ##根据lables中的值是否等于k,重新组成一个True、False的数组 71 | my_members = lables == k 72 | ##X[my_members, 0] 取出my_members对应位置为True的值的横坐标 73 | plt.plot(X[my_members, 0], X[my_members, 1], col + '.') 74 | 75 | plt.title('Estimated number of clusters: %d' % n_clusters_) 76 | plt.show() 77 | 78 | 79 | def Mean_shift(X): 80 | # 产生随机数据的中心 81 | # centers = [[1, 1], [-1, -1], [1, -1]] 82 | # 产生的数据个数 83 | # n_samples=10000 84 | # 生产数据 85 | # X, _ = make_blobs(n_samples=n_samples, centers= centers, cluster_std=0.6,random_state =0) 86 | 87 | # 带宽,也就是以某个点为核心时的搜索半径 88 | bandwidth = estimate_bandwidth(X, quantile=0.2, n_samples=500) 89 | # 设置均值偏移函数 90 | ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) 91 | # 训练数据 92 | ms.fit(X) 93 | # 每个点的标签 94 | labels = ms.labels_ 95 | print(labels) 96 | # 簇中心的点的集合 97 | cluster_centers = ms.cluster_centers_ 98 | # 总共的标签分类 99 | labels_unique = np.unique(labels) 100 | # 聚簇的个数,即分类的个数 101 | n_clusters_ = len(labels_unique) 102 | 103 | print("number of estimated clusters : %d" % n_clusters_) 104 | 105 | # 绘图 106 | plt.figure(1) 107 | plt.clf() 108 | 109 | colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk') 110 | for k, col in zip(range(n_clusters_), colors): 111 | # 根据lables中的值是否等于k,重新组成一个True、False的数组 112 | my_members = labels == k 113 | cluster_center = cluster_centers[k] 114 | # X[my_members, 0] 取出my_members对应位置为True的值的横坐标 115 | plt.plot(X[my_members, 0], X[my_members, 1], col + '.') 116 | plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=14) 117 | plt.title('Estimated number of clusters: %d' % n_clusters_) 118 | plt.show() 119 | 120 | 121 | def main(): 122 | path = "./title_info.txt" 123 | text = read_txt(path) 124 | vector = onehot(text) 125 | # 降维 PCA/SVD 126 | pca = PCA(n_components=2) # 降到2维 127 | pca.fit(vector) 128 | new_vector = pca.fit_transform(vector) 129 | # 层次聚类 130 | Hierarchy(new_vector) 131 | 132 | # 均值聚类 133 | # Mean_shift(new_vector) 134 | 135 | 136 | if __name__ == '__main__': 137 | main() 138 | -------------------------------------------------------------------------------- /第8章 数字人文下的文本聚类/code/cluster/kmeans-all.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy.spatial.distance import cdist 3 | from sklearn import metrics 4 | from sklearn.cluster import KMeans 5 | from sklearn.metrics import silhouette_score 6 | 7 | 8 | def kmeans(X): 9 | K = range(2, 11) 10 | meandistortions = [] 11 | Scores = [] # 存放轮廓系数 12 | CH=[] 13 | all = [] 14 | '''kmeans = KMeans(n_clusters=8) 15 | kmeans.fit(X) 16 | lables = kmeans.labels_ 17 | meandistortions.append(sum(np.min(cdist(X, kmeans.cluster_centers_, 'euclidean'), axis=1)) / X.shape[0]) 18 | Scores.append( 19 | silhouette_score(X, lables, metric='euclidean')) # euclidean 欧氏距离 20 | CH.append(metrics.calinski_harabasz_score(X, lables)) 21 | print("标签:", lables) 22 | np.savetxt("D:\我\非遗\高维聚类结果\\400_6类_label.txt",lables) 23 | test_stat = {} 24 | l = lables.tolist() 25 | # print(l) 26 | for i in set(l): 27 | test_stat[i] = l.count(i) 28 | print(test_stat)''' 29 | for k in K: 30 | kmeans = KMeans(n_clusters=k) 31 | kmeans.fit(X) 32 | lables = kmeans.labels_ 33 | meandistortions.append(sum(np.min(cdist(X, kmeans.cluster_centers_, 'euclidean'), axis=1)) / X.shape[0]) 34 | Scores.append( 35 | silhouette_score(X, lables, metric='euclidean')) # euclidean 欧氏距离 36 | CH.append(metrics.calinski_harabasz_score(X, lables)) 37 | print("标签:", lables) 38 | if(k==6): 39 | np.savetxt("D:\我\非遗\高维聚类结果\\tfidf\\6类_label3.txt", lables) 40 | elif(k==8): 41 | np.savetxt("D:\我\非遗\高维聚类结果\\tfidf\\8类_label3.txt", lables) 42 | elif(k==10): 43 | np.savetxt("D:\我\非遗\高维聚类结果\\tfidf\\10类_label3.txt", lables) 44 | test_stat = {} 45 | l = lables.tolist() 46 | # print(l) 47 | for i in set(l): 48 | test_stat[i] = l.count(i) 49 | print(test_stat) 50 | 51 | '''降维可视化 52 | tsne = TSNE(perplexity=30, n_components=2, init='pca') TSNE降维,降到2D 53 | data = tsne.fit_transform(X) 54 | 55 | x_min, x_max = np.min(data, 0), np.max(data, 0) 56 | data = (data - x_min) / (x_max - x_min) # 归一化 57 | 58 | plt.figure() 59 | for i in range(data.shape[0]): 60 | plt.text(data[i, 0], data[i, 1], str(lables[i]), 61 | color=plt.cm.Set1(lables[i] / 10.), 62 | fontdict={'weight': 'bold', 'size': 9}) 63 | plt.xticks([]) 64 | plt.yticks([]) 65 | plt.show()''' 66 | 67 | 68 | #np.save("D:\我\非遗\高维聚类结果\\100w\\number.txt", test_stat) 69 | print("轮廓系数:", Scores) 70 | #np.savetxt("D:\我\非遗\Word2vector\标签\轮廓系数_50.txt", Scores) 71 | print("成本函数:", meandistortions) 72 | #np.savetxt("D:\我\非遗\Word2vector\标签\成本函数_50.txt", meandistortions) 73 | #print("CH:", CH) 74 | #np.savetxt("D:\我\非遗\Word2vector\标签\CH_50.txt", CH) 75 | 76 | 77 | def main(): 78 | #X_w2v_50 = np.loadtxt("./Word2vector/w2v_sentence_vec_50D.txt") 79 | X_w2v_100 = np.loadtxt("./Word2vector/cbow-hn/w2v_sentence_vec_100D_cbow.txt") 80 | X_w2v_200 = np.loadtxt("./Word2vector/cbow-hn/w2v_sentence_vec_200D_cbow.txt") 81 | X_w2v_300 = np.loadtxt("./Word2vector/cbow-hn/w2v_sentence_vec_300D_cbow.txt") 82 | X_w2v_400 = np.loadtxt("./Word2vector/cbow-hn/w2v_sentence_vec_400D_cbow.txt") 83 | X_tfidf = np.loadtxt("words_tfidf2.txt") 84 | 85 | X_w2v_100_ns = np.loadtxt("./Word2vector/cbow-ns/w2v_sentence_vec_100D_ns.txt") 86 | X_w2v_200_ns = np.loadtxt("./Word2vector/cbow-ns/w2v_sentence_vec_200D_ns.txt") 87 | X_w2v_300_ns = np.loadtxt("./Word2vector/cbow-ns/w2v_sentence_vec_300D_ns.txt") 88 | X_w2v_400_ns = np.loadtxt("./Word2vector/cbow-ns/w2v_sentence_vec_400D_ns.txt") 89 | 90 | kmeans(X_tfidf) 91 | 92 | if __name__ == '__main__': 93 | main() -------------------------------------------------------------------------------- /第8章 数字人文下的文本聚类/code/cluster/kmeans(余弦相似度)/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hsc748NLP/code-for-digital-humanities-tutorial/6220c365ab5ff7df16f1f20a46cc928b034db36a/第8章 数字人文下的文本聚类/code/cluster/kmeans(余弦相似度)/__init__.py -------------------------------------------------------------------------------- /第8章 数字人文下的文本聚类/code/cluster/kmeans(余弦相似度)/basealgorithm.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | from cluster.basefunction import getCenter_mean 4 | from cluster.basefunction import similarity_cos, similarity_euclidean, similarity_manhattan 5 | 6 | ''' 7 | 此模块待填坑: 8 | 1.缺少层次聚类法函数 9 | 2.缺少报错信息raise 10 | ''' 11 | 12 | class kmeans(): 13 | def __init__(self,k=5,max_iteration=100000,min_center=0.001,select_decision="random",getcenter_function="mean",similarity_function="cos"): 14 | self.k = k 15 | self.max_iteration = max_iteration 16 | self.min_center = min_center 17 | self.select_decision = select_decision 18 | self.getcenter_function = getcenter_function 19 | self.similarity_function = similarity_function 20 | self.clustercenter = [[] for _ in range(k) ] 21 | self.clusterdata=[ [] for _ in range(k) ] 22 | pass 23 | 24 | def similarity(self,point_data): 25 | if(self.similarity_function=="cos"): 26 | result=[ similarity_cos(self.clustercenter[i],point_data) for i in range(self.k)] 27 | return result.index(max(result)) 28 | elif(self.similarity_function=="euclidean"): 29 | result = [similarity_euclidean(self.clustercenter[i], point_data) for i in range(self.k)] 30 | return result.index(min(result)) 31 | elif (self.similarity_function == "manhattan"): 32 | result = [similarity_manhattan(self.clustercenter[i], point_data) for i in range(self.k)] 33 | return result.index(min(result)) 34 | else: 35 | print("similarity 参数错误!") 36 | 37 | def getCenter(self,tmp_data,origin_center): 38 | if(self.getcenter_function=="mean"): 39 | return getCenter_mean(tmp_data,origin_center) 40 | else: 41 | print("similarity 参数错误!") 42 | 43 | def selected_point(self,data): 44 | if(self.select_decision=="random"): 45 | self.selected_random(data) 46 | else: 47 | print(" select_decision 参数错误!") 48 | 49 | def selected_random(self,data): 50 | tmp_set=set([]) 51 | if(len(data) 1)]) 82 | f.close() 83 | tmp_dict=model.extractKeyword(docs,10) 84 | w=open("C:/Users/sfe_williamsL/Desktop/毕业论文/keyword_10.txt","wt",encoding="utf-8") 85 | print(tmp_dict) 86 | for key in tmp_dict.keys(): 87 | w.write("\n".join(tmp_dict[key])+"\n") 88 | w.close() 89 | 90 | 91 | #word2vec计算 92 | doc_data = [] 93 | i = 0 94 | f = open("C:/Users/sfe_williamsL/Desktop/毕业论文/result_id.txt", "rt", encoding="utf-8") 95 | for line in f.readlines(): 96 | tmp_data = [] 97 | datas = line.split("\t") 98 | if (len(datas) < 2): 99 | continue 100 | docid = datas[0] 101 | content = datas[3] 102 | word_list =list(jieba.cut(content)) 103 | doc_data.append(word_list) 104 | i = i + 1 105 | f.close() 106 | wm=word2vect_model(path="C:/Users/sfe_williamsL/Desktop/毕业论文/data/word2vect_8",embedding_dim=8) 107 | wm.train(doc_data) 108 | ''' 109 | docs = {} 110 | model = tfidf() 111 | f = open("D:\我\非遗\高维聚类结果\\400维六类数据\\总.txt", "rt", encoding="utf-8") 112 | i=0 113 | for line in f.readlines(): 114 | if(not line.replace("\r","").replace("\n","")): 115 | continue 116 | i=i+1 117 | content = line.replace("\r","").replace("\n","") 118 | docs[i] = docs.get(i,content) 119 | f.close() 120 | print(docs) 121 | tmp_dict = model.extractKeyword(docs, 15) 122 | w = open("D:\我\非遗\高维聚类结果\\400维六类数据\\keyword_15.txt", "wt", encoding="utf-8") 123 | print(tmp_dict) 124 | for key in tmp_dict.keys(): 125 | w.write("\t".join(tmp_dict[key]) + "\n") 126 | w.close() -------------------------------------------------------------------------------- /第8章 数字人文下的文本聚类/code/ex_key/extraction-keywords.py: -------------------------------------------------------------------------------- 1 | import codecs 2 | 3 | import numpy 4 | import pandas 5 | from sklearn.feature_extraction.text import CountVectorizer 6 | from sklearn.feature_extraction.text import TfidfTransformer 7 | 8 | 9 | def extract(corpus): 10 | '''corpus = [] # 文档预料 空格连接 11 | 12 | # 读取预料 一行预料为一个文档 13 | path1 = "D:\我\非遗\高维聚类结果\\350_类11.txt" 14 | for line in open(path1, 'r', encoding="utf-8").readlines(): 15 | corpus.append(line.strip()) 16 | #print(corpus)''' 17 | corpus1 = ["我 来到 北京 清华大学", # 第一类文本切词后的结果,词之间以空格隔开 18 | "他 来到 了 网易 杭研 大厦", # 第二类文本的切词结果 19 | "小明 硕士 毕业 与 中国 科学院", # 第三类文本的切词结果 20 | "我 爱 北京 天安门"] 21 | 22 | contents = [ 23 | '我 是 中国 人。', 24 | '你 是 美国 人。', 25 | '他 叫 什么 名字?', 26 | '她 是 谁 啊?' 27 | ] 28 | countVectorizer = CountVectorizer( 29 | '''min_df=0, 30 | token_pattern=r"\b\w+\b"''' 31 | ) # 增加了min_df=0参数,保留最小长度为0的分词,和token_pattern,设置分词的正则表达式。 32 | textVector = countVectorizer.fit_transform(corpus) 33 | transformer = TfidfTransformer(sublinear_tf=True) # 该类会统计每个词语的tf-idf权值 34 | tfidf = transformer.fit_transform(textVector) # .fit_transform()方法得到tf-idf矩阵 35 | weight = tfidf.toarray() 36 | #print(weight) 37 | word = countVectorizer.get_feature_names() 38 | #print(word) 39 | sort = numpy.argsort(weight, axis=1)[:, -10:] # 对tf-idf矩阵每行的值进行排序,输出对应索引,并取每行前五,得到sort,格式为numpy.ndarray 40 | keywords = pandas.Index(word)[sort].values 41 | tagDF = pandas.DataFrame({ 42 | 'tag1': keywords[:, 0], # 提取第一行,得到包含所有文档的第1个关键词的数组 43 | 'tag2': keywords[:, 1], # 提取第二行,得到包含所有文档的第2个关键词的数组 44 | 'tag3': keywords[:, 2], 45 | 'tag4': keywords[:, 3], 46 | 'tag5': keywords[:, 4], 47 | 'tag6': keywords[:, 5], 48 | 'tag7': keywords[:, 6], 49 | 'tag8': keywords[:, 7], 50 | 'tag9': keywords[:, 8], 51 | 'tag10': keywords[:, 9], 52 | }) 53 | tagDF.to_csv("D:\我\非遗\高维聚类结果\\400-10\\掉包keywords_10.txt",header=False,index=False) 54 | print(tagDF) 55 | 56 | def read(path): 57 | with codecs.open(path, 'r', 'utf8') as f: 58 | line = f.readlines() 59 | return line 60 | 61 | 62 | def corpus(data): 63 | final = [] 64 | for line in data: 65 | l = line.split(' ') 66 | res = [x.strip() for x in l if x.strip() != ''] 67 | cor = " ".join(res) 68 | final.append(cor) 69 | #print(final[1]) 70 | return final 71 | 72 | 73 | def main(): 74 | path = "D:\我\非遗\高维聚类结果\\400-10\\总.txt" 75 | data = read(path) 76 | final = corpus(data) 77 | #print(final) 78 | extract(final) 79 | #print(data) 80 | 81 | 82 | if __name__ == "__main__": 83 | main() -------------------------------------------------------------------------------- /第8章 数字人文下的文本聚类/code/some_deal/data.py: -------------------------------------------------------------------------------- 1 | import codecs 2 | 3 | import jieba 4 | import xlrd 5 | 6 | 7 | def readxls(path, col): 8 | xl = xlrd.open_workbook(path) 9 | sheet = xl.sheets()[0] 10 | data = list(sheet.col_values(col))[1:] 11 | return data 12 | 13 | 14 | def readtxt(path): 15 | with codecs.open(path, 'r', 'utf8') as f: 16 | line = f.readline() 17 | line.replace('\\u3000','') 18 | data = list(line) 19 | return data 20 | 21 | 22 | def uni(title, info): 23 | uni_lis = [] 24 | for i, j in zip(title, info): 25 | if i != '' and j != '': 26 | n = i+' '+j 27 | uni_lis.append(n) 28 | return uni_lis 29 | 30 | 31 | def writetxt(path, txt): 32 | with codecs.open(path, 'a', 'utf-8') as f: 33 | for i in txt: 34 | f.write('\t'+str(i)+'\n') 35 | 36 | 37 | def cutwords(data, stopwords): 38 | #分词 39 | word_lis = [] 40 | for line in data: 41 | slist = jieba.cut(line, cut_all=False) 42 | output = " ".join(slist) 43 | for key in output.split(' '): 44 | if key not in stopwords: 45 | word_lis.append(key) 46 | return word_lis 47 | 48 | 49 | def main(): 50 | path_xls = ".\非遗国家级.xlsx" 51 | path_txt = ".\info.txt" 52 | path_stopword = ".\停用词.txt" 53 | title = readxls(path_xls, 0) 54 | info = readxls(path_xls, 8) 55 | stopwords = readtxt(path_stopword) # 读取停用词 56 | data = uni(title, info) # 标题和详细信息结合 57 | #random.shuffle(result) 58 | #train_list = data[:int(len(data) * 0.9)] 59 | #test_list = data[int(len(data) * 0.9):] 60 | writetxt(path_txt, data) # 输出全部训练数据 61 | #train_data = cutwords(train_list, stopwords) # 分词 62 | #test_data = cutwords(test_list, stopwords) 63 | #writetxt("./cut_words.txt", cutwords(data, stopwords)) # 分词 64 | 65 | 66 | if __name__=="__main__": 67 | main() 68 | -------------------------------------------------------------------------------- /第8章 数字人文下的文本聚类/code/some_deal/divide.py: -------------------------------------------------------------------------------- 1 | import codecs 2 | 3 | import jieba 4 | import jieba.analyse 5 | import numpy as np 6 | 7 | data0 = [] 8 | data1 = [] 9 | data2 = [] 10 | data3 = [] 11 | data4 = [] 12 | data5 = [] 13 | data6 = [] 14 | data7 = [] 15 | data8 = [] 16 | data9 = [] 17 | 18 | def readtxt(path): 19 | with codecs.open(path, 'r', 'utf8') as f: 20 | line = f.readlines() 21 | return line 22 | 23 | 24 | def divide(line, lables): 25 | for i, j in zip(lables, range(len(line))): 26 | line[j] = line[j].strip('\n') 27 | #res = [x.strip() for x in line[j] if x.strip() != ''] 28 | if i == 0: 29 | data0.append(line[j]) 30 | elif i == 1: 31 | data1.append(line[j]) 32 | elif i == 2: 33 | data2.append(line[j]) 34 | elif i == 3: 35 | data3.append(line[j]) 36 | elif i == 4: 37 | data4.append(line[j]) 38 | else: 39 | data5.append(line[j]) 40 | 41 | 42 | #print(data0) 43 | 44 | def extract_kw(data): 45 | #print(str(data)) 46 | kw = jieba.analyse.extract_tags(str(data), topK=30, withWeight=False, allowPOS=()) 47 | print(kw) 48 | 49 | 50 | def write(data, path): 51 | with codecs.open(path, 'a', encoding='utf8') as f: 52 | for line in data: 53 | f.write(line+' '+'\n') 54 | f.write('\n') 55 | f.close() 56 | 57 | 58 | def main(): 59 | path="D:/我/非遗/cut_words_entity.txt" 60 | pathtxt="D:/我/非遗/title_info.txt" 61 | path_lables = "D:\我\非遗\高维聚类结果\\400-ns\\6类_label.txt" 62 | line = readtxt(pathtxt) 63 | #print(line) 64 | lables = np.loadtxt(path_lables) 65 | #print(lables) 66 | divide(line, lables) 67 | write(data0, "D:\我\非遗\高维聚类结果\\400-6\\类1.txt") 68 | write(data1, "D:\我\非遗\高维聚类结果\\400-6\\类2.txt") 69 | write(data2, "D:\我\非遗\高维聚类结果\\400-6\\类3.txt") 70 | write(data3, "D:\我\非遗\高维聚类结果\\400-6\\类4.txt") 71 | write(data4, "D:\我\非遗\高维聚类结果\\400-6\\类5.txt") 72 | write(data5, "D:\我\非遗\高维聚类结果\\400-6\\类6.txt") 73 | #write(data6, "D:\我\非遗\高维聚类结果\\400-10\\类7.txt") 74 | #write(data7, "D:\我\非遗\高维聚类结果\\400-10\\类8.txt") 75 | #write(data8, "D:\我\非遗\高维聚类结果\\400-10\\类9.txt") 76 | #write(data9, "D:\我\非遗\高维聚类结果\\400-10\\类10.txt") 77 | 78 | main() -------------------------------------------------------------------------------- /第8章 数字人文下的文本聚类/code/some_deal/keywords.py: -------------------------------------------------------------------------------- 1 | import codecs 2 | 3 | 4 | def readtxt(path): 5 | with codecs.open(path,'r',encoding='utf-8') as f: 6 | line=f.readlines() #line为一个列表 一行一个元素 7 | line = [x.strip('\r\n') for x in line] 8 | print(line) 9 | return line 10 | 11 | def trans(data): 12 | words=[] 13 | for line in data: 14 | line = line.split(',') 15 | with codecs.open(r'D:\我\非遗\高维聚类结果\400维十类\掉包keywords_15.txt', 'a', 'utf-8')as f: 16 | for i in line: 17 | f.write(str(i)+'\n') 18 | f.write('\n') 19 | 20 | 21 | 22 | 23 | def main(): 24 | path6 = r'D:\我\非遗\高维聚类结果\400维六类数据\掉包keywords_15.txt' 25 | path8 = r'D:\我\非遗\高维聚类结果\400维八类\掉包keywords_15.txt' 26 | path10 = r'D:\我\非遗\高维聚类结果\400维十类\掉包keywords_15.txt' 27 | data = readtxt(path10) 28 | trans(data) 29 | 30 | 31 | 32 | if __name__ == '__main__': 33 | main() -------------------------------------------------------------------------------- /第8章 数字人文下的文本聚类/code/some_deal/porpotion.py: -------------------------------------------------------------------------------- 1 | import codecs 2 | 3 | import xlrd 4 | from pylab import * 5 | 6 | mpl.rcParams['font.sans-serif'] = ['SimHei'] 7 | 8 | 9 | # 读取类别txt 10 | def readtxt(path): 11 | with codecs.open(path, 'r', 'utf8') as f: 12 | line = f.readlines() 13 | return line 14 | 15 | # 读取excel 非遗信息全 16 | def readxls(path,col): 17 | xl=xlrd.open_workbook(path) 18 | sheet=xl.sheets()[0] 19 | data=list(sheet.col_values(col))[1:] 20 | return data 21 | 22 | # 读类别里标题信息 23 | def ex_title(data): 24 | ti=[] 25 | temp=[] 26 | for line in data: 27 | l = line.split(' ') 28 | ti.append(l[0]) 29 | #print(ti) 30 | return ti 31 | 32 | 33 | # 去除多余空字符 34 | def corpus(data): 35 | final = [] 36 | for line in data: 37 | l = line.strip(' ').split(' ') 38 | res = [x.strip() for x in l if x.strip() != '\xa0' or x.strip() != '\u3000' or x.strip() !='\ue81b'\ 39 | or x.strip() !=' '] 40 | cor = " ".join(res) 41 | final.append(cor) 42 | #print(final) 43 | return final 44 | 45 | 46 | # 计算各类数量 47 | def calculate(ti,dic): 48 | sum=0 49 | list=[] 50 | dic1 = {'民间文学':0,'传统音乐':0,'传统舞蹈':0,'传统戏剧':0,'曲艺':0,'传统体育、游艺与杂技':0,\ 51 | '传统美术':0,'传统技艺':0,'传统医药':0,'民俗':0} 52 | #print(dic1) 53 | for i in dic: 54 | for t in ti: 55 | if t == i: 56 | list.append(t) 57 | dic1[dic[i]]+=1 58 | #else: 59 | # print(t) 60 | for v in dic1.values(): 61 | sum=sum+v 62 | for i in ti: 63 | if i not in list: 64 | print(i) 65 | print(sum) 66 | print(dic1) 67 | #print(list) 68 | #print(len(list)) 69 | return dic1 70 | 71 | def writetxt(path,txt): 72 | with codecs.open(path,'a','utf-8') as f: 73 | for i in txt: 74 | f.write(str(i)+'\n') 75 | 76 | 77 | def main(): 78 | path1="D:\我\非遗\高维聚类结果\\400-10\\类2.txt" 79 | #path_cla = "D:\我\非遗\高维聚类结果\标题加类别.txt" 80 | path_xls = "D:\我\非遗\非遗初始语料\非遗国家级.xlsx" 81 | #path_list = "D:\我\非遗\高维聚类结果\\400维六类数据\类4结果.txt" 82 | #ti_cla = readtxt(path_cla) 83 | #print(ti_cla) 84 | title = readxls(path_xls, 0) 85 | classes = readxls(path_xls, 4) 86 | final_title=corpus(title) # 去除奇异字符 87 | dic=dict(zip(final_title, corpus(classes))) # 标题:类别 字典 88 | #print(dic) 89 | 90 | data = readtxt(path1) 91 | ti = corpus(ex_title(data)) 92 | #print(ti) 93 | dic_num=calculate(ti,dic) 94 | 95 | #writetxt(path_list,calculate(ti,dic)) 96 | 97 | 98 | if __name__ == '__main__': 99 | main() -------------------------------------------------------------------------------- /第8章 数字人文下的文本聚类/code/some_deal/shufa.py: -------------------------------------------------------------------------------- 1 | import codecs 2 | 3 | import xlrd 4 | 5 | 6 | def readxls(path): 7 | xl=xlrd.open_workbook(path) 8 | sheet=xl.sheets()[0] 9 | data=[] 10 | for i in range(1, sheet.nrows): 11 | data.append(sheet.row_values(i)) 12 | return data 13 | 14 | 15 | def form(data): 16 | for i in data: 17 | if i[1]!='': 18 | i[1] = '\t'+'SC:'+i[1] 19 | if i[2]!='': 20 | i[2] = '\t' + 'USE:' + i[2] 21 | if i[3]!='': 22 | i[3] = '\t' + 'UF:' + i[3] 23 | if i[4]!='': 24 | i[4] = '\t' + 'AD:' + i[4] 25 | if i[5] != '': 26 | i[5] = '\t' + 'NT:' + i[5] 27 | if i[6] != '': 28 | i[6] = '\t' + 'BT:' + i[6] 29 | if i[7] != '': 30 | i[7] = '\t' + 'RT:' + i[7] 31 | for j in data: 32 | for k in range(len(j)): 33 | j[k]=j[k].replace('/','\n\t ') 34 | return data 35 | 36 | def write(path,data): 37 | with codecs.open(path,'w',encoding='utf8') as f: 38 | for i in data: 39 | for j in i: 40 | if j != '': 41 | f.write(j+'\n') 42 | f.write('\n') 43 | 44 | 45 | def main(): 46 | path=r'C:\Users\lenovo\Desktop\情报语言学\书法.xlsx' 47 | path_put = r'C:\Users\lenovo\Desktop\情报语言学\书法.txt' 48 | data = readxls(path) 49 | write(path_put,form(data)) 50 | 51 | 52 | main() -------------------------------------------------------------------------------- /第8章 数字人文下的文本聚类/code/some_deal/test.py: -------------------------------------------------------------------------------- 1 | import codecs 2 | 3 | import jieba 4 | 5 | 6 | def readtxt(path): 7 | with codecs.open(path,'r',encoding='utf-8') as f: 8 | line=f.readlines() #line为一个列表 一行一个元素 9 | line = [x.strip('\r\n') for x in line] 10 | #print(line) 11 | return line 12 | 13 | 14 | # jieba分词(去除停用词后每行分词) 15 | def cut_words(data, stopwords): 16 | cut_word=[] 17 | words='' 18 | for line in data: 19 | jieba.load_userdict('./heritage_entity.txt') 20 | slist=jieba.cut(line,cut_all=False) 21 | #slist = [x.strip() for x in list(slist) if x.strip() != '\xa0' or x.strip() != '\u3000' or x.strip() !='\ue81b'\ 22 | # or x.strip() !=' '] 23 | for key in slist: 24 | if key not in stopwords and key !=' ' and key != '\xa0' and key !='\u3000' and key !='\ue81b': 25 | words+=key+' ' 26 | #output=" ".join(words) 27 | words+='\n' 28 | cut_word.append(words) 29 | return cut_word 30 | 31 | 32 | # 去除停用词、重复词词总量 33 | def wordlist(cut_word, stopwords): 34 | final=[] 35 | for line in data: 36 | slist=jieba.cut(line,cut_all=False) 37 | output=" ".join(list(slist)) 38 | for key in output.split(' '): 39 | if (key not in stopwords) and (key not in cut_word): 40 | final.append(key) 41 | return final 42 | 43 | 44 | def write(path,data): 45 | with codecs.open(path,'a','utf-8')as f: 46 | for i in data: 47 | f.write(str(i)) 48 | 49 | def corpus(data): 50 | final = [] 51 | for line in data: 52 | l = line.strip(' ').split(' ') 53 | res = [x.strip() for x in l if x.strip() != '\xa0' or x.strip() != '\u3000' or x.strip() !='\ue81b'\ 54 | or x.strip() !=' '] 55 | cor = " ".join(res) 56 | final.append(cor) 57 | #print(final) 58 | return final 59 | 60 | def main(): 61 | path1 = "./title_info.txt" 62 | path2 = "./cut_words.txt" 63 | path3 = "./CW_noplace.txt" # 基于实体词典分词 64 | stopwords_path = "./停用词.txt" 65 | data = readtxt(path1) 66 | stopwords = readtxt(stopwords_path) 67 | #stopwords = [x.replace('\r\n', '') for x in stopwords] 68 | 69 | #cut_word = cut_words(data, stopwords) 70 | cut_word_entity = cut_words(data, stopwords) 71 | print(cut_word_entity) 72 | #print(corpus(cut_word_entity)) 73 | #write(path2, cut_word) 74 | write(path3, cut_word_entity) 75 | 76 | 77 | if __name__=="__main__": 78 | main() 79 | -------------------------------------------------------------------------------- /第8章 数字人文下的文本聚类/code/some_deal/title_info.py: -------------------------------------------------------------------------------- 1 | import codecs 2 | 3 | import xlrd 4 | 5 | 6 | def readxls(path,col): 7 | xl=xlrd.open_workbook(path) 8 | sheet=xl.sheets()[0] 9 | data=list(sheet.col_values(col))[1:] 10 | return data 11 | 12 | def readtxt(path): 13 | with codecs.open(path,'r','utf8') as f: 14 | line=f.readline() 15 | data=list(line) 16 | return data 17 | 18 | #union 19 | def uni(title,info): 20 | uni_lis=[] 21 | n='' 22 | for i,j in zip(title,info): 23 | n=i+' '+j 24 | uni_lis.append(n) 25 | return uni_lis 26 | 27 | def writetxt(path,txt): 28 | with codecs.open(path,'a','utf-8') as f: 29 | for i in txt: 30 | f.write(str(i)+'\n') 31 | 32 | def main(): 33 | path_xls=".\非遗国家级.xlsx" 34 | path_txt=".\\title_info.txt" 35 | path_stopword=".\停用词.txt" 36 | #path_txt=r'C:\Users\lenovo\Desktop\非遗\title_info_onehot.txt' 37 | title=readxls(path_xls,0) 38 | info=readxls(path_xls,8) 39 | classes = readxls(path_xls,4) 40 | stopwords=readtxt(path_stopword) 41 | #data=uni(title,info) 42 | #ti_class = uni(title,classes) 43 | #dic=dict.fromkeys(title,classes) 44 | print(dict(zip(title,classes))) 45 | #writetxt(path_txt, data) 46 | #writetxt("D:\我\非遗\高维聚类结果\标题加类别.txt",ti_class) 47 | ''''#分词 48 | word_lis=[] 49 | for line in data: 50 | slist = jieba.cut(line, cut_all=False) 51 | output = " ".join(slist) 52 | for key in output.split(' '): 53 | if key not in stopwords: 54 | word_lis.append(key) 55 | 56 | # 参考官方文档运用sklearn.feature_extraction.text.TfidfVectorizer,将corpus文本转换为tfidf值的svm向量 57 | tfidfvec = TfidfVectorizer() 58 | cop_tfidf = tfidfvec.fit_transform(word_lis) 59 | weight = cop_tfidf.toarray() 60 | 61 | 62 | #降维 63 | X = np.array(weight) # 导入数据 64 | pca = PCA(n_components=2) # 降到2维 65 | pca.fit(X) # 训练 66 | newX = pca.fit_transform(X) # 降维后的数据 67 | # PCA(copy=True, n_components=2, whiten=False) 68 | # print(pca.explained_variance_ratio_) #输出贡献率 69 | print(newX) 70 | 71 | #层次聚类 72 | X = newX 73 | ##设置分层聚类函数 74 | linkages = ['ward', 'average', 'complete'] 75 | n_clusters_ = 6 76 | ac = AgglomerativeClustering(linkage=linkages[2], n_clusters=n_clusters_) 77 | ##训练数据 78 | ac.fit(X) 79 | 80 | ##每个数据的分类 81 | lables = ac.labels_ 82 | 83 | ##绘图 84 | plt.figure(1) 85 | plt.clf() 86 | 87 | colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk') 88 | for k, col in zip(range(n_clusters_), colors): 89 | # 根据lables中的值是否等于k,重新组成一个True、False的数组 90 | my_members = lables == k 91 | ##X[my_members, 0] 取出my_members对应位置为True的值的横坐标 92 | plt.plot(X[my_members, 0], X[my_members, 1], col + '.') 93 | 94 | plt.title('Estimated number of clusters: %d' % n_clusters_) 95 | plt.show()''' 96 | 97 | if __name__=="__main__": 98 | main() 99 | -------------------------------------------------------------------------------- /第8章 数字人文下的文本聚类/code/tsne_plot/3D+tsne2维画图.py: -------------------------------------------------------------------------------- 1 | from time import time 2 | 3 | import matplotlib.pyplot as plt 4 | import numpy as np 5 | from matplotlib.ticker import NullFormatter 6 | from sklearn import manifold, datasets 7 | 8 | # # Next line to silence pyflakes. This import is needed. 9 | # Axes3D 10 | 11 | n_points = 2690 12 | # X是一个(1000, 3)的2维数据,color是一个(1000,)的1维数据 13 | X = np.loadtxt("D:\我\非遗\Word2vector\w2v_sentence_vec_100D.txt") 14 | X, color = datasets.samples_generator.make_s_curve(n_points, random_state=0) 15 | n_neighbors = 10 16 | n_components = 2 17 | 18 | fig = plt.figure(figsize=(8, 8)) 19 | # 创建了一个figure,标题为"Manifold Learning with 1000 points, 10 neighbors" 20 | plt.suptitle("Manifold Learning with %i points, %i neighbors" 21 | % (1000, n_neighbors), fontsize=14) 22 | 23 | 24 | '''绘制S曲线的3D图像''' 25 | ax = fig.add_subplot(211, projection='3d') 26 | ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=color, cmap=plt.cm.Spectral) 27 | ax.view_init(4, -72) # 初始化视角 28 | 29 | '''t-SNE''' 30 | t0 = time() 31 | tsne = manifold.TSNE(n_components=n_components, init='pca', random_state=0) 32 | Y = tsne.fit_transform(X) # 转换后的输出 33 | t1 = time() 34 | print("t-SNE: %.2g sec" % (t1 - t0)) # 算法用时 35 | ax = fig.add_subplot(2, 1, 2) 36 | plt.scatter(Y[:, 0], Y[:, 1], c=color, cmap=plt.cm.Spectral) 37 | plt.title("t-SNE (%.2g sec)" % (t1 - t0)) 38 | ax.xaxis.set_major_formatter(NullFormatter()) # 设置标签显示格式为空 39 | ax.yaxis.set_major_formatter(NullFormatter()) 40 | # plt.axis('tight') 41 | 42 | plt.show() -------------------------------------------------------------------------------- /第8章 数字人文下的文本聚类/code/tsne_plot/heatmap.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import seaborn as sns 3 | from pylab import * 4 | 5 | mpl.rcParams['font.sans-serif'] = ['SimHei'] 6 | 7 | 8 | labels = ['民间文学', '传统音乐', '传统舞蹈', '传统戏剧', '曲艺', '传统体育、游艺与杂技', '传统美术', \ 9 | '传统技艺', '传统医药', '民俗'] 10 | classes = ['1', '2', '3', '4', '5', '6'] 11 | sum=[191,352,278,422,175,102,307,427,89,347] 12 | six = [927, 693, 235, 377, 209, 249] 13 | eight = [174, 141, 693, 920, 209, 198, 110, 245] 14 | 15 | def hotmap(X): 16 | '''for r,i in zip(range(len(eight)),X): 17 | print(i) 18 | for j in range(len(i)): 19 | i[j]=i[j]/eight[r]''' 20 | for i in X: 21 | print(i) 22 | for j,k in zip(range(len(i)),range(len(sum))): 23 | i[j]=i[j]/sum[k] 24 | #print(X) 25 | X = np.transpose(X) 26 | print(X) 27 | dt = pd.DataFrame(X, columns=['类1', '类2', '类3', '类4', '类5', '类6'], index=labels) 28 | print(dt) 29 | # pt = dt.pivot(index=labels,columns=classes,values=0) 30 | # cmap用matplotlib colormap 31 | ax = sns.heatmap(dt, cmap='YlGnBu') 32 | # rainbow为 matplotlib 的colormap名称 33 | ax.set_title('six classes heatmap') 34 | # ax.set_xlabel('classes') 35 | # ax.set_ylabel('') 36 | plt.show() 37 | 38 | 39 | def main(): 40 | w2v_six_class = [[82,127,55,182,31,31,118,158,28,115],[46,98,97,86,84,42,79,95,28,38],[7,31,25,45,2,8,16,31,11,59],[28,45,54,54,19,12,48,61,15,41],\ 41 | [13,4,12,38,6,6,16,29,3,82],[15,48,35,17,33,3,30,52,4,12]] 42 | w2v_eight = [[17,28,28,24,12,8,30,19,2,6],[12,23,13,8,30,2,14,31,3,5],[46,98,97,86,84,42,79,95,28,38],\ 43 | [87,127,52,179,27,31,121,157,28,111],[13,4,12,38,6,6,16,29,3,82],[6,25,17,39,2,7,13,25,11,53],\ 44 | [3,25,22,9,4,1,16,22,1,7],[7,23,37,39,10,5,18,48,13,45]] 45 | w2v_ten = [[12,23,13,8,30,2,14,31,3,5],[80,118,45,152,26,27,113,146,25,94],[46,98,97,86,84,42,79,95,28,38],\ 46 | [12,4,12,38,6,6,16,29,3,82],[7,19,18,40,0,8,9,18,2,52],[17,27,28,24,12,8,30,17,2,4],[3,25,22,9,3,1,16,21,1,7],\ 47 | [1,8,4,7,2,0,5,14,9,9],[0,1,0,0,1,0,0,1,0,0],[13,30,39,58,11,8,25,54,16,56]] 48 | hotmap(w2v_six_class) 49 | 50 | if __name__ == '__main__': 51 | main() 52 | 53 | -------------------------------------------------------------------------------- /第8章 数字人文下的文本聚类/code/tsne_plot/tsne.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | from sklearn.manifold import TSNE 4 | 5 | 6 | def plot_embedding(X, lables): 7 | tsne = TSNE(perplexity=30, n_components=2, init='pca') # TSNE降维,降到2D 8 | data = tsne.fit_transform(X) 9 | 10 | x_min, x_max = np.min(data, 0), np.max(data, 0) 11 | data = (data - x_min) / (x_max - x_min) 12 | 13 | plt.figure() 14 | for i in range(data.shape[0]): 15 | plt.text(data[i, 0], data[i, 1], str(lables[i]), 16 | color=plt.cm.Set1(lables[i] / 10.), 17 | fontdict={'weight': 'bold', 'size': 9}) 18 | plt.xticks([]) 19 | plt.yticks([]) 20 | plt.show() 21 | #plt.title(title) 22 | 23 | def main(): 24 | X = np.loadtxt("D:\我\非遗\Word2vector\w2v_sentence_vec_100D.txt") 25 | lables = np.loadtxt("D:\我\非遗\Word2vector\标签\\100-lables.txt") 26 | tsne = TSNE(perplexity=30, n_components=2, init='pca') # TSNE降维,降到2D 27 | data = tsne.fit_transform(X) 28 | print(data) 29 | plot_embedding(data, lables) 30 | 31 | 32 | main() 33 | ''' 34 | X = np.loadtxt("D:\我\非遗\Word2vector\w2v_sentence_vec_100D.txt") 35 | labels = np.loadtxt("D:\我\非遗\Word2vector\标签\\100-lables.txt") 36 | tsne = TSNE(perplexity=30, n_components=2, init='pca') # TSNE降维,降到2D 37 | data = tsne.fit_transform(X) 38 | plt.figure(1) 39 | plt.clf() 40 | colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk') 41 | for k, col in zip(range(6), colors): 42 | # 根据lables中的值是否等于k,重新组成一个True、False的数组 43 | my_members = labels == k 44 | # X[my_members, 0] 取出my_members对应位置为True的值的横坐标 45 | plt.plot(data[my_members, 0], data[my_members, 1], col + '.') 46 | 47 | #plt.title('Estimated number of clusters: %d' % n_clusters_) 48 | plt.show()''' 49 | -------------------------------------------------------------------------------- /第8章 数字人文下的文本聚类/code/tsne_plot/ttt.py: -------------------------------------------------------------------------------- 1 | from itertools import cycle 2 | 3 | import matplotlib.pyplot as plt 4 | import numpy as np 5 | from sklearn import manifold 6 | 7 | '''# read data 8 | path="D:\我\非遗\cut_words_entity.txt" 9 | num = [] 10 | with codecs.open(path, 'r', 'utf8') as f: 11 | line = f.readlines() 12 | for i in line: 13 | l=i.strip().replace('\n', '').split(' ') 14 | res = [x.strip() for x in l if x.strip() != ''] 15 | print(res) 16 | num.append(len(res)) 17 | print(max(num)) 18 | print(min(num)) 19 | print(np.average(num))''' 20 | X = np.loadtxt("D:\我\非遗\Word2vector\w2v_sentence_vec_50D.txt") 21 | y = np.loadtxt("D:\我\非遗\Word2vector\标签\lables_50.txt") 22 | tsne = manifold.TSNE(n_components=2, init='pca', random_state=501) 23 | X_tsne = tsne.fit_transform(X) 24 | 25 | print("Org data dimension is {}.\ 26 | Embedded data dimension is {}".format(X.shape[-1], X_tsne.shape[-1])) 27 | 28 | #嵌入空间可视化 29 | colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk') 30 | x_min, x_max = X_tsne.min(0), X_tsne.max(0) 31 | X_norm = (X_tsne - x_min) / (x_max - x_min) # 归一化 32 | plt.figure(figsize=(8, 8)) 33 | for i in range(X_norm.shape[0],): 34 | plt.text(X_norm[i, 0], X_norm[i, 1], str(y[i]), color=plt.cm.Set1(y[i] /20.), 35 | fontdict={'weight': 'bold', 'size': 9}) 36 | plt.xticks([]) 37 | plt.yticks([]) 38 | plt.show() 39 | ''' 40 | plt.figure(1) 41 | plt.clf() 42 | colors =['k','darkgrey','brown','r','peru','tan','gold','olive','y','sage','palegreen','g','c','deepskyblue','b','m','pink'] 43 | #colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk') 44 | for k, col in zip(range(6), colors): 45 | # 根据lables中的值是否等于k,重新组成一个True、False的数组 46 | my_members = y == k 47 | # X[my_members, 0] 取出my_members对应位置为True的值的横坐标 48 | plt.plot(X_tsne[my_members, 0], X_tsne[my_members, 1], col + '.') 49 | plt.show()''' 50 | 51 | 52 | 53 | -------------------------------------------------------------------------------- /第8章 数字人文下的文本聚类/code/vsm.py: -------------------------------------------------------------------------------- 1 | import codecs 2 | import math 3 | import os 4 | 5 | from sklearn.feature_extraction.text import CountVectorizer 6 | from sklearn.feature_extraction.text import TfidfTransformer 7 | 8 | path = r"C:\Users\lenovo\Desktop\信息检索系统\文摘" 9 | 10 | def readtxt(path): 11 | cate = [path +'\\'+ x for x in os.listdir(path)] 12 | print(cate) 13 | data = [] 14 | for f in cate: 15 | with codecs.open(f, 'r', 'utf8') as f: 16 | line = f.readlines() 17 | data.append(line) 18 | data_final=[] 19 | for i in range(len(data)): 20 | ll = '' 21 | for l in data[i]: 22 | l = l.replace('\r\n','').replace('.','').replace(',','').replace('"','').replace('--','').replace('\'','') 23 | ll = ll+l 24 | data_final.append(ll) 25 | print(data_final) 26 | return data_final 27 | 28 | 29 | if __name__ == "__main__": 30 | #corpus = [] # 文档预料 空格连接 31 | corpus = readtxt(path) 32 | # 读取预料 一行预料为一个文档 33 | #path1 = "D:\我\非遗\高维聚类结果\\350_类11.txt" 34 | #for line in open(path1, 'r', encoding="utf-8").readlines(): 35 | # corpus.append(line.strip()) 36 | # print corpus 37 | #time.sleep(5) 38 | 39 | # 将文本中的词语转换为词频矩阵 矩阵元素a[i][j] 表示j词在i类文本下的词频 40 | vectorizer = CountVectorizer( 41 | min_df=0, 42 | token_pattern=r"\b\w+\b" 43 | ) 44 | 45 | # 该类会统计每个词语的tf-idf权值 46 | transformer = TfidfTransformer() 47 | 48 | # 第一个fit_transform是计算tf-idf 第二个fit_transform是将文本转为词频矩阵 49 | tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus)) 50 | 51 | # 获取词袋模型中的所有词语 52 | word = vectorizer.get_feature_names() 53 | print(len(word)) 54 | # 将tf-idf矩阵抽取出来,元素w[i][j]表示j词在i类文本中的tf-idf权重 55 | weight = tfidf.toarray() 56 | print(weight) 57 | #path2 = "./words_tfidf2.txt" 58 | #result = codecs.open(path2, 'w', 'utf-8') 59 | #for j in range(len(word)): 60 | # result.write(word[j] + ' ') 61 | #result.write('\r\n\r\n') 62 | # 打印每类文本的tf-idf词语权重,第一个for遍历所有文本,第二个for便利某一类文本下的词语权重 63 | sum = 0 64 | sq1 = 0 65 | sq2 = 0 66 | for i in range(len(weight[0])): 67 | sum += weight[0][i] * weight[1][i] 68 | sq1 += pow(weight[0][i], 2) 69 | sq2 += pow(weight[1][i], 2) 70 | try: 71 | result = round(float(sum) / (math.sqrt(sq1) * math.sqrt(sq2)), 2) 72 | except ZeroDivisionError: 73 | result = 0.0 74 | print(result) 75 | 76 | #result.close() 77 | 78 | -------------------------------------------------------------------------------- /第8章 数字人文下的文本聚类/code/word2vec/doc2vec.py: -------------------------------------------------------------------------------- 1 | import codecs 2 | 3 | import gensim 4 | import numpy as np 5 | from gensim.models.doc2vec import Doc2Vec 6 | 7 | TaggededDocument = gensim.models.doc2vec.TaggedDocument 8 | 9 | 10 | def readtxt(path): 11 | data = [] 12 | with codecs.open(path,'r',encoding='utf-8') as f: 13 | doc = f.readlines() 14 | #for line in f.readlines(): 15 | #line = line.strip('\n') 16 | #data.append(line) 17 | return doc 18 | 19 | 20 | def train(x_train): 21 | # D2V参数解释: 22 | # min_count:忽略所有单词中单词频率小于这个值的单词。 23 | # window:窗口的尺寸。(句子中当前和预测单词之间的最大距离) 24 | # size:特征向量的维度 25 | # sample:高频词汇的随机降采样的配置阈值,默认为1e-3,范围是(0,1e-5)。 26 | # negative: 如果>0,则会采用negativesampling,用于设置多少个noise words(一般是5-20)。默认值是5。 27 | # workers:用于控制训练的并行数。 28 | model_dm = Doc2Vec(x_train, min_count=1, window=3, vector_size=160, sample=1e-3, negative=5, workers=4) 29 | # total_examples:统计句子数 30 | # epochs:在语料库上的迭代次数(epochs)。 31 | model_dm.train(x_train, total_examples=model_dm.corpus_count, epochs=70) 32 | model_dm.save('d2v_heritage_160.model') 33 | 34 | return model_dm 35 | 36 | def test(): 37 | model_dm = Doc2Vec.load("model/model_dm_wangyi") 38 | test_text = ['《', '舞林', '争霸' '》', '十强' '出炉', '复活', '舞者', '澳门', '踢馆'] 39 | inferred_vector_dm = model_dm.infer_vector(test_text) 40 | print(inferred_vector_dm) 41 | sims = model_dm.docvecs.most_similar([inferred_vector_dm], topn=10) 42 | 43 | return sims 44 | 45 | '''sims = test() 46 | for count, sim in sims: 47 | sentence = x_train[count] 48 | words = '' 49 | for word in sentence[0]: 50 | words = words + word + ' ' 51 | print(words, sim, len(sentence[0])) 52 | ''' 53 | 54 | 55 | def get_dataset(): 56 | with open(r'D:\我\非遗\cut_words_entity.txt', 'r', encoding='utf8') as f: 57 | docs = f.readlines() 58 | print(len(docs)) 59 | 60 | x_train = [] 61 | # y = np.concatenate(np.ones(len(docs))) 62 | for i, text in enumerate(docs): 63 | word_list = text.split(' ') 64 | l = len(word_list) 65 | word_list[l - 1] = word_list[l - 1].strip() 66 | document = TaggededDocument(word_list, tags=[i]) 67 | x_train.append(document) 68 | 69 | return x_train 70 | 71 | 72 | def main(): 73 | path = r'D:\我\非遗\cut_words_entity.txt' 74 | x_train = get_dataset() 75 | #print(x_train) 76 | train(x_train) 77 | #test() 78 | data = [] 79 | model = Doc2Vec.load('d2v_heritage_160.model') 80 | for i in range(2690): 81 | data.append(model.docvecs[i]) 82 | X = np.array(data) 83 | np.savetxt("./d2v_heritage_160.txt", X) 84 | # print(model.docvecs[10]) 85 | 86 | 87 | main() -------------------------------------------------------------------------------- /第8章 数字人文下的文本聚类/code/word2vec/word2vector.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from gensim.models import Word2Vec 3 | from gensim.models.word2vec import LineSentence 4 | 5 | 6 | # 训练word2vec模型 参数说明: 7 | # sentences: 包含句子的list,或迭代器 8 | # size: 词向量的维数,size越大需要越多的训练数据,同时能得到更好的模型 9 | # alpha: 初始学习速率,随着训练过程递减,最后降到 min_alpha 10 | # window: 上下文窗口大小,即预测当前这个词的时候最多使用距离为window大小的词 11 | # max_vocab_size: 词表大小,如果实际词的数量超过了这个值,过滤那些频率低的 12 | # workers: 并行度 13 | # iter: 训练轮数 14 | # sg=0 cbow,sg=1 skip-gram 15 | # hs=0 negative sampling, hs=1 hierarchy 16 | #sentences = word2vec.Text8Corpus(r'D:\我\非遗\cut_words_entity') 17 | #model.save('heritage.model') 保存模型 18 | # https://blog.csdn.net/laobai1015/article/details/86540813 参数解释 19 | 20 | def build_vec(list_sentence, model): 21 | list_vec_sentence = [] 22 | for sentence in list_sentence: # 每个sentence为一个list 23 | 24 | if len(sentence) > 1000: 25 | arrlists = [model[word] for word in sentence[0:1000]] 26 | x = np.average(arrlists, axis=0) 27 | else: 28 | arrlists = [model[word] for word in sentence] 29 | x = np.average(arrlists, axis=0) 30 | list_vec_sentence.append(x) 31 | return list_vec_sentence 32 | 33 | 34 | def main(): 35 | path = r'D:\我\非遗\cut_words_entity.txt' 36 | sentences = LineSentence(path) 37 | model = Word2Vec(sentences, sg=0, size=100, min_count=0) # sg=0 cbow,hs=0默认 negative sampling 38 | # model.wv.save_word2vec_format('heritage_word_100.bin', binary=True) 39 | model.save('heritage_ns_100.model') 40 | vec_sentence = build_vec(sentences, model) 41 | #print(vec_sentence) 42 | list_vec_sentence = [] 43 | for s in sentences: 44 | for word in s: 45 | arrlists = [model[word]] 46 | x = np.average(arrlists, axis=0) 47 | list_vec_sentence.append(x) 48 | np.savetxt("w2v_sentence_vec_100D_ns.txt", list_vec_sentence) 49 | ''' 50 | 51 | model = Word2Vec.load('./heritage.model') 52 | word = model.most_similar("赛龙舟") 53 | print(word) 54 | #print(model['']) 55 | ''' 56 | 57 | if __name__ == '__main__': 58 | main() -------------------------------------------------------------------------------- /第8章 数字人文下的文本聚类/code/分词/中科院分词.py: -------------------------------------------------------------------------------- 1 | import pynlpir 2 | pynlpir.open() 3 | import codecs 4 | from ctypes import c_char_p 5 | 6 | 7 | def readtxt(path): 8 | with codecs.open(path,'r',encoding='utf-8') as f: 9 | line=f.readlines() #line为一个列表 一行一个元素 10 | line = [x.strip('\r\n') for x in line] 11 | #print(line) 12 | return line 13 | 14 | def cutwords(data,stopwords,en): 15 | words='' 16 | cut_word= [] 17 | for word in en: 18 | pynlpir.nlpir.ImportUserDict(c_char_p(word.encode())) 19 | for line in data: 20 | slist = pynlpir.segment(line, pos_tagging=False) 21 | print(slist) 22 | for key in slist: 23 | if key not in stopwords and key != ' ' and key != '\xa0' and key != '\u3000' and key != '\ue81b': 24 | words += key+' ' 25 | # output=" ".join(words) 26 | words += '\n' 27 | cut_word.append(words) 28 | print(cut_word) 29 | return cut_word 30 | 31 | 32 | def write(path,data): 33 | with codecs.open(path,'a','utf-8')as f: 34 | for i in data: 35 | f.write(str(i)) 36 | 37 | 38 | def main(): 39 | path1 = "D:\我\非遗\example.txt" 40 | stopwords_path = "D:\我\非遗\停用词.txt" 41 | path_entity = "D:\我\非遗\heritage_entity.txt" 42 | data = readtxt(path1) 43 | stopwords = readtxt(stopwords_path) 44 | entity = readtxt(path_entity) 45 | #print(entity) 46 | cutwords(data,stopwords,entity) 47 | 48 | 49 | if __name__ == '__main__': 50 | main() -------------------------------------------------------------------------------- /第8章 数字人文下的文本聚类/第八章 数字人文下的文本聚类.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hsc748NLP/code-for-digital-humanities-tutorial/6220c365ab5ff7df16f1f20a46cc928b034db36a/第8章 数字人文下的文本聚类/第八章 数字人文下的文本聚类.pdf -------------------------------------------------------------------------------- /第8章 数字人文下的文本聚类/非遗信息 全.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hsc748NLP/code-for-digital-humanities-tutorial/6220c365ab5ff7df16f1f20a46cc928b034db36a/第8章 数字人文下的文本聚类/非遗信息 全.xlsx -------------------------------------------------------------------------------- /第9章 数字人文下的机器翻译/README.md: -------------------------------------------------------------------------------- 1 | ## 机器自动翻译源代码 2 | 3 | 使用开源的opennmt进行古汉语到现代汉语和英语自动翻译的源代码,详细使用说明见教材第九章的内容。 4 | -------------------------------------------------------------------------------- /第9章 数字人文下的机器翻译/opennmt/README.md: -------------------------------------------------------------------------------- 1 | ## 一、使用方法 2 | 3 | 1. 下载本仓库 4 | 2. 从百度网盘下载模型文件,链接:https://pan.baidu.com/s/1md7mVEH46AmisIZXwRkKrQ 提取码:ckdx 5 | 3. 运行translate.py,根据需求更改输入 6 | 7 | ## 二、文件说明 8 | 9 | ### 1、data 10 | 11 | 包含原始训练语料src-train.txt、tgt-train.txt及验证测试语料src-valid.txt、tgt-valid.txt、src-test_.txt、tgt-test.txt共6个文件; 12 | 13 | 其中src开头为古文语料,tgt为目标英文语料 14 | 15 | ### 2、model 16 | 17 | 已训练完毕的翻译模型model.pt,因模型文件过大,已上传到百度网盘,链接:https://pan.baidu.com/s/1md7mVEH46AmisIZXwRkKrQ 提取码:ckdx 18 | 19 | ### 3、omnt 20 | 21 | 项目依赖文件 22 | 23 | 24 | 25 | -------------------------------------------------------------------------------- /第9章 数字人文下的机器翻译/opennmt/data/pred.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /第9章 数字人文下的机器翻译/opennmt/data/src-test.txt: -------------------------------------------------------------------------------- 1 | 虽然 , 每 至 于 族 , 吾 见 其 难为 , 怵然 为戒 , 视 为 止 , 行 为迟 。 -------------------------------------------------------------------------------- /第9章 数字人文下的机器翻译/opennmt/data/tgt-test.txt: -------------------------------------------------------------------------------- 1 | test -------------------------------------------------------------------------------- /第9章 数字人文下的机器翻译/opennmt/model/说明.txt: -------------------------------------------------------------------------------- 1 | 将从网盘下载的model.pt文件放在此文件夹内 -------------------------------------------------------------------------------- /第9章 数字人文下的机器翻译/opennmt/onmt/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /第9章 数字人文下的机器翻译/opennmt/onmt/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /第9章 数字人文下的机器翻译/opennmt/onmt/.idea/onmt.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 11 | -------------------------------------------------------------------------------- /第9章 数字人文下的机器翻译/opennmt/onmt/.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /第9章 数字人文下的机器翻译/opennmt/onmt/__init__.py: -------------------------------------------------------------------------------- 1 | """ Main entry point of the ONMT library """ 2 | from __future__ import division, print_function 3 | 4 | import sys 5 | 6 | import onmt.utils.optimizers 7 | 8 | onmt.utils.optimizers.Optim = onmt.utils.optimizers.Optimizer 9 | sys.modules["onmt.Optim"] = onmt.utils.optimizers 10 | 11 | # For Flake 12 | __all__ = [onmt.inputters, onmt.encoders, onmt.decoders, onmt.models, 13 | onmt.utils, onmt.modules, "Trainer"] 14 | 15 | __version__ = "1.1.1" 16 | -------------------------------------------------------------------------------- /第9章 数字人文下的机器翻译/opennmt/onmt/bin/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hsc748NLP/code-for-digital-humanities-tutorial/6220c365ab5ff7df16f1f20a46cc928b034db36a/第9章 数字人文下的机器翻译/opennmt/onmt/bin/__init__.py -------------------------------------------------------------------------------- /第9章 数字人文下的机器翻译/opennmt/onmt/bin/translate.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from __future__ import unicode_literals 5 | 6 | import onmt.opts as opts 7 | from onmt.translate.translator import build_translator 8 | from onmt.utils.logging import init_logger 9 | from onmt.utils.misc import split_corpus 10 | from onmt.utils.parse import ArgumentParser 11 | 12 | 13 | def translate(opt): 14 | ArgumentParser.validate_translate_opts(opt) 15 | logger = init_logger(opt.log_file) 16 | 17 | translator = build_translator(opt, report_score=True) 18 | src_shards = split_corpus(opt.src, opt.shard_size) 19 | tgt_shards = split_corpus(opt.tgt, opt.shard_size) 20 | shard_pairs = zip(src_shards, tgt_shards) 21 | 22 | for i, (src_shard, tgt_shard) in enumerate(shard_pairs): 23 | logger.info("Translating shard %d." % i) 24 | translator.translate( 25 | src=src_shard, 26 | tgt=tgt_shard, 27 | src_dir=opt.src_dir, 28 | batch_size=opt.batch_size, 29 | batch_type=opt.batch_type, 30 | attn_debug=opt.attn_debug, 31 | align_debug=opt.align_debug 32 | ) 33 | 34 | 35 | def _get_parser(): 36 | parser = ArgumentParser(description='translate.py') 37 | 38 | opts.config_opts(parser) 39 | opts.translate_opts(parser) 40 | return parser 41 | 42 | 43 | def main(): 44 | parser = _get_parser() 45 | 46 | opt = parser.parse_args() 47 | translate(opt) 48 | 49 | 50 | if __name__ == "__main__": 51 | main() 52 | -------------------------------------------------------------------------------- /第9章 数字人文下的机器翻译/opennmt/onmt/decoders/__init__.py: -------------------------------------------------------------------------------- 1 | """Module defining decoders.""" 2 | from onmt.decoders.cnn_decoder import CNNDecoder 3 | from onmt.decoders.decoder import InputFeedRNNDecoder, \ 4 | StdRNNDecoder 5 | from onmt.decoders.transformer import TransformerDecoder 6 | 7 | str2dec = {"rnn": StdRNNDecoder, "ifrnn": InputFeedRNNDecoder, 8 | "cnn": CNNDecoder, "transformer": TransformerDecoder 9 | } 10 | 11 | __all__ = ["DecoderBase", "TransformerDecoder", "StdRNNDecoder", "CNNDecoder", 12 | "InputFeedRNNDecoder", "str2dec"] 13 | -------------------------------------------------------------------------------- /第9章 数字人文下的机器翻译/opennmt/onmt/encoders/__init__.py: -------------------------------------------------------------------------------- 1 | """Module defining encoders.""" 2 | from onmt.encoders.cnn_encoder import CNNEncoder 3 | from onmt.encoders.ggnn_encoder import GGNNEncoder 4 | from onmt.encoders.mean_encoder import MeanEncoder 5 | from onmt.encoders.rnn_encoder import RNNEncoder 6 | from onmt.encoders.transformer import TransformerEncoder 7 | 8 | str2enc = {"ggnn": GGNNEncoder, "rnn": RNNEncoder, "brnn": RNNEncoder, 9 | "cnn": CNNEncoder, "transformer": TransformerEncoder, 10 | "mean": MeanEncoder} 11 | 12 | __all__ = ["EncoderBase", "TransformerEncoder", "RNNEncoder", "CNNEncoder", 13 | "MeanEncoder", "str2enc"] 14 | -------------------------------------------------------------------------------- /第9章 数字人文下的机器翻译/opennmt/onmt/encoders/cnn_encoder.py: -------------------------------------------------------------------------------- 1 | """ 2 | Implementation of "Convolutional Sequence to Sequence Learning" 3 | """ 4 | import torch.nn as nn 5 | 6 | from onmt.encoders.encoder import EncoderBase 7 | from onmt.utils.cnn_factory import shape_transform, StackedCNN 8 | 9 | SCALE_WEIGHT = 0.5 ** 0.5 10 | 11 | 12 | class CNNEncoder(EncoderBase): 13 | """Encoder based on "Convolutional Sequence to Sequence Learning" 14 | :cite:`DBLP:journals/corr/GehringAGYD17`. 15 | """ 16 | 17 | def __init__(self, num_layers, hidden_size, 18 | cnn_kernel_width, dropout, embeddings): 19 | super(CNNEncoder, self).__init__() 20 | 21 | self.embeddings = embeddings 22 | input_size = embeddings.embedding_size 23 | self.linear = nn.Linear(input_size, hidden_size) 24 | self.cnn = StackedCNN(num_layers, hidden_size, 25 | cnn_kernel_width, dropout) 26 | 27 | @classmethod 28 | def from_opt(cls, opt, embeddings): 29 | """Alternate constructor.""" 30 | return cls( 31 | opt.enc_layers, 32 | opt.enc_rnn_size, 33 | opt.cnn_kernel_width, 34 | opt.dropout[0] if type(opt.dropout) is list else opt.dropout, 35 | embeddings) 36 | 37 | def forward(self, input, lengths=None, hidden=None): 38 | """See :class:`onmt.modules.EncoderBase.forward()`""" 39 | self._check_args(input, lengths, hidden) 40 | 41 | emb = self.embeddings(input) 42 | # s_len, batch, emb_dim = emb.size() 43 | 44 | emb = emb.transpose(0, 1).contiguous() 45 | emb_reshape = emb.view(emb.size(0) * emb.size(1), -1) 46 | emb_remap = self.linear(emb_reshape) 47 | emb_remap = emb_remap.view(emb.size(0), emb.size(1), -1) 48 | emb_remap = shape_transform(emb_remap) 49 | out = self.cnn(emb_remap) 50 | 51 | return emb_remap.squeeze(3).transpose(0, 1).contiguous(), \ 52 | out.squeeze(3).transpose(0, 1).contiguous(), lengths 53 | 54 | def update_dropout(self, dropout): 55 | self.cnn.dropout.p = dropout 56 | -------------------------------------------------------------------------------- /第9章 数字人文下的机器翻译/opennmt/onmt/encoders/encoder.py: -------------------------------------------------------------------------------- 1 | """Base class for encoders and generic multi encoders.""" 2 | 3 | import torch.nn as nn 4 | 5 | from onmt.utils.misc import aeq 6 | 7 | 8 | class EncoderBase(nn.Module): 9 | """ 10 | Base encoder class. Specifies the interface used by different encoder types 11 | and required by :class:`onmt.Models.NMTModel`. 12 | 13 | .. mermaid:: 14 | 15 | graph BT 16 | A[Input] 17 | subgraph RNN 18 | C[Pos 1] 19 | D[Pos 2] 20 | E[Pos N] 21 | end 22 | F[Memory_Bank] 23 | G[Final] 24 | A-->C 25 | A-->D 26 | A-->E 27 | C-->F 28 | D-->F 29 | E-->F 30 | E-->G 31 | """ 32 | 33 | @classmethod 34 | def from_opt(cls, opt, embeddings=None): 35 | raise NotImplementedError 36 | 37 | def _check_args(self, src, lengths=None, hidden=None): 38 | n_batch = src.size(1) 39 | if lengths is not None: 40 | n_batch_, = lengths.size() 41 | aeq(n_batch, n_batch_) 42 | 43 | def forward(self, src, lengths=None): 44 | """ 45 | Args: 46 | src (LongTensor): 47 | padded sequences of sparse indices ``(src_len, batch, nfeat)`` 48 | lengths (LongTensor): length of each sequence ``(batch,)`` 49 | 50 | 51 | Returns: 52 | (FloatTensor, FloatTensor): 53 | 54 | * final encoder state, used to initialize decoder 55 | * memory bank for attention, ``(src_len, batch, hidden)`` 56 | """ 57 | 58 | raise NotImplementedError 59 | -------------------------------------------------------------------------------- /第9章 数字人文下的机器翻译/opennmt/onmt/encoders/mean_encoder.py: -------------------------------------------------------------------------------- 1 | """Define a minimal encoder.""" 2 | import torch 3 | from onmt.encoders.encoder import EncoderBase 4 | from onmt.utils.misc import sequence_mask 5 | 6 | 7 | class MeanEncoder(EncoderBase): 8 | """A trivial non-recurrent encoder. Simply applies mean pooling. 9 | 10 | Args: 11 | num_layers (int): number of replicated layers 12 | embeddings (onmt.modules.Embeddings): embedding module to use 13 | """ 14 | 15 | def __init__(self, num_layers, embeddings): 16 | super(MeanEncoder, self).__init__() 17 | self.num_layers = num_layers 18 | self.embeddings = embeddings 19 | 20 | @classmethod 21 | def from_opt(cls, opt, embeddings): 22 | """Alternate constructor.""" 23 | return cls( 24 | opt.enc_layers, 25 | embeddings) 26 | 27 | def forward(self, src, lengths=None): 28 | """See :func:`EncoderBase.forward()`""" 29 | self._check_args(src, lengths) 30 | 31 | emb = self.embeddings(src) 32 | _, batch, emb_dim = emb.size() 33 | 34 | if lengths is not None: 35 | # we avoid padding while mean pooling 36 | mask = sequence_mask(lengths).float() 37 | mask = mask / lengths.unsqueeze(1).float() 38 | mean = torch.bmm(mask.unsqueeze(1), emb.transpose(0, 1)).squeeze(1) 39 | else: 40 | mean = emb.mean(0) 41 | 42 | mean = mean.expand(self.num_layers, batch, emb_dim) 43 | memory_bank = emb 44 | encoder_final = (mean, mean) 45 | return encoder_final, memory_bank, lengths 46 | -------------------------------------------------------------------------------- /第9章 数字人文下的机器翻译/opennmt/onmt/encoders/rnn_encoder.py: -------------------------------------------------------------------------------- 1 | """Define RNN-based encoders.""" 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from onmt.encoders.encoder import EncoderBase 5 | from onmt.utils.rnn_factory import rnn_factory 6 | from torch.nn.utils.rnn import pack_padded_sequence as pack 7 | from torch.nn.utils.rnn import pad_packed_sequence as unpack 8 | 9 | 10 | class RNNEncoder(EncoderBase): 11 | """ A generic recurrent neural network encoder. 12 | 13 | Args: 14 | rnn_type (str): 15 | style of recurrent unit to use, one of [RNN, LSTM, GRU, SRU] 16 | bidirectional (bool) : use a bidirectional RNN 17 | num_layers (int) : number of stacked layers 18 | hidden_size (int) : hidden size of each layer 19 | dropout (float) : dropout value for :class:`torch.nn.Dropout` 20 | embeddings (onmt.modules.Embeddings): embedding module to use 21 | """ 22 | 23 | def __init__(self, rnn_type, bidirectional, num_layers, 24 | hidden_size, dropout=0.0, embeddings=None, 25 | use_bridge=False): 26 | super(RNNEncoder, self).__init__() 27 | assert embeddings is not None 28 | 29 | num_directions = 2 if bidirectional else 1 30 | assert hidden_size % num_directions == 0 31 | hidden_size = hidden_size // num_directions 32 | self.embeddings = embeddings 33 | 34 | self.rnn, self.no_pack_padded_seq = \ 35 | rnn_factory(rnn_type, 36 | input_size=embeddings.embedding_size, 37 | hidden_size=hidden_size, 38 | num_layers=num_layers, 39 | dropout=dropout, 40 | bidirectional=bidirectional) 41 | 42 | # Initialize the bridge layer 43 | self.use_bridge = use_bridge 44 | if self.use_bridge: 45 | self._initialize_bridge(rnn_type, 46 | hidden_size, 47 | num_layers) 48 | 49 | @classmethod 50 | def from_opt(cls, opt, embeddings): 51 | """Alternate constructor.""" 52 | return cls( 53 | opt.rnn_type, 54 | opt.brnn, 55 | opt.enc_layers, 56 | opt.enc_rnn_size, 57 | opt.dropout[0] if type(opt.dropout) is list else opt.dropout, 58 | embeddings, 59 | opt.bridge) 60 | 61 | def forward(self, src, lengths=None): 62 | """See :func:`EncoderBase.forward()`""" 63 | self._check_args(src, lengths) 64 | 65 | emb = self.embeddings(src) 66 | # s_len, batch, emb_dim = emb.size() 67 | 68 | packed_emb = emb 69 | if lengths is not None and not self.no_pack_padded_seq: 70 | # Lengths data is wrapped inside a Tensor. 71 | lengths_list = lengths.view(-1).tolist() 72 | packed_emb = pack(emb, lengths_list) 73 | 74 | memory_bank, encoder_final = self.rnn(packed_emb) 75 | 76 | if lengths is not None and not self.no_pack_padded_seq: 77 | memory_bank = unpack(memory_bank)[0] 78 | 79 | if self.use_bridge: 80 | encoder_final = self._bridge(encoder_final) 81 | return encoder_final, memory_bank, lengths 82 | 83 | def _initialize_bridge(self, rnn_type, 84 | hidden_size, 85 | num_layers): 86 | 87 | # LSTM has hidden and cell state, other only one 88 | number_of_states = 2 if rnn_type == "LSTM" else 1 89 | # Total number of states 90 | self.total_hidden_dim = hidden_size * num_layers 91 | 92 | # Build a linear layer for each 93 | self.bridge = nn.ModuleList([nn.Linear(self.total_hidden_dim, 94 | self.total_hidden_dim, 95 | bias=True) 96 | for _ in range(number_of_states)]) 97 | 98 | def _bridge(self, hidden): 99 | """Forward hidden state through bridge.""" 100 | def bottle_hidden(linear, states): 101 | """ 102 | Transform from 3D to 2D, apply linear and return initial size 103 | """ 104 | size = states.size() 105 | result = linear(states.view(-1, self.total_hidden_dim)) 106 | return F.relu(result).view(size) 107 | 108 | if isinstance(hidden, tuple): # LSTM 109 | outs = tuple([bottle_hidden(layer, hidden[ix]) 110 | for ix, layer in enumerate(self.bridge)]) 111 | else: 112 | outs = bottle_hidden(self.bridge[0], hidden) 113 | return outs 114 | 115 | def update_dropout(self, dropout): 116 | self.rnn.dropout = dropout 117 | -------------------------------------------------------------------------------- /第9章 数字人文下的机器翻译/opennmt/onmt/inputters/MakeToken.py: -------------------------------------------------------------------------------- 1 | import MeCab 2 | import sentencepiece as spm 3 | 4 | 5 | def korean_token(datatxt): 6 | m = MeCab.Tagger() 7 | delete_tag = ['BOS/EOS', 'JKS', 'JKC', 'JKG', 'JKO', 'JKB', 'JKV', 'JKQ', 'JX', 'JC'] 8 | 9 | def del_post_pos(sentence): 10 | tokens = sentence.split() # 원본 문장 띄어쓰기로 분리 11 | 12 | dict_list = [] 13 | 14 | for token in tokens: # 띄어쓰기로 분리된 각 토큰 {'단어':'형태소 태그'} 와 같이 딕셔너리 생성 15 | m.parse('') 16 | node = m.parseToNode(token) 17 | word_list = [] 18 | morph_list = [] 19 | 20 | while node: 21 | morphs = node.feature.split(',') 22 | word_list.append(node.surface) 23 | morph_list.append(morphs[0]) 24 | node = node.next 25 | 26 | dict_list.append(dict(zip(word_list, morph_list))) 27 | 28 | for dic in dict_list: # delete_tag에 해당하는 단어 쌍 지우기 (조사에 해당하는 단어 지우기) 29 | for key in list(dic.keys()): 30 | if dic[key] in delete_tag: 31 | del dic[key] 32 | 33 | combine_word = [''.join(list(dic.keys())) for dic in dict_list] # 형태소로 분리된 각 단어 합치기 34 | result = ' '.join(combine_word) # 띄어쓰기로 분리된 각 토큰 합치기 35 | 36 | return result # 온전한 문장을 반환 37 | 38 | data = open(datatxt,'r', encoding='utf-8') 39 | 40 | with open("data/kor.txt", "w", encoding='utf-8') as f: 41 | for row in data: 42 | f.write(del_post_pos(row)) 43 | f.write('\n') 44 | 45 | spm.SentencePieceTrainer.Train( 46 | '--input=data/kor.txt \ 47 | --model_prefix=data/korean_tok \ 48 | --vocab_size=100000 \ 49 | --hard_vocab_limit=false' 50 | ) 51 | 52 | def english_token(datatxt): 53 | data = open(datatxt,'r', encoding='utf-8') 54 | 55 | with open("data/eng.txt", "w", encoding='utf-8') as f: 56 | for row in data: 57 | f.write(row) 58 | f.write('\n') 59 | 60 | spm.SentencePieceTrainer.Train( 61 | '--input=data/eng.txt \ 62 | --model_prefix=data/english_tok \ 63 | --vocab_size=100000 \ 64 | --hard_vocab_limit=false' 65 | ) 66 | 67 | -------------------------------------------------------------------------------- /第9章 数字人文下的机器翻译/opennmt/onmt/inputters/__init__.py: -------------------------------------------------------------------------------- 1 | """Module defining inputters. 2 | 3 | Inputters implement the logic of transforming raw data to vectorized inputs, 4 | e.g., from a line of text to a sequence of embeddings. 5 | """ 6 | from onmt.inputters.text_dataset import text_sort_key, TextDataReader 7 | 8 | str2reader = {"text": TextDataReader} 9 | str2sortkey = {'text': text_sort_key} 10 | 11 | 12 | __all__ = ['Dataset', 'load_old_vocab', 'get_fields', 'DataReaderBase', 13 | 'filter_example', 'old_style_vocab', 14 | 'build_vocab', 'OrderedIterator', 15 | 'text_sort_key', 'TextDataReader'] 16 | -------------------------------------------------------------------------------- /第9章 数字人文下的机器翻译/opennmt/onmt/inputters/datareader_base.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | 4 | # several data readers need optional dependencies. There's no 5 | # appropriate builtin exception 6 | class MissingDependencyException(Exception): 7 | pass 8 | 9 | 10 | class DataReaderBase(object): 11 | """Read data from file system and yield as dicts. 12 | 13 | Raises: 14 | onmt.inputters.datareader_base.MissingDependencyException: A number 15 | of DataReaders need specific additional packages. 16 | If any are missing, this will be raised. 17 | """ 18 | 19 | @classmethod 20 | def from_opt(cls, opt): 21 | """Alternative constructor. 22 | 23 | Args: 24 | opt (argparse.Namespace): The parsed arguments. 25 | """ 26 | print("DataReaderBase from_opt") 27 | return cls() 28 | 29 | @classmethod 30 | def _read_file(cls, path): 31 | """Line-by-line read a file as bytes.""" 32 | print("DataReaderBase _read_file") 33 | with open(path, "rb") as f: 34 | for line in f: 35 | yield line 36 | 37 | @staticmethod 38 | def _raise_missing_dep(*missing_deps): 39 | """Raise missing dep exception with standard error message.""" 40 | raise MissingDependencyException( 41 | "Could not create reader. Be sure to install " 42 | "the following dependencies: " + ", ".join(missing_deps)) 43 | 44 | def read(self, data, side, src_dir): 45 | """Read data from file system and yield as dicts.""" 46 | raise NotImplementedError() 47 | -------------------------------------------------------------------------------- /第9章 数字人文下的机器翻译/opennmt/onmt/models/__init__.py: -------------------------------------------------------------------------------- 1 | """Module defining models.""" 2 | 3 | __all__ = ["build_model_saver", "ModelSaver", "NMTModel"] 4 | -------------------------------------------------------------------------------- /第9章 数字人文下的机器翻译/opennmt/onmt/models/model.py: -------------------------------------------------------------------------------- 1 | """ Onmt NMT Model base class definition """ 2 | import torch.nn as nn 3 | 4 | 5 | class NMTModel(nn.Module): 6 | """ 7 | Core trainable object in OpenNMT. Implements a trainable interface 8 | for a simple, generic encoder + decoder model. 9 | 10 | Args: 11 | encoder (onmt.encoders.EncoderBase): an encoder object 12 | decoder (onmt.decoders.DecoderBase): a decoder object 13 | """ 14 | 15 | def __init__(self, encoder, decoder): 16 | super(NMTModel, self).__init__() 17 | print("NMTModel") 18 | self.encoder = encoder 19 | self.decoder = decoder 20 | 21 | def forward(self, src, tgt, lengths, bptt=False, with_align=False): 22 | """Forward propagate a `src` and `tgt` pair for training. 23 | Possible initialized with a beginning decoder state. 24 | 25 | Args: 26 | src (Tensor): A source sequence passed to encoder. 27 | typically for inputs this will be a padded `LongTensor` 28 | of size ``(len, batch, features)``. However, may be an 29 | image or other generic input depending on encoder. 30 | tgt (LongTensor): A target sequence passed to decoder. 31 | Size ``(tgt_len, batch, features)``. 32 | lengths(LongTensor): The src lengths, pre-padding ``(batch,)``. 33 | bptt (Boolean): A flag indicating if truncated bptt is set. 34 | If reset then init_state 35 | with_align (Boolean): A flag indicating whether output alignment, 36 | Only valid for transformer decoder. 37 | 38 | Returns: 39 | (FloatTensor, dict[str, FloatTensor]): 40 | 41 | * decoder output ``(tgt_len, batch, hidden)`` 42 | * dictionary attention dists of ``(tgt_len, batch, src_len)`` 43 | """ 44 | dec_in = tgt[:-1] # exclude last target from inputs 45 | 46 | enc_state, memory_bank, lengths = self.encoder(src, lengths) 47 | 48 | if bptt is False: 49 | self.decoder.init_state(src, memory_bank, enc_state) 50 | dec_out, attns = self.decoder(dec_in, memory_bank, 51 | memory_lengths=lengths, 52 | with_align=with_align) 53 | return dec_out, attns 54 | 55 | def update_dropout(self, dropout): 56 | self.encoder.update_dropout(dropout) 57 | self.decoder.update_dropout(dropout) 58 | -------------------------------------------------------------------------------- /第9章 数字人文下的机器翻译/opennmt/onmt/models/stacked_rnn.py: -------------------------------------------------------------------------------- 1 | """ Implementation of ONMT RNN for Input Feeding Decoding """ 2 | import torch 3 | import torch.nn as nn 4 | 5 | 6 | class StackedLSTM(nn.Module): 7 | """ 8 | Our own implementation of stacked LSTM. 9 | Needed for the decoder, because we do input feeding. 10 | """ 11 | 12 | def __init__(self, num_layers, input_size, rnn_size, dropout): 13 | super(StackedLSTM, self).__init__() 14 | self.dropout = nn.Dropout(dropout) 15 | self.num_layers = num_layers 16 | self.layers = nn.ModuleList() 17 | 18 | for _ in range(num_layers): 19 | self.layers.append(nn.LSTMCell(input_size, rnn_size)) 20 | input_size = rnn_size 21 | 22 | def forward(self, input_feed, hidden): 23 | h_0, c_0 = hidden 24 | h_1, c_1 = [], [] 25 | for i, layer in enumerate(self.layers): 26 | h_1_i, c_1_i = layer(input_feed, (h_0[i], c_0[i])) 27 | input_feed = h_1_i 28 | if i + 1 != self.num_layers: 29 | input_feed = self.dropout(input_feed) 30 | h_1 += [h_1_i] 31 | c_1 += [c_1_i] 32 | 33 | h_1 = torch.stack(h_1) 34 | c_1 = torch.stack(c_1) 35 | 36 | return input_feed, (h_1, c_1) 37 | 38 | 39 | class StackedGRU(nn.Module): 40 | """ 41 | Our own implementation of stacked GRU. 42 | Needed for the decoder, because we do input feeding. 43 | """ 44 | 45 | def __init__(self, num_layers, input_size, rnn_size, dropout): 46 | super(StackedGRU, self).__init__() 47 | self.dropout = nn.Dropout(dropout) 48 | self.num_layers = num_layers 49 | self.layers = nn.ModuleList() 50 | 51 | for _ in range(num_layers): 52 | self.layers.append(nn.GRUCell(input_size, rnn_size)) 53 | input_size = rnn_size 54 | 55 | def forward(self, input_feed, hidden): 56 | h_1 = [] 57 | for i, layer in enumerate(self.layers): 58 | h_1_i = layer(input_feed, hidden[0][i]) 59 | input_feed = h_1_i 60 | if i + 1 != self.num_layers: 61 | input_feed = self.dropout(input_feed) 62 | h_1 += [h_1_i] 63 | 64 | h_1 = torch.stack(h_1) 65 | return input_feed, (h_1,) 66 | -------------------------------------------------------------------------------- /第9章 数字人文下的机器翻译/opennmt/onmt/modules/__init__.py: -------------------------------------------------------------------------------- 1 | """ Attention and normalization modules """ 2 | 3 | import onmt.modules.source_noise # noqa 4 | 5 | __all__ = ["Elementwise", "context_gate_factory", "ContextGate", 6 | "GlobalAttention", "ConvMultiStepAttention", "CopyGenerator", 7 | "CopyGeneratorLoss", "CopyGeneratorLossCompute", 8 | "MultiHeadedAttention", "Embeddings", "PositionalEncoding", 9 | "WeightNormConv2d", "AverageAttention"] 10 | -------------------------------------------------------------------------------- /第9章 数字人文下的机器翻译/opennmt/onmt/modules/average_attn.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Average Attention module.""" 3 | 4 | import torch 5 | import torch.nn as nn 6 | 7 | from onmt.modules.position_ffn import PositionwiseFeedForward 8 | 9 | 10 | class AverageAttention(nn.Module): 11 | """ 12 | Average Attention module from 13 | "Accelerating Neural Transformer via an Average Attention Network" 14 | :cite:`DBLP:journals/corr/abs-1805-00631`. 15 | 16 | Args: 17 | model_dim (int): the dimension of keys/values/queries, 18 | must be divisible by head_count 19 | dropout (float): dropout parameter 20 | """ 21 | 22 | def __init__(self, model_dim, dropout=0.1, aan_useffn=False): 23 | self.model_dim = model_dim 24 | self.aan_useffn = aan_useffn 25 | super(AverageAttention, self).__init__() 26 | if aan_useffn: 27 | self.average_layer = PositionwiseFeedForward(model_dim, model_dim, 28 | dropout) 29 | self.gating_layer = nn.Linear(model_dim * 2, model_dim * 2) 30 | 31 | def cumulative_average_mask(self, batch_size, inputs_len, device): 32 | """ 33 | Builds the mask to compute the cumulative average as described in 34 | :cite:`DBLP:journals/corr/abs-1805-00631` -- Figure 3 35 | 36 | Args: 37 | batch_size (int): batch size 38 | inputs_len (int): length of the inputs 39 | 40 | Returns: 41 | (FloatTensor): 42 | 43 | * A Tensor of shape ``(batch_size, input_len, input_len)`` 44 | """ 45 | 46 | triangle = torch.tril(torch.ones(inputs_len, inputs_len, 47 | dtype=torch.float, device=device)) 48 | weights = torch.ones(1, inputs_len, dtype=torch.float, device=device) \ 49 | / torch.arange(1, inputs_len + 1, dtype=torch.float, device=device) 50 | mask = triangle * weights.transpose(0, 1) 51 | 52 | return mask.unsqueeze(0).expand(batch_size, inputs_len, inputs_len) 53 | 54 | def cumulative_average(self, inputs, mask_or_step, 55 | layer_cache=None, step=None): 56 | """ 57 | Computes the cumulative average as described in 58 | :cite:`DBLP:journals/corr/abs-1805-00631` -- Equations (1) (5) (6) 59 | 60 | Args: 61 | inputs (FloatTensor): sequence to average 62 | ``(batch_size, input_len, dimension)`` 63 | mask_or_step: if cache is set, this is assumed 64 | to be the current step of the 65 | dynamic decoding. Otherwise, it is the mask matrix 66 | used to compute the cumulative average. 67 | layer_cache: a dictionary containing the cumulative average 68 | of the previous step. 69 | 70 | Returns: 71 | a tensor of the same shape and type as ``inputs``. 72 | """ 73 | 74 | if layer_cache is not None: 75 | step = mask_or_step 76 | average_attention = (inputs + step * 77 | layer_cache["prev_g"]) / (step + 1) 78 | layer_cache["prev_g"] = average_attention 79 | return average_attention 80 | else: 81 | mask = mask_or_step 82 | return torch.matmul(mask.to(inputs.dtype), inputs) 83 | 84 | def forward(self, inputs, mask=None, layer_cache=None, step=None): 85 | """ 86 | Args: 87 | inputs (FloatTensor): ``(batch_size, input_len, model_dim)`` 88 | 89 | Returns: 90 | (FloatTensor, FloatTensor): 91 | 92 | * gating_outputs ``(batch_size, input_len, model_dim)`` 93 | * average_outputs average attention 94 | ``(batch_size, input_len, model_dim)`` 95 | """ 96 | 97 | batch_size = inputs.size(0) 98 | inputs_len = inputs.size(1) 99 | average_outputs = self.cumulative_average( 100 | inputs, self.cumulative_average_mask(batch_size, 101 | inputs_len, inputs.device) 102 | if layer_cache is None else step, layer_cache=layer_cache) 103 | if self.aan_useffn: 104 | average_outputs = self.average_layer(average_outputs) 105 | gating_outputs = self.gating_layer(torch.cat((inputs, 106 | average_outputs), -1)) 107 | input_gate, forget_gate = torch.chunk(gating_outputs, 2, dim=2) 108 | gating_outputs = torch.sigmoid(input_gate) * inputs + \ 109 | torch.sigmoid(forget_gate) * average_outputs 110 | 111 | return gating_outputs, average_outputs 112 | -------------------------------------------------------------------------------- /第9章 数字人文下的机器翻译/opennmt/onmt/modules/conv_multi_step_attention.py: -------------------------------------------------------------------------------- 1 | """ Multi Step Attention for CNN """ 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | from onmt.utils.misc import aeq 6 | 7 | 8 | SCALE_WEIGHT = 0.5 ** 0.5 9 | 10 | 11 | def seq_linear(linear, x): 12 | """ linear transform for 3-d tensor """ 13 | batch, hidden_size, length, _ = x.size() 14 | h = linear(torch.transpose(x, 1, 2).contiguous().view( 15 | batch * length, hidden_size)) 16 | return torch.transpose(h.view(batch, length, hidden_size, 1), 1, 2) 17 | 18 | 19 | class ConvMultiStepAttention(nn.Module): 20 | """ 21 | Conv attention takes a key matrix, a value matrix and a query vector. 22 | Attention weight is calculated by key matrix with the query vector 23 | and sum on the value matrix. And the same operation is applied 24 | in each decode conv layer. 25 | """ 26 | 27 | def __init__(self, input_size): 28 | super(ConvMultiStepAttention, self).__init__() 29 | self.linear_in = nn.Linear(input_size, input_size) 30 | self.mask = None 31 | 32 | def apply_mask(self, mask): 33 | """ Apply mask """ 34 | self.mask = mask 35 | 36 | def forward(self, base_target_emb, input_from_dec, encoder_out_top, 37 | encoder_out_combine): 38 | """ 39 | Args: 40 | base_target_emb: target emb tensor 41 | input_from_dec: output of decode conv 42 | encoder_out_top: the key matrix for calculation of attetion weight, 43 | which is the top output of encode conv 44 | encoder_out_combine: 45 | the value matrix for the attention-weighted sum, 46 | which is the combination of base emb and top output of encode 47 | """ 48 | 49 | # checks 50 | # batch, channel, height, width = base_target_emb.size() 51 | batch, _, height, _ = base_target_emb.size() 52 | # batch_, channel_, height_, width_ = input_from_dec.size() 53 | batch_, _, height_, _ = input_from_dec.size() 54 | aeq(batch, batch_) 55 | aeq(height, height_) 56 | 57 | # enc_batch, enc_channel, enc_height = encoder_out_top.size() 58 | enc_batch, _, enc_height = encoder_out_top.size() 59 | # enc_batch_, enc_channel_, enc_height_ = encoder_out_combine.size() 60 | enc_batch_, _, enc_height_ = encoder_out_combine.size() 61 | 62 | aeq(enc_batch, enc_batch_) 63 | aeq(enc_height, enc_height_) 64 | 65 | preatt = seq_linear(self.linear_in, input_from_dec) 66 | target = (base_target_emb + preatt) * SCALE_WEIGHT 67 | target = torch.squeeze(target, 3) 68 | target = torch.transpose(target, 1, 2) 69 | pre_attn = torch.bmm(target, encoder_out_top) 70 | 71 | if self.mask is not None: 72 | pre_attn.data.masked_fill_(self.mask, -float('inf')) 73 | 74 | attn = F.softmax(pre_attn, dim=2) 75 | 76 | context_output = torch.bmm( 77 | attn, torch.transpose(encoder_out_combine, 1, 2)) 78 | context_output = torch.transpose( 79 | torch.unsqueeze(context_output, 3), 1, 2) 80 | return context_output, attn 81 | -------------------------------------------------------------------------------- /第9章 数字人文下的机器翻译/opennmt/onmt/modules/gate.py: -------------------------------------------------------------------------------- 1 | """ ContextGate module """ 2 | import torch 3 | import torch.nn as nn 4 | 5 | 6 | def context_gate_factory(gate_type, embeddings_size, decoder_size, 7 | attention_size, output_size): 8 | """Returns the correct ContextGate class""" 9 | 10 | gate_types = {'source': SourceContextGate, 11 | 'target': TargetContextGate, 12 | 'both': BothContextGate} 13 | 14 | assert gate_type in gate_types, "Not valid ContextGate type: {0}".format( 15 | gate_type) 16 | return gate_types[gate_type](embeddings_size, decoder_size, attention_size, 17 | output_size) 18 | 19 | 20 | class ContextGate(nn.Module): 21 | """ 22 | Context gate is a decoder module that takes as input the previous word 23 | embedding, the current decoder state and the attention state, and 24 | produces a gate. 25 | The gate can be used to select the input from the target side context 26 | (decoder state), from the source context (attention state) or both. 27 | """ 28 | 29 | def __init__(self, embeddings_size, decoder_size, 30 | attention_size, output_size): 31 | super(ContextGate, self).__init__() 32 | input_size = embeddings_size + decoder_size + attention_size 33 | self.gate = nn.Linear(input_size, output_size, bias=True) 34 | self.sig = nn.Sigmoid() 35 | self.source_proj = nn.Linear(attention_size, output_size) 36 | self.target_proj = nn.Linear(embeddings_size + decoder_size, 37 | output_size) 38 | 39 | def forward(self, prev_emb, dec_state, attn_state): 40 | input_tensor = torch.cat((prev_emb, dec_state, attn_state), dim=1) 41 | z = self.sig(self.gate(input_tensor)) 42 | proj_source = self.source_proj(attn_state) 43 | proj_target = self.target_proj( 44 | torch.cat((prev_emb, dec_state), dim=1)) 45 | return z, proj_source, proj_target 46 | 47 | 48 | class SourceContextGate(nn.Module): 49 | """Apply the context gate only to the source context""" 50 | 51 | def __init__(self, embeddings_size, decoder_size, 52 | attention_size, output_size): 53 | super(SourceContextGate, self).__init__() 54 | self.context_gate = ContextGate(embeddings_size, decoder_size, 55 | attention_size, output_size) 56 | self.tanh = nn.Tanh() 57 | 58 | def forward(self, prev_emb, dec_state, attn_state): 59 | z, source, target = self.context_gate( 60 | prev_emb, dec_state, attn_state) 61 | return self.tanh(target + z * source) 62 | 63 | 64 | class TargetContextGate(nn.Module): 65 | """Apply the context gate only to the target context""" 66 | 67 | def __init__(self, embeddings_size, decoder_size, 68 | attention_size, output_size): 69 | super(TargetContextGate, self).__init__() 70 | self.context_gate = ContextGate(embeddings_size, decoder_size, 71 | attention_size, output_size) 72 | self.tanh = nn.Tanh() 73 | 74 | def forward(self, prev_emb, dec_state, attn_state): 75 | z, source, target = self.context_gate(prev_emb, dec_state, attn_state) 76 | return self.tanh(z * target + source) 77 | 78 | 79 | class BothContextGate(nn.Module): 80 | """Apply the context gate to both contexts""" 81 | 82 | def __init__(self, embeddings_size, decoder_size, 83 | attention_size, output_size): 84 | super(BothContextGate, self).__init__() 85 | self.context_gate = ContextGate(embeddings_size, decoder_size, 86 | attention_size, output_size) 87 | self.tanh = nn.Tanh() 88 | 89 | def forward(self, prev_emb, dec_state, attn_state): 90 | z, source, target = self.context_gate(prev_emb, dec_state, attn_state) 91 | return self.tanh((1. - z) * target + z * source) 92 | -------------------------------------------------------------------------------- /第9章 数字人文下的机器翻译/opennmt/onmt/modules/position_ffn.py: -------------------------------------------------------------------------------- 1 | """Position feed-forward network from "Attention is All You Need".""" 2 | 3 | import torch.nn as nn 4 | 5 | 6 | class PositionwiseFeedForward(nn.Module): 7 | """ A two-layer Feed-Forward-Network with residual layer norm. 8 | 9 | Args: 10 | d_model (int): the size of input for the first-layer of the FFN. 11 | d_ff (int): the hidden layer size of the second-layer 12 | of the FNN. 13 | dropout (float): dropout probability in :math:`[0, 1)`. 14 | """ 15 | 16 | def __init__(self, d_model, d_ff, dropout=0.1): 17 | super(PositionwiseFeedForward, self).__init__() 18 | self.w_1 = nn.Linear(d_model, d_ff) 19 | self.w_2 = nn.Linear(d_ff, d_model) 20 | self.layer_norm = nn.LayerNorm(d_model, eps=1e-6) 21 | self.dropout_1 = nn.Dropout(dropout) 22 | self.relu = nn.ReLU() 23 | self.dropout_2 = nn.Dropout(dropout) 24 | 25 | def forward(self, x): 26 | """Layer definition. 27 | 28 | Args: 29 | x: ``(batch_size, input_len, model_dim)`` 30 | 31 | Returns: 32 | (FloatTensor): Output ``(batch_size, input_len, model_dim)``. 33 | """ 34 | 35 | inter = self.dropout_1(self.relu(self.w_1(self.layer_norm(x)))) 36 | output = self.dropout_2(self.w_2(inter)) 37 | return output + x 38 | 39 | def update_dropout(self, dropout): 40 | self.dropout_1.p = dropout 41 | self.dropout_2.p = dropout 42 | -------------------------------------------------------------------------------- /第9章 数字人文下的机器翻译/opennmt/onmt/modules/sparse_activations.py: -------------------------------------------------------------------------------- 1 | """ 2 | An implementation of sparsemax (Martins & Astudillo, 2016). See 3 | :cite:`DBLP:journals/corr/MartinsA16` for detailed description. 4 | 5 | By Ben Peters and Vlad Niculae 6 | """ 7 | 8 | import torch 9 | import torch.nn as nn 10 | from torch.autograd import Function 11 | 12 | 13 | def _make_ix_like(input, dim=0): 14 | d = input.size(dim) 15 | rho = torch.arange(1, d + 1, device=input.device, dtype=input.dtype) 16 | view = [1] * input.dim() 17 | view[0] = -1 18 | return rho.view(view).transpose(0, dim) 19 | 20 | 21 | def _threshold_and_support(input, dim=0): 22 | """Sparsemax building block: compute the threshold 23 | 24 | Args: 25 | input: any dimension 26 | dim: dimension along which to apply the sparsemax 27 | 28 | Returns: 29 | the threshold value 30 | """ 31 | 32 | input_srt, _ = torch.sort(input, descending=True, dim=dim) 33 | input_cumsum = input_srt.cumsum(dim) - 1 34 | rhos = _make_ix_like(input, dim) 35 | support = rhos * input_srt > input_cumsum 36 | 37 | support_size = support.sum(dim=dim).unsqueeze(dim) 38 | tau = input_cumsum.gather(dim, support_size - 1) 39 | tau /= support_size.to(input.dtype) 40 | return tau, support_size 41 | 42 | 43 | class SparsemaxFunction(Function): 44 | 45 | @staticmethod 46 | def forward(ctx, input, dim=0): 47 | """sparsemax: normalizing sparse transform (a la softmax) 48 | 49 | Parameters: 50 | input (Tensor): any shape 51 | dim: dimension along which to apply sparsemax 52 | 53 | Returns: 54 | output (Tensor): same shape as input 55 | """ 56 | ctx.dim = dim 57 | max_val, _ = input.max(dim=dim, keepdim=True) 58 | input -= max_val # same numerical stability trick as for softmax 59 | tau, supp_size = _threshold_and_support(input, dim=dim) 60 | output = torch.clamp(input - tau, min=0) 61 | ctx.save_for_backward(supp_size, output) 62 | return output 63 | 64 | @staticmethod 65 | def backward(ctx, grad_output): 66 | supp_size, output = ctx.saved_tensors 67 | dim = ctx.dim 68 | grad_input = grad_output.clone() 69 | grad_input[output == 0] = 0 70 | 71 | v_hat = grad_input.sum(dim=dim) / supp_size.to(output.dtype).squeeze() 72 | v_hat = v_hat.unsqueeze(dim) 73 | grad_input = torch.where(output != 0, grad_input - v_hat, grad_input) 74 | return grad_input, None 75 | 76 | 77 | sparsemax = SparsemaxFunction.apply 78 | 79 | 80 | class Sparsemax(nn.Module): 81 | 82 | def __init__(self, dim=0): 83 | self.dim = dim 84 | super(Sparsemax, self).__init__() 85 | 86 | def forward(self, input): 87 | return sparsemax(input, self.dim) 88 | 89 | 90 | class LogSparsemax(nn.Module): 91 | 92 | def __init__(self, dim=0): 93 | self.dim = dim 94 | super(LogSparsemax, self).__init__() 95 | 96 | def forward(self, input): 97 | return torch.log(sparsemax(input, self.dim)) 98 | -------------------------------------------------------------------------------- /第9章 数字人文下的机器翻译/opennmt/onmt/modules/sparse_losses.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from onmt.modules.sparse_activations import _threshold_and_support 4 | from onmt.utils.misc import aeq 5 | from torch.autograd import Function 6 | 7 | 8 | class SparsemaxLossFunction(Function): 9 | 10 | @staticmethod 11 | def forward(ctx, input, target): 12 | """ 13 | input (FloatTensor): ``(n, num_classes)``. 14 | target (LongTensor): ``(n,)``, the indices of the target classes 15 | """ 16 | input_batch, classes = input.size() 17 | target_batch = target.size(0) 18 | aeq(input_batch, target_batch) 19 | 20 | z_k = input.gather(1, target.unsqueeze(1)).squeeze() 21 | tau_z, support_size = _threshold_and_support(input, dim=1) 22 | support = input > tau_z 23 | x = torch.where( 24 | support, input**2 - tau_z**2, 25 | torch.tensor(0.0, device=input.device) 26 | ).sum(dim=1) 27 | ctx.save_for_backward(input, target, tau_z) 28 | # clamping necessary because of numerical errors: loss should be lower 29 | # bounded by zero, but negative values near zero are possible without 30 | # the clamp 31 | return torch.clamp(x / 2 - z_k + 0.5, min=0.0) 32 | 33 | @staticmethod 34 | def backward(ctx, grad_output): 35 | input, target, tau_z = ctx.saved_tensors 36 | sparsemax_out = torch.clamp(input - tau_z, min=0) 37 | delta = torch.zeros_like(sparsemax_out) 38 | delta.scatter_(1, target.unsqueeze(1), 1) 39 | return sparsemax_out - delta, None 40 | 41 | 42 | sparsemax_loss = SparsemaxLossFunction.apply 43 | 44 | 45 | class SparsemaxLoss(nn.Module): 46 | """ 47 | An implementation of sparsemax loss, first proposed in 48 | :cite:`DBLP:journals/corr/MartinsA16`. If using 49 | a sparse output layer, it is not possible to use negative log likelihood 50 | because the loss is infinite in the case the target is assigned zero 51 | probability. Inputs to SparsemaxLoss are arbitrary dense real-valued 52 | vectors (like in nn.CrossEntropyLoss), not probability vectors (like in 53 | nn.NLLLoss). 54 | """ 55 | 56 | def __init__(self, weight=None, ignore_index=-100, 57 | reduction='elementwise_mean'): 58 | assert reduction in ['elementwise_mean', 'sum', 'none'] 59 | self.reduction = reduction 60 | self.weight = weight 61 | self.ignore_index = ignore_index 62 | super(SparsemaxLoss, self).__init__() 63 | 64 | def forward(self, input, target): 65 | loss = sparsemax_loss(input, target) 66 | if self.ignore_index >= 0: 67 | ignored_positions = target == self.ignore_index 68 | size = float((target.size(0) - ignored_positions.sum()).item()) 69 | loss.masked_fill_(ignored_positions, 0.0) 70 | else: 71 | size = float(target.size(0)) 72 | if self.reduction == 'sum': 73 | loss = loss.sum() 74 | elif self.reduction == 'elementwise_mean': 75 | loss = loss.sum() / size 76 | return loss 77 | -------------------------------------------------------------------------------- /第9章 数字人文下的机器翻译/opennmt/onmt/modules/structured_attention.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.cuda 3 | import torch.nn as nn 4 | 5 | 6 | class MatrixTree(nn.Module): 7 | """Implementation of the matrix-tree theorem for computing marginals 8 | of non-projective dependency parsing. This attention layer is used 9 | in the paper "Learning Structured Text Representations" 10 | :cite:`DBLP:journals/corr/LiuL17d`. 11 | """ 12 | 13 | def __init__(self, eps=1e-5): 14 | self.eps = eps 15 | super(MatrixTree, self).__init__() 16 | 17 | def forward(self, input): 18 | laplacian = input.exp() + self.eps 19 | output = input.clone() 20 | for b in range(input.size(0)): 21 | lap = laplacian[b].masked_fill( 22 | torch.eye(input.size(1), device=input.device).ne(0), 0) 23 | lap = -lap + torch.diag(lap.sum(0)) 24 | # store roots on diagonal 25 | lap[0] = input[b].diag().exp() 26 | inv_laplacian = lap.inverse() 27 | 28 | factor = inv_laplacian.diag().unsqueeze(1)\ 29 | .expand_as(input[b]).transpose(0, 1) 30 | term1 = input[b].exp().mul(factor).clone() 31 | term2 = input[b].exp().mul(inv_laplacian.transpose(0, 1)).clone() 32 | term1[:, 0] = 0 33 | term2[0] = 0 34 | output[b] = term1 - term2 35 | roots_output = input[b].diag().exp().mul( 36 | inv_laplacian.transpose(0, 1)[0]) 37 | output[b] = output[b] + torch.diag(roots_output) 38 | return output 39 | -------------------------------------------------------------------------------- /第9章 数字人文下的机器翻译/opennmt/onmt/modules/util_class.py: -------------------------------------------------------------------------------- 1 | """ Misc classes """ 2 | import torch 3 | import torch.nn as nn 4 | 5 | 6 | # At the moment this class is only used by embeddings.Embeddings look-up tables 7 | class Elementwise(nn.ModuleList): 8 | """ 9 | A simple network container. 10 | Parameters are a list of modules. 11 | Inputs are a 3d Tensor whose last dimension is the same length 12 | as the list. 13 | Outputs are the result of applying modules to inputs elementwise. 14 | An optional merge parameter allows the outputs to be reduced to a 15 | single Tensor. 16 | """ 17 | 18 | def __init__(self, merge=None, *args): 19 | assert merge in [None, 'first', 'concat', 'sum', 'mlp'] 20 | self.merge = merge 21 | super(Elementwise, self).__init__(*args) 22 | 23 | def forward(self, inputs): 24 | inputs_ = [feat.squeeze(2) for feat in inputs.split(1, dim=2)] 25 | assert len(self) == len(inputs_) 26 | outputs = [f(x) for f, x in zip(self, inputs_)] 27 | if self.merge == 'first': 28 | return outputs[0] 29 | elif self.merge == 'concat' or self.merge == 'mlp': 30 | return torch.cat(outputs, 2) 31 | elif self.merge == 'sum': 32 | return sum(outputs) 33 | else: 34 | return outputs 35 | 36 | 37 | class Cast(nn.Module): 38 | """ 39 | Basic layer that casts its input to a specific data type. The same tensor 40 | is returned if the data type is already correct. 41 | """ 42 | 43 | def __init__(self, dtype): 44 | super(Cast, self).__init__() 45 | self._dtype = dtype 46 | 47 | def forward(self, x): 48 | return x.to(self._dtype) 49 | -------------------------------------------------------------------------------- /第9章 数字人文下的机器翻译/opennmt/onmt/translate/__init__.py: -------------------------------------------------------------------------------- 1 | """ Modules for translation """ 2 | from onmt.translate.greedy_search import GreedySearch 3 | 4 | __all__ = ['Translator', 'Translation', 'BeamSearch', 5 | 'GNMTGlobalScorer', 'TranslationBuilder', 6 | 'PenaltyBuilder', "DecodeStrategy", "GreedySearch"] 7 | -------------------------------------------------------------------------------- /第9章 数字人文下的机器翻译/opennmt/onmt/translate/penalties.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | 3 | import torch 4 | 5 | 6 | class PenaltyBuilder(object): 7 | """Returns the Length and Coverage Penalty function for Beam Search. 8 | 9 | Args: 10 | length_pen (str): option name of length pen 11 | cov_pen (str): option name of cov pen 12 | 13 | Attributes: 14 | has_cov_pen (bool): Whether coverage penalty is None (applying it 15 | is a no-op). Note that the converse isn't true. Setting beta 16 | to 0 should force coverage length to be a no-op. 17 | has_len_pen (bool): Whether length penalty is None (applying it 18 | is a no-op). Note that the converse isn't true. Setting alpha 19 | to 1 should force length penalty to be a no-op. 20 | coverage_penalty (callable[[FloatTensor, float], FloatTensor]): 21 | Calculates the coverage penalty. 22 | length_penalty (callable[[int, float], float]): Calculates 23 | the length penalty. 24 | """ 25 | 26 | def __init__(self, cov_pen, length_pen): 27 | self.has_cov_pen = not self._pen_is_none(cov_pen) 28 | self.coverage_penalty = self._coverage_penalty(cov_pen) 29 | self.has_len_pen = not self._pen_is_none(length_pen) 30 | self.length_penalty = self._length_penalty(length_pen) 31 | 32 | @staticmethod 33 | def _pen_is_none(pen): 34 | return pen == "none" or pen is None 35 | 36 | def _coverage_penalty(self, cov_pen): 37 | if cov_pen == "wu": 38 | return self.coverage_wu 39 | elif cov_pen == "summary": 40 | return self.coverage_summary 41 | elif self._pen_is_none(cov_pen): 42 | return self.coverage_none 43 | else: 44 | raise NotImplementedError("No '{:s}' coverage penalty.".format( 45 | cov_pen)) 46 | 47 | def _length_penalty(self, length_pen): 48 | if length_pen == "wu": 49 | return self.length_wu 50 | elif length_pen == "avg": 51 | return self.length_average 52 | elif self._pen_is_none(length_pen): 53 | return self.length_none 54 | else: 55 | raise NotImplementedError("No '{:s}' length penalty.".format( 56 | length_pen)) 57 | 58 | # Below are all the different penalty terms implemented so far. 59 | # Subtract coverage penalty from topk log probs. 60 | # Divide topk log probs by length penalty. 61 | 62 | def coverage_wu(self, cov, beta=0.): 63 | """GNMT coverage re-ranking score. 64 | 65 | See "Google's Neural Machine Translation System" :cite:`wu2016google`. 66 | ``cov`` is expected to be sized ``(*, seq_len)``, where ``*`` is 67 | probably ``batch_size x beam_size`` but could be several 68 | dimensions like ``(batch_size, beam_size)``. If ``cov`` is attention, 69 | then the ``seq_len`` axis probably sums to (almost) 1. 70 | """ 71 | 72 | penalty = -torch.min(cov, cov.clone().fill_(1.0)).log().sum(-1) 73 | return beta * penalty 74 | 75 | def coverage_summary(self, cov, beta=0.): 76 | """Our summary penalty.""" 77 | penalty = torch.max(cov, cov.clone().fill_(1.0)).sum(-1) 78 | penalty -= cov.size(-1) 79 | return beta * penalty 80 | 81 | def coverage_none(self, cov, beta=0.): 82 | """Returns zero as penalty""" 83 | none = torch.zeros((1,), device=cov.device, 84 | dtype=torch.float) 85 | if cov.dim() == 3: 86 | none = none.unsqueeze(0) 87 | return none 88 | 89 | def length_wu(self, cur_len, alpha=0.): 90 | """GNMT length re-ranking score. 91 | 92 | See "Google's Neural Machine Translation System" :cite:`wu2016google`. 93 | """ 94 | 95 | return ((5 + cur_len) / 6.0) ** alpha 96 | 97 | def length_average(self, cur_len, alpha=0.): 98 | """Returns the current sequence length.""" 99 | return cur_len 100 | 101 | def length_none(self, cur_len, alpha=0.): 102 | """Returns unmodified scores.""" 103 | return 1.0 104 | -------------------------------------------------------------------------------- /第9章 数字人文下的机器翻译/opennmt/onmt/utils/__init__.py: -------------------------------------------------------------------------------- 1 | """Module defining various utilities.""" 2 | 3 | __all__ = ["split_corpus", "aeq", "use_gpu", "set_random_seed", "ReportMgr", 4 | "build_report_manager", "Statistics", 5 | "MultipleOptimizer", "Optimizer", "AdaFactor", "EarlyStopping", 6 | "scorers_from_opts", "make_batch_align_matrix"] 7 | -------------------------------------------------------------------------------- /第9章 数字人文下的机器翻译/opennmt/onmt/utils/cnn_factory.py: -------------------------------------------------------------------------------- 1 | """ 2 | Implementation of "Convolutional Sequence to Sequence Learning" 3 | """ 4 | import onmt.modules 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.init as init 8 | 9 | SCALE_WEIGHT = 0.5 ** 0.5 10 | 11 | 12 | def shape_transform(x): 13 | """ Tranform the size of the tensors to fit for conv input. """ 14 | return torch.unsqueeze(torch.transpose(x, 1, 2), 3) 15 | 16 | 17 | class GatedConv(nn.Module): 18 | """ Gated convolution for CNN class """ 19 | 20 | def __init__(self, input_size, width=3, dropout=0.2, nopad=False): 21 | super(GatedConv, self).__init__() 22 | self.conv = onmt.modules.WeightNormConv2d( 23 | input_size, 2 * input_size, kernel_size=(width, 1), stride=(1, 1), 24 | padding=(width // 2 * (1 - nopad), 0)) 25 | init.xavier_uniform_(self.conv.weight, gain=(4 * (1 - dropout))**0.5) 26 | self.dropout = nn.Dropout(dropout) 27 | 28 | def forward(self, x_var): 29 | x_var = self.dropout(x_var) 30 | x_var = self.conv(x_var) 31 | out, gate = x_var.split(int(x_var.size(1) / 2), 1) 32 | out = out * torch.sigmoid(gate) 33 | return out 34 | 35 | 36 | class StackedCNN(nn.Module): 37 | """ Stacked CNN class """ 38 | 39 | def __init__(self, num_layers, input_size, cnn_kernel_width=3, 40 | dropout=0.2): 41 | super(StackedCNN, self).__init__() 42 | self.dropout = dropout 43 | self.num_layers = num_layers 44 | self.layers = nn.ModuleList() 45 | for _ in range(num_layers): 46 | self.layers.append( 47 | GatedConv(input_size, cnn_kernel_width, dropout)) 48 | 49 | def forward(self, x): 50 | for conv in self.layers: 51 | x = x + conv(x) 52 | x *= SCALE_WEIGHT 53 | return x 54 | -------------------------------------------------------------------------------- /第9章 数字人文下的机器翻译/opennmt/onmt/utils/distributed.py: -------------------------------------------------------------------------------- 1 | """ Pytorch Distributed utils 2 | This piece of code was heavily inspired by the equivalent of Fairseq-py 3 | https://github.com/pytorch/fairseq 4 | """ 5 | 6 | 7 | from __future__ import print_function 8 | 9 | import math 10 | import pickle 11 | 12 | import torch.distributed 13 | from onmt.utils.logging import logger 14 | 15 | 16 | def is_master(opt, device_id): 17 | return opt.gpu_ranks[device_id] == 0 18 | 19 | 20 | def multi_init(opt, device_id): 21 | dist_init_method = 'tcp://{master_ip}:{master_port}'.format( 22 | master_ip=opt.master_ip, 23 | master_port=opt.master_port) 24 | dist_world_size = opt.world_size 25 | torch.distributed.init_process_group( 26 | backend=opt.gpu_backend, init_method=dist_init_method, 27 | world_size=dist_world_size, rank=opt.gpu_ranks[device_id]) 28 | gpu_rank = torch.distributed.get_rank() 29 | if not is_master(opt, device_id): 30 | logger.disabled = True 31 | 32 | return gpu_rank 33 | 34 | 35 | def all_reduce_and_rescale_tensors(tensors, rescale_denom, 36 | buffer_size=10485760): 37 | """All-reduce and rescale tensors in chunks of the specified size. 38 | 39 | Args: 40 | tensors: list of Tensors to all-reduce 41 | rescale_denom: denominator for rescaling summed Tensors 42 | buffer_size: all-reduce chunk size in bytes 43 | """ 44 | # buffer size in bytes, determine equiv. # of elements based on data type 45 | buffer_t = tensors[0].new( 46 | math.ceil(buffer_size / tensors[0].element_size())).zero_() 47 | buffer = [] 48 | 49 | def all_reduce_buffer(): 50 | # copy tensors into buffer_t 51 | offset = 0 52 | for t in buffer: 53 | numel = t.numel() 54 | buffer_t[offset:offset+numel].copy_(t.view(-1)) 55 | offset += numel 56 | 57 | # all-reduce and rescale 58 | torch.distributed.all_reduce(buffer_t[:offset]) 59 | buffer_t.div_(rescale_denom) 60 | 61 | # copy all-reduced buffer back into tensors 62 | offset = 0 63 | for t in buffer: 64 | numel = t.numel() 65 | t.view(-1).copy_(buffer_t[offset:offset+numel]) 66 | offset += numel 67 | 68 | filled = 0 69 | for t in tensors: 70 | sz = t.numel() * t.element_size() 71 | if sz > buffer_size: 72 | # tensor is bigger than buffer, all-reduce and rescale directly 73 | torch.distributed.all_reduce(t) 74 | t.div_(rescale_denom) 75 | elif filled + sz > buffer_size: 76 | # buffer is full, all-reduce and replace buffer with grad 77 | all_reduce_buffer() 78 | buffer = [t] 79 | filled = sz 80 | else: 81 | # add tensor to buffer 82 | buffer.append(t) 83 | filled += sz 84 | 85 | if len(buffer) > 0: 86 | all_reduce_buffer() 87 | 88 | 89 | def all_gather_list(data, max_size=4096): 90 | """Gathers arbitrary data from all nodes into a list.""" 91 | world_size = torch.distributed.get_world_size() 92 | if not hasattr(all_gather_list, '_in_buffer') or \ 93 | max_size != all_gather_list._in_buffer.size(): 94 | all_gather_list._in_buffer = torch.cuda.ByteTensor(max_size) 95 | all_gather_list._out_buffers = [ 96 | torch.cuda.ByteTensor(max_size) 97 | for i in range(world_size) 98 | ] 99 | in_buffer = all_gather_list._in_buffer 100 | out_buffers = all_gather_list._out_buffers 101 | 102 | enc = pickle.dumps(data) 103 | enc_size = len(enc) 104 | if enc_size + 2 > max_size: 105 | raise ValueError( 106 | 'encoded data exceeds max_size: {}'.format(enc_size + 2)) 107 | assert max_size < 255*256 108 | in_buffer[0] = enc_size // 255 # this encoding works for max_size < 65k 109 | in_buffer[1] = enc_size % 255 110 | in_buffer[2:enc_size+2] = torch.ByteTensor(list(enc)) 111 | 112 | torch.distributed.all_gather(out_buffers, in_buffer.cuda()) 113 | 114 | results = [] 115 | for i in range(world_size): 116 | out_buffer = out_buffers[i] 117 | size = (255 * out_buffer[0].item()) + out_buffer[1].item() 118 | 119 | bytes_list = bytes(out_buffer[2:size+2].tolist()) 120 | result = pickle.loads(bytes_list) 121 | results.append(result) 122 | return results 123 | -------------------------------------------------------------------------------- /第9章 数字人文下的机器翻译/opennmt/onmt/utils/logging.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | 4 | import logging 5 | from logging.handlers import RotatingFileHandler 6 | 7 | logger = logging.getLogger() 8 | 9 | 10 | def init_logger(log_file=None, log_file_level=logging.NOTSET, rotate=False): 11 | log_format = logging.Formatter("[%(asctime)s %(levelname)s] %(message)s") 12 | logger = logging.getLogger() 13 | logger.setLevel(logging.INFO) 14 | 15 | console_handler = logging.StreamHandler() 16 | console_handler.setFormatter(log_format) 17 | logger.handlers = [console_handler] 18 | 19 | if log_file and log_file != '': 20 | if rotate: 21 | file_handler = RotatingFileHandler( 22 | log_file, maxBytes=1000000, backupCount=10) 23 | else: 24 | file_handler = logging.FileHandler(log_file) 25 | file_handler.setLevel(log_file_level) 26 | file_handler.setFormatter(log_format) 27 | logger.addHandler(file_handler) 28 | 29 | return logger 30 | -------------------------------------------------------------------------------- /第9章 数字人文下的机器翻译/opennmt/onmt/utils/rnn_factory.py: -------------------------------------------------------------------------------- 1 | """ 2 | RNN tools 3 | """ 4 | import onmt.models 5 | import torch.nn as nn 6 | 7 | 8 | def rnn_factory(rnn_type, **kwargs): 9 | """ rnn factory, Use pytorch version when available. """ 10 | no_pack_padded_seq = False 11 | if rnn_type == "SRU": 12 | # SRU doesn't support PackedSequence. 13 | no_pack_padded_seq = True 14 | rnn = onmt.models.sru.SRU(**kwargs) 15 | else: 16 | rnn = getattr(nn, rnn_type)(**kwargs) 17 | return rnn, no_pack_padded_seq 18 | -------------------------------------------------------------------------------- /第9章 数字人文下的机器翻译/opennmt/setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from os import path 3 | 4 | from setuptools import setup 5 | 6 | this_directory = path.abspath(path.dirname(__file__)) 7 | setup( 8 | install_requires=[ 9 | "six", 10 | "tqdm~=4.30.0", 11 | "torch>=1.4.0", 12 | "torchtext==0.4.0", 13 | "future", 14 | "configargparse", 15 | "tensorboard>=1.14", 16 | "pyonmttok==1.*;platform_system=='Linux'", 17 | "pyyaml", 18 | "sentencepiece", 19 | "MeCab" 20 | ], 21 | ) 22 | -------------------------------------------------------------------------------- /第9章 数字人文下的机器翻译/opennmt/translate.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from onmt.bin.translate import main 3 | 4 | 5 | 6 | def translate(input_text): 7 | ''' 8 | :param input_text: 从浏览器前端文本框输入的原始文本 9 | :return: processed_text: 处理后可直接用于前端呈现的文本 10 | ''' 11 | 12 | def writetxt(path, text): 13 | with open(path, 'w', encoding='utf8') as f: 14 | f.write(str(text)) 15 | f.close() 16 | 17 | def readtxt(path): 18 | with open(path, 'r', encoding='utf8') as f: 19 | lines = f.readlines() 20 | lines = lines[0].replace('\n','') 21 | return lines 22 | 23 | writetxt('./data/src-test.txt', input_text) 24 | main() 25 | processed_text = readtxt('./data/pred.txt') 26 | 27 | return processed_text 28 | 29 | 30 | 31 | if __name__ == "__main__": 32 | input_text = '虽然 , 每 至 于 族 , 吾 见 其 难为 , 怵然 为戒 , 视 为 止 , 行 为迟 。' 33 | output = translate(input_text) 34 | print(output) -------------------------------------------------------------------------------- /第9章 数字人文下的机器翻译/第九章 数字人文下的机器翻译.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hsc748NLP/code-for-digital-humanities-tutorial/6220c365ab5ff7df16f1f20a46cc928b034db36a/第9章 数字人文下的机器翻译/第九章 数字人文下的机器翻译.pdf --------------------------------------------------------------------------------