├── .gitignore
├── .idea
    └── .gitignore
├── README.md
├── run_chinese_ref.py
├── data
    ├── test.txt
    └── 新词挖掘.ipynb
└── run_mlm_wwm.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.xml
2 | *.iml


--------------------------------------------------------------------------------
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 | # Datasource local storage ignored files
5 | /dataSources/
6 | /dataSources.local.xml
7 | # Editor-based HTTP Client requests
8 | /httpRequests/
9 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # 项目说明
 2 | 本项目是新词挖掘+预训练模型继续预训练：  
 3 | 
 4 | - 新词挖掘用到了两种方法，分别是
 5 |    - 基于频次的新词挖掘
 6 |    - 基于自由凝固度以及左右邻字熵的新词挖掘  
 7 | 
 8 | 详细请看./data/新词挖掘.ipynb  
 9 | 
10 | - 继续预训练模型代码用的是huggingface/transformers库的examples  
11 | 地址在https://github.com/huggingface/transformers/tree/master/examples/research_projects/mlm_wwm  
12 | 可以直接看链接的介绍，这里只不过修改了部分源码罢了    
13 | 
14 | > 更具体方法原理和效果，请看我的知乎博客https://zhuanlan.zhihu.com/p/414384344
15 | 
16 | # 环境
17 | python=3.6   
18 | ltp=4.1.5  
19 | torch=1.7  
20 | transformers=4.5.0  


--------------------------------------------------------------------------------
/run_chinese_ref.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | from typing import List
  4 | 
  5 | from ltp import LTP
  6 | from tqdm import tqdm
  7 | from transformers.models.bert.tokenization_bert import BertTokenizer
  8 | 
  9 | 
 10 | def _is_chinese_char(cp):
 11 |     """Checks whether CP is the codepoint of a CJK character."""
 12 |     # This defines a "chinese character" as anything in the CJK Unicode block:
 13 |     #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
 14 |     #
 15 |     # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
 16 |     # despite its name. The modern Korean Hangul alphabet is a different block,
 17 |     # as is Japanese Hiragana and Katakana. Those alphabets are used to write
 18 |     # space-separated words, so they are not treated specially and handled
 19 |     # like the all of the other languages.
 20 |     if (
 21 |             (cp >= 0x4E00 and cp <= 0x9FFF)
 22 |             or (cp >= 0x3400 and cp <= 0x4DBF)  #
 23 |             or (cp >= 0x20000 and cp <= 0x2A6DF)  #
 24 |             or (cp >= 0x2A700 and cp <= 0x2B73F)  #
 25 |             or (cp >= 0x2B740 and cp <= 0x2B81F)  #
 26 |             or (cp >= 0x2B820 and cp <= 0x2CEAF)  #
 27 |             or (cp >= 0xF900 and cp <= 0xFAFF)
 28 |             or (cp >= 0x2F800 and cp <= 0x2FA1F)  #
 29 |     ):  #
 30 |         return True
 31 | 
 32 |     return False
 33 | 
 34 | 
 35 | def is_chinese(word: str):
 36 |     # word like '180' or '身高' or '神'
 37 |     for char in word:
 38 |         char = ord(char)
 39 |         if not _is_chinese_char(char):
 40 |             return 0
 41 |     return 1
 42 | 
 43 | 
 44 | def get_chinese_word(tokens: List[str]):
 45 |     word_set = set()
 46 | 
 47 |     for token in tokens:
 48 |         chinese_word = len(token) > 1 and is_chinese(token)
 49 |         if chinese_word:
 50 |             word_set.add(token)
 51 |     word_list = list(word_set)
 52 |     return word_list
 53 | 
 54 | 
 55 | def add_sub_symbol(bert_tokens: List[str], chinese_word_set: set()):
 56 |     if not chinese_word_set:
 57 |         return bert_tokens
 58 |     max_word_len = max([len(w) for w in chinese_word_set])
 59 | 
 60 |     bert_word = bert_tokens
 61 |     start, end = 0, len(bert_word)
 62 |     while start < end:
 63 |         single_word = True
 64 |         if is_chinese(bert_word[start]):
 65 |             l = min(end - start, max_word_len)
 66 |             for i in range(l, 1, -1):
 67 |                 whole_word = "".join(bert_word[start: start + i])
 68 |                 if whole_word in chinese_word_set:
 69 |                     for j in range(start + 1, start + i):
 70 |                         bert_word[j] = "##" + bert_word[j]
 71 |                     start = start + i
 72 |                     single_word = False
 73 |                     break
 74 |         if single_word:
 75 |             start += 1
 76 |     return bert_word
 77 | 
 78 | 
 79 | def prepare_ref(lines: List[str], ltp_tokenizer: LTP, bert_tokenizer: BertTokenizer):
 80 |     ltp_res = []
 81 | 
 82 |     with tqdm(range(0, len(lines), 100), total=len(lines) / 100, desc="ltp_tokenizering") as pbar:
 83 |         for i in pbar:
 84 |             res = ltp_tokenizer.seg(lines[i: i + 100])[0]
 85 |             res = [get_chinese_word(r) for r in res]
 86 |             ltp_res.extend(res)
 87 |     assert len(ltp_res) == len(lines)
 88 | 
 89 |     bert_res = []
 90 |     with tqdm(range(0, len(lines), 100), total=len(lines) / 100, desc="bert tokenizering") as pbar:
 91 |         for i in pbar:
 92 |             res = bert_tokenizer(lines[i: i + 100], add_special_tokens=True, truncation=True, max_length=512)
 93 |             bert_res.extend(res["input_ids"])
 94 |     assert len(bert_res) == len(lines)
 95 | 
 96 |     ref_ids = []
 97 |     for input_ids, chinese_word in zip(bert_res, ltp_res):
 98 | 
 99 |         input_tokens = []
100 |         for id in input_ids:
101 |             token = bert_tokenizer._convert_id_to_token(id)
102 |             input_tokens.append(token)
103 |         input_tokens = add_sub_symbol(input_tokens, chinese_word)
104 |         ref_id = []
105 |         # We only save pos of chinese subwords start with ##, which mean is part of a whole word.
106 |         for i, token in enumerate(input_tokens):
107 |             if token[:2] == "##":
108 |                 clean_token = token[2:]
109 |                 # save chinese tokens' pos
110 |                 if len(clean_token) == 1 and _is_chinese_char(ord(clean_token)):
111 |                     ref_id.append(i)
112 |         ref_ids.append(ref_id)
113 |         #
114 |     assert len(ref_ids) == len(bert_res)
115 | 
116 |     return ref_ids
117 | 
118 | 
119 | def main(args):
120 |     # For Chinese (Ro)Bert, the best result is from : RoBERTa-wwm-ext (https://github.com/ymcui/Chinese-BERT-wwm)
121 |     # If we want to fine-tune these model, we have to use same tokenizer : LTP (https://github.com/HIT-SCIR/ltp)
122 |     with open(args.file_name, "r", encoding="utf-8") as f:
123 |         data = f.readlines()
124 |     data = [line.strip() for line in data if len(line) > 0 and not line.isspace()]  # avoid delimiter like '\u2029'
125 |     # ltp_tokenizer = LTP(args.ltp)  # faster in GPU device
126 |     ltp_tokenizer = LTP()  # faster in GPU device
127 | 
128 |     # 把挖掘到的新词加入到分词字典中
129 |     with open("./data/new_words.txt", "r", encoding="utf-8") as f:
130 |         new_words = f.readlines()
131 |         new_words = [x.strip() for x in new_words]
132 |     ltp_tokenizer.add_words(words=new_words)
133 | 
134 |     bert_tokenizer = BertTokenizer.from_pretrained(args.bert)
135 | 
136 |     ref_ids = prepare_ref(data, ltp_tokenizer, bert_tokenizer)
137 | 
138 |     with open(args.save_path, "w", encoding="utf-8") as f:
139 |         data = [json.dumps(ref) + "\n" for ref in ref_ids]
140 |         f.writelines(data)
141 | 
142 | 
143 | if __name__ == "__main__":
144 |     parser = argparse.ArgumentParser(description="prepare_chinese_ref")
145 |     parser.add_argument("--file_name",
146 |                         type=str,
147 |                         default="./resources/chinese-demo.txt",
148 |                         help="file need process, same as training data in lm", )
149 |     parser.add_argument("--ltp",
150 |                         type=str,
151 |                         default="./resources/ltp",
152 |                         help="resources for LTP tokenizer, usually a path")
153 |     parser.add_argument("--bert",
154 |                         type=str,
155 |                         default="./resources/robert",
156 |                         help="resources for Bert tokenizer")
157 |     parser.add_argument("--save_path",
158 |                         type=str,
159 |                         default="./resources/ref.txt",
160 |                         help="path to save res")
161 | 
162 |     args = parser.parse_args()
163 |     main(args)
164 | 


--------------------------------------------------------------------------------
/data/test.txt:
--------------------------------------------------------------------------------
  1 | 自然语言处理 语音 编辑 上传视频
  2 | 本词条由“科普中国”科学百科词条编写与应用工作项目 审核 。
  3 | 自然语言处理( Natural Language Processing, NLP)是计算机科学领域与人工智能领域中的一个重要方向。它研究能实现人与计算机之间用自然语言进行有效通信的各种理论和方法。自然语言处理是一门融语言学、计算机科学、数学于一体的科学。因此，这一领域的研究将涉及自然语言，即人们日常使用的语言，所以它与语言学的研究有着密切的联系，但又有重要的区别。自然语言处理并不是一般地研究自然语言，而在于研制能有效地实现自然语言通信的计算机系统，特别是其中的软件系统。因而它是计算机科学的一部分 [1]  。
  4 | 自然语言处理主要应用于机器翻译、舆情监测、自动摘要、观点提取、文本分类、问题回答、文本语义对比、语音识别、中文OCR等方面 [2]  。
  5 | 中文名自然语言处理 [1] 外文名natural language processing [2] 适用领域计算机、人工智能缩    写NLP [1] 
  6 | 目录
  7 | 1 简介
  8 | 2 发展史
  9 | ▪ 早期自然语言处理
 10 | ▪ 统计自然语言处理
 11 | ▪ 神经网络自然语言处理
 12 | 3 关键概念和技术
 13 | ▪ 信息抽取（IE）
 14 | ▪ 自动文摘
 15 | ▪ 语音识别技术
 16 | ▪ Transformer 模型
 17 | ▪ 基于传统机器学习的自然语言处理技术
 18 | ▪ 基于深度学习的自然语言处理技术
 19 | 4 技术难点
 20 | ▪ 内容的有效界定
 21 | ▪ 消歧和模糊性
 22 | ▪ 有瑕疵的或不规范的输入
 23 | ▪ 语言行为与计划
 24 | 5 关联技术
 25 | ▪ 计算机科学
 26 | ▪ 互联网技术
 27 | ▪ 机器学习方法
 28 | 6 工具和平台
 29 | 7 研究热点
 30 | ▪ 预训练技术
 31 | ▪ 图神经网络技术
 32 | 8 未来展望
 33 | 简介编辑 语音
 34 | 语言是人类区别其他动物的本质特性。在所有生物中，只有人类才具有语言能力。人类的多种智能都与语言有着密切的关系。人类的逻辑思维以语言为形式，人类的绝大部分知识也是以语言文字的形式记载和流传下来的。因而，它也是人工智能的一个重要，甚至核心部分。
 35 | 用自然语言与计算机进行通信，这是人们长期以来所追求的。因为它既有明显的实际意义，同时也有重要的理论意义：人们可以用自己最习惯的语言来使用计算机，而无需再花大量的时间和精力去学习不很自然和习惯的各种计算机语言；人们也可通过它进一步了解人类的语言能力和智能的机制。
 36 | 自然语言处理是指利用人类交流所使用的自然语言与机器进行交互通讯的技术。通过人为的对自然语言的处理，使得计算机对其能够可读并理解。自然语言处理的相关研究始于人类对机器翻译的探索。虽然自然语言处理涉及语音、语法、语义、语用等多维度的操作，但简单而言，自然语言处理的基本任务是基于本体词典、词频统计、上下文语义分析等方式对待处理语料进行分词，形成以最小词性为单位，且富含语义的词项单元。 [3] 
 37 | 自然语言处理( Natural Language Processing, NLP)以语言为对象，利用计算机技术来分析、理解和处理自然语言的一门学科,即把计算机作为语言研究的强大工具，在计算机的支持下对语言信息进行定量化的研究,并提供可供人与计算机之间能共同使用的语言描写。包括自然语言理解( NaturalLanguage Understanding, NLU)和自然语言生成( Natural LanguageGeneration, NLG)两部分。它是典型边缘交叉学科，涉及到语言科学、计算机科学、数学、认知学、逻辑学等，关注计算机和人类(自然)语言之间的相互作用的领域。人们把用计算机处理自然语言的过程在不同时期或侧重点不同时又称为自然语言理解( Natural Language Understanding, NLU)、人类语言技术( Human Language Technology, HLT)、计算语言学Hl(Computational Linguistics)、计量语言学( QuantitativeLinguistics)、数理语言学( Mathematical Linguistics) [1]  。
 38 | 实现人机间自然语言通信意味着要使计算机既能理解自然语言文本的意义，也能以自然语言文本来表达给定的意图、思想等。前者称为自然语言理解，后者称为自然语言生成。因此，自然语言处理大体包括了自然语言理解和自然语言生成两个部分。历史上对自然语言理解研究得较多，而对自然语言生成研究得较少。但这种状况已有所改变。
 39 | 无论实现自然语言理解，还是自然语言生成，都远不如人们原来想象的那么简单，而是十分困难的。从现有的理论和技术现状看，通用的、高质量的自然语言处理系统，仍然是较长期的努力目标，但是针对一定应用，具有相当自然语言处理能力的实用系统已经出现，有些已商品化，甚至开始产业化。典型的例子有：多语种数据库和专家系统的自然语言接口、各种机器翻译系统、全文信息检索系统、自动文摘系统等。
 40 | 自然语言处理，即实现人机间自然语言通信，或实现自然语言理解和自然语言生成是十分困难的。造成困难的根本原因是自然语言文本和对话的各个层次上广泛存在的各种各样的歧义性或多义性（ambiguity）。
 41 | 自然语言的形式（字符串）与其意义之间是一种多对多的关系。其实这也正是自然语言的魅力所在。但从计算机处理的角度看，我们必须消除歧义，而且有人认为它正是自然语言理解中的中心问题，即要把带有潜在歧义的自然语言输入转换成某种无歧义的计算机内部表示。
 42 | 歧义现象的广泛存在使得消除它们需要大量的知识和推理，这就给基于语言学的方法、基于知识的方法带来了巨大的困难，因而以这些方法为主流的自然语言处理研究几十年来一方面在理论和方法方面取得了很多成就，但在能处理大规模真实文本的系统研制方面，成绩并不显著。研制的一些系统大多数是小规模的、研究性的演示系统。
 43 | 目前存在的问题有两个方面：一方面，迄今为止的语法都限于分析一个孤立的句子，上下文关系和谈话环境对本句的约束和影响还缺乏系统的研究，因此分析歧义、词语省略、代词所指、同一句话在不同场合或由不同的人说出来所具有的不同含义等问题，尚无明确规律可循，需要加强语用学的研究才能逐步解决。另一方面，人理解一个句子不是单凭语法，还运用了大量的有关知识，包括生活知识和专门知识，这些知识无法全部贮存在计算机里。因此一个书面理解系统只能建立在有限的词汇、句型和特定的主题范围内；计算机的贮存量和运转速度大大提高之后，才有可能适当扩大范围.
 44 | 以上存在的问题成为自然语言理解在机器翻译应用中的主要难题，这也就是当今机器翻译系统的译文质量离理想目标仍相差甚远的原因之一；而译文质量是机译系统成败的关键。中国数学家、语言学家周海中教授曾在经典论文《机器翻译五十年》中指出：要提高机译的质量，首先要解决的是语言本身问题而不是程序设计问题；单靠若干程序来做机译系统，肯定是无法提高机译质量的；另外在人类尚未明了大脑是如何进行语言的模糊识别和逻辑判断的情况下，机译要想达到“信、达、雅”的程度是不可能的。
 45 | 发展史编辑 语音
 46 | 最早的自然语言理解方面的研究工作是机器翻译 [4]  。1949年，美国人威弗首先提出了机器翻译设计方案 [1]  。其发展主要分为三个阶段。
 47 | 早期自然语言处理
 48 | 第一阶段(60~80年代):基于规则来建立词汇、句法语义分析、问答、聊天和机器翻译系统。好处是规则可以利用人类的内省知识，不依赖数据，可以快速起步；问题是覆盖面不足，像个玩具系统，规则管理和可扩展一直没有解决。 [5] 
 49 | 统计自然语言处理
 50 | 第二阶段(90年代开始)：基于统计的机器学习(ML)开始流行，很多NLP开始用基于统计的方法来做。主要思路是利用带标注的数据，基于人工定义的特征建立机器学习系统，并利用数据经过学习确定机器学习系统的参数。运行时利用这些学习得到的参数，对输入数据进行解码，得到输出。机器翻译、搜索引擎都是利用统计方法获得了成功。 [5] 
 51 | 神经网络自然语言处理
 52 | 第三阶段(2008年之后)：深度学习开始在语音和图像发挥威力。随之，NLP研究者开始把目光转向深度学习。先是把深度学习用于特征计算或者建立一个新的特征，然后在原有的统计学习框架下体验效果。比如，搜索引擎加入了深度学习的检索词和文档的相似度计算，以提升搜索的相关度。自2014年以来，人们尝试直接通过深度学习建模，进行端对端的训练。目前已在机器翻译、问答、阅读理解等领域取得了进展，出现了深度学习的热潮。 [5] 
 53 | 关键概念和技术编辑 语音
 54 | 信息抽取（IE）
 55 | 信息抽取是将嵌入在文本中的非结构化信息提取并转换为结构化数据的过程，从自然语言构成的语料中提取出命名实体之间的关系，是一种基于命名实体识别更深层次的研究。信息抽取的主要过程有三步：首先对非结构化的数据进行自动化处理，其次是针对性的抽取文本信息，最后对抽取的信息进行结构化表示。信息抽取最基本的工作是命名实体识别，而核心在于对实体关系的抽取。 [6] 
 56 | 自动文摘
 57 | 自动文摘是利用计算机按照某一规则自动地对文本信息进行提取、集合成简短摘要的一种信息压缩技术，旨在实现两个目标：首先使语言的简短，其次要保留重要信息。 [6] 
 58 | 语音识别技术
 59 | 语音识别技术就是让机器通过识别和理解过程把语音信号转变为相应的文本或命令的技术，也就是让机器听懂人类的语音，其目标是将人类语音中的词汇内容转化为计算机可读的数据。要做到这些，首先必须将连续的讲话分解为词、音素等单位，还需要建立一套理解语义的规则。语音识别技术从流程上讲有前端降噪、语音切割分帧、特征提取、状态匹配几个部分。而其框架可分成声学模型、语言模型和解码三个部分。 [7] 
 60 | Transformer 模型
 61 | Transformer 模型在2017 年，由Google 团队中首次提出。Transformer 是一种基于注意力机制来加速深度学习算法的模型，模型由一组编码器和一组解码器组成，编码器负责处理任意长度的输入并生成其表达，解码器负责把新表达转换为目的词。Transformer 模型利用注意力机制获取所有其他单词之间的关系，生成每个单词的新表示。Transformer 的优点是注意力机制能够在不考虑单词位置的情况下，直接捕捉句子中所有单词之间的关系。模型抛弃之前传统的encoder-decoder 模型必须结合RNN 或者CNN(Convolutional Neural Networks, CNN)的固有模式，使用全Attention 的结构代替了LSTM，减少计算量和提高并行效率的同时不损害最终的实验结果。但是此模型也存在缺陷。首先此模型计算量太大，其次还存在位置信息利用不明显的问题，无法捕获长距离的信息。 [8] 
 62 | 基于传统机器学习的自然语言处理技术
 63 | 自然语言处理可将处理任务进行分类，形成多个子任务，传统的机械学习方法可利用SVM（支持向量机模型）、
 64 | Markov（马尔科夫模型）、CRF（条件随机场模型）等方法对自然语言中多个子任务进行处理，进一步提高处理结果的精度。但是，从实际应用效果上来看，仍存在着以下不足：(1)传统机器学习训练模型的性能过于依赖训练集的质量，需要人工标注训练集，降低了训练效率。(2)传统机器学习模型中的训练集在不同领域应用会出现差异较大的应用效果，削弱了训练的适用性，暴露出学习方法单一的弊端。若想让训练数据集适用于多个不同领域，则要耗费大量人力资源进行人工标注。(3)在处理更高阶、更抽象的自然语言时，机器学习无法人工标注出来这些自然语言特征，使得传统机器学习只能学习预先制定的规则，而不能学规则之外的复杂语言特征。 [9] 
 65 | 基于深度学习的自然语言处理技术
 66 | 深度学习是机器学习的一大分支，在自然语言处理中需应用深度学习模型，如卷积神经网络、循环神经网络等，通过对生成的词向量进行学习，以完成自然语言分类、理解的过程。与传统的机器学习相比，基于深度学习的自然语言处理技术具备以下优势：(1)深度学习能够以词或句子的向量化为前提，不断学习语言特征，掌握更高层次、更加抽象的语言特征，满足大量特征工程的自然语言处理要求。(2)深度学习无需专家人工定义训练集，可通过神经网络自动学习高层次特征。 [9] 
 67 | 技术难点编辑 语音
 68 | 内容的有效界定
 69 | 日常生活中句子间的词汇通常是不会孤立存在的，需要将话语中的所有词语进行相互关联才能够表达出相应的含义,一旦形成特定的句子，词语间就会形成相应的界定关系。如果缺少有效的界定，内容就会变得模棱两可，无法进行有效的理解。例如他背着母亲和姐姐悄悄的出去玩了。这句话中如果不对介词“和”作出界定，就很容易形成母亲和姐姐两个人不知道他出去玩，或者是母亲不知道他和姐姐出去玩。
 70 | 消歧和模糊性
 71 | 词语和句子在不同情况下的运用往往具备多个含义,很容易产生模糊的概念或者是不同的想法，例如高山流水这个词具备多重含义，既可以表示自然环境,也能表达两者间的关系，甚至是形容乐曲的美妙,所以自然语言处理需要根据前后的内容进行界定,从中消除歧义和模糊性，表达出真正的意义。 [10] 
 72 | 有瑕疵的或不规范的输入
 73 | 例如语音处理时遇到外国口音或地方口音,或者在文本的处理中处理拼写,语法或者光学字符识别(OCR)的错误。
 74 | 语言行为与计划
 75 | 句子常常并不只是字面上的意思；例如，“你能把盐递过来吗”，一个好的回答应当是把盐递过去；在大多数上下文环境中，“能”将是糟糕的回答，虽说回答“不”或者“太远了我拿不到”也是可以接受的。再者，如果一门课程上一年没开设，对于提问“这门课程去年有多少学生没通过？”回答“去年没开这门课”要比回答“没人没通过”好。
 76 | 关联技术编辑 语音
 77 | 计算机科学
 78 | 自然语言处理的最初目的就是实现人和计算机的自然语言对话，计算机作为对话的一个主体是自然语言处理这个概念提出的先决条件。长久以来人们对于机器人应用于生活，成为重要生产力推动社会发展，尤其是使机器人拥有“人的智能”就充满了憧憬，自然语言处理作为人工智能领域的一个重要组成部分，对于推动机器人的真正智能化有标志性作用。近年来计算机性能在数据存储能力、处理速度等方面的大幅提升，为海量数据的处理、概率统计，为发现语言的规律、获得内在联系成为可能。 [11] 
 79 | 互联网技术
 80 | 互联网的出现使信息的传播更加便捷，依托于互联网技术出现的各种新媒体是信息已成为信息传播的主要途径，各种网络聊天软件增加了人们沟通交流的途径，这些以文字形式出现具有保存一定时间要求的信息带来了数据的爆炸式增长，为利用基于统计的自然语言处理提供了海量资源。依托于互联网技术，出现的开源平台，也是研究者们获取研究资源的重要途径。 [11] 
 81 | 机器学习方法
 82 | 机器学习是利用数据和经验改进计算机算法、优化计算机性能的多领域交叉学科，可以追溯到17 世纪的最小二乘法、马尔科夫链，但是其真正发展起来应该从20 世纪50 年代算起，经历了“有无知识的学习”的执行、基于图结构及逻辑结构进行系统描述、结合各种应用拓展到对多个概念学习三个阶段的发展，自20 世纪80 年代重要进入更新的、能够真正使计算机智能化的第四阶段。 [11] 
 83 | 利用半监督或无监督的机器学习方法对海量自然语言进行处理也与机器学习的发展历程相对应，大致可以分为两个阶段：基于离散性表示的线性模型的传统机器学习，基于连续性表示的非线性模型的深度学习。 [11] 
 84 | 深度学习是一种计算机自动学习算法，包括输入层、隐含层、输出层三部分，其中输入层是研究人员提供的大量数据，是算法的处理对象，隐含层的层数由实验人员确定，是算法对数据进行特征标记、发现其中规律、建立特征点间联系的过程，输出层则是研究人员可以得到的结果，一般来说输入层得到的数据越多，隐含层的层数越多，对数据的区分结果也就越好，但是带来的问题是计算量的加大、计算难度的提升，所幸计算机硬件在近年来取得飞跃。作为推动自然语言处理的最新动力，机器学习展现出了前所未有的优势： [11] 
 85 | （一）克服了语言特征人工标记的稀疏性的缺点，深度学习可以利用分布式向量对词做分类，词类标签、词义标签、依存关系等可以得到有效标记； [11] 
 86 | （二）克服了语言特征人工标记不完整的问题，人工的语言标记由于工作量的繁重，被遗漏的可能性很大，而高效率的计算机进行此项工作可以大大减少这种失误； [11] 
 87 | （三）克服了传统机器学习算法计算量大、计算时间长的问题，深度学习利用矩阵进行计算将计算量大幅压缩。 [11] 
 88 | 工具和平台编辑 语音
 89 | NLTK [12]  ：全面的python基础NLP库。
 90 | StanfordNLP [13]  ：学界常用的NLP算法库。
 91 | 中文NLP工具：THULAC [14]  、哈工大LTP [15]  、jieba分词 [16]  。
 92 | 研究热点编辑 语音
 93 | 预训练技术
 94 | 预训练思想的本质是模型参数不再随机初始化，而是通过语言模型进行训练。目前NLP 各项任务的解决思路是预训练加微调。预训练对于NLP任务有着巨大的提升帮助，而预训练语言模型也越来越多，从最初的Word2vec］、Glove到通用语言文本分类模型ULMFiT以及EMLo等。而当前最优秀的预训练语言模型是基于Transformer 模型构建。该模型是由Vaswani 等人提出的，其是一种完全基于Self-Attention 所构建的，是目前NLP领域最优秀的特征提取器，不但可以并行运算而且可以捕获长距离特征依赖。 [17] 
 95 | 当前影响最大的预训练语言模型是基于Transformer 的双向深度语言模型—BERT。BERT 是由多层双向Transformer 解码器构成，主要包括2 个不同大小的版本: 基础版本有12 层Transformer，每个Transformer 中的多头注意力层是12 个，隐藏层大小为768; 加强版有24 层Transformer，每个Transformer 中的多头注意力层是24 个，隐藏层大小为1 024。由此可见深而窄的模型效果要优于浅而宽的模型。目前BERT 在机器翻译、文本分类、文本相似性、阅读理解等多个任务中都有优异的表现。BERT 模型的训练方式包括2种：(1) 采用遮盖单词的方式。(2) 采用预测句子下一句的方式。 [17] 
 96 | 通过上述2 种方式训练得到通用语言模型，然后利用微调的方法进行下游任务，如文本分类、机器翻译等任务。较比以前的预训练模型，BERT 可以捕获真正意义上的双向上下文语义。但BERT 也有一定的缺点，既在训练模型时，使用大量的［MASK］会影响模型效果，而且每个批次只有15%的标记被预测，因此BERT 在训练时的收敛速度较慢。此外由于在预训练过程和生成过程不一致，导致在自然语言生成任务表现不佳，而且BERT 无法完成文档级别的NLP 任务，只适合于句子和段落级别的任务。 [17] 
 97 | XLNet是一种广义自回归的语言模型，是基于Transformer－XL 而构建的。Transformer 的缺点:(1) 字符之间的最大依赖距离受输入长度的限制。(2) 对于输入文本长度超过512 个字符时，每个段都是从头开始单独训练，因此使训练效率下降，影响模型性能。针对以上2 个缺点，Transformer－XL引入了2 个解决方法: 分割循环机制( Division Recurrence Mechanism) 和相对位置编码( Relative Positional Encoding) 。Transformer －XL 的测试速度更快，可以捕获更长的上下文长度。 [17] 
 98 | 无监督表征学习在NLP 领域取得了巨大成功，在这种理念下，很多研究者探索了不同的无监督预训练目标，而自回归语言建模和自编码语言是2 个最成功的预训练目标。而XLNet 是一种集合了自回归和自编码2 种方式的泛化自回归方法。XLNet不使用传统自回归模型中的固定前向或后向因式分解顺序，而使用一种随机排列自然语言预测某个位置可能出现的词，这种方式不仅可以使句子中的每个位置都能学习来自所有位置的语境信息，而且还可以构建双向语义，更好地获取上下文语义。由于XLNet 采用的是Transformer－XL，因此模型性能更优，尤其在包含长文本序列的任务中。 [17] 
 99 | 无论是BERT 还是XLNet 语言模型，在英文语料中表现都很优异，但在中文语料中效果一般，ERNIE则是以中文语料训练得出一种语言模型。ERNIE 是一种知识增强语义表示模型，其在语言推断、语义相似度、命名实体识别、文本分类等多个NLP 中文任务上都有优异表现。ERNIE 在处理中文语料时，通过对预测汉字进行建模，可以学习到更大语义单元的完整语义表示。ERNIE 模型内部核心是由Transformer 所构成。模型结构主要包括2 个模块，下层模块的文本编码器( T－Encoder) 主要负责捕获来自输入标记的基本词汇和句法信息，上层模块的知识编码器( KEncoder)负责从下层获取的知识信息集成到文本信息中，以便能够将标记和实体的异构信息表示成一个统一的特征空间中。 [17] 
100 | 图神经网络技术
101 | 图神经网络（Graph Neural Network）的研究主要是集中在相邻节点信息的传播与聚合上，从图神经网络的概念提出，到受深度学习中卷积神经网络的启发 [18]  。图神经网络对于非欧几里德数据在深度学习中的应用有着非常重要的地位，尤其是利用图结构在传统贝叶斯因果网络上可解释的特点，在定义深度神经网络关系可推理、因果可解释的问题上有很大的研究意义。如何利用深度学习方法对图结构的数据进行分析和推理吸引了非常多的研究和关注。 [18] 
102 | 通用的图神经网络推理过程可以通过图节点预表示、图节点采样、子图提取、子图特征融合、图神经网络的生成和训练子过程来表示，具体步骤如下： [18] 
103 | STEP1 图节点预表示： 通过图嵌入（Graph Embedding）的方法对图中每一个节点进行嵌入表示； [18] 
104 | STEP2 图节点采样：对图中每个节点或存在的节点对的正负样本进行采样； [18] 
105 | STEP3 子图提取：提取图中每一个节点的邻节点构建n 阶子图，其中n 表示第n 层的邻节点，从而形成通用的子图结构； [18] 
106 | STEP4 子图特征融合：对每一个输入神经网络的子图进行局部或全局的特征提取； [18] 
107 | STEP5 生成图神经网络和训练：定义网络的层数和输入输出的参数，并对图数据进行网络训练。 [18] 
108 | 1.图卷积神经网络
109 | 深度学习的流行与卷积神经网络的广泛适用性密不可分，图神经网络的研究中时间最长、研究成果最多的就是图卷积神经网络，从特征空间来看图卷积神经网络主要可以分为频域和空间域两个类型。 [18] 
110 | 频域的图卷积神经网络基于图信号处理问题,将图神经网络的卷积层定义为一个滤波器，即通过滤波器去除噪声信号从而得到输入信号的分类结果。实际问题中只能用于处理无向且边上无信息的图结构，将输入信号的图定义为可特征分解的拉普拉斯矩阵,归一化后的特征分解可以表示为通用结构其对角矩阵𝑨就是特征值的𝜆𝑖按序排列组成的特征矩阵。 [18] 
111 | 2.基于空间的图卷积神经网络
112 | 与深度学习中卷积神经网络对图像的像素点进行卷积运算类似，基于空间的图卷积神经网络通过计算中心单一节点与邻节点之间的卷积，来表示邻节点间信息的传递和聚合，作为特征域的新节点表示。 [18] 
113 | 未来展望编辑 语音
114 | 自然语言处理领域一直是基于规则和基于统计２种研究方法交替占据主导地位，２种研究都先后遇到瓶颈，基于规则和传统机器学习的方法到达一定阶段后就很难再取得更大的突破，直到计算能力和数据存储的提升才极大地促进了自然语言处理的发展。语音识别的突破使得深度学习技术变得非常普及。取得较大进展的还有机器翻译，谷歌翻译目前用深度神经网络技术将机器翻译提升到了新的高度，即使达不到人工翻译标准也足以应对大部分的需求。信息抽取也变得更加智能，能更好地理解复杂句子结构和实体间关系，抽取出正确的事实。深度学习推动了自然语言处理任务的进步，同时自然语言处理任务也为深度学习提供了广阔的应用前景，使得人们在算法设计上投入得更多。人工智能的进步会继续促进自然语言处理的发展，也使得自然语言处理面临着如下挑战： [19] 
115 | １）更优的算法。人工智能发展的三要素（数据、计算能力和算法）中，与自然语言处理研究者最相关的就是算法设计。深度学习已经在很多任务中表现出了强大的优势，但后向传播方式的合理性近期受到质疑。深度学习是通过大数据完成小任务的方法，重点在做归纳，学习效率是比较低的，而能否从小数据出发，分析出其蕴含的原理，从演绎的角度出发来完成多任务，是未来非常值得研究的方向。 [19] 
116 | ２）语言的深度分析。尽管深度学习很大程度上提升了自然语言处理的效果，但该领域是关于语言技术的科学，而不是寻找最好的机器学习方法，核心仍然是语言学问题。未来语言中的难题还需要关注语义理解，从大规模网络数据中，通过深入的语义分析，结合语言学理论，发现语义产生与理解的规律，研究数据背后隐藏的模式，扩充和完善已有的知识模型，使语义表示更加准确．语言理解需要理性与经验的结合，理性是先验的，而经验可以扩充知识，因此需要充分利用世界知识和语言学理论指导先进技术来理解语义。分布式词向量中隐含了部分语义信息，通过词向量的不同组合方式，能够表达出更丰富的语义，但词向量的语义作用仍未完全发挥，挖掘语言中的语义表示模式，并将语义用形式化语言完整准确地表示出来让计算机理解，是将来研究的重点任务。 [19] 
117 | ３）多学科的交叉。在理解语义的问题上，需要寻找一个合适的模型。在模型的探索中，需要充分借鉴语言哲学、认知科学和脑科学领域的研究成果，从认知的角度去发现语义的产生与理解，有可能会为语言理解建立更好的模型。在科技创新的今天，多学科的交叉可以更好地促进自然语言处理的发展。 [19] 
118 | 深度学习为自然语言处理带来了重大技术突破，它的广泛应用极大地改变了人们的日常生活。当深度学习和其他认知科学、语言学结合时，或许可以发挥出更大的威力，解决语义理解问题，带来真正的“智能”。 [19] 
119 | 尽管深度学习在NLP 各个任务中取得了巨大成功，但若大规模投入使用，仍然有许多研究难点需要克服。深度神经网络模型越大，使得模型训练时间延长，如何减小模型体积但同时保持模型性能不变是未来研究的一个方向。此外深度神经网络模型可解释性较差，在自然语言生成任务研究进展不大。但是，随着深度学习的不断研究深入，在不久的将来，NLP 领域将会取得更多研究成果和发展。 [17] 


--------------------------------------------------------------------------------
/run_mlm_wwm.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2020 The HuggingFace Team All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """
 16 | Fine-tuning the library models for masked language modeling (BERT, ALBERT, RoBERTa...) with whole word masking on a
 17 | text file or a dataset.
 18 | 
 19 | Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
 20 | https://huggingface.co/models?filter=masked-lm
 21 | """
 22 | # You can also adapt this script on your own masked language modeling task. Pointers for this are left as comments.
 23 | 
 24 | import json
 25 | import logging
 26 | import math
 27 | import os
 28 | import sys
 29 | from dataclasses import dataclass, field
 30 | from typing import Optional
 31 | 
 32 | from datasets import Dataset, load_dataset
 33 | 
 34 | import transformers
 35 | from transformers import (
 36 |     CONFIG_MAPPING,
 37 |     MODEL_FOR_MASKED_LM_MAPPING,
 38 |     AutoConfig,
 39 |     AutoModelForMaskedLM,
 40 |     AutoTokenizer,
 41 |     DataCollatorForWholeWordMask,
 42 |     HfArgumentParser,
 43 |     Trainer,
 44 |     TrainingArguments,
 45 |     set_seed,
 46 | )
 47 | from transformers.trainer_utils import get_last_checkpoint, is_main_process
 48 | 
 49 | 
 50 | logger = logging.getLogger(__name__)
 51 | MODEL_CONFIG_CLASSES = list(MODEL_FOR_MASKED_LM_MAPPING.keys())
 52 | MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
 53 | 
 54 | 
 55 | @dataclass
 56 | class ModelArguments:
 57 |     """
 58 |     Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
 59 |     """
 60 | 
 61 |     model_name_or_path: Optional[str] = field(
 62 |         default=None,
 63 |         metadata={
 64 |             "help": "The model checkpoint for weights initialization."
 65 |             "Don't set if you want to train a model from scratch."
 66 |         },
 67 |     )
 68 |     model_type: Optional[str] = field(
 69 |         default=None,
 70 |         metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)},
 71 |     )
 72 |     config_name: Optional[str] = field(
 73 |         default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
 74 |     )
 75 |     tokenizer_name: Optional[str] = field(
 76 |         default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
 77 |     )
 78 |     cache_dir: Optional[str] = field(
 79 |         default=None,
 80 |         metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
 81 |     )
 82 |     use_fast_tokenizer: bool = field(
 83 |         default=True,
 84 |         metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
 85 |     )
 86 |     model_revision: str = field(
 87 |         default="main",
 88 |         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
 89 |     )
 90 |     use_auth_token: bool = field(
 91 |         default=False,
 92 |         metadata={
 93 |             "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script "
 94 |             "with private models)."
 95 |         },
 96 |     )
 97 | 
 98 | 
 99 | @dataclass
100 | class DataTrainingArguments:
101 |     """
102 |     Arguments pertaining to what data we are going to input our model for training and eval.
103 |     """
104 | 
105 |     dataset_name: Optional[str] = field(
106 |         default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
107 |     )
108 |     dataset_config_name: Optional[str] = field(
109 |         default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
110 |     )
111 |     train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
112 |     validation_file: Optional[str] = field(
113 |         default=None,
114 |         metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
115 |     )
116 |     train_ref_file: Optional[str] = field(
117 |         default=None,
118 |         metadata={"help": "An optional input train ref data file for whole word masking in Chinese."},
119 |     )
120 |     validation_ref_file: Optional[str] = field(
121 |         default=None,
122 |         metadata={"help": "An optional input validation ref data file for whole word masking in Chinese."},
123 |     )
124 |     overwrite_cache: bool = field(
125 |         default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
126 |     )
127 |     validation_split_percentage: Optional[int] = field(
128 |         default=5,
129 |         metadata={
130 |             "help": "The percentage of the train set used as validation set in case there's no validation split"
131 |         },
132 |     )
133 |     max_seq_length: Optional[int] = field(
134 |         default=None,
135 |         metadata={
136 |             "help": "The maximum total input sequence length after tokenization. Sequences longer "
137 |             "than this will be truncated. Default to the max input length of the model."
138 |         },
139 |     )
140 |     preprocessing_num_workers: Optional[int] = field(
141 |         default=None,
142 |         metadata={"help": "The number of processes to use for the preprocessing."},
143 |     )
144 |     mlm_probability: float = field(
145 |         default=0.15, metadata={"help": "Ratio of tokens to mask for masked language modeling loss"}
146 |     )
147 |     pad_to_max_length: bool = field(
148 |         default=False,
149 |         metadata={
150 |             "help": "Whether to pad all samples to `max_seq_length`. "
151 |             "If False, will pad the samples dynamically when batching to the maximum length in the batch."
152 |         },
153 |     )
154 | 
155 |     def __post_init__(self):
156 |         if self.train_file is not None:
157 |             extension = self.train_file.split(".")[-1]
158 |             assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file."
159 |         if self.validation_file is not None:
160 |             extension = self.validation_file.split(".")[-1]
161 |             assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file."
162 | 
163 | 
164 | def add_chinese_references(dataset, ref_file):
165 |     with open(ref_file, "r", encoding="utf-8") as f:
166 |         refs = [json.loads(line) for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())]
167 |     assert len(dataset) == len(refs)
168 | 
169 |     dataset_dict = {c: dataset[c] for c in dataset.column_names}
170 |     dataset_dict["chinese_ref"] = refs
171 |     return Dataset.from_dict(dataset_dict)
172 | 
173 | 
174 | def main():
175 |     # See all possible arguments in src/transformers/training_args.py
176 |     # or by passing the --help flag to this script.
177 |     # We now keep distinct sets of args, for a cleaner separation of concerns.
178 | 
179 |     parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
180 |     if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
181 |         # If we pass only one argument to the script and it's the path to a json file,
182 |         # let's parse it to get our arguments.
183 |         model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
184 |     else:
185 |         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
186 | 
187 |     # Detecting last checkpoint.
188 |     last_checkpoint = None
189 |     if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
190 |         last_checkpoint = get_last_checkpoint(training_args.output_dir)
191 |         if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
192 |             raise ValueError(
193 |                 f"Output directory ({training_args.output_dir}) already exists and is not empty. "
194 |                 "Use --overwrite_output_dir to overcome."
195 |             )
196 |         elif last_checkpoint is not None:
197 |             logger.info(
198 |                 f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
199 |                 "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
200 |             )
201 | 
202 |     # Setup logging
203 |     logging.basicConfig(
204 |         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
205 |         datefmt="%m/%d/%Y %H:%M:%S",
206 |         handlers=[logging.StreamHandler(sys.stdout)],
207 |     )
208 |     logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN)
209 | 
210 |     # Log on each process the small summary:
211 |     logger.warning(
212 |         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
213 |         + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
214 |     )
215 |     # Set the verbosity to info of the Transformers logger (on main process only):
216 |     if is_main_process(training_args.local_rank):
217 |         transformers.utils.logging.set_verbosity_info()
218 |         transformers.utils.logging.enable_default_handler()
219 |         transformers.utils.logging.enable_explicit_format()
220 |     logger.info("Training/evaluation parameters %s", training_args)
221 | 
222 |     # Set seed before initializing model.
223 |     set_seed(training_args.seed)
224 | 
225 |     # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
226 |     # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
227 |     # (the dataset will be downloaded automatically from the datasets Hub).
228 |     #
229 |     # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
230 |     # 'text' is found. You can easily tweak this behavior (see below).
231 |     #
232 |     # In distributed training, the load_dataset function guarantee that only one local process can concurrently
233 |     # download the dataset.
234 |     if data_args.dataset_name is not None:
235 |         # Downloading and loading a dataset from the hub.
236 |         datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name)
237 |         if "validation" not in datasets.keys():
238 |             datasets["validation"] = load_dataset(
239 |                 data_args.dataset_name,
240 |                 data_args.dataset_config_name,
241 |                 split=f"train[:{data_args.validation_split_percentage}%]",
242 |             )
243 |             datasets["train"] = load_dataset(
244 |                 data_args.dataset_name,
245 |                 data_args.dataset_config_name,
246 |                 split=f"train[{data_args.validation_split_percentage}%:]",
247 |             )
248 |     else:
249 |         data_files = {}
250 |         if data_args.train_file is not None:
251 |             data_files["train"] = data_args.train_file
252 |         if data_args.validation_file is not None:
253 |             data_files["validation"] = data_args.validation_file
254 |         extension = data_args.train_file.split(".")[-1]
255 |         if extension == "txt":
256 |             extension = "text"
257 |         datasets = load_dataset(extension, data_files=data_files)
258 |     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
259 |     # https://huggingface.co/docs/datasets/loading_datasets.html.
260 | 
261 |     # Load pretrained model and tokenizer
262 |     #
263 |     # Distributed training:
264 |     # The .from_pretrained methods guarantee that only one local process can concurrently
265 |     # download model & vocab.
266 |     config_kwargs = {
267 |         "cache_dir": model_args.cache_dir,
268 |         "revision": model_args.model_revision,
269 |         "use_auth_token": True if model_args.use_auth_token else None,
270 |     }
271 |     if model_args.config_name:
272 |         config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs)
273 |     elif model_args.model_name_or_path:
274 |         config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs)
275 |     else:
276 |         config = CONFIG_MAPPING[model_args.model_type]()
277 |         logger.warning("You are instantiating a new config instance from scratch.")
278 | 
279 |     tokenizer_kwargs = {
280 |         "cache_dir": model_args.cache_dir,
281 |         "use_fast": model_args.use_fast_tokenizer,
282 |         "revision": model_args.model_revision,
283 |         "use_auth_token": True if model_args.use_auth_token else None,
284 |     }
285 |     if model_args.tokenizer_name:
286 |         tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs)
287 |     elif model_args.model_name_or_path:
288 |         tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, **tokenizer_kwargs)
289 |     else:
290 |         raise ValueError(
291 |             "You are instantiating a new tokenizer from scratch. This is not supported by this script."
292 |             "You can do it from another script, save it, and load it from here, using --tokenizer_name."
293 |         )
294 | 
295 |     if model_args.model_name_or_path:
296 |         model = AutoModelForMaskedLM.from_pretrained(
297 |             model_args.model_name_or_path,
298 |             from_tf=bool(".ckpt" in model_args.model_name_or_path),
299 |             config=config,
300 |             cache_dir=model_args.cache_dir,
301 |             revision=model_args.model_revision,
302 |             use_auth_token=True if model_args.use_auth_token else None,
303 |         )
304 |     else:
305 |         logger.info("Training new model from scratch")
306 |         model = AutoModelForMaskedLM.from_config(config)
307 | 
308 |     model.resize_token_embeddings(len(tokenizer))
309 | 
310 |     # Preprocessing the datasets.
311 |     # First we tokenize all the texts.
312 |     if training_args.do_train:
313 |         column_names = datasets["train"].column_names
314 |     else:
315 |         column_names = datasets["validation"].column_names
316 |     text_column_name = "text" if "text" in column_names else column_names[0]
317 | 
318 |     padding = "max_length" if data_args.pad_to_max_length else False
319 | 
320 |     def tokenize_function(examples):
321 |         # Remove empty lines
322 |         examples["text"] = [line for line in examples["text"] if len(line) > 0 and not line.isspace()]
323 |         return tokenizer(examples["text"], padding=padding, truncation=True, max_length=data_args.max_seq_length)
324 | 
325 |     tokenized_datasets = datasets.map(
326 |         tokenize_function,
327 |         batched=True,
328 |         num_proc=data_args.preprocessing_num_workers,
329 |         remove_columns=[text_column_name],
330 |         load_from_cache_file=not data_args.overwrite_cache,
331 |     )
332 | 
333 |     # Add the chinese references if provided
334 |     if data_args.train_ref_file is not None:
335 |         tokenized_datasets["train"] = add_chinese_references(tokenized_datasets["train"], data_args.train_ref_file)
336 |     if data_args.validation_ref_file is not None:
337 |         tokenized_datasets["validation"] = add_chinese_references(
338 |             tokenized_datasets["validation"], data_args.validation_ref_file
339 |         )
340 |     # If we have ref files, need to avoid it removed by trainer
341 |     has_ref = data_args.train_ref_file or data_args.validation_ref_file
342 |     if has_ref:
343 |         training_args.remove_unused_columns = False
344 | 
345 |     # Data collator
346 |     # This one will take care of randomly masking the tokens.
347 |     data_collator = DataCollatorForWholeWordMask(tokenizer=tokenizer, mlm_probability=data_args.mlm_probability)
348 | 
349 |     # Initialize our Trainer
350 |     trainer = Trainer(
351 |         model=model,
352 |         args=training_args,
353 |         train_dataset=tokenized_datasets["train"] if training_args.do_train else None,
354 |         eval_dataset=tokenized_datasets["validation"] if training_args.do_eval else None,
355 |         tokenizer=tokenizer,
356 |         data_collator=data_collator,
357 |     )
358 | 
359 |     # Training
360 |     if training_args.do_train:
361 |         if last_checkpoint is not None:
362 |             checkpoint = last_checkpoint
363 |         elif model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path):
364 |             checkpoint = model_args.model_name_or_path
365 |         else:
366 |             checkpoint = None
367 |         train_result = trainer.train(resume_from_checkpoint=checkpoint)
368 |         trainer.save_model()  # Saves the tokenizer too for easy upload
369 | 
370 |         output_train_file = os.path.join(training_args.output_dir, "train_results.txt")
371 |         if trainer.is_world_process_zero():
372 |             with open(output_train_file, "w") as writer:
373 |                 logger.info("***** Train results *****")
374 |                 for key, value in sorted(train_result.metrics.items()):
375 |                     logger.info(f"  {key} = {value}")
376 |                     writer.write(f"{key} = {value}\n")
377 | 
378 |             # Need to save the state, since Trainer.save_model saves only the tokenizer with the model
379 |             trainer.state.save_to_json(os.path.join(training_args.output_dir, "trainer_state.json"))
380 | 
381 |     # Evaluation
382 |     results = {}
383 |     if training_args.do_eval:
384 |         logger.info("*** Evaluate ***")
385 | 
386 |         eval_output = trainer.evaluate()
387 | 
388 |         perplexity = math.exp(eval_output["eval_loss"])
389 |         results["perplexity"] = perplexity
390 | 
391 |         output_eval_file = os.path.join(training_args.output_dir, "eval_results_mlm_wwm.txt")
392 |         if trainer.is_world_process_zero():
393 |             with open(output_eval_file, "w") as writer:
394 |                 logger.info("***** Eval results *****")
395 |                 for key, value in sorted(results.items()):
396 |                     logger.info(f"  {key} = {value}")
397 |                     writer.write(f"{key} = {value}\n")
398 | 
399 |     return results
400 | 
401 | 
402 | def _mp_fn(index):
403 |     # For xla_spawn (TPUs)
404 |     main()
405 | 
406 | 
407 | if __name__ == "__main__":
408 |     main()
409 | 


--------------------------------------------------------------------------------
/data/新词挖掘.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# 新词挖掘"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {
 13 |     "ExecuteTime": {
 14 |      "end_time": "2021-09-24T02:28:23.426088Z",
 15 |      "start_time": "2021-09-24T02:28:23.418927Z"
 16 |     }
 17 |    },
 18 |    "source": [
 19 |     "用于挖掘特定领域场景的新词，这里提供两种方法，分别是**基于频次**和**基于自由凝固度以及左右邻字熵**"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "markdown",
 24 |    "metadata": {},
 25 |    "source": [
 26 |     "## 基于频次的新词挖掘"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": null,
 32 |    "metadata": {
 33 |     "ExecuteTime": {
 34 |      "end_time": "2021-09-24T09:21:23.516012Z",
 35 |      "start_time": "2021-09-24T09:21:20.971619Z"
 36 |     }
 37 |    },
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "from tqdm import tqdm\n",
 41 |     "from ltp import LTP\n",
 42 |     "\n",
 43 |     "ltp = LTP()\n",
 44 |     "with open(\"./test.txt\", \"r\", encoding = \"utf-8\") as f:\n",
 45 |     "    texts = f.readlines()[:100000]  # 因语料太大，所以这里只用了前1W条做新词发现\n",
 46 |     "    with tqdm(range(0, len(texts), 1000)) as pbar:\n",
 47 |     "        words = []\n",
 48 |     "        for i in pbar:\n",
 49 |     "            words.extend([word for text in ltp.seg(texts[i:i+1000])[0] for word in text])\n",
 50 |     "print(len(words))"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": null,
 56 |    "metadata": {
 57 |     "ExecuteTime": {
 58 |      "end_time": "2021-09-24T09:29:05.179009Z",
 59 |      "start_time": "2021-09-24T09:29:03.411788Z"
 60 |     },
 61 |     "scrolled": true
 62 |    },
 63 |    "outputs": [],
 64 |    "source": [
 65 |     "from ltp import LTP\n",
 66 |     "ltp = LTP()\n",
 67 |     "\n",
 68 |     "text = '''\n",
 69 |     "自然语言处理( Natural Language Processing, NLP)是计算机科学领域与人工智能领域中的一个重要方向。它研究能实现人与计算机之间用自然语言进行有效通信的各种理论和方法。自然语言处理是一门融语言学、计算机科学、数学于一体的科学。因此，这一领域的研究将涉及自然语言，即人们日常使用的语言，所以它与语言学的研究有着密切的联系，但又有重要的区别。自然语言处理并不是一般地研究自然语言，而在于研制能有效地实现自然语言通信的计算机系统，特别是其中的软件系统。因而它是计算机科学的一部分。\n",
 70 |     "'''\n"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": null,
 76 |    "metadata": {
 77 |     "ExecuteTime": {
 78 |      "end_time": "2021-09-24T09:29:15.099171Z",
 79 |      "start_time": "2021-09-24T09:29:15.048454Z"
 80 |     }
 81 |    },
 82 |    "outputs": [],
 83 |    "source": [
 84 |     "print(ltp.seg([text])[0])"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": null,
 90 |    "metadata": {
 91 |     "ExecuteTime": {
 92 |      "end_time": "2021-09-24T09:32:28.634031Z",
 93 |      "start_time": "2021-09-24T09:32:28.327963Z"
 94 |     }
 95 |    },
 96 |    "outputs": [],
 97 |    "source": [
 98 |     "def get_chinese_words(file_path):\n",
 99 |     "    with open(file_path, \"r\", encoding = \"utf-8\") as f:\n",
100 |     "        return [line.split()[0] for line in f.readlines()]\n",
101 |     "\n",
102 |     "CH_DICT = set(get_chinese_words(\"chinese_words.txt\"))"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": null,
108 |    "metadata": {
109 |     "ExecuteTime": {
110 |      "end_time": "2021-09-24T09:32:32.021425Z",
111 |      "start_time": "2021-09-24T09:32:31.989504Z"
112 |     }
113 |    },
114 |    "outputs": [],
115 |    "source": [
116 |     "import re\n",
117 |     "\n",
118 |     "unigram_freq, bigram_freq = {},{}\n",
119 |     "for i in range(len(words)-1):\n",
120 |     "    if words[i] not in CH_DICT and not re.search(\"[^\\u4e00-\\u9fa5]\",words[i]): \n",
121 |     "        if words[i] in unigram_freq: # 一阶计数\n",
122 |     "            unigram_freq[words[i]] += 1\n",
123 |     "        else:\n",
124 |     "            unigram_freq[words[i]] = 1\n",
125 |     "    bigram = words[i]+words[i+1]\n",
126 |     "    if bigram not in CH_DICT and not re.search(\"[^\\u4e00-\\u9fa5]\",bigram): \n",
127 |     "        if bigram in bigram_freq:\n",
128 |     "            bigram_freq[bigram] += 1\n",
129 |     "        else:\n",
130 |     "            bigram_freq[bigram] = 1"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "code",
135 |    "execution_count": null,
136 |    "metadata": {
137 |     "ExecuteTime": {
138 |      "end_time": "2021-09-24T09:32:36.118237Z",
139 |      "start_time": "2021-09-24T09:32:36.107604Z"
140 |     }
141 |    },
142 |    "outputs": [],
143 |    "source": [
144 |     "unigram_freq_sorted = sorted(unigram_freq.items(), key = lambda d: d[1],reverse = True)\n",
145 |     "bigram_freq_sorted = sorted(bigram_freq.items(), key = lambda d: d[1],reverse = True)"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "code",
150 |    "execution_count": null,
151 |    "metadata": {
152 |     "ExecuteTime": {
153 |      "end_time": "2021-09-24T09:39:54.276828Z",
154 |      "start_time": "2021-09-24T09:39:54.266340Z"
155 |     }
156 |    },
157 |    "outputs": [],
158 |    "source": [
159 |     "print(\"unigram:\\n\", unigram_freq_sorted[:100])\n",
160 |     "print(\"\\n\")\n",
161 |     "print(\"bigram:\\n\", bigram_freq_sorted[:100])"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "markdown",
166 |    "metadata": {},
167 |    "source": [
168 |     "## 基于自由疑固度以及左右邻字熵的新词挖掘"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "markdown",
173 |    "metadata": {},
174 |    "source": [
175 |     "- 自由疑固度：表示一个字串的凝固程度。\n",
176 |     "<center>$$pmi(x,y) = log{\\frac{P(x,y)}{P(x)P(y)}}$$</center>、\n",
177 |     "- 左邻字熵与右邻字熵：表示一个字串左右搭配的丰富性。\n",
178 |     "<center>$$entropy(w) = -P(x_i)logP(x_i)$$</center>"
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "markdown",
183 |    "metadata": {},
184 |    "source": [
185 |     "**实战流程**  \n",
186 |     "第一步：数据获取及预处理；词典获取；  \n",
187 |     "第二步：将数据进行切分获取所有切分出的候选单词，并且统计词频信息、候选新词左右出现的字的信息；  \n",
188 |     "第三步：根据第二步中统计的进行 pmi 值以及左右邻字熵的计算；  \n",
189 |     "第四步：设定各指标的阈值，根据其值获取新词结果；  \n",
190 |     "第五步：根据一些规则过滤掉明显不正确的新词，得到最终的新词结果。"
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "markdown",
195 |    "metadata": {},
196 |    "source": [
197 |     "### 数据获取及预处理；词典获取"
198 |    ]
199 |   },
200 |   {
201 |    "cell_type": "code",
202 |    "execution_count": null,
203 |    "metadata": {
204 |     "ExecuteTime": {
205 |      "end_time": "2021-09-27T00:19:49.179030Z",
206 |      "start_time": "2021-09-27T00:19:43.722062Z"
207 |     }
208 |    },
209 |    "outputs": [],
210 |    "source": [
211 |     "# 读取数据\n",
212 |     "from tqdm import tqdm\n",
213 |     "import re\n",
214 |     "from ltp import LTP\n",
215 |     "ltp = LTP()\n",
216 |     "    \n",
217 |     "def preprocess_data(file_path):\n",
218 |     "    texts = []\n",
219 |     "    with open(file_path, \"r\", encoding = \"utf-8\") as f:\n",
220 |     "        lines = f.readlines()[:100000]\n",
221 |     "        with tqdm(lines, total=len(lines)) as pbar:\n",
222 |     "            for text in pbar:\n",
223 |     "                text = re.sub(\"[^\\u4e00-\\u9fa5。？．，！：]\", \"\", text.strip()) \n",
224 |     "    #             text_splited = re.split(\"[。？．，！：]\", text) \n",
225 |     "                text_splited = ltp.sent_split([text])  # 调用LTP进行分句\n",
226 |     "                texts.extend(text_splited)\n",
227 |     "            \n",
228 |     "    tmp = texts\n",
229 |     "    texts = []\n",
230 |     "    with tqdm(tmp, total=len(tmp), desc=\"filtering the null sentences\") as pbar:\n",
231 |     "        for text in pbar:\n",
232 |     "            if text is not \"\":\n",
233 |     "                texts.append(text)\n",
234 |     "    return texts"
235 |    ]
236 |   },
237 |   {
238 |    "cell_type": "code",
239 |    "execution_count": null,
240 |    "metadata": {
241 |     "ExecuteTime": {
242 |      "end_time": "2021-09-27T00:19:51.054557Z",
243 |      "start_time": "2021-09-27T00:19:51.015435Z"
244 |     }
245 |    },
246 |    "outputs": [],
247 |    "source": [
248 |     "texts = preprocess_data(\"test.txt\")  # 按照基本的标点符号进行切分"
249 |    ]
250 |   },
251 |   {
252 |    "cell_type": "code",
253 |    "execution_count": null,
254 |    "metadata": {
255 |     "ExecuteTime": {
256 |      "end_time": "2021-09-27T00:19:53.604507Z",
257 |      "start_time": "2021-09-27T00:19:53.340778Z"
258 |     }
259 |    },
260 |    "outputs": [],
261 |    "source": [
262 |     "# 获取已有的中文词典\n",
263 |     "def get_chinese_words(file_path):\n",
264 |     "    with open(file_path, \"r\", encoding = \"utf-8\") as f:\n",
265 |     "        return [line.split()[0] for line in f.readlines()]\n",
266 |     "\n",
267 |     "CH_DICT = set(get_chinese_words(\"chinese_words.txt\"))"
268 |    ]
269 |   },
270 |   {
271 |    "cell_type": "markdown",
272 |    "metadata": {},
273 |    "source": [
274 |     "### 将数据进行切分获取所有切分出的候选单词，并且统计词频信息、候选新词左右出现的字的信息"
275 |    ]
276 |   },
277 |   {
278 |    "cell_type": "markdown",
279 |    "metadata": {},
280 |    "source": [
281 |     "接下来需要对文本进行切分以及获取相关的频次信息，这里统一在一个函数中，主要逻辑如下：\n",
282 |     "- 对文本按照一定的长度范围进行切分，切分出所有成词的可能性，这里称之为字符串；\n",
283 |     "- 对于所有切分出的字符串进行过滤，长度大于等于 2 的词以及不是词典 CH_DICT 中的词作为候选新词；\n",
284 |     "- 获取所有切分出的字符串的频次信息（在后续计算中需要用到一些字符串的频次信息）、候选新词词频信息、候选新词左右出现的字的统计信息。"
285 |    ]
286 |   },
287 |   {
288 |    "cell_type": "code",
289 |    "execution_count": null,
290 |    "metadata": {
291 |     "ExecuteTime": {
292 |      "end_time": "2021-09-27T00:19:53.913456Z",
293 |      "start_time": "2021-09-27T00:19:53.874699Z"
294 |     }
295 |    },
296 |    "outputs": [],
297 |    "source": [
298 |     "def get_candidate_wordsinfo(texts, max_word_len):\n",
299 |     "    '''\n",
300 |     "    texts：表示输入的所有文本\n",
301 |     "    max_word_len：表示最长的词长    \n",
302 |     "    '''\n",
303 |     "    # 四个词典均以单词为 key，分别以词频、候选新词词频、左字集合、右字集合为 value\n",
304 |     "    words_freq, candidate_words_freq, candidate_words_left_characters, candidate_words_right_characters = {},{},{},{}\n",
305 |     "    WORD_NUM = 0  # 统计所有可能的字符串频次\n",
306 |     "    with tqdm(texts, total=len(texts)) as pbar:\n",
307 |     "        for text in pbar:  # 遍历每个文本\n",
308 |     "            # word_indexes 中存储了所有可能的词汇的切分下标 (i,j) ，i 表示词汇的起始下标，j 表示结束下标，注意这里有包括了所有的字\n",
309 |     "            # word_indexes 的生成需要两层循环，第一层循环，遍历所有可能的起始下标 i；第二层循环，在给定 i 的情况下，遍历所有可能的结束下标 j\n",
310 |     "            word_indexes = [(i,j) for i in range(len(text)) for j in range(i + 1, i + 1 + max_word_len)]\n",
311 |     "            WORD_NUM += len(word_indexes)\n",
312 |     "            for index in word_indexes:  # 遍历所有词汇的下标\n",
313 |     "                word = text[index[0]:index[1]]  # 获取单词\n",
314 |     "                # 更新所有切分出的字符串的频次信息\n",
315 |     "                if word in words_freq:\n",
316 |     "                    words_freq[word] += 1\n",
317 |     "                else:\n",
318 |     "                    words_freq[word] = 1\n",
319 |     "                if len(word) >= 2 and word not in CH_DICT:  # 长度大于等于 2 的词以及不是词典中的词作为候选新词\n",
320 |     "                    # 更新候选新词词频\n",
321 |     "                    if word in candidate_words_freq:\n",
322 |     "                        candidate_words_freq[word] += 1\n",
323 |     "                    else:\n",
324 |     "                        candidate_words_freq[word] = 1\n",
325 |     "                    # 更新候选新词左字集合\n",
326 |     "                    if index[0] != 0:  # 当为文本中首个单词时无左字\n",
327 |     "                        if word in candidate_words_left_characters:\n",
328 |     "                            candidate_words_left_characters[word].append(text[index[0]-1])\n",
329 |     "                        else:\n",
330 |     "                            candidate_words_left_characters[word] = [text[index[0]-1]]\n",
331 |     "                    else:\n",
332 |     "                        if word in candidate_words_left_characters:\n",
333 |     "                            candidate_words_left_characters[word].append(len(candidate_words_left_characters[word]))\n",
334 |     "                        else:\n",
335 |     "                            candidate_words_left_characters[word] = [0]                    \n",
336 |     "                    # 更新候选新词右字集合\n",
337 |     "                    if index[1] < len(text)-1:  # 当为文本中末个单词时无右字\n",
338 |     "                        if word in candidate_words_right_characters:\n",
339 |     "                            candidate_words_right_characters[word].append(text[index[1]]) # \n",
340 |     "                        else:\n",
341 |     "                            candidate_words_right_characters[word] = [text[index[1]]]\n",
342 |     "                    else:\n",
343 |     "                        if word in candidate_words_right_characters:\n",
344 |     "                            candidate_words_right_characters[word].append(len(candidate_words_right_characters[word]))\n",
345 |     "                        else:\n",
346 |     "                            candidate_words_right_characters[word] = [0]\n",
347 |     "    return WORD_NUM, words_freq, candidate_words_freq, candidate_words_left_characters, candidate_words_right_characters"
348 |    ]
349 |   },
350 |   {
351 |    "cell_type": "code",
352 |    "execution_count": null,
353 |    "metadata": {
354 |     "ExecuteTime": {
355 |      "end_time": "2021-09-27T00:19:59.491903Z",
356 |      "start_time": "2021-09-27T00:19:59.163367Z"
357 |     }
358 |    },
359 |    "outputs": [],
360 |    "source": [
361 |     "WORD_NUM, words_freq, candidate_words_freq, candidate_words_left_characters, candidate_words_right_characters = \\\n",
362 |     "get_candidate_wordsinfo(texts = texts, max_word_len = 6)  # 字符串最长为 3"
363 |    ]
364 |   },
365 |   {
366 |    "cell_type": "markdown",
367 |    "metadata": {},
368 |    "source": [
369 |     "### 根据第二步中统计的进行 pmi 值以及左右邻字熵的计算"
370 |    ]
371 |   },
372 |   {
373 |    "cell_type": "code",
374 |    "execution_count": null,
375 |    "metadata": {
376 |     "ExecuteTime": {
377 |      "end_time": "2021-09-27T00:20:00.492563Z",
378 |      "start_time": "2021-09-27T00:20:00.479023Z"
379 |     }
380 |    },
381 |    "outputs": [],
382 |    "source": [
383 |     "import math\n",
384 |     "\n",
385 |     "# 计算候选单词的 pmi 值\n",
386 |     "def compute_pmi(words_freq, candidate_words_freq):\n",
387 |     "    words_pmi = {}\n",
388 |     "    with tqdm(candidate_words_freq, total=len(candidate_words_freq), desc=\"Counting pmi\") as pbar:\n",
389 |     "        for word in pbar:\n",
390 |     "            # 首先，将某个候选单词按照不同的切分位置切分成两项，比如“电影院”可切分为“电”和“影院”以及“电影”和“院”\n",
391 |     "            bi_grams = [(word[0:i],word[i:]) for i in range(1,len(word))]\n",
392 |     "            # 对所有切分情况计算 pmi 值，取最大值作为当前候选词的最终 pmi 值\n",
393 |     "            # words_freq[bi_gram[0]]，words_freq[bi_gram[1]] 分别表示一个候选新词的前后两部分的出现频次\n",
394 |     "            words_pmi[word] = max(map(lambda bi_gram: math.log(\\\n",
395 |     "                words_freq[word]/(words_freq[bi_gram[0]]*words_freq[bi_gram[1]]/WORD_NUM)), bi_grams))\n",
396 |     "    return words_pmi"
397 |    ]
398 |   },
399 |   {
400 |    "cell_type": "markdown",
401 |    "metadata": {},
402 |    "source": [
403 |     "在下一步中，计算 pmi 值以及左右邻字熵。"
404 |    ]
405 |   },
406 |   {
407 |    "cell_type": "code",
408 |    "execution_count": null,
409 |    "metadata": {
410 |     "ExecuteTime": {
411 |      "end_time": "2021-09-27T00:20:01.586222Z",
412 |      "start_time": "2021-09-27T00:20:01.321617Z"
413 |     }
414 |    },
415 |    "outputs": [],
416 |    "source": [
417 |     "words_pmi = compute_pmi(words_freq, candidate_words_freq)\n",
418 |     "words_pmi"
419 |    ]
420 |   },
421 |   {
422 |    "cell_type": "code",
423 |    "execution_count": null,
424 |    "metadata": {
425 |     "ExecuteTime": {
426 |      "end_time": "2021-09-27T00:20:01.953164Z",
427 |      "start_time": "2021-09-27T00:20:01.946786Z"
428 |     }
429 |    },
430 |    "outputs": [],
431 |    "source": [
432 |     "from collections import Counter \n",
433 |     "\n",
434 |     "# 计算候选单词的邻字熵\n",
435 |     "def compute_entropy(candidate_words_characters):\n",
436 |     "    words_entropy = {}\n",
437 |     "    with tqdm(candidate_words_characters.items(), total=len(candidate_words_characters), desc=\"Counting entropy\") as pbar:\n",
438 |     "        for word, characters in pbar:\n",
439 |     "            character_freq = Counter(characters)  # 统计邻字的出现分布\n",
440 |     "            # 根据出现分布计算邻字熵\n",
441 |     "            words_entropy[word] = sum(map(lambda x: - x/len(characters) * math.log(x/len(characters)) , character_freq.values())) \n",
442 |     "    return words_entropy"
443 |    ]
444 |   },
445 |   {
446 |    "cell_type": "code",
447 |    "execution_count": null,
448 |    "metadata": {
449 |     "ExecuteTime": {
450 |      "end_time": "2021-09-27T00:20:02.894920Z",
451 |      "start_time": "2021-09-27T00:20:02.321323Z"
452 |     }
453 |    },
454 |    "outputs": [],
455 |    "source": [
456 |     "words_left_entropy = compute_entropy(candidate_words_left_characters)\n",
457 |     "words_right_entropy = compute_entropy(candidate_words_right_characters)"
458 |    ]
459 |   },
460 |   {
461 |    "cell_type": "markdown",
462 |    "metadata": {},
463 |    "source": [
464 |     "### 设定各指标的阈值，根据其值获取最终的新词结果"
465 |    ]
466 |   },
467 |   {
468 |    "cell_type": "code",
469 |    "execution_count": null,
470 |    "metadata": {
471 |     "ExecuteTime": {
472 |      "end_time": "2021-09-27T00:20:03.290762Z",
473 |      "start_time": "2021-09-27T00:20:03.284101Z"
474 |     }
475 |    },
476 |    "outputs": [],
477 |    "source": [
478 |     "# 根据各指标阈值获取最终的新词结果\n",
479 |     "def get_newwords(candidate_words_freq,\n",
480 |     "                               words_pmi,\n",
481 |     "                               words_left_entropy,\n",
482 |     "                               words_right_entropy,\n",
483 |     "                               words_freq_limit=4,\n",
484 |     "                               pmi_limit=5,\n",
485 |     "                               entropy_limit=1):\n",
486 |     "    # 在每一项指标中根据阈值进行筛选\n",
487 |     "    candidate_words = [k for k, v in candidate_words_freq.items() if v >= words_freq_limit]\n",
488 |     "    candidate_words_pmi = [k for k, v in words_pmi.items() if v >= pmi_limit]\n",
489 |     "    candidate_words_left_entropy = [k for k, v in words_left_entropy.items() if v >= entropy_limit]\n",
490 |     "    candidate_words_right_entropy = [k for k, v in words_right_entropy.items() if v >= entropy_limit]\n",
491 |     "    # 对筛选结果进行合并\n",
492 |     "    return list(set(candidate_words).intersection(candidate_words_pmi, candidate_words_left_entropy, candidate_words_right_entropy))"
493 |    ]
494 |   },
495 |   {
496 |    "cell_type": "code",
497 |    "execution_count": null,
498 |    "metadata": {
499 |     "ExecuteTime": {
500 |      "end_time": "2021-09-27T00:20:31.263048Z",
501 |      "start_time": "2021-09-27T00:20:31.225118Z"
502 |     }
503 |    },
504 |    "outputs": [],
505 |    "source": [
506 |     "# 可以不断调参数来达到想要的结果\n",
507 |     "new_words = get_newwords(candidate_words_freq,\n",
508 |     "                         words_pmi,\n",
509 |     "                         words_left_entropy,\n",
510 |     "                         words_right_entropy,\n",
511 |     "                         words_freq_limit= 2,\n",
512 |     "                         pmi_limit=3,\n",
513 |     "                         entropy_limit=1)\n",
514 |     "print(len(new_words))\n",
515 |     "new_words"
516 |    ]
517 |   },
518 |   {
519 |    "cell_type": "markdown",
520 |    "metadata": {},
521 |    "source": [
522 |     "### 过滤掉一些不正确的新词"
523 |    ]
524 |   },
525 |   {
526 |    "cell_type": "code",
527 |    "execution_count": null,
528 |    "metadata": {
529 |     "ExecuteTime": {
530 |      "end_time": "2021-09-27T00:20:37.356733Z",
531 |      "start_time": "2021-09-27T00:20:37.340186Z"
532 |     }
533 |    },
534 |    "outputs": [],
535 |    "source": [
536 |     "new_words1 = list(filter(lambda x: not re.search(\"[^\\u4e00-\\u9fa5]\", x), new_words))\n",
537 |     "new_words2 = list(filter(lambda x: not re.search(\"[了但里的和为是]\", x), new_words1))"
538 |    ]
539 |   },
540 |   {
541 |    "cell_type": "code",
542 |    "execution_count": null,
543 |    "metadata": {
544 |     "ExecuteTime": {
545 |      "end_time": "2021-09-27T00:20:47.240200Z",
546 |      "start_time": "2021-09-27T00:20:47.226105Z"
547 |     }
548 |    },
549 |    "outputs": [],
550 |    "source": [
551 |     "print(len(new_words2))\n",
552 |     "new_words2"
553 |    ]
554 |   },
555 |   {
556 |    "cell_type": "code",
557 |    "execution_count": null,
558 |    "metadata": {
559 |     "ExecuteTime": {
560 |      "end_time": "2021-09-27T00:20:38.513825Z",
561 |      "start_time": "2021-09-27T00:20:38.463910Z"
562 |     }
563 |    },
564 |    "outputs": [],
565 |    "source": [
566 |     "with open(\"new_words.txt\", \"\", encoding=\"utf-8\") as f:\n",
567 |     "    for new_word in new_words2:\n",
568 |     "        f.write(new_word + \"\\n\")"
569 |    ]
570 |   },
571 |   {
572 |    "cell_type": "code",
573 |    "execution_count": null,
574 |    "metadata": {},
575 |    "outputs": [],
576 |    "source": []
577 |   }
578 |  ],
579 |  "metadata": {
580 |   "kernelspec": {
581 |    "display_name": "torch1.7",
582 |    "language": "python",
583 |    "name": "torch1.7"
584 |   },
585 |   "language_info": {
586 |    "codemirror_mode": {
587 |     "name": "ipython",
588 |     "version": 3
589 |    },
590 |    "file_extension": ".py",
591 |    "mimetype": "text/x-python",
592 |    "name": "python",
593 |    "nbconvert_exporter": "python",
594 |    "pygments_lexer": "ipython3",
595 |    "version": "3.6.12"
596 |   },
597 |   "toc": {
598 |    "base_numbering": 1,
599 |    "nav_menu": {},
600 |    "number_sections": true,
601 |    "sideBar": true,
602 |    "skip_h1_title": false,
603 |    "title_cell": "Table of Contents",
604 |    "title_sidebar": "Contents",
605 |    "toc_cell": false,
606 |    "toc_position": {
607 |     "height": "calc(100% - 180px)",
608 |     "left": "10px",
609 |     "top": "150px",
610 |     "width": "321.733px"
611 |    },
612 |    "toc_section_display": true,
613 |    "toc_window_display": true
614 |   },
615 |   "varInspector": {
616 |    "cols": {
617 |     "lenName": 16,
618 |     "lenType": 16,
619 |     "lenVar": 40
620 |    },
621 |    "kernels_config": {
622 |     "python": {
623 |      "delete_cmd_postfix": "",
624 |      "delete_cmd_prefix": "del ",
625 |      "library": "var_list.py",
626 |      "varRefreshCmd": "print(var_dic_list())"
627 |     },
628 |     "r": {
629 |      "delete_cmd_postfix": ") ",
630 |      "delete_cmd_prefix": "rm(",
631 |      "library": "var_list.r",
632 |      "varRefreshCmd": "cat(var_dic_list()) "
633 |     }
634 |    },
635 |    "types_to_exclude": [
636 |     "module",
637 |     "function",
638 |     "builtin_function_or_method",
639 |     "instance",
640 |     "_Feature"
641 |    ],
642 |    "window_display": false
643 |   }
644 |  },
645 |  "nbformat": 4,
646 |  "nbformat_minor": 2
647 | }
648 | 


--------------------------------------------------------------------------------