├── .gitignore ├── 1.simpliedTextClassification ├── README.md ├── classify.py └── data │ └── train.tsv ├── 2.advancedTextClassification ├── 1.textClassificationWithRNN.py ├── 2.GloVe.py ├── 3.textClassificationWithCNN.py ├── 4.textClassificationWithRNNAndGlove.py ├── README.md ├── data │ ├── names │ │ ├── Arabic.txt │ │ ├── Chinese.txt │ │ ├── Czech.txt │ │ ├── Dutch.txt │ │ ├── English.txt │ │ ├── French.txt │ │ ├── German.txt │ │ ├── Greek.txt │ │ ├── Irish.txt │ │ ├── Italian.txt │ │ ├── Japanese.txt │ │ ├── Korean.txt │ │ ├── Polish.txt │ │ ├── Portuguese.txt │ │ ├── Russian.txt │ │ ├── Scottish.txt │ │ ├── Spanish.txt │ │ └── Vietnamese.txt │ ├── question-classif-data │ │ ├── TREC_10.label │ │ └── train_1000.label │ └── train.tsv └── result │ ├── plot.png │ ├── plot2.png │ └── plot3.png ├── 3.textMatching(ESIM) ├── README.md ├── papers │ ├── A Broad-Coverage Challenge Corpus for Sentence Understanding through Inference.pdf │ ├── Enhanced LSTM for Natural Language Inference.pdf │ └── Sequential Attention-based Network for Noetic End-to-End Response Selection.pdf └── python │ ├── __init__.py │ ├── models │ ├── __init__.py │ └── esim.py │ ├── train_mnli.py │ └── util │ ├── __init__.py │ ├── __pycache__ │ ├── __init__.cpython-37.pyc │ ├── logger.cpython-37.pyc │ └── parameters.cpython-37.pyc │ ├── blocks.py │ ├── data_processing.py │ ├── evaluate.py │ ├── logger.py │ └── parameters.py ├── 4.NER(LSTM+CRF) ├── LICENSE.txt ├── README.md ├── build_data.py ├── data │ └── test.txt ├── evaluate.py ├── makefile ├── model │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-37.pyc │ │ ├── base_model.cpython-37.pyc │ │ ├── config.cpython-37.pyc │ │ ├── data_utils.cpython-37.pyc │ │ ├── general_utils.cpython-37.pyc │ │ └── ner_model.cpython-37.pyc │ ├── base_model.py │ ├── config.py │ ├── data_utils.py │ ├── general_utils.py │ └── ner_model.py ├── requirements.txt └── train.py ├── 5.transformer ├── README.md ├── __pycache__ │ ├── attention.cpython-37.pyc │ ├── embedding.cpython-37.pyc │ ├── encoderdecoder.cpython-37.pyc │ ├── generator.cpython-37.pyc │ ├── multiHeadAttention.cpython-37.pyc │ ├── positionalEncoding.cpython-37.pyc │ ├── positionwiseFeedForward.cpython-37.pyc │ └── transformerModel.cpython-37.pyc ├── attention.py ├── embedding.py ├── encoderdecoder.py ├── generator.py ├── multiHeadAttention.py ├── positionalEncoding.py ├── positionwiseFeedForward.py └── transformerModel.py ├── IMG_0611.PNG └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | */.DS_Store 2 | .DS_Store 3 | */.idea 4 | .idea 5 | 6 | -------------------------------------------------------------------------------- /1.simpliedTextClassification/README.md: -------------------------------------------------------------------------------- 1 | ### 任务一:基于机器学习的文本分类 2 | 3 | 实现基于logistic的文本分类 4 | 5 | 1. 数据集:[Classify the sentiment of sentences from the Rotten Tomatoes dataset](https://www.kaggle.com/c/sentiment-analysis-on-movie-reviews) 6 | 7 | 2. 知识点: 8 | 1. 文本特征表示:[词袋模型](https://jesseyule.github.io/naturallanguage/bow/content.html) 9 | 2. 分类器:[logistic回归](https://jesseyule.github.io/machinelearning/logisticRegression/content.html)、损失函数、[梯度下降](https://jesseyule.github.io/machinelearning/gradientDescent/content.html)、[特征选择](https://jesseyule.github.io/machinelearning/featureEngineering/content.html) 10 | 3. [交叉检验](https://jesseyule.github.io/machinelearning/crossValidation/content.html) 11 | 12 | 3. 实验: 13 | 1. 分析不同的特征、损失函数、学习率对最终分类性能的影响 14 | 2. shuffle 、batch、mini-batch 15 | 16 | 4. [问题简单分析](https://jesseyule.github.io/naturallanguage/simplifiedTextClassification/content.html) 17 | 18 | ### 代码说明 19 | 20 | ​ classify.py只针对部分数据进行分析,但是模型是完整应用了词袋模型以及logistic模型,在此代码的基础上,可以改进文本特征的表示方法,比如采用二元特征表示等等,另一方面logistic模型等等也可以进行相应改进。 21 | 22 | -------------------------------------------------------------------------------- /1.simpliedTextClassification/classify.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.linear_model import LogisticRegression 3 | import pandas as pd 4 | 5 | 6 | def classify(): 7 | df = pd.read_csv('data/train.tsv', header=0, delimiter='\t') 8 | df = df[0:63] # 为了简化只取第一句话作为例子 9 | x_train = df['Phrase'] 10 | y_train = df['Sentiment'] 11 | all = [] 12 | 13 | # 构建词袋 14 | for i in range(len(x_train)): 15 | all.append(x_train[i]) 16 | voc = set(all) # 删除重复数据 17 | 18 | x_train_idx = [] 19 | 20 | # 将文本转化为向量形式 21 | for i in range(len(x_train)): 22 | tmp = np.zeros(len(voc)) 23 | for j, word in enumerate(voc): # 将voc转为索引序列,同时列出数据和下标 24 | tmp[j] = x_train[i].count(word) # 计算词袋中的每个词在句子中出现的次数,填入向量中 25 | x_train_idx.append(tmp) 26 | x_train_id = np.array(x_train_idx) 27 | 28 | logist = LogisticRegression() 29 | logist.fit(x_train_id, y_train) 30 | x_test = x_train_id # 为了简化过程用回训练数据测试模型,实际上应该划分一个测试集 31 | predicted = logist.predict(x_test) 32 | print(np.mean(predicted == y_train)) 33 | 34 | 35 | classify() 36 | -------------------------------------------------------------------------------- /2.advancedTextClassification/1.textClassificationWithRNN.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals, print_function, division 2 | from io import open 3 | import glob 4 | import os 5 | import unicodedata 6 | import string 7 | import torch 8 | import torch.nn as nn 9 | import random 10 | import time 11 | import math 12 | import matplotlib.pyplot as plt 13 | import matplotlib.ticker as ticker 14 | 15 | # 第一步,数据预处理 16 | # 因为数据几乎都是罗马化的文本,所以要将其从unicode转化为ASCII 17 | 18 | 19 | def findFiles(path): 20 | return glob.glob(path) 21 | 22 | 23 | all_letters = string.ascii_letters + ".,;'" 24 | n_letters = len(all_letters) 25 | 26 | 27 | # 将unicode转成ASCII 28 | def unicodeToAscii(s): 29 | return ''.join( 30 | c for c in unicodedata.normalize('NFD', s) 31 | if unicodedata.category(c) != 'Mn' 32 | and c in all_letters 33 | ) 34 | 35 | 36 | category_lines = {} 37 | all_categories = [] 38 | 39 | 40 | def readLines(filename): 41 | lines = open(filename, encoding='utf-8').read().strip().split('\n') 42 | return [unicodeToAscii(line) for line in lines] 43 | 44 | 45 | for filename in findFiles('data/names/*.txt'): 46 | category = os.path.splitext(os.path.basename(filename))[0] 47 | all_categories.append(category) 48 | lines = readLines(filename) 49 | category_lines[category] = lines # 一个字典变量储存每一种语言及其对应的每一行文本(名字)列表的映射关系 50 | 51 | n_categories = len(all_categories) 52 | 53 | # 以上处理可得到category_line,它保存了语种-姓名列表,也有all_categories保存语种列表,以及n_categories表示语种数量 54 | 55 | # 下面的步骤就是把word embedding,因为是分析单词,所以主要通过独热编码表示字母,再根据单词长度构建相应维度的tensor 56 | # 注意,这里不同长度单词的矩阵维度也不同,额外的一维是batch的维度 57 | 58 | 59 | # 找到字母在字母表中的位置 60 | def letterToIndex(letter): 61 | return all_letters.find(letter) 62 | 63 | 64 | # 独热编码,调用letterToIndex,将字母转化为一个tensor 65 | def letterToTensor(letter): 66 | tensor = torch.zeros(1, n_letters) 67 | tensor[0][letterToIndex(letter)] = 1 68 | return tensor 69 | 70 | 71 | # 根据单词长度将一个个字母的tensor构建成表示一个单词的的tensor 72 | def lineToTensor(line): 73 | tensor = torch.zeros(len(line), 1, n_letters) 74 | for li, letter in enumerate(line): 75 | tensor[li][0][letterToIndex(letter)] = 1 76 | return tensor 77 | 78 | 79 | # 第二步,正式构建循环神经网络 80 | # 主要构建了一个线性隐层和一个线性输出层 81 | 82 | class RNN(nn.Module): 83 | def __init__(self, input_size, hidden_size, output_size): 84 | super(RNN, self).__init__() 85 | 86 | # 隐层包含正常神经元(i2o)和虚神经元(i2h) 87 | self.hidden_size = hidden_size 88 | 89 | self.i2h = nn.Linear(input_size + hidden_size, hidden_size) 90 | self.i2o = nn.Linear(input_size + hidden_size, output_size) 91 | self.softmax = nn.LogSoftmax(dim=1) 92 | 93 | def forward(self, input, hidden): 94 | combined = torch.cat((input, hidden), 1) # 输入包括当前输入和以前的隐藏状态 95 | hidden = self.i2h(combined) # 更新hidden层,留给下一次训练当作输入 96 | output = self.i2o(combined) # 隐层输出 97 | output = self.softmax(output) # 对隐层输出作softmax(即输出层激活函数) 98 | return output, hidden 99 | 100 | def initHidden(self): 101 | return torch.zeros(1, self.hidden_size) # 初始化虚神经元 102 | 103 | 104 | n_hidden = 128 105 | rnn = RNN(n_letters, n_hidden, n_categories) 106 | 107 | 108 | input = lineToTensor('Hofler') 109 | 110 | hidden = torch.zeros(1, n_hidden) 111 | 112 | output, next_hidden = rnn(input[0], hidden) 113 | 114 | 115 | # 第三步,构建一些辅助函数辅助训练 116 | # 以下函数主要分析输出结果对应哪种语言(只输出可能性最大的结果) 117 | 118 | 119 | def categoryFromOutput(output): 120 | top_n, top_i = output.topk(1) # topk函数可得到最大值在结果中的位置索引 121 | category_i = top_i[0].item() 122 | return all_categories[category_i], category_i 123 | 124 | 125 | # 以下函数是关于随机选择训练样本 126 | def randomChoice(l): 127 | return l[random.randint(0, len(l)-1)] 128 | 129 | 130 | def randomTrainingExample(): 131 | category = randomChoice(all_categories) # 在所有种类中随机选择一种 132 | line = randomChoice(category_lines[category]) # 在选中的语言中再随机选一个名字 133 | 134 | category_tensor = torch.tensor([all_categories.index(category)], dtype=torch.long) # 保存语种的index 135 | line_tensor = lineToTensor(line) # 把名字转化为tensor 136 | return category, line, category_tensor, line_tensor 137 | 138 | 139 | # 第四步,正式训练神经网络 140 | 141 | criterion = nn.NLLLoss() # 定义损失函数 142 | learning_rate = 0.005 143 | 144 | 145 | def train(category_tensor, line_tensor): 146 | hidden = rnn.initHidden() 147 | 148 | rnn.zero_grad() 149 | 150 | # 下面是训练一个单词的过程,注意这里是针对一个单词的一个个字符进行输入 151 | # 对RNN来说,完整的一次训练是完整输入一个单词的所有字符的过程 152 | for i in range(line_tensor.size()[0]): 153 | output, hidden = rnn(line_tensor[i], hidden) 154 | 155 | 156 | loss = criterion(output, category_tensor) 157 | loss.backward() 158 | 159 | # 将参数的梯度添加到其值中,乘以学习速率 160 | for p in rnn.parameters(): 161 | p.data.add_(-learning_rate, p.grad.data) 162 | 163 | return output, loss.item() 164 | 165 | 166 | n_iters = 100000 167 | print_every = 5000 168 | plot_every = 1000 169 | 170 | # 跟踪绘图的损失 171 | current_loss = 0 172 | all_losses = [] 173 | 174 | 175 | def timeSince(since): 176 | now = time.time() 177 | s = now - since 178 | m = math.floor(s / 60) 179 | s -= m * 60 180 | return '%dm %ds' % (m, s) 181 | 182 | start = time.time() 183 | 184 | 185 | # 正式进行批量训练,针对随机选择的大量单词训练RNN 186 | for iter in range(1, n_iters + 1): 187 | category, line, category_tensor, line_tensor = randomTrainingExample() 188 | output, loss = train(category_tensor, line_tensor) 189 | current_loss += loss 190 | 191 | # 打印迭代的编号,损失,名字和猜测 192 | if iter % print_every == 0: 193 | guess, guess_i = categoryFromOutput(output) 194 | print('guess: ', guess) 195 | print('category: ', category) 196 | correct = '✓' if guess == category else '✗ (%s)' % category 197 | print('%d %d%% (%s) %.4f %s / %s %s' % (iter, iter / n_iters * 100, timeSince(start), loss, line, guess, correct)) 198 | 199 | # 将当前损失平均值添加到损失列表中 200 | if iter % plot_every == 0: 201 | all_losses.append(current_loss / plot_every) 202 | current_loss = 0 203 | 204 | plt.figure() 205 | plt.plot(all_losses) 206 | plt.show() 207 | -------------------------------------------------------------------------------- /2.advancedTextClassification/2.GloVe.py: -------------------------------------------------------------------------------- 1 | from gensim.models import KeyedVectors 2 | from gensim.scripts.glove2word2vec import glove2word2vec 3 | 4 | 5 | # 这里使用gensim工具包加载训练好的GloVe词向量,首先利用gensim把glove转换成方便gensim加载的word2vec格式 6 | # 网上下载的训练好的glove词向量 7 | glove_input_file = r'../../../glove/glove.42B.300d.txt' 8 | # 指定转化为word2vec格式后文件的名称 9 | word2vec_output_file = r'../../../glove/glove.42B.300d.word2vec.txt' 10 | # 转换操作,注意,这个操作只需要进行一次 11 | # glove2word2vec(glove_input_file, word2vec_output_file) 12 | 13 | # 加载模型 14 | glove_model = KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False) 15 | 16 | # 获得单词cat的词向量 17 | cat_vec = glove_model['cat'] 18 | print(cat_vec) 19 | # 获得单词frog的最相似向量的词汇 20 | print(glove_model.most_similar('frog')) 21 | -------------------------------------------------------------------------------- /2.advancedTextClassification/3.textClassificationWithCNN.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import collections 3 | import math 4 | import numpy as np 5 | import os 6 | import random 7 | import tensorflow as tf 8 | import zipfile 9 | from matplotlib import pylab 10 | from six.moves import range 11 | from six.moves.urllib.request import urlretrieve 12 | import tensorflow as tf 13 | 14 | url = 'http://cogcomp.org/Data/QA/QC/' 15 | dir_name = 'data/question-classif-data' 16 | 17 | 18 | # 下载数据 19 | def maybe_download(dir_name, filename, expected_bytes): 20 | """Download a file if not present, and make sure it's the right size.""" 21 | 22 | if not os.path.exists(dir_name): 23 | os.mkdir(dir_name) 24 | 25 | if not os.path.exists(os.path.join(dir_name, filename)): 26 | filename, _ = urlretrieve(os.path.join(dir_name, filename)) 27 | 28 | print(os.path.join(dir_name, filename)) 29 | statinfo = os.stat(os.path.join(dir_name, filename)) 30 | if statinfo.st_size == expected_bytes: 31 | print('Found and verified %s' % os.path.join(dir_name, filename)) 32 | else: 33 | print(statinfo.st_size) 34 | raise Exception( 35 | 'Failed to verify ' + os.path.join(dir_name, filename) + '. Can you get to it with a browser?') 36 | return filename 37 | 38 | 39 | # 读取数据 40 | def read_data(filename): 41 | ''' 42 | Read data from a file with given filename 43 | Returns a list of strings where each string is a lower case word 44 | ''' 45 | global max_sent_length # 最大句子长度:33 46 | questions = [] 47 | labels = [] 48 | with open(filename, 'r', encoding='latin-1') as f: 49 | for row in f: 50 | row_str = row.split(":") 51 | lb, q = row_str[0], row_str[1] 52 | q = q.lower() 53 | labels.append(lb) 54 | questions.append(q.split()) 55 | if len(questions[-1]) > max_sent_length: # 检测每个句子最大的长度 56 | max_sent_length = len(questions[-1]) 57 | return questions, labels 58 | 59 | 60 | # 这里就是词袋模型,把文本转化为词向量 61 | def build_dataset(questions): 62 | words = [] 63 | data_list = [] 64 | count = [] 65 | 66 | # First create a large list with all the words in all the questions 67 | for d in questions: 68 | words.extend(d) 69 | print('%d Words found.' % len(words)) 70 | print('Found %d words in the vocabulary. ' % len(collections.Counter(words).most_common())) 71 | 72 | # Sort words by there frequency 73 | count.extend(collections.Counter(words).most_common()) 74 | 75 | # Create an ID for each word by giving the current length of the dictionary 76 | # And adding that item to the dictionary 77 | dictionary = dict() 78 | for word, _ in count: 79 | dictionary[word] = len(dictionary) 80 | 81 | # Traverse through all the text and 82 | # replace the string words with the ID 83 | # of the word found at that index 84 | for d in questions: 85 | data = list() 86 | for word in d: 87 | index = dictionary[word] 88 | data.append(index) 89 | 90 | data_list.append(data) 91 | 92 | reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys())) 93 | 94 | return data_list, count, dictionary, reverse_dictionary 95 | 96 | 97 | class BatchGenerator(object): 98 | ''' 99 | Generates a batch of data 100 | ''' 101 | 102 | def __init__(self, batch_size, questions, labels): 103 | self.questions = questions 104 | self.labels = labels 105 | self.text_size = len(questions) 106 | self.batch_size = batch_size 107 | self.data_index = 0 108 | assert len(self.questions) == len(self.labels) 109 | 110 | def generate_batch(self): 111 | ''' 112 | Data generation function. This outputs two matrices 113 | inputs: a batch of questions where each question is a tensor of size 114 | [sent_length, vocabulary_size] with each word one-hot-encoded 115 | labels_ohe: one-hot-encoded labels corresponding to the questions in inputs 116 | ''' 117 | global sent_length, num_classes 118 | global dictionary, all_labels 119 | 120 | # Numpy arrays holding input and label data 121 | # 输入就是一个batch的句子,所以是一个batch*最大句子长度*单词长度 122 | inputs = np.zeros((self.batch_size, sent_length, vocabulary_size), dtype=np.float32) 123 | labels_ohe = np.zeros((self.batch_size, num_classes), dtype=np.float32) 124 | 125 | # When we reach the end of the dataset 126 | # start from beginning 127 | if self.data_index + self.batch_size >= self.text_size: 128 | self.data_index = 0 129 | 130 | # For each question in the dataset 131 | for qi, que in enumerate(self.questions[self.data_index:self.data_index + self.batch_size]): 132 | # For each word in the question 133 | for wi, word in enumerate(que): 134 | # Set the element at the word ID index to 1 135 | # this gives the one-hot-encoded vector of that word 136 | inputs[qi, wi, dictionary[word]] = 1.0 137 | 138 | # Set the index corrsponding to that particular class to 1 139 | labels_ohe[qi, all_labels.index(self.labels[self.data_index + qi])] = 1.0 140 | 141 | # Update the data index to get the next batch of data 142 | self.data_index = (self.data_index + self.batch_size) % self.text_size 143 | 144 | return inputs, labels_ohe 145 | 146 | def return_index(self): 147 | # Get the current index of data 148 | return self.data_index 149 | 150 | 151 | if __name__ == '__main__': 152 | 153 | filename = maybe_download(dir_name, 'train_1000.label', 60774) 154 | test_filename = maybe_download(dir_name, 'TREC_10.label', 23354) 155 | 156 | filenames = ['train_1000.label', 'TREC_10.label'] 157 | num_files = len(filenames) 158 | for i in range(len(filenames)): 159 | file_exists = os.path.isfile(os.path.join(dir_name, filenames[i])) 160 | assert file_exists 161 | print('Files found and verified.') 162 | 163 | max_sent_length = 0 164 | 165 | # Process train and Test data 166 | for i in range(num_files): 167 | print('\nProcessing file %s' % os.path.join(dir_name, filenames[i])) 168 | if i == 0: 169 | # Processing training data 170 | train_questions, train_labels = read_data(os.path.join(dir_name, filenames[i])) 171 | # Making sure we got all the questions and corresponding labels 172 | assert len(train_questions) == len(train_labels) 173 | elif i == 1: 174 | # Processing testing data 175 | test_questions, test_labels = read_data(os.path.join(dir_name, filenames[i])) 176 | # Making sure we got all the questions and corresponding labels. 177 | assert len(test_questions) == len(test_labels) 178 | 179 | # Print some data to see everything is okey 180 | for j in range(5): 181 | print('\tQuestion %d: %s' % (j, train_questions[j])) 182 | print('\tLabel %d: %s\n' % (j, train_labels[j])) 183 | 184 | print('Max Sentence Length: %d' % max_sent_length) 185 | print('\nNormalizing all sentences to same length') 186 | 187 | # 因为CNN不像RNN,需要确保每次输入(每个句子)的维度相等,也就是每次输入都是同样尺寸的二维矩阵 188 | # 填充每个句子,使每个句子长度相等 189 | for qi, que in enumerate(train_questions): 190 | for _ in range(max_sent_length - len(que)): 191 | que.append('PAD') 192 | assert len(que) == max_sent_length 193 | train_questions[qi] = que 194 | print('Train questions padded') 195 | 196 | # 填充每个句子,使每个句子长度相等 197 | for qi, que in enumerate(test_questions): 198 | for _ in range(max_sent_length - len(que)): 199 | que.append('PAD') 200 | assert len(que) == max_sent_length 201 | test_questions[qi] = que 202 | print('\nTest questions padded') 203 | 204 | # Printing a test question to see if everything is correct 205 | print('\nSample test question: %s', test_questions[0]) 206 | 207 | # Create a dataset with both train and test questions 208 | all_questions = list(train_questions) 209 | all_questions.extend(test_questions) 210 | 211 | # Use the above created dataset to build the vocabulary 212 | all_question_ind, count, dictionary, reverse_dictionary = build_dataset(all_questions) 213 | 214 | # Print some statistics about the processed data 215 | print('All words (count)', count[:5]) 216 | print('\n0th entry in dictionary: %s', reverse_dictionary[0]) 217 | print('\nSample data', all_question_ind[0]) 218 | print('\nSample data', all_question_ind[1]) 219 | print('\nVocabulary: ', len(dictionary)) 220 | vocabulary_size = len(dictionary) 221 | 222 | print('\nNumber of training questions: ', len(train_questions)) 223 | print('Number of testing questions: ', len(test_questions)) 224 | 225 | batch_size = 16 # We process 16 questions at a time 226 | sent_length = max_sent_length 227 | 228 | num_classes = 6 # Number of classes 229 | # All the types of question that are in the dataset 230 | all_labels = ['NUM', 'LOC', 'HUM', 'DESC', 'ENTY', 'ABBR'] 231 | 232 | # Test our batch generator 233 | sample_gen = BatchGenerator(batch_size, train_questions, train_labels) 234 | # Generate a single batch 235 | sample_batch_inputs, sample_batch_labels = sample_gen.generate_batch() 236 | # Generate another batch 237 | sample_batch_inputs_2, sample_batch_labels_2 = sample_gen.generate_batch() 238 | 239 | # Make sure that we infact have the question 0 as the 0th element of our batch 240 | assert np.all(np.asarray([dictionary[w] for w in train_questions[0]], dtype=np.int32) 241 | == np.argmax(sample_batch_inputs[0, :, :], axis=1)) 242 | 243 | # Print some data labels we obtained 244 | print('Sample batch labels') 245 | print(np.argmax(sample_batch_labels, axis=1)) 246 | print(np.argmax(sample_batch_labels_2, axis=1)) 247 | 248 | tf.reset_default_graph() 249 | 250 | batch_size = 32 251 | # Different filter sizes we use in a single convolution layer 252 | filter_sizes = [3, 5, 7] 253 | 254 | # inputs and labels 255 | sent_inputs = tf.placeholder(shape=[batch_size, sent_length, vocabulary_size], dtype=tf.float32, 256 | name='sentence_inputs') 257 | sent_labels = tf.placeholder(shape=[batch_size, num_classes], dtype=tf.float32, name='sentence_labels') 258 | 259 | # 3 filters with different context window sizes (3,5,7) 260 | # Each of this filter spans the full one-hot-encoded length of each word and the context window width 261 | 262 | # Weights of the first parallel layer 263 | w1 = tf.Variable(tf.truncated_normal([filter_sizes[0], vocabulary_size, 1], stddev=0.02, dtype=tf.float32), 264 | name='weights_1') 265 | b1 = tf.Variable(tf.random_uniform([1], 0, 0.01, dtype=tf.float32), name='bias_1') 266 | 267 | # Weights of the second parallel layer 268 | w2 = tf.Variable(tf.truncated_normal([filter_sizes[1], vocabulary_size, 1], stddev=0.02, dtype=tf.float32), 269 | name='weights_2') 270 | b2 = tf.Variable(tf.random_uniform([1], 0, 0.01, dtype=tf.float32), name='bias_2') 271 | 272 | # Weights of the third parallel layer 273 | w3 = tf.Variable(tf.truncated_normal([filter_sizes[2], vocabulary_size, 1], stddev=0.02, dtype=tf.float32), 274 | name='weights_3') 275 | b3 = tf.Variable(tf.random_uniform([1], 0, 0.01, dtype=tf.float32), name='bias_3') 276 | 277 | # Fully connected layer 278 | w_fc1 = tf.Variable(tf.truncated_normal([len(filter_sizes), num_classes], stddev=0.5, dtype=tf.float32), 279 | name='weights_fulcon_1') 280 | b_fc1 = tf.Variable(tf.random_uniform([num_classes], 0, 0.01, dtype=tf.float32), name='bias_fulcon_1') 281 | 282 | # Calculate the output for all the filters with a stride 1 283 | # We use relu activation as the activation function 284 | h1_1 = tf.nn.relu(tf.nn.conv1d(sent_inputs, w1, stride=1, padding='SAME') + b1) 285 | h1_2 = tf.nn.relu(tf.nn.conv1d(sent_inputs, w2, stride=1, padding='SAME') + b2) 286 | h1_3 = tf.nn.relu(tf.nn.conv1d(sent_inputs, w3, stride=1, padding='SAME') + b3) 287 | 288 | # Pooling over time operation 289 | 290 | # This is doing the max pooling. Thereare two options to do the max pooling 291 | # 1. Use tf.nn.max_pool operation on a tensor made by concatenating h1_1,h1_2,h1_3 and converting that tensor to 4D 292 | # (Because max_pool takes a tensor of rank >= 4 ) 293 | # 2. Do the max pooling separately for each filter output and combine them using tf.concat 294 | # (this is the one used in the code) 295 | 296 | h2_1 = tf.reduce_max(h1_1, axis=1) 297 | h2_2 = tf.reduce_max(h1_2, axis=1) 298 | h2_3 = tf.reduce_max(h1_3, axis=1) 299 | 300 | h2 = tf.concat([h2_1, h2_2, h2_3], axis=1) 301 | 302 | # Calculate the fully connected layer output (no activation) 303 | # Note: since h2 is 2d [batch_size,number of parallel filters] 304 | # reshaping the output is not required as it usually do in CNNs 305 | logits = tf.matmul(h2, w_fc1) + b_fc1 306 | 307 | # Loss (Cross-Entropy) 308 | loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=sent_labels, logits=logits)) 309 | 310 | # Momentum Optimizer 311 | optimizer = tf.train.MomentumOptimizer(learning_rate=0.01, momentum=0.9).minimize(loss) 312 | 313 | predictions = tf.argmax(tf.nn.softmax(logits), axis=1) 314 | 315 | # With filter widths [3,5,7] and batch_size 32 the algorithm 316 | # achieves around ~90% accuracy on test dataset (50 epochs). 317 | # From batch sizes [16,32,64] I found 32 to give best performance 318 | 319 | session = tf.InteractiveSession() 320 | 321 | num_steps = 50 # Number of epochs the algorithm runs for 322 | 323 | # Initialize all variables 324 | tf.global_variables_initializer().run() 325 | print('Initialized\n') 326 | 327 | # Define data batch generators for train and test data 328 | train_gen = BatchGenerator(batch_size, train_questions, train_labels) 329 | test_gen = BatchGenerator(batch_size, test_questions, test_labels) 330 | 331 | # How often do we compute the test accuracy 332 | test_interval = 1 333 | 334 | # Compute accuracy for a given set of predictions and labels 335 | def accuracy(labels, preds): 336 | return np.sum(np.argmax(labels, axis=1) == preds) / labels.shape[0] 337 | 338 | 339 | # Running the algorithm 340 | for step in range(num_steps): 341 | avg_loss = [] 342 | 343 | # A single traverse through the whole training set 344 | for tr_i in range((len(train_questions) // batch_size) - 1): 345 | # Get a batch of data 346 | tr_inputs, tr_labels = train_gen.generate_batch() 347 | # Optimize the network and compute the loss 348 | l, _ = session.run([loss, optimizer], feed_dict={sent_inputs: tr_inputs, sent_labels: tr_labels}) 349 | avg_loss.append(l) 350 | 351 | # Print average loss 352 | print('Train Loss at Epoch %d: %.2f' % (step, np.mean(avg_loss))) 353 | test_accuracy = [] 354 | 355 | # Compute the test accuracy 356 | if (step + 1) % test_interval == 0: 357 | for ts_i in range((len(test_questions) - 1) // batch_size): 358 | # Get a batch of test data 359 | ts_inputs, ts_labels = test_gen.generate_batch() 360 | # Get predictions for that batch 361 | preds = session.run(predictions, feed_dict={sent_inputs: ts_inputs, sent_labels: ts_labels}) 362 | # Compute test accuracy 363 | test_accuracy.append(accuracy(ts_labels, preds)) 364 | 365 | # Display the mean test accuracy 366 | print('Test accuracy at Epoch %d: %.3f' % (step, np.mean(test_accuracy) * 100.0)) 367 | 368 | 369 | -------------------------------------------------------------------------------- /2.advancedTextClassification/4.textClassificationWithRNNAndGlove.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals, print_function, division 2 | from io import open 3 | import glob 4 | import torch 5 | import torch.nn as nn 6 | import time 7 | import math 8 | import matplotlib.pyplot as plt 9 | from gensim.models import KeyedVectors 10 | import numpy as np 11 | import random 12 | 13 | 14 | # 第一步,加载GloVe 15 | 16 | def loadGlove(word2vec_file): 17 | glove_model = KeyedVectors.load_word2vec_format(word2vec_file, binary=False) 18 | return glove_model 19 | 20 | 21 | # 第二步,加载数据 22 | 23 | def findFiles(path): 24 | return glob.glob(path) 25 | 26 | 27 | def readLines(filename): 28 | lines = open(filename, encoding='utf-8').read().strip().split('\n') 29 | return [line for line in lines] 30 | 31 | 32 | # 第三步,正式构建循环神经网络 33 | # 主要构建了一个线性隐层和一个线性输出层 34 | 35 | class RNN(nn.Module): 36 | def __init__(self, input_size, hidden_size, output_size): 37 | super(RNN, self).__init__() 38 | 39 | # 隐层包含正常神经元(i2o)和虚神经元(i2h) 40 | self.hidden_size = hidden_size 41 | 42 | self.i2h = nn.Linear(input_size + hidden_size, hidden_size) 43 | self.i2o = nn.Linear(input_size + hidden_size, output_size) 44 | self.softmax = nn.LogSoftmax(dim=1) 45 | 46 | def forward(self, input, hidden): 47 | 48 | combined = torch.cat((input, hidden), 1) # 输入包括当前输入和以前的隐藏状态 49 | hidden = self.i2h(combined) # 更新hidden层,留给下一次训练当作输入 50 | output = self.i2o(combined) # 隐层输出 51 | output = self.softmax(output) # 对隐层输出作softmax(即输出层激活函数) 52 | return output, hidden 53 | 54 | def initHidden(self): 55 | return torch.zeros(1, self.hidden_size) # 初始化虚神经元 56 | 57 | 58 | # 第四步,构建一些辅助训练的函数 59 | # 以下函数主要分析输出结果对应的sentiment得分 60 | 61 | def resultFromOutput(output): 62 | top_n, top_i = output.topk(1) # topk函数可得到最大值在结果中的位置索引 63 | return top_n, top_i 64 | 65 | 66 | # 第四步,正式训练神经网络 67 | 68 | def timeSince(since): 69 | now = time.time() 70 | s = now - since 71 | m = math.floor(s / 60) 72 | s -= m * 60 73 | return '%dm %ds' % (m, s) 74 | 75 | 76 | def trainModel(result, sentence): 77 | 78 | hidden = rnn.initHidden() 79 | 80 | model_output = torch.ones(1, 5) 81 | 82 | rnn.zero_grad() 83 | 84 | # 重新处理输入的分类结果,把其变为pytorch可处理的类型 85 | result = int(result) 86 | result = torch.tensor([result]).long() # 为什么这里要Long类型,明明其他都是int64 87 | 88 | # 重新处理输入的句子,主要把句子变为单词的列表,并且去掉单词之间的空格 89 | input_sentence = [] 90 | for char in sentence.split(' '): 91 | input_sentence.append(char.strip()) 92 | 93 | # 将句子分为一个个单词进行输入 94 | for k in range(len(input_sentence)): 95 | test = glove_model['test'] 96 | try: 97 | input_char = glove_model[input_sentence[k]] # 将单词转化为词向量 98 | input_char = np.mat(input_char) # 改变向量格式,把一维数组改为1*len(input)的二维矩阵,这是pytorch要求的输入格式 99 | input_char = torch.from_numpy(input_char).float() # 注意数据类型 100 | 101 | model_output, hidden = rnn(input_char, hidden) 102 | # print(model_output) 103 | # print(result) 104 | except Exception as e: 105 | input_char = torch.zeros(1, len(test)).float() # 假如GloVe中没有对应的单词,直接用全0向量代替 106 | model_output, hidden = rnn(input_char, hidden) 107 | # print(model_output) 108 | # print(result) 109 | # print(e) 110 | 111 | loss = criterion(model_output, result) 112 | loss.backward() 113 | 114 | # 将参数的梯度添加到其值中,乘以学习速率 115 | for p in rnn.parameters(): 116 | p.data.add_(-learning_rate, p.grad.data) 117 | 118 | return model_output, loss.item() 119 | 120 | 121 | if __name__ == '__main__': 122 | 123 | # 加载glove预训练词向量 124 | word2vec_file = r'../../../glove/glove.42B.300d.word2vec.txt' 125 | glove_model = loadGlove(word2vec_file) 126 | 127 | # 句子情感的可能输出数值 128 | result = [0, 1, 2, 3, 4] 129 | 130 | # 定义神经网络 131 | 132 | current_loss = 0 133 | all_losses = [] 134 | n_hidden = 128 135 | input_size = 300 136 | output_size = 5 137 | rnn = RNN(input_size, n_hidden, output_size) 138 | 139 | # 读取数据以及进行预处理 140 | 141 | lines = readLines('data/train.tsv') 142 | sentence = [] 143 | result = [] 144 | for i in range(len(lines)-1): 145 | sentence.append(lines[i+1][4:-1]) # 读取每一行数据中对应的文本的列 146 | result.append(lines[i+1][-1]) # 读取每一行数据中对应的分类结果的列 147 | 148 | # 正式训练,注意,这里只是按顺序抽取数据进行训练 149 | 150 | criterion = nn.NLLLoss() 151 | learning_rate = 0.001 152 | 153 | # 跟踪绘图的损失 154 | print_every = 5000 155 | plot_every = 1000 156 | 157 | start = time.time() 158 | 159 | iter_max = 100000 160 | 161 | for j in range(iter_max): 162 | 163 | rand_j = random.randint(1, len(lines)-2) # 随机训练 164 | 165 | output, loss = trainModel(result[rand_j], sentence[rand_j]) 166 | current_loss += loss 167 | try: 168 | # 打印迭代的编号,损失,名字和猜测 169 | if j % print_every == 0: 170 | guess, guess_i = resultFromOutput(output) 171 | check_guess = int(guess) 172 | check_guess = check_guess * (-1) 173 | check_result = int(result[rand_j]) 174 | print('guess: ', check_guess) 175 | print('result: ', check_result) 176 | correct = '✓' if check_guess == check_result else '✗ (%s)' % check_result 177 | print('%d (%s) %.4f %s / %s %s' % ( 178 | j, timeSince(start), loss, sentence[rand_j], check_guess, correct)) 179 | except Exception as e: 180 | print(e) 181 | print('rand_j: ', rand_j) 182 | print('output: ', output) 183 | continue 184 | 185 | # 将当前损失平均值添加到损失列表中 186 | if j % plot_every == 0: 187 | all_losses.append(current_loss / plot_every) 188 | current_loss = 0 189 | 190 | plt.figure() 191 | plt.plot(all_losses) 192 | plt.show() 193 | 194 | -------------------------------------------------------------------------------- /2.advancedTextClassification/README.md: -------------------------------------------------------------------------------- 1 | ### 任务二:基于深度学习的文本分类 2 | 3 | 用Pytorch重写《任务一》,实现CNN、RNN的文本分类 4 | 5 | 1. 参考论文 6 | 7 | 1. Convolutional Neural Networks for Sentence Classification 8 | 2. 9 | 10 | 2. word embedding 的方式初始化 11 | 12 | 3. 随机embedding的初始化方式 13 | 14 | 4. 用glove 预训练的embedding进行初始化 https://nlp.stanford.edu/projects/glove/ 15 | 16 | 5. 知识点: 17 | 18 | 1. [卷积神经网络]() 19 | 2. [循环神经网络]() 20 | 3. [word2vec]() 21 | 4. [GloVe]() 22 | 23 | 24 | 25 | ### 代码说明 26 | 27 | ​ 1.textClassificationWithRNN.py是利用Pytorch在文本分类中应用RNN的简单例子,2.GloVe.py说明了如何使用GloVe的预训练词向量,3.textClassificationWithCNN.py则是使用Tensorflow将CNN应用到文本分类中。 28 | 29 | ​ 3.textClassificationWithRNNAndGlove.py主要基于1和2两段代码,利用GloVe对原始数据进行转换,在输入到RNN中进行分析。在改写的过程中,主要需要注意的是数据格式,因为pytorch对输入数据的格式有很严格的要求(比如数据的维数),所以必须检查清楚避免出错,建议在理解第一第二个文件的代码的基础上,自行改写出第三个文件。 30 | 31 | ### 结果分析 32 | 33 | ![plot](result/plot.png) 34 | 35 | ​ 这是第一次训练的结果,从结果可以看出,其实模型训练效果并不算好,主要原因有以下几点: 36 | 37 | 1. 模型是按顺序训练数据,实际上应该进行随机抽取数据进行训练 38 | 39 | 2. 模型只有一层隐层,这也可能导致模型训练效果欠缺 40 | 41 | 3. GloVe缺失部分词向量,对这些词向量模型里都以全0向量代替,对模型的结果也可能造成影响 42 | 43 | 第二次训练我才用了随机选取数据进行训练,效果马上就上来了: 44 | 45 | ![plot2](result/plot2.png) 46 | 47 | ​ 可是另一个问题又来了,模型的输出出现NAN,在网上搜索之后发现可能是梯度爆炸(消失?)造成的,于是把学习率降低,再学习一次: 48 | 49 | ![plot3](result/plot3.png) 50 | 51 | ​ 第三次的训练效果是比较满意的,也避免了梯度爆炸的问题。 52 | 53 | ​ 基于上面的训练过程,可以看出随机训练模型对模型的学习非常重要。结合训练数据其实可以这样理解,我们按顺序训练数据,模型"学到"了这一批数据的规律,应用到下一批数据又不凑效了,所以模型的Loss没有明显的下降,但是随机学习数据的话,就避免了这个问题。从这个角度来说,所谓的学习对机器来说,或许本质上还是"记住"数据的潜在规律。 -------------------------------------------------------------------------------- /2.advancedTextClassification/data/names/Chinese.txt: -------------------------------------------------------------------------------- 1 | Ang 2 | Au-Yong 3 | Bai 4 | Ban 5 | Bao 6 | Bei 7 | Bian 8 | Bui 9 | Cai 10 | Cao 11 | Cen 12 | Chai 13 | Chaim 14 | Chan 15 | Chang 16 | Chao 17 | Che 18 | Chen 19 | Cheng 20 | Cheung 21 | Chew 22 | Chieu 23 | Chin 24 | Chong 25 | Chou 26 | Chu 27 | Cui 28 | Dai 29 | Deng 30 | Ding 31 | Dong 32 | Dou 33 | Duan 34 | Eng 35 | Fan 36 | Fei 37 | Feng 38 | Foong 39 | Fung 40 | Gan 41 | Gauk 42 | Geng 43 | Gim 44 | Gok 45 | Gong 46 | Guan 47 | Guang 48 | Guo 49 | Gwock 50 | Han 51 | Hang 52 | Hao 53 | Hew 54 | Hiu 55 | Hong 56 | Hor 57 | Hsiao 58 | Hua 59 | Huan 60 | Huang 61 | Hui 62 | Huie 63 | Huo 64 | Jia 65 | Jiang 66 | Jin 67 | Jing 68 | Joe 69 | Kang 70 | Kau 71 | Khoo 72 | Khu 73 | Kong 74 | Koo 75 | Kwan 76 | Kwei 77 | Kwong 78 | Lai 79 | Lam 80 | Lang 81 | Lau 82 | Law 83 | Lew 84 | Lian 85 | Liao 86 | Lim 87 | Lin 88 | Ling 89 | Liu 90 | Loh 91 | Long 92 | Loong 93 | Luo 94 | Mah 95 | Mai 96 | Mak 97 | Mao 98 | Mar 99 | Mei 100 | Meng 101 | Miao 102 | Min 103 | Ming 104 | Moy 105 | Mui 106 | Nie 107 | Niu 108 | Ou-Yang 109 | Ow-Yang 110 | Pan 111 | Pang 112 | Pei 113 | Peng 114 | Ping 115 | Qian 116 | Qin 117 | Qiu 118 | Quan 119 | Que 120 | Ran 121 | Rao 122 | Rong 123 | Ruan 124 | Sam 125 | Seah 126 | See 127 | Seow 128 | Seto 129 | Sha 130 | Shan 131 | Shang 132 | Shao 133 | Shaw 134 | She 135 | Shen 136 | Sheng 137 | Shi 138 | Shu 139 | Shuai 140 | Shui 141 | Shum 142 | Siew 143 | Siu 144 | Song 145 | Sum 146 | Sun 147 | Sze 148 | Tan 149 | Tang 150 | Tao 151 | Teng 152 | Teoh 153 | Thean 154 | Thian 155 | Thien 156 | Tian 157 | Tong 158 | Tow 159 | Tsang 160 | Tse 161 | Tsen 162 | Tso 163 | Tze 164 | Wan 165 | Wang 166 | Wei 167 | Wen 168 | Weng 169 | Won 170 | Wong 171 | Woo 172 | Xiang 173 | Xiao 174 | Xie 175 | Xing 176 | Xue 177 | Xun 178 | Yan 179 | Yang 180 | Yao 181 | Yap 182 | Yau 183 | Yee 184 | Yep 185 | Yim 186 | Yin 187 | Ying 188 | Yong 189 | You 190 | Yuan 191 | Zang 192 | Zeng 193 | Zha 194 | Zhan 195 | Zhang 196 | Zhao 197 | Zhen 198 | Zheng 199 | Zhong 200 | Zhou 201 | Zhu 202 | Zhuo 203 | Zong 204 | Zou 205 | Bing 206 | Chi 207 | Chu 208 | Cong 209 | Cuan 210 | Dan 211 | Fei 212 | Feng 213 | Gai 214 | Gao 215 | Gou 216 | Guan 217 | Gui 218 | Guo 219 | Hong 220 | Hou 221 | Huan 222 | Jian 223 | Jiao 224 | Jin 225 | Jiu 226 | Juan 227 | Jue 228 | Kan 229 | Kuai 230 | Kuang 231 | Kui 232 | Lao 233 | Liang 234 | Lu: 235 | Luo 236 | Man 237 | Nao 238 | Pian 239 | Qiao 240 | Qing 241 | Qiu 242 | Rang 243 | Rui 244 | She 245 | Shi 246 | Shuo 247 | Sui 248 | Tai 249 | Wan 250 | Wei 251 | Xian 252 | Xie 253 | Xin 254 | Xing 255 | Xiong 256 | Xuan 257 | Yan 258 | Yin 259 | Ying 260 | Yuan 261 | Yue 262 | Yun 263 | Zha 264 | Zhai 265 | Zhang 266 | Zhi 267 | Zhuan 268 | Zhui 269 | -------------------------------------------------------------------------------- /2.advancedTextClassification/data/names/Czech.txt: -------------------------------------------------------------------------------- 1 | Abl 2 | Adsit 3 | Ajdrna 4 | Alt 5 | Antonowitsch 6 | Antonowitz 7 | Bacon 8 | Ballalatak 9 | Ballaltick 10 | Bartonova 11 | Bastl 12 | Baroch 13 | Benesch 14 | Betlach 15 | Biganska 16 | Bilek 17 | Blahut 18 | Blazek 19 | Blazek 20 | Blazejovsky 21 | Blecha 22 | Bleskan 23 | Blober 24 | Bock 25 | Bohac 26 | Bohunovsky 27 | Bolcar 28 | Borovka 29 | Borovski 30 | Borowski 31 | Borovsky 32 | Brabbery 33 | Brezovjak 34 | Brousil 35 | Bruckner 36 | Buchta 37 | Cablikova 38 | Camfrlova 39 | Cap 40 | Cerda 41 | Cermak 42 | Chermak 43 | Cermak 44 | Cernochova 45 | Cernohous 46 | Cerny 47 | Cerney 48 | Cerny 49 | Cerv 50 | Cervenka 51 | Chalupka 52 | Charlott 53 | Chemlik 54 | Chicken 55 | Chilar 56 | Chromy 57 | Cihak 58 | Clineburg 59 | Klineberg 60 | Cober 61 | Colling 62 | Cvacek 63 | Czabal 64 | Damell 65 | Demall 66 | Dehmel 67 | Dana 68 | Dejmal 69 | Dempko 70 | Demko 71 | Dinko 72 | Divoky 73 | Dolejsi 74 | Dolezal 75 | Doljs 76 | Dopita 77 | Drassal 78 | Driml 79 | Duyava 80 | Dvorak 81 | Dziadik 82 | Egr 83 | Entler 84 | Faltysek 85 | Faltejsek 86 | Fencl 87 | Fenyo 88 | Fillipova 89 | Finfera 90 | Finferovy 91 | Finke 92 | Fojtikova 93 | Fremut 94 | Friedrich 95 | Frierdich 96 | Fritsch 97 | Furtsch 98 | Gabrisova 99 | Gavalok 100 | Geier 101 | Georgijev 102 | Geryk 103 | Giersig 104 | Glatter 105 | Glockl 106 | Grabski 107 | Grozmanova 108 | Grulich 109 | Grygarova 110 | Hadash 111 | Hafernik 112 | Hajek 113 | Hajicek 114 | Hajkova 115 | Hana 116 | Hanek 117 | Hanek 118 | Hanika 119 | Hanusch 120 | Hanzlick 121 | Handzlik 122 | Hanzlik 123 | Harger 124 | Hartl 125 | Havlatova 126 | Havlice 127 | Hawlata 128 | Heidl 129 | Herback 130 | Herodes 131 | Hiorvst 132 | Hladky 133 | Hlavsa 134 | Hnizdil 135 | Hodowal 136 | Hodoval 137 | Holan 138 | Holub 139 | Homulka 140 | Hora 141 | Hovanec 142 | Hrabak 143 | Hradek 144 | Hrdy 145 | Hrula 146 | Hruska 147 | Hruskova 148 | Hudecek 149 | Husk 150 | Hynna 151 | Jaluvka 152 | Janca 153 | Janicek 154 | Jenicek 155 | Janacek 156 | Janick 157 | Janoch 158 | Janosik 159 | Janutka 160 | Jares 161 | Jarzembowski 162 | Jedlicka 163 | Jelinek 164 | Jindra 165 | Jirava 166 | Jirik 167 | Jirku 168 | Jirovy 169 | Jobst 170 | Jonas 171 | Kacirek 172 | Kafka 173 | Kafka 174 | Kaiser 175 | Kanak 176 | Kaplanek 177 | Kara 178 | Karlovsky 179 | Kasa 180 | Kasimor 181 | Kazimor 182 | Kazmier 183 | Katschker 184 | Kauphsman 185 | Kenzel 186 | Kerner 187 | Kesl 188 | Kessel 189 | Kessler 190 | Khork 191 | Kirchma 192 | Klein 193 | Klemper 194 | Klimes 195 | Kober 196 | Koberna 197 | Koci 198 | Kocian 199 | Kocian 200 | Kofron 201 | Kolacny 202 | Koliha 203 | Kolman 204 | Koma 205 | Komo 206 | Coma 207 | Konarik 208 | Kopp 209 | Kopecky 210 | Korandak 211 | Korycan 212 | Korycansky 213 | Kosko 214 | Kouba 215 | Kouba 216 | Koukal 217 | Koza 218 | Kozumplikova 219 | Kratschmar 220 | Krawiec 221 | Kreisinger 222 | Kremlacek 223 | Kremlicka 224 | Kreutschmer 225 | Krhovsky 226 | Krivan 227 | Krivolavy 228 | Kriz 229 | Kruessel 230 | Krupala 231 | Krytinar 232 | Kubin 233 | Kucera 234 | Kucharova 235 | Kudrna 236 | Kuffel 237 | Kupfel 238 | Kofel 239 | Kulhanek 240 | Kunik 241 | Kurtz 242 | Kusak 243 | Kvasnicka 244 | Lawa 245 | Linart 246 | Lind 247 | Lokay 248 | Loskot 249 | Ludwig 250 | Lynsmeier 251 | Macha 252 | Machacek 253 | Macikova 254 | Malafa 255 | Malec 256 | Malecha 257 | Maly 258 | Marek 259 | Marik 260 | Marik 261 | Markytan 262 | Matejka 263 | Matjeka 264 | Matocha 265 | Maxa/B 266 | Mayer 267 | Meier 268 | Merta 269 | Meszes 270 | Metjeka 271 | Michalovic 272 | Michalovicova 273 | Miksatkova 274 | Mojzis 275 | Mojjis 276 | Mozzis 277 | Molcan 278 | Monfort 279 | MonkoAustria 280 | Morava 281 | Morek 282 | Muchalon 283 | Mudra 284 | Muhlbauer 285 | Nadvornizch 286 | Nadwornik 287 | Navara 288 | Navratil 289 | Navratil 290 | Navrkal 291 | Nekuza 292 | Nemec 293 | Nemecek 294 | Nestrojil 295 | Netsch 296 | Neusser 297 | Neisser 298 | Naizer 299 | Novak 300 | Nowak 301 | Novotny 302 | Novy Novy 303 | Oborny 304 | Ocasek 305 | Ocaskova 306 | Oesterreicher 307 | Okenfuss 308 | Olbrich 309 | Ondrisek 310 | Opizka 311 | Opova 312 | Opp 313 | Osladil 314 | Ozimuk 315 | Pachr 316 | Palzewicz 317 | Panek 318 | Patril 319 | Pavlik 320 | Pavlicka 321 | Pavlu 322 | Pawlak 323 | Pear 324 | Peary 325 | Pech 326 | Peisar 327 | Paisar 328 | Paiser 329 | Perevuznik 330 | Perina 331 | Persein 332 | Petrezelka 333 | Petru 334 | Pesek 335 | Petersen 336 | Pfeifer 337 | Picha 338 | Pillar 339 | Pellar 340 | Piller 341 | Pinter 342 | Pitterman 343 | Planick 344 | Piskach 345 | Plisek 346 | Plisko 347 | Pokorny 348 | Ponec 349 | Ponec 350 | Prachar 351 | Praseta 352 | Prchal 353 | Prehatney 354 | Pretsch 355 | Prill 356 | Psik 357 | Pudel 358 | Purdes 359 | Quasninsky 360 | Raffel 361 | Rafaj1 362 | Ransom 363 | Rezac 364 | Riedel 365 | Riha 366 | Riha 367 | Ritchie 368 | Rozinek 369 | Ruba 370 | Ruda 371 | Rumisek 372 | Ruzicka 373 | Rypka 374 | Rebka 375 | Rzehak 376 | Sabol 377 | Safko 378 | Samz 379 | Sankovsky 380 | Sappe 381 | Sappe 382 | Sarna 383 | Satorie 384 | Savchak 385 | Svotak 386 | Swatchak 387 | Svocak 388 | Svotchak 389 | Schallom 390 | Schenk 391 | Schlantz 392 | Schmeiser 393 | Schneider 394 | Schmied 395 | Schubert 396 | Schwarz 397 | Schwartz 398 | Sedmik 399 | Sedmikova 400 | Seger 401 | Sekovora 402 | Semick 403 | Serak 404 | Sherak 405 | Shima 406 | Shula 407 | Siegl 408 | Silhan 409 | Simecek 410 | Simodines 411 | Simonek 412 | Sip 413 | Sitta 414 | Skala 415 | Skeril 416 | Skokan 417 | Skomicka 418 | Skwor 419 | Slapnickova 420 | Slejtr 421 | Slepicka 422 | Slepica 423 | Slezak 424 | Slivka 425 | Smith 426 | Snelker 427 | Sokolik 428 | Soucek 429 | Soukup 430 | Soukup 431 | Spicka 432 | Spoerl 433 | Sponer 434 | Srda 435 | Srpcikova 436 | Stangl 437 | Stanzel 438 | Stary 439 | Staska 440 | Stedronsky 441 | Stegon 442 | Sztegon 443 | Steinborn 444 | Stepan 445 | Stites 446 | Stluka 447 | Stotzky 448 | StrakaO 449 | Stramba 450 | Stupka 451 | Subertova 452 | Suchanka 453 | Sula 454 | Svejda 455 | Svejkovsky 456 | Svoboda 457 | Tejc 458 | Tikal 459 | Tykal 460 | Till 461 | Timpe 462 | Timpy 463 | Toman 464 | Tomanek 465 | Tomasek 466 | Tomes 467 | Trampotova 468 | Trampota 469 | Treblik 470 | Trnkova 471 | Uerling 472 | Uhlik 473 | Urbanek 474 | Urbanek1 475 | Urbanovska 476 | Urista 477 | Ustohal 478 | Vaca 479 | Vaculova 480 | Vavra 481 | Vejvoda 482 | Veverka 483 | Victor 484 | Vlach 485 | Vlach 486 | Vlasak 487 | Vlasek 488 | Volcik 489 | Voneve 490 | Votke 491 | Vozab 492 | Vrazel 493 | Vykruta 494 | Wykruta 495 | Waclauska 496 | Weichert 497 | Weineltk 498 | Weisener 499 | Wiesner 500 | Wizner 501 | Weiss 502 | Werlla 503 | Whitmire1 504 | Widerlechner 505 | Wilchek 506 | Wondracek 507 | Wood 508 | Zajicek 509 | Zak 510 | Zajicek 511 | Zaruba 512 | Zaruba 513 | Zelinka 514 | Zeman 515 | Zimola 516 | Zipperer 517 | Zitka 518 | Zoucha 519 | Zwolenksy 520 | -------------------------------------------------------------------------------- /2.advancedTextClassification/data/names/Dutch.txt: -------------------------------------------------------------------------------- 1 | Aalsburg 2 | Aalst 3 | Aarle 4 | Achteren 5 | Achthoven 6 | Adrichem 7 | Aggelen 8 | Agteren 9 | Agthoven 10 | Akkeren 11 | Aller 12 | Alphen 13 | Alst 14 | Altena 15 | Althuis 16 | Amelsvoort 17 | Amersvoort 18 | Amstel 19 | Andel 20 | Andringa 21 | Ankeren 22 | Antwerp 23 | Antwerpen 24 | Apeldoorn 25 | Arendonk 26 | Asch 27 | Assen 28 | Baarle 29 | Bokhoven 30 | Breda 31 | Bueren 32 | Buggenum 33 | Buiren 34 | Buren 35 | Can 36 | Cann 37 | Canne 38 | Daal 39 | Daalen 40 | Dael 41 | Daele 42 | Dale 43 | Dalen 44 | Laar 45 | Vliert 46 | Akker 47 | Andel 48 | Denend 49 | Aart 50 | Beek 51 | Berg 52 | Hout 53 | Laar 54 | See 55 | Stoep 56 | Veen 57 | Ven 58 | Venn 59 | Venne 60 | Vennen 61 | Zee 62 | Donk 63 | Haanraads 64 | Haanraats 65 | Haanrade 66 | Haanrath 67 | Haenraats 68 | Haenraets 69 | Hanraets 70 | Hassel 71 | Hautem 72 | Hautum 73 | Heel 74 | Herten 75 | Hofwegen 76 | Horn 77 | Hout 78 | Houte 79 | Houtem 80 | Houten 81 | Houttum 82 | Houtum 83 | Kan 84 | Kann 85 | Kanne 86 | Kappel 87 | Karl 88 | Kikkert 89 | Klein 90 | Klerk 91 | Klerken 92 | Klerks 93 | Klerkse 94 | Klerkx 95 | Klerx 96 | Kloet 97 | Kloeten 98 | Kloeter 99 | Koeman 100 | Koemans 101 | Kolen 102 | Kolijn 103 | Kollen 104 | Koning 105 | Kool 106 | Koole 107 | Koolen 108 | Kools 109 | Kouman 110 | Koumans 111 | Krantz 112 | Kranz 113 | Krusen 114 | Kuijpers 115 | Kuiper 116 | Kuipers 117 | Laar 118 | Langbroek 119 | Laren 120 | Lauwens 121 | Lauwers 122 | Leeuwenhoeck 123 | Leeuwenhoek 124 | Leeuwenhoek 125 | Lucas 126 | Lucassen 127 | Lyon 128 | Maas 129 | Maes 130 | Maessen 131 | Marquering 132 | Marqueringh 133 | Marquerink 134 | Mas 135 | Meeuwe 136 | Meeuwes 137 | Meeuwessen 138 | Meeuweszen 139 | Meeuwis 140 | Meeuwissen 141 | Meeuwsen 142 | Meisner 143 | Merckx 144 | Mertens 145 | Michel 146 | Middelburg 147 | Middlesworth 148 | Mohren 149 | Mooren 150 | Mulder 151 | Muyskens 152 | Nagel 153 | Nelissen 154 | Nifterick 155 | Nifterick 156 | Nifterik 157 | Nifterik 158 | Niftrik 159 | Niftrik 160 | Offermans 161 | Ogterop 162 | Ogtrop 163 | Oirschot 164 | Oirschotten 165 | Oomen 166 | Oorschot 167 | Oorschot 168 | Ophoven 169 | Otten 170 | Pander 171 | Panders 172 | Paulis 173 | Paulissen 174 | Peerenboom 175 | Peeters 176 | Peij 177 | Pender 178 | Penders 179 | Pennders 180 | Penner 181 | Penners 182 | Peter 183 | Peusen 184 | Pey 185 | Philips 186 | Prinsen 187 | Rademaker 188 | Rademakers 189 | Ramaaker 190 | Ramaker 191 | Ramakers 192 | Ramecker 193 | Rameckers 194 | Raske 195 | Reijnder 196 | Reijnders 197 | Reinder 198 | Reinders 199 | Reynder 200 | Reynders 201 | Richard 202 | Rietveld 203 | Rijnder 204 | Rijnders 205 | Robert 206 | Roggeveen 207 | Roijacker 208 | Roijackers 209 | Roijakker 210 | Roijakkers 211 | Romeijn 212 | Romeijnders 213 | Romeijnsen 214 | Romijn 215 | Romijnders 216 | Romijnsen 217 | Rompa 218 | Rompa 219 | Rompaeij 220 | Rompaey 221 | Rompaij 222 | Rompay 223 | Rompaye 224 | Rompu 225 | Rompuy 226 | Rooiakker 227 | Rooiakkers 228 | Rooijakker 229 | Rooijakkers 230 | Roosa 231 | Roosevelt 232 | Rossem 233 | Rossum 234 | Rumpade 235 | Rutten 236 | Ryskamp 237 | Samson 238 | Sanna 239 | Schenck 240 | Schermer 241 | Schneider 242 | Schneiders 243 | Schneijder 244 | Schneijders 245 | Schoonenburg 246 | Schoonraad 247 | Schoorel 248 | Schoorel 249 | Schoorl 250 | Schorel 251 | Schrijnemakers 252 | Schuyler 253 | Schwarzenberg 254 | Seeger 255 | Seegers 256 | Seelen 257 | Segers 258 | Segher 259 | Seghers 260 | Severijns 261 | Severins 262 | Sevriens 263 | Silje 264 | Simon 265 | Simonis 266 | Slootmaekers 267 | Smeets 268 | Smets 269 | Smit 270 | Smits 271 | Snaaijer 272 | Snaijer 273 | Sneiders 274 | Sneijder 275 | Sneijders 276 | Sneijer 277 | Sneijers 278 | Snell 279 | Snider 280 | Sniders 281 | Snijder 282 | Snijders 283 | Snyder 284 | Snyders 285 | Specht 286 | Spijker 287 | Spiker 288 | Ter Avest 289 | Teunissen 290 | Theunissen 291 | Tholberg 292 | Tillens 293 | Tunison 294 | Tunneson 295 | Vandale 296 | Vandroogenbroeck 297 | Vann 298 | -------------------------------------------------------------------------------- /2.advancedTextClassification/data/names/French.txt: -------------------------------------------------------------------------------- 1 | Abel 2 | Abraham 3 | Adam 4 | Albert 5 | Allard 6 | Archambault 7 | Armistead 8 | Arthur 9 | Augustin 10 | Babineaux 11 | Baudin 12 | Beauchene 13 | Beaulieu 14 | Beaumont 15 | Bélanger 16 | Bellamy 17 | Bellerose 18 | Belrose 19 | Berger 20 | Béringer 21 | Bernard 22 | Bertrand 23 | Bisset 24 | Bissette 25 | Blaise 26 | Blanc 27 | Blanchet 28 | Blanchett 29 | Bonfils 30 | Bonheur 31 | Bonhomme 32 | Bonnaire 33 | Bonnay 34 | Bonner 35 | Bonnet 36 | Borde 37 | Bordelon 38 | Bouchard 39 | Boucher 40 | Brisbois 41 | Brodeur 42 | Bureau 43 | Caron 44 | Cavey 45 | Chaput 46 | Charbonneau 47 | Charpentier 48 | Charron 49 | Chastain 50 | Chevalier 51 | Chevrolet 52 | Cloutier 53 | Colbert 54 | Comtois 55 | Cornett 56 | Coté 57 | Coupe 58 | Courtemanche 59 | Cousineau 60 | Couture 61 | Daniau 62 | D'aramitz 63 | Daviau 64 | David 65 | Deforest 66 | Degarmo 67 | Delacroix 68 | De la fontaine 69 | Deniau 70 | Deniaud 71 | Deniel 72 | Denis 73 | De sauveterre 74 | Deschamps 75 | Descoteaux 76 | Desjardins 77 | Desrochers 78 | Desrosiers 79 | Dubois 80 | Duchamps 81 | Dufort 82 | Dufour 83 | Duguay 84 | Dupond 85 | Dupont 86 | Durand 87 | Durant 88 | Duval 89 | Émile 90 | Eustis 91 | Fabian 92 | Fabre 93 | Fabron 94 | Faucher 95 | Faucheux 96 | Faure 97 | Favager 98 | Favre 99 | Favreau 100 | Fay 101 | Félix 102 | Firmin 103 | Fontaine 104 | Forest 105 | Forestier 106 | Fortier 107 | Foss 108 | Fournier 109 | Gage 110 | Gagne 111 | Gagnier 112 | Gagnon 113 | Garcon 114 | Gardinier 115 | Germain 116 | Géroux 117 | Giles 118 | Girard 119 | Giroux 120 | Glaisyer 121 | Gosse 122 | Gosselin 123 | Granger 124 | Guérin 125 | Guillory 126 | Hardy 127 | Harman 128 | Hébert 129 | Herbert 130 | Herriot 131 | Jacques 132 | Janvier 133 | Jordan 134 | Joubert 135 | Labelle 136 | Lachance 137 | Lachapelle 138 | Lamar 139 | Lambert 140 | Lane 141 | Langlais 142 | Langlois 143 | Lapointe 144 | Larue 145 | Laurent 146 | Lavigne 147 | Lavoie 148 | Leandres 149 | Lebeau 150 | Leblanc 151 | Leclair 152 | Leclerc 153 | Lécuyer 154 | Lefebvre 155 | Lefévre 156 | Lefurgey 157 | Legrand 158 | Lemaire 159 | Lémieux 160 | Leon 161 | Leroy 162 | Lesauvage 163 | Lestrange 164 | Lévêque 165 | Lévesque 166 | Linville 167 | Lyon 168 | Lyon 169 | Maçon 170 | Marchand 171 | Marie 172 | Marion 173 | Martel 174 | Martel 175 | Martin 176 | Masson 177 | Masson 178 | Mathieu 179 | Mercier 180 | Merle 181 | Michaud 182 | Michel 183 | Monet 184 | Monette 185 | Montagne 186 | Moreau 187 | Moulin 188 | Mullins 189 | Noel 190 | Oliver 191 | Olivier 192 | Page 193 | Paget 194 | Palomer 195 | Pan 196 | Pape 197 | Paquet 198 | Paquet 199 | Parent 200 | Paris 201 | Parris 202 | Pascal 203 | Patenaude 204 | Paternoster 205 | Paul 206 | Pelletier 207 | Perrault 208 | Perreault 209 | Perrot 210 | Petit 211 | Pettigrew 212 | Pierre 213 | Plamondon 214 | Plourde 215 | Poingdestre 216 | Poirier 217 | Porcher 218 | Poulin 219 | Proulx 220 | Renaud 221 | Rey 222 | Reyer 223 | Richard 224 | Richelieu 225 | Robert 226 | Roche 227 | Rome 228 | Romilly 229 | Rose 230 | Rousseau 231 | Roux 232 | Roy 233 | Royer 234 | Salomon 235 | Salvage 236 | Samson 237 | Samuel 238 | Sargent 239 | Sarkozi 240 | Sarkozy 241 | Sartre 242 | Sault 243 | Sauvage 244 | Sauvageau 245 | Sauvageon 246 | Sauvageot 247 | Sauveterre 248 | Savatier 249 | Segal 250 | Sergeant 251 | Séverin 252 | Simon 253 | Solomon 254 | Soucy 255 | St martin 256 | St pierre 257 | Tailler 258 | Tasse 259 | Thayer 260 | Thibault 261 | Thomas 262 | Tobias 263 | Tolbert 264 | Traver 265 | Travere 266 | Travers 267 | Traverse 268 | Travert 269 | Tremblay 270 | Tremble 271 | Victor 272 | Victors 273 | Villeneuve 274 | Vincent 275 | Vipond 276 | Voclain 277 | Yount 278 | -------------------------------------------------------------------------------- /2.advancedTextClassification/data/names/German.txt: -------------------------------------------------------------------------------- 1 | Abbing 2 | Abel 3 | Abeln 4 | Abt 5 | Achilles 6 | Achterberg 7 | Acker 8 | Ackermann 9 | Adam 10 | Adenauer 11 | Adler 12 | Adlersflügel 13 | Aeschelman 14 | Albert 15 | Albrecht 16 | Aleshire 17 | Aleshite 18 | Althaus 19 | Amsel 20 | Andres 21 | Armbrüster 22 | Armbruster 23 | Artz 24 | Aue 25 | Auer 26 | Augustin 27 | Aust 28 | Autenburg 29 | Auttenberg 30 | Baasch 31 | Bach 32 | Bachmeier 33 | Bäcker 34 | Bader 35 | Bähr 36 | Bambach 37 | Bauer 38 | Bauers 39 | Baum 40 | Baumann 41 | Baumbach 42 | Baumgärtner 43 | Baumgartner 44 | Baumhauer 45 | Bayer 46 | Beck 47 | Becke 48 | Beckenbauer 49 | Becker 50 | Beckert 51 | Behrend 52 | Behrends 53 | Beitel 54 | Beltz 55 | Benn 56 | Berg 57 | Berger 58 | Bergfalk 59 | Beringer 60 | Bernat 61 | Best 62 | Beutel 63 | Beyer 64 | Beyersdorf 65 | Bieber 66 | Biermann 67 | Bischoffs 68 | Blau 69 | Blecher 70 | Bleier 71 | Blumenthal 72 | Blumstein 73 | Bocker 74 | Boehler 75 | Boer 76 | Boesch 77 | Böhler 78 | Böhm 79 | Böhme 80 | Böhmer 81 | Bohn 82 | Borchard 83 | Bösch 84 | Bosch 85 | Böttcher 86 | Brahms 87 | Brand 88 | Brandt 89 | Brant 90 | Brauer 91 | Braun 92 | Braune 93 | Breiner 94 | Breisacher 95 | Breitbarth 96 | Bretz 97 | Brinkerhoff 98 | Brodbeck 99 | Brose 100 | Brotz 101 | Bruhn 102 | Brun 103 | Brune 104 | Buchholz 105 | Buckholtz 106 | Buhr 107 | Bumgarner 108 | Burgstaller 109 | Busch 110 | Carver 111 | Chevrolet 112 | Cline 113 | Dahl 114 | Denzel 115 | Derrick 116 | Diefenbach 117 | Dieter 118 | Dietrich 119 | Dirchs 120 | Dittmar 121 | Dohman 122 | Drechsler 123 | Dreher 124 | Dreschner 125 | Dresdner 126 | Dressler 127 | Duerr 128 | Dunkle 129 | Dunst 130 | Dürr 131 | Eberhardt 132 | Ebner 133 | Ebner 134 | Eckstein 135 | Egger 136 | Eichel 137 | Eilerts 138 | Engel 139 | Enns 140 | Esser 141 | Essert 142 | Everhart 143 | Fabel 144 | Faerber 145 | Falk 146 | Falkenrath 147 | Färber 148 | Fashingbauer 149 | Faust 150 | Feigenbaum 151 | Feld 152 | Feldt 153 | Fenstermacher 154 | Fertig 155 | Fiedler 156 | Fischer 157 | Flater 158 | Fleischer 159 | Foerstner 160 | Forst 161 | Förstner 162 | Foth 163 | Frank 164 | Franke 165 | Frei 166 | Freud 167 | Freudenberger 168 | Freund 169 | Fried 170 | Friedrich 171 | Fromm 172 | Frost 173 | Fuchs 174 | Fuhrmann 175 | Fürst 176 | Fux 177 | Gabler 178 | Gaertner 179 | Garb 180 | Garber 181 | Gärtner 182 | Garver 183 | Gass 184 | Gehrig 185 | Gehring 186 | Geier 187 | Geiger 188 | Geisler 189 | Geissler 190 | Geiszler 191 | Gensch 192 | Gerber 193 | Gerhard 194 | Gerhardt 195 | Gerig 196 | Gerst 197 | Gerstle 198 | Gerver 199 | Giehl 200 | Giese 201 | Glöckner 202 | Goebel 203 | Goldschmidt 204 | Gorman 205 | Gott 206 | Gotti 207 | Gottlieb 208 | Gottschalk 209 | Graner 210 | Greenberg 211 | Groos 212 | Gros 213 | Gross 214 | Groß 215 | Große 216 | Grosse 217 | Größel 218 | Großel 219 | Großer 220 | Grosser 221 | Grosz 222 | Grünewald 223 | Günther 224 | Gunther 225 | Gutermuth 226 | Gwerder 227 | Haas 228 | Haase 229 | Haber 230 | Habich 231 | Habicht 232 | Hafner 233 | Hahn 234 | Hall 235 | Halle 236 | Harman 237 | Hartmann 238 | Hase 239 | Hasek 240 | Hasenkamp 241 | Hass 242 | Hauer 243 | Haupt 244 | Hausler 245 | Havener 246 | Heidrich 247 | Heinrich 248 | Heinrichs 249 | Heintze 250 | Hellewege 251 | Heppenheimer 252 | Herbert 253 | Hermann 254 | Herrmann 255 | Herschel 256 | Hertz 257 | Hildebrand 258 | Hinrichs 259 | Hintzen 260 | Hirsch 261 | Hoch 262 | Hochberg 263 | Hoefler 264 | Hofer 265 | Hoffman 266 | Hoffmann 267 | Höfler 268 | Hofmann 269 | Hofmeister 270 | Holst 271 | Holtzer 272 | Hölzer 273 | Holzer 274 | Holzknecht 275 | Holzmann 276 | Hoover 277 | Horn 278 | Horn 279 | Horowitz 280 | Houk 281 | Hüber 282 | Huber 283 | Huff 284 | Huffman 285 | Huffmann 286 | Hummel 287 | Hummel 288 | Hutmacher 289 | Ingersleben 290 | Jaeger 291 | Jäger 292 | Jager 293 | Jans 294 | Janson 295 | Janz 296 | Jollenbeck 297 | Jordan 298 | Jund 299 | Jung 300 | Junge 301 | Kahler 302 | Kaiser 303 | Kalb 304 | Kalbfleisch 305 | Kappel 306 | Karl 307 | Kaspar 308 | Kassmeyer 309 | Kästner 310 | Katz 311 | Kaube 312 | Käufer 313 | Kaufer 314 | Kauffmann 315 | Kaufman 316 | Keil 317 | Keller 318 | Kempf 319 | Kerner 320 | Kerper 321 | Kerwar 322 | Kerwer 323 | Kiefer 324 | Kiefer 325 | Kirchner 326 | Kistler 327 | Kistner 328 | Kleid 329 | Klein 330 | Klossner 331 | Knef 332 | Kneib 333 | Kneller 334 | Knepp 335 | Knochenmus 336 | Knopf 337 | Knopp 338 | Koch 339 | Kock 340 | Koenig 341 | Koenigsmann 342 | Köhl 343 | Kohl 344 | Köhler 345 | Kohler 346 | Kolbe 347 | König 348 | Königsmann 349 | Kopp 350 | Kraemer 351 | Krämer 352 | Kramer 353 | Krantz 354 | Kranz 355 | Kraus 356 | Krause 357 | Krauss 358 | Krauß 359 | Krebs 360 | Kröger 361 | Kron 362 | Kruckel 363 | Krüger 364 | Krüger 365 | Kruger 366 | Kruse 367 | Kruse 368 | Küchler 369 | Kuhn 370 | Kundert 371 | Kunkel 372 | Kunkle 373 | Kuntz 374 | Kunze 375 | Kurzmann 376 | Laberenz 377 | Lafrentz 378 | Lafrenz 379 | Landau 380 | Lang 381 | Lange 382 | Langenberg 383 | Langer 384 | Larenz 385 | Laurenz 386 | Lauritz 387 | Lawerenz 388 | Lawrenz 389 | Lehmann 390 | Lehrer 391 | Leitner 392 | Leitz 393 | Leitzke 394 | Lenz 395 | Leverenz 396 | Lewerentz 397 | Lewerenz 398 | Lichtenberg 399 | Lieberenz 400 | Linden 401 | Loewe 402 | Lohrenz 403 | Lorentz 404 | Lorenz 405 | Lorenzen 406 | Loris 407 | Loritz 408 | Löwe 409 | Ludwig 410 | Luther 411 | Maas 412 | Maier 413 | Mandel 414 | Mann 415 | Markwardt 416 | Marquardt 417 | Marquering 418 | Marquerink 419 | Martell 420 | Martin 421 | Martz 422 | Mas 423 | Maurer 424 | Maus 425 | Mayer 426 | Meier 427 | Mein 428 | Meindl 429 | Meinhardt 430 | Meisner 431 | Meissner 432 | Melsbach 433 | Mendel 434 | Mendelsohn 435 | Mendelssohn 436 | Messer 437 | Messerli 438 | Messmann 439 | Messner 440 | Metz 441 | Metz 442 | Metzger 443 | Meyer 444 | Michel 445 | Mohren 446 | Möller 447 | Morgenstern 448 | Moser 449 | Mueller 450 | Muhlfeld 451 | Müller 452 | Nagel 453 | Neuman 454 | Neumann 455 | Nuremberg 456 | Nussbaum 457 | Nussenbaum 458 | Oberst 459 | Oelberg 460 | Ohme 461 | Oliver 462 | Oppenheimer 463 | Ott 464 | Otto 465 | Oursler 466 | Pahlke 467 | Papke 468 | Papp 469 | Paternoster 470 | Paul 471 | Paulis 472 | Pawlitzki 473 | Penzig 474 | Peter 475 | Peters 476 | Pfaff 477 | Pfenning 478 | Plank 479 | Pletcher 480 | Porsche 481 | Portner 482 | Prinz 483 | Protz 484 | Rademacher 485 | Rademaker 486 | Rapp 487 | Raske 488 | Raskob 489 | Raskop 490 | Raskoph 491 | Regenbogen 492 | Reier 493 | Reiher 494 | Reiter 495 | Rettig 496 | Reuter 497 | Reuter 498 | Richard 499 | Richter 500 | Rier 501 | Riese 502 | Ritter 503 | Rose 504 | Rosenberg 505 | Rosenberger 506 | Rosenfeld 507 | Rot 508 | Roth 509 | Rothbauer 510 | Rothenberg 511 | Rothschild 512 | Sachs 513 | Saller 514 | Saller 515 | Salomon 516 | Salzwedel 517 | Samuel 518 | Sander 519 | Sauber 520 | Schäfer 521 | Scheer 522 | Scheinberg 523 | Schenck 524 | Schermer 525 | Schindler 526 | Schirmer 527 | Schlender 528 | Schlimme 529 | Schlusser 530 | Schmeling 531 | Schmid 532 | Schmidt 533 | Schmitt 534 | Schmitz 535 | Schneider 536 | Schnoor 537 | Schnur 538 | Schoettmer 539 | Schräder 540 | Schrader 541 | Schreck 542 | Schreier 543 | Schröder 544 | Schröder 545 | Schroeder 546 | Schroeter 547 | Schröter 548 | Schubert 549 | Schuchard 550 | Schuchardt 551 | Schuchert 552 | Schuhart 553 | Schuhmacher 554 | Schuler 555 | Schult 556 | Schulte 557 | Schultes 558 | Schultheis 559 | Schultheiss 560 | Schultheiß 561 | Schultz 562 | Schultze 563 | Schulz 564 | Schulze 565 | Schumacher 566 | Schuster 567 | Schuttmann 568 | Schwangau 569 | Schwartz 570 | Schwarz 571 | Schwarzenegger 572 | Schwenke 573 | Schwinghammer 574 | Seelenfreund 575 | Seidel 576 | Senft 577 | Senft 578 | Sheinfeld 579 | Shriver 580 | Siegel 581 | Siegel 582 | Siekert 583 | Siemon 584 | Silverstein 585 | Simen 586 | Simmon 587 | Simon 588 | Simons 589 | Siskin 590 | Siskind 591 | Sitz 592 | Sitz 593 | Slusser 594 | Solberg 595 | Sommer 596 | Sommer 597 | Sommer 598 | Sommer 599 | Sonnen 600 | Sorg 601 | Sorge 602 | Spannagel 603 | Specht 604 | Spellmeyer 605 | Spitznogle 606 | Sponaugle 607 | Stark 608 | Stauss 609 | Steen 610 | Steffen 611 | Stein 612 | Steinmann 613 | Stenger 614 | Sternberg 615 | Steube 616 | Steuben 617 | Stieber 618 | Stoppelbein 619 | Stoppelbein 620 | Strand 621 | Straub 622 | Strobel 623 | Strohkirch 624 | Stroman 625 | Stuber 626 | Stueck 627 | Stumpf 628 | Sturm 629 | Suess 630 | Sulzbach 631 | Swango 632 | Switzer 633 | Tangeman 634 | Tanzer 635 | Teufel 636 | Tiedeman 637 | Tifft 638 | Tillens 639 | Tobias 640 | Tolkien 641 | Tresler 642 | Tritten 643 | Trumbauer 644 | Tschida 645 | Unkle 646 | Unruh 647 | Unterbrink 648 | Ursler 649 | Vann 650 | Van tonder 651 | Vieth 652 | Vogel 653 | Vogt 654 | Vogts 655 | Voigt 656 | Voigts 657 | Volk 658 | Voll 659 | Von brandt 660 | Von essen 661 | Von grimmelshausen 662 | Von ingersleben 663 | Vonnegut 664 | Von wegberg 665 | Voss 666 | Voß 667 | Wägner 668 | Wagner 669 | Wähner 670 | Wahner 671 | Waldfogel 672 | Waldvogel 673 | Walkenhorst 674 | Walter 675 | Walther 676 | Waltz 677 | Wang 678 | Warner 679 | Waxweiler 680 | Weber 681 | Wechsler 682 | Wedekind 683 | Weeber 684 | Wegener 685 | Wegner 686 | Wehner 687 | Wehunt 688 | Weigand 689 | Weiman 690 | Weiner 691 | Weiss 692 | Weiß 693 | Welter 694 | Wendel 695 | Wendell 696 | Werner 697 | Wernher 698 | West 699 | Westerberg 700 | Wetterman 701 | Wetzel 702 | Wexler 703 | Wieck 704 | Wiegand 705 | Wildgrube 706 | Winter 707 | Winther 708 | Winther 709 | Wirner 710 | Wirnhier 711 | Wirt 712 | Wirth 713 | Wolf 714 | Wolff 715 | Wolter 716 | Wörner 717 | Wörnhör 718 | Wruck 719 | Wyman 720 | Xylander 721 | Zellweger 722 | Zilberschlag 723 | Zimmerman 724 | Zimmermann 725 | -------------------------------------------------------------------------------- /2.advancedTextClassification/data/names/Greek.txt: -------------------------------------------------------------------------------- 1 | Adamidis 2 | Adamou 3 | Agelakos 4 | Akrivopoulos 5 | Alexandropoulos 6 | Anetakis 7 | Angelopoulos 8 | Antimisiaris 9 | Antipas 10 | Antonakos 11 | Antoniadis 12 | Antonopoulos 13 | Antonopoulos 14 | Antonopoulos 15 | Arvanitoyannis 16 | Avgerinos 17 | Banos 18 | Batsakis 19 | Bekyros 20 | Belesis 21 | Bertsimas 22 | Bilias 23 | Blades 24 | Bouloukos 25 | Brisimitzakis 26 | Bursinos 27 | Calogerakis 28 | Calpis 29 | Chellos 30 | Christakos 31 | Christodoulou 32 | Christou 33 | Chrysanthopoulos 34 | Chrysanthopoulos 35 | Comino 36 | Close 37 | Close 38 | Close 39 | Close 40 | Close 41 | Close 42 | Close 43 | Close 44 | Dalianis 45 | Danas 46 | Dasios 47 | Demakis 48 | Demarchis 49 | Demas 50 | Demetrious 51 | Dertilis 52 | Diakogeorgiou 53 | Dioletis 54 | Dounias 55 | Dritsas 56 | Drivakis 57 | Eatros 58 | Egonidis 59 | Eliopoulos 60 | Forakis 61 | Fotopoulos 62 | Fourakis 63 | Frangopoulos 64 | Galanopoulos 65 | Garofalis 66 | Gavril 67 | Gavrilopoulos 68 | Georgeakopoulos 69 | Geracimos 70 | Gianakopulos 71 | Giannakopoulos 72 | Giannakos 73 | Glynatsis 74 | Gomatos 75 | Grammatakakis 76 | Gravari 77 | Hadjiyianakies 78 | Hagias 79 | Haritopoulos 80 | Honjas 81 | Horiatis 82 | Houlis 83 | Jamussa 84 | Kaglantge 85 | Kalakos 86 | Kalogeria 87 | Kaloxylos 88 | Kanavos 89 | Kapsimalles 90 | Karahalios 91 | Karameros 92 | Karkampasis 93 | Karnoupakis 94 | Katsourinis 95 | Kefalas 96 | Kokkali 97 | Kokoris 98 | Kolovos 99 | Konstantatos 100 | Kosmas 101 | Kotsilimbas 102 | Kotsiopoulos 103 | Kouches 104 | Koulaxizis 105 | Koumanidis 106 | Kourempes 107 | Kouretas 108 | Kouropoulos 109 | Kouros 110 | Koustoubos 111 | Koutsoubos 112 | Kreskas 113 | Kringos 114 | Kyritsis 115 | Laganas 116 | Leontarakis 117 | Letsos 118 | Liatos 119 | Lillis 120 | Lolos 121 | Louverdis 122 | Makricosta 123 | Malihoudis 124 | Maneates 125 | Manos 126 | Manoukarakis 127 | Matsoukis 128 | Mentis 129 | Mersinias 130 | Metrofanis 131 | Michalaras 132 | Milionis 133 | Missiakos 134 | Moraitopoulos 135 | Nikolaou 136 | Nomikos 137 | Paitakes 138 | Paloumbas 139 | Panayiotopoulos 140 | Panoulias 141 | Pantelakos 142 | Pantelas 143 | Papadelias 144 | Papadopulos 145 | Papageorge 146 | Papoutsis 147 | Pappayiorgas 148 | Paraskevopoulos 149 | Paraskos 150 | Paschalis 151 | Patrianakos 152 | Patselas 153 | Pefanis 154 | Petimezas 155 | Petrakis 156 | Pezos 157 | Phocas 158 | Pispinis 159 | Polites 160 | Polymenakou 161 | Poniros 162 | Protopsaltis 163 | Rallis 164 | Rigatos 165 | Rorris 166 | Rousses 167 | Ruvelas 168 | Sakelaris 169 | Sakellariou 170 | Samios 171 | Sardelis 172 | Sfakianos 173 | Sklavenitis 174 | Sortras 175 | Sotiris 176 | Spyridis 177 | Stamatas 178 | Stamatelos 179 | Stavropoulos 180 | Strilakos 181 | Stroggylis 182 | Tableriou 183 | Taflambas 184 | Tassioglou 185 | Telis 186 | Tsoumada 187 | Theofilopoulos 188 | Theohari 189 | Totolos 190 | Tourna 191 | Tsahalis 192 | Tsangaris 193 | Tselios 194 | Tsogas 195 | Vamvakidis 196 | Varvitsiotes 197 | Vassilikos 198 | Vassilopulos 199 | Vlahos 200 | Vourlis 201 | Xydis 202 | Zaloumi 203 | Zouvelekis 204 | -------------------------------------------------------------------------------- /2.advancedTextClassification/data/names/Irish.txt: -------------------------------------------------------------------------------- 1 | Adam 2 | Ahearn 3 | Aodh 4 | Aodha 5 | Aonghuis 6 | Aonghus 7 | Bhrighde 8 | Bradach 9 | Bradan 10 | Braden 11 | Brady 12 | Bran 13 | Brannon 14 | Brian 15 | Callaghan 16 | Caomh 17 | Carey 18 | Casey 19 | Cassidy 20 | Cathain 21 | Cathan 22 | Cathasach 23 | Ceallach 24 | Ceallachan 25 | Cearbhall 26 | Cennetig 27 | Ciardha 28 | Clark 29 | Cleirich 30 | Cleirigh 31 | Cnaimhin 32 | Coghlan 33 | Coilean 34 | Collins 35 | Colman 36 | Conall 37 | Conchobhar 38 | Conn 39 | Connell 40 | Connolly 41 | Cormac 42 | Corraidhin 43 | Cuidightheach 44 | Curran 45 | Dúbhshlaine 46 | Dalach 47 | Daly 48 | Damhain 49 | Damhan 50 | Delaney 51 | Desmond 52 | Devin 53 | Diarmaid 54 | Doherty 55 | Domhnall 56 | Donnchadh 57 | Donndubhan 58 | Donnell 59 | Donoghue 60 | Donovan 61 | Doyle 62 | Dubhain 63 | Dubhan 64 | Duncan 65 | Eoghan 66 | Eoin 67 | Eoin 68 | Faolan 69 | Farrell 70 | Fearghal 71 | Fergus 72 | Finn 73 | Finnegan 74 | Fionn 75 | Flanagan 76 | Flann 77 | Flynn 78 | Gallchobhar 79 | Gerald 80 | Giolla 81 | Gorman 82 | Hayden 83 | Ivor 84 | John 85 | Kavanagh 86 | Keefe 87 | Kelly 88 | Kennedy 89 | Lennon 90 | Login 91 | Macclelland 92 | Macdermott 93 | Maceachthighearna 94 | Macfarland 95 | Macghabhann 96 | Maciomhair 97 | Macshuibhne 98 | Madaidhin 99 | Madden 100 | Maguire 101 | Mahoney 102 | Maille 103 | Malone 104 | Manus 105 | Maolmhuaidh 106 | Mathghamhain 107 | Maurice 108 | Mcguire 109 | Mckay 110 | Mclain 111 | Mcmahon 112 | Mcnab 113 | Mcneil 114 | Meadhra 115 | Michael 116 | Milligan 117 | Mochan 118 | Mohan 119 | Molloy 120 | Monahan 121 | Mooney 122 | Muirchertach 123 | Mullen 124 | Mulryan 125 | Murchadh 126 | Murphy 127 | Names 128 | Naoimhin 129 | Naomhan 130 | Neil 131 | Neville 132 | Nevin 133 | Niadh 134 | Niall 135 | Nolan 136 | Nuallan 137 | O'Boyle 138 | O'Brien 139 | O'Byrne 140 | O'Donnell 141 | O'Hannagain 142 | O'Hannigain 143 | O'Keefe 144 | O'Mooney 145 | O'Neal 146 | O'Boyle 147 | O'Bree 148 | O'Brian 149 | O'Brien 150 | O'Callaghann 151 | O'Connell 152 | O'Connor 153 | O'Dell 154 | O'Doherty 155 | O'Donnell 156 | O'Donoghue 157 | O'Dowd 158 | O'Driscoll 159 | O'Gorman 160 | O'Grady 161 | O'Hagan 162 | O'Halloran 163 | O'Hanlon 164 | O'Hara 165 | O'Hare 166 | O'Kane 167 | O'Keefe 168 | O'Keeffe 169 | O'Kelly 170 | O'Leary 171 | O'Loughlin 172 | O'Mahoney 173 | O'Mahony 174 | O'Malley 175 | O'Meara 176 | O'Neal 177 | O'Neill 178 | O'Reilly 179 | O'Rourke 180 | O'Ryan 181 | O'Shea 182 | O'Sullivan 183 | O'Toole 184 | Patrick 185 | Peatain 186 | Pharlain 187 | Power 188 | Quigley 189 | Quinn 190 | Quirke 191 | Raghailligh 192 | Reagan 193 | Register 194 | Reilly 195 | Reynold 196 | Rhys 197 | Riagain 198 | Riagan 199 | Riain 200 | Rian 201 | Rinn 202 | Roach 203 | Rodagh 204 | Rory 205 | Ruadh 206 | Ruadhain 207 | Ruadhan 208 | Ruaidh 209 | Samuel 210 | Scolaidhe 211 | Seaghdha 212 | Sechnall 213 | Seighin 214 | Shannon 215 | Sheehy 216 | Simon 217 | Sioda 218 | Sloan 219 | Sluaghadhan 220 | Suaird 221 | Sullivan 222 | Tadhg 223 | Tadhgan 224 | Taidhg 225 | Teagan 226 | Teague 227 | Tighearnach 228 | Tracey 229 | Treasach 230 | Whalen 231 | Whelan 232 | William 233 | -------------------------------------------------------------------------------- /2.advancedTextClassification/data/names/Italian.txt: -------------------------------------------------------------------------------- 1 | Abandonato 2 | Abatangelo 3 | Abatantuono 4 | Abate 5 | Abategiovanni 6 | Abatescianni 7 | Abbà 8 | Abbadelli 9 | Abbascia 10 | Abbatangelo 11 | Abbatantuono 12 | Abbate 13 | Abbatelli 14 | Abbaticchio 15 | Abbiati 16 | Abbracciabene 17 | Abbracciabeni 18 | Abelli 19 | Abelló 20 | Abrami 21 | Abramo 22 | Acardi 23 | Accardi 24 | Accardo 25 | Acciai 26 | Acciaio 27 | Acciaioli 28 | Acconci 29 | Acconcio 30 | Accorsi 31 | Accorso 32 | Accosi 33 | Accursio 34 | Acerbi 35 | Acone 36 | Aconi 37 | Acqua 38 | Acquafredda 39 | Acquarone 40 | Acquati 41 | Adalardi 42 | Adami 43 | Adamo 44 | Adamoli 45 | Addario 46 | Adelardi 47 | Adessi 48 | Adimari 49 | Adriatico 50 | Affini 51 | Africani 52 | Africano 53 | Agani 54 | Aggi 55 | Aggio 56 | Agli 57 | Agnelli 58 | Agnellutti 59 | Agnusdei 60 | Agosti 61 | Agostini 62 | Agresta 63 | Agrioli 64 | Aiello 65 | Aiolfi 66 | Airaldi 67 | Airò 68 | Aita 69 | Ajello 70 | Alagona 71 | Alamanni 72 | Albanesi 73 | Albani 74 | Albano 75 | Alberghi 76 | Alberghini 77 | Alberici 78 | Alberighi 79 | Albero 80 | Albini 81 | Albricci 82 | Albrici 83 | Alcheri 84 | Aldebrandi 85 | Alderisi 86 | Alduino 87 | Alemagna 88 | Aleppo 89 | Alesci 90 | Alescio 91 | Alesi 92 | Alesini 93 | Alesio 94 | Alessandri 95 | Alessi 96 | Alfero 97 | Aliberti 98 | Alinari 99 | Aliprandi 100 | Allegri 101 | Allegro 102 | Alò 103 | Aloia 104 | Aloisi 105 | Altamura 106 | Altimari 107 | Altoviti 108 | Alunni 109 | Amadei 110 | Amadori 111 | Amalberti 112 | Amantea 113 | Amato 114 | Amatore 115 | Ambrogi 116 | Ambrosi 117 | Amello 118 | Amerighi 119 | Amoretto 120 | Angioli 121 | Ansaldi 122 | Anselmetti 123 | Anselmi 124 | Antonelli 125 | Antonini 126 | Antonino 127 | Aquila 128 | Aquino 129 | Arbore 130 | Ardiccioni 131 | Ardizzone 132 | Ardovini 133 | Arena 134 | Aringheri 135 | Arlotti 136 | Armani 137 | Armati 138 | Armonni 139 | Arnolfi 140 | Arnoni 141 | Arrighetti 142 | Arrighi 143 | Arrigucci 144 | Aucciello 145 | Azzarà 146 | Baggi 147 | Baggio 148 | Baglio 149 | Bagni 150 | Bagnoli 151 | Balboni 152 | Baldi 153 | Baldini 154 | Baldinotti 155 | Baldovini 156 | Bandini 157 | Bandoni 158 | Barbieri 159 | Barone 160 | Barsetti 161 | Bartalotti 162 | Bartolomei 163 | Bartolomeo 164 | Barzetti 165 | Basile 166 | Bassanelli 167 | Bassani 168 | Bassi 169 | Basso 170 | Basurto 171 | Battaglia 172 | Bazzoli 173 | Bellandi 174 | Bellandini 175 | Bellincioni 176 | Bellini 177 | Bello 178 | Bellomi 179 | Belloni 180 | Belluomi 181 | Belmonte 182 | Bencivenni 183 | Benedetti 184 | Benenati 185 | Benetton 186 | Benini 187 | Benivieni 188 | Benvenuti 189 | Berardi 190 | Bergamaschi 191 | Berti 192 | Bertolini 193 | Biancardi 194 | Bianchi 195 | Bicchieri 196 | Biondi 197 | Biondo 198 | Boerio 199 | Bologna 200 | Bondesan 201 | Bonomo 202 | Borghi 203 | Borgnino 204 | Borgogni 205 | Bosco 206 | Bove 207 | Bovér 208 | Boveri 209 | Brambani 210 | Brambilla 211 | Breda 212 | Brioschi 213 | Brivio 214 | Brunetti 215 | Bruno 216 | Buffone 217 | Bulgarelli 218 | Bulgari 219 | Buonarroti 220 | Busto 221 | Caiazzo 222 | Caito 223 | Caivano 224 | Calabrese 225 | Calligaris 226 | Campana 227 | Campo 228 | Cantu 229 | Capello 230 | Capello 231 | Capello 232 | Capitani 233 | Carbone 234 | Carboni 235 | Carideo 236 | Carlevaro 237 | Caro 238 | Carracci 239 | Carrara 240 | Caruso 241 | Cassano 242 | Castro 243 | Catalano 244 | Cattaneo 245 | Cavalcante 246 | Cavallo 247 | Cingolani 248 | Cino 249 | Cipriani 250 | Cisternino 251 | Coiro 252 | Cola 253 | Colombera 254 | Colombo 255 | Columbo 256 | Como 257 | Como 258 | Confortola 259 | Conti 260 | Corna 261 | Corti 262 | Corvi 263 | Costa 264 | Costantini 265 | Costanzo 266 | Cracchiolo 267 | Cremaschi 268 | Cremona 269 | Cremonesi 270 | Crespo 271 | Croce 272 | Crocetti 273 | Cucinotta 274 | Cuocco 275 | Cuoco 276 | D'ambrosio 277 | Damiani 278 | D'amore 279 | D'angelo 280 | D'antonio 281 | De angelis 282 | De campo 283 | De felice 284 | De filippis 285 | De fiore 286 | De laurentis 287 | De luca 288 | De palma 289 | De rege 290 | De santis 291 | De vitis 292 | Di antonio 293 | Di caprio 294 | Di mercurio 295 | Dinapoli 296 | Dioli 297 | Di pasqua 298 | Di pietro 299 | Di stefano 300 | Donati 301 | D'onofrio 302 | Drago 303 | Durante 304 | Elena 305 | Episcopo 306 | Ermacora 307 | Esposito 308 | Evangelista 309 | Fabbri 310 | Fabbro 311 | Falco 312 | Faraldo 313 | Farina 314 | Farro 315 | Fattore 316 | Fausti 317 | Fava 318 | Favero 319 | Fermi 320 | Ferrara 321 | Ferrari 322 | Ferraro 323 | Ferrero 324 | Ferro 325 | Fierro 326 | Filippi 327 | Fini 328 | Fiore 329 | Fiscella 330 | Fiscella 331 | Fonda 332 | Fontana 333 | Fortunato 334 | Franco 335 | Franzese 336 | Furlan 337 | Gabrielli 338 | Gagliardi 339 | Gallo 340 | Ganza 341 | Garfagnini 342 | Garofalo 343 | Gaspari 344 | Gatti 345 | Genovese 346 | Gentile 347 | Germano 348 | Giannino 349 | Gimondi 350 | Giordano 351 | Gismondi 352 | Giùgovaz 353 | Giunta 354 | Goretti 355 | Gori 356 | Greco 357 | Grillo 358 | Grimaldi 359 | Gronchi 360 | Guarneri 361 | Guerra 362 | Guerriero 363 | Guidi 364 | Guttuso 365 | Idoni 366 | Innocenti 367 | Labriola 368 | Làconi 369 | Laganà 370 | Lagomarsìno 371 | Lagorio 372 | Laguardia 373 | Lama 374 | Lamberti 375 | Lamon 376 | Landi 377 | Lando 378 | Landolfi 379 | Laterza 380 | Laurito 381 | Lazzari 382 | Lecce 383 | Leccese 384 | Leggièri 385 | Lèmmi 386 | Leone 387 | Leoni 388 | Lippi 389 | Locatelli 390 | Lombardi 391 | Longo 392 | Lupo 393 | Luzzatto 394 | Maestri 395 | Magro 396 | Mancini 397 | Manco 398 | Mancuso 399 | Manfredi 400 | Manfredonia 401 | Mantovani 402 | Marchegiano 403 | Marchesi 404 | Marchetti 405 | Marchioni 406 | Marconi 407 | Mari 408 | Maria 409 | Mariani 410 | Marino 411 | Marmo 412 | Martelli 413 | Martinelli 414 | Masi 415 | Masin 416 | Mazza 417 | Merlo 418 | Messana 419 | Micheli 420 | Milani 421 | Milano 422 | Modugno 423 | Mondadori 424 | Mondo 425 | Montagna 426 | Montana 427 | Montanari 428 | Monte 429 | Monti 430 | Morandi 431 | Morello 432 | Moretti 433 | Morra 434 | Moschella 435 | Mosconi 436 | Motta 437 | Muggia 438 | Muraro 439 | Murgia 440 | Murtas 441 | Nacar 442 | Naggi 443 | Naggia 444 | Naldi 445 | Nana 446 | Nani 447 | Nanni 448 | Nannini 449 | Napoleoni 450 | Napoletani 451 | Napoliello 452 | Nardi 453 | Nardo 454 | Nardovino 455 | Nasato 456 | Nascimbene 457 | Nascimbeni 458 | Natale 459 | Nave 460 | Nazario 461 | Necchi 462 | Negri 463 | Negrini 464 | Nelli 465 | Nenci 466 | Nepi 467 | Neri 468 | Neroni 469 | Nervetti 470 | Nervi 471 | Nespola 472 | Nicastro 473 | Nicchi 474 | Nicodemo 475 | Nicolai 476 | Nicolosi 477 | Nicosia 478 | Nicotera 479 | Nieddu 480 | Nieri 481 | Nigro 482 | Nisi 483 | Nizzola 484 | Noschese 485 | Notaro 486 | Notoriano 487 | Oberti 488 | Oberto 489 | Ongaro 490 | Orlando 491 | Orsini 492 | Pace 493 | Padovan 494 | Padovano 495 | Pagani 496 | Pagano 497 | Palladino 498 | Palmisano 499 | Palumbo 500 | Panzavecchia 501 | Parisi 502 | Parma 503 | Parodi 504 | Parri 505 | Parrino 506 | Passerini 507 | Pastore 508 | Paternoster 509 | Pavesi 510 | Pavone 511 | Pavoni 512 | Pecora 513 | Pedrotti 514 | Pellegrino 515 | Perugia 516 | Pesaresi 517 | Pesaro 518 | Pesce 519 | Petri 520 | Pherigo 521 | Piazza 522 | Piccirillo 523 | Piccoli 524 | Pierno 525 | Pietri 526 | Pini 527 | Piovene 528 | Piraino 529 | Pisani 530 | Pittaluga 531 | Poggi 532 | Poggio 533 | Poletti 534 | Pontecorvo 535 | Portelli 536 | Porto 537 | Portoghese 538 | Potenza 539 | Pozzi 540 | Profeta 541 | Prosdocimi 542 | Provenza 543 | Provenzano 544 | Pugliese 545 | Quaranta 546 | Quattrocchi 547 | Ragno 548 | Raimondi 549 | Rais 550 | Rana 551 | Raneri 552 | Rao 553 | Rapallino 554 | Ratti 555 | Ravenna 556 | Ré 557 | Ricchetti 558 | Ricci 559 | Riggi 560 | Righi 561 | Rinaldi 562 | Riva 563 | Rizzo 564 | Robustelli 565 | Rocca 566 | Rocchi 567 | Rocco 568 | Roma 569 | Roma 570 | Romagna 571 | Romagnoli 572 | Romano 573 | Romano 574 | Romero 575 | Roncalli 576 | Ronchi 577 | Rosa 578 | Rossi 579 | Rossini 580 | Rotolo 581 | Rovigatti 582 | Ruggeri 583 | Russo 584 | Rustici 585 | Ruzzier 586 | Sabbadin 587 | Sacco 588 | Sala 589 | Salomon 590 | Salucci 591 | Salvaggi 592 | Salvai 593 | Salvail 594 | Salvatici 595 | Salvay 596 | Sanna 597 | Sansone 598 | Santini 599 | Santoro 600 | Sapienti 601 | Sarno 602 | Sarti 603 | Sartini 604 | Sarto 605 | Savona 606 | Scarpa 607 | Scarsi 608 | Scavo 609 | Sciacca 610 | Sciacchitano 611 | Sciarra 612 | Scordato 613 | Scotti 614 | Scutese 615 | Sebastiani 616 | Sebastino 617 | Segreti 618 | Selmone 619 | Selvaggio 620 | Serafin 621 | Serafini 622 | Serpico 623 | Sessa 624 | Sgro 625 | Siena 626 | Silvestri 627 | Sinagra 628 | Sinagra 629 | Soldati 630 | Somma 631 | Sordi 632 | Soriano 633 | Sorrentino 634 | Spada 635 | Spanò 636 | Sparacello 637 | Speziale 638 | Spini 639 | Stabile 640 | Stablum 641 | Stilo 642 | Sultana 643 | Tafani 644 | Tamàro 645 | Tamboia 646 | Tanzi 647 | Tarantino 648 | Taverna 649 | Tedesco 650 | Terranova 651 | Terzi 652 | Tessaro 653 | Testa 654 | Tiraboschi 655 | Tivoli 656 | Todaro 657 | Toloni 658 | Tornincasa 659 | Toselli 660 | Tosetti 661 | Tosi 662 | Tosto 663 | Trapani 664 | Traversa 665 | Traversi 666 | Traversini 667 | Traverso 668 | Trucco 669 | Trudu 670 | Tumicelli 671 | Turati 672 | Turchi 673 | Uberti 674 | Uccello 675 | Uggeri 676 | Ughi 677 | Ungaretti 678 | Ungaro 679 | Vacca 680 | Vaccaro 681 | Valenti 682 | Valentini 683 | Valerio 684 | Varano 685 | Ventimiglia 686 | Ventura 687 | Verona 688 | Veronesi 689 | Vescovi 690 | Vespa 691 | Vestri 692 | Vicario 693 | Vico 694 | Vigo 695 | Villa 696 | Vinci 697 | Vinci 698 | Viola 699 | Vitali 700 | Viteri 701 | Voltolini 702 | Zambrano 703 | Zanetti 704 | Zangari 705 | Zappa 706 | Zeni 707 | Zini 708 | Zino 709 | Zunino 710 | -------------------------------------------------------------------------------- /2.advancedTextClassification/data/names/Japanese.txt: -------------------------------------------------------------------------------- 1 | Abe 2 | Abukara 3 | Adachi 4 | Aida 5 | Aihara 6 | Aizawa 7 | Ajibana 8 | Akaike 9 | Akamatsu 10 | Akatsuka 11 | Akechi 12 | Akera 13 | Akimoto 14 | Akita 15 | Akiyama 16 | Akutagawa 17 | Amagawa 18 | Amaya 19 | Amori 20 | Anami 21 | Ando 22 | Anzai 23 | Aoki 24 | Arai 25 | Arakawa 26 | Araki 27 | Arakida 28 | Arato 29 | Arihyoshi 30 | Arishima 31 | Arita 32 | Ariwa 33 | Ariwara 34 | Asahara 35 | Asahi 36 | Asai 37 | Asano 38 | Asanuma 39 | Asari 40 | Ashia 41 | Ashida 42 | Ashikaga 43 | Asuhara 44 | Atshushi 45 | Ayabito 46 | Ayugai 47 | Baba 48 | Baisotei 49 | Bando 50 | Bunya 51 | Chiba 52 | Chikamatsu 53 | Chikanatsu 54 | Chino 55 | Chishu 56 | Choshi 57 | Daishi 58 | Dan 59 | Date 60 | Dazai 61 | Deguchi 62 | Deushi 63 | Doi 64 | Ebina 65 | Ebisawa 66 | Eda 67 | Egami 68 | Eguchi 69 | Ekiguchi 70 | Endo 71 | Endoso 72 | Enoki 73 | Enomoto 74 | Erizawa 75 | Eto 76 | Etsuko 77 | Ezakiya 78 | Fuchida 79 | Fugunaga 80 | Fujikage 81 | Fujimaki 82 | Fujimoto 83 | Fujioka 84 | Fujishima 85 | Fujita 86 | Fujiwara 87 | Fukao 88 | Fukayama 89 | Fukuda 90 | Fukumitsu 91 | Fukunaka 92 | Fukuoka 93 | Fukusaku 94 | Fukushima 95 | Fukuyama 96 | Fukuzawa 97 | Fumihiko 98 | Funabashi 99 | Funaki 100 | Funakoshi 101 | Furusawa 102 | Fuschida 103 | Fuse 104 | Futabatei 105 | Fuwa 106 | Gakusha 107 | Genda 108 | Genji 109 | Gensai 110 | Godo 111 | Goto 112 | Gushiken 113 | Hachirobei 114 | Haga 115 | Hagino 116 | Hagiwara 117 | Hama 118 | Hamacho 119 | Hamada 120 | Hamaguchi 121 | Hamamoto 122 | Hanabusa 123 | Hanari 124 | Handa 125 | Hara 126 | Harada 127 | Haruguchi 128 | Hasegawa 129 | Hasekura 130 | Hashimoto 131 | Hasimoto 132 | Hatakeda 133 | Hatakeyama 134 | Hatayama 135 | Hatoyama 136 | Hattori 137 | Hayakawa 138 | Hayami 139 | Hayashi 140 | Hayashida 141 | Hayata 142 | Hayuata 143 | Hida 144 | Hideaki 145 | Hideki 146 | Hideyoshi 147 | Higashikuni 148 | Higashiyama 149 | Higo 150 | Higoshi 151 | Higuchi 152 | Hike 153 | Hino 154 | Hira 155 | Hiraga 156 | Hiraki 157 | Hirano 158 | Hiranuma 159 | Hiraoka 160 | Hirase 161 | Hirasi 162 | Hirata 163 | Hiratasuka 164 | Hirayama 165 | Hiro 166 | Hirose 167 | Hirota 168 | Hiroyuki 169 | Hisamatsu 170 | Hishida 171 | Hishikawa 172 | Hitomi 173 | Hiyama 174 | Hohki 175 | Hojo 176 | Hokusai 177 | Honami 178 | Honda 179 | Hori 180 | Horigome 181 | Horigoshi 182 | Horiuchi 183 | Horri 184 | Hoshino 185 | Hosokawa 186 | Hosokaya 187 | Hotate 188 | Hotta 189 | Hyata 190 | Hyobanshi 191 | Ibi 192 | Ibu 193 | Ibuka 194 | Ichigawa 195 | Ichihara 196 | Ichikawa 197 | Ichimonji 198 | Ichiro 199 | Ichisada 200 | Ichiyusai 201 | Idane 202 | Iemochi 203 | Ienari 204 | Iesada 205 | Ieyasu 206 | Ieyoshi 207 | Igarashi 208 | Ihara 209 | Ii 210 | Iida 211 | Iijima 212 | Iitaka 213 | Ijichi 214 | Ijiri 215 | Ikeda 216 | Ikina 217 | Ikoma 218 | Imada 219 | Imagawa 220 | Imai 221 | Imaizumi 222 | Imamura 223 | Imoo 224 | Ina 225 | Inaba 226 | Inao 227 | Inihara 228 | Ino 229 | Inoguchi 230 | Inokuma 231 | Inoue 232 | Inouye 233 | Inukai 234 | Ippitsusai 235 | Irie 236 | Iriye 237 | Isayama 238 | Ise 239 | Iseki 240 | Iseya 241 | Ishibashi 242 | Ishida 243 | Ishiguro 244 | Ishihara 245 | Ishikawa 246 | Ishimaru 247 | Ishimura 248 | Ishinomori 249 | Ishiyama 250 | Isobe 251 | Isoda 252 | Isozaki 253 | Itagaki 254 | Itami 255 | Ito 256 | Itoh 257 | Iwahara 258 | Iwahashi 259 | Iwakura 260 | Iwasa 261 | Iwasaki 262 | Izumi 263 | Jimbo 264 | Jippensha 265 | Jo 266 | Joshuya 267 | Joshuyo 268 | Jukodo 269 | Jumonji 270 | Kada 271 | Kagabu 272 | Kagawa 273 | Kahae 274 | Kahaya 275 | Kaibara 276 | Kaima 277 | Kajahara 278 | Kajitani 279 | Kajiwara 280 | Kajiyama 281 | Kakinomoto 282 | Kakutama 283 | Kamachi 284 | Kamata 285 | Kaminaga 286 | Kamio 287 | Kamioka 288 | Kamisaka 289 | Kamo 290 | Kamon 291 | Kan 292 | Kanada 293 | Kanagaki 294 | Kanegawa 295 | Kaneko 296 | Kanesaka 297 | Kano 298 | Karamorita 299 | Karube 300 | Karubo 301 | Kasahara 302 | Kasai 303 | Kasamatsu 304 | Kasaya 305 | Kase 306 | Kashiwagi 307 | Kasuse 308 | Kataoka 309 | Katayama 310 | Katayanagi 311 | Kate 312 | Kato 313 | Katoaka 314 | Katsu 315 | Katsukawa 316 | Katsumata 317 | Katsura 318 | Katsushika 319 | Kawabata 320 | Kawachi 321 | Kawagichi 322 | Kawagishi 323 | Kawaguchi 324 | Kawai 325 | Kawaii 326 | Kawakami 327 | Kawamata 328 | Kawamura 329 | Kawasaki 330 | Kawasawa 331 | Kawashima 332 | Kawasie 333 | Kawatake 334 | Kawate 335 | Kawayama 336 | Kawazu 337 | Kaza 338 | Kazuyoshi 339 | Kenkyusha 340 | Kenmotsu 341 | Kentaro 342 | Ki 343 | Kido 344 | Kihara 345 | Kijimuta 346 | Kijmuta 347 | Kikkawa 348 | Kikuchi 349 | Kikugawa 350 | Kikui 351 | Kikutake 352 | Kimio 353 | Kimiyama 354 | Kimura 355 | Kinashita 356 | Kinoshita 357 | Kinugasa 358 | Kira 359 | Kishi 360 | Kiski 361 | Kita 362 | Kitabatake 363 | Kitagawa 364 | Kitamura 365 | Kitano 366 | Kitao 367 | Kitoaji 368 | Ko 369 | Kobayashi 370 | Kobi 371 | Kodama 372 | Koga 373 | Kogara 374 | Kogo 375 | Koguchi 376 | Koiso 377 | Koizumi 378 | Kojima 379 | Kokan 380 | Komagata 381 | Komatsu 382 | Komatsuzaki 383 | Komine 384 | Komiya 385 | Komon 386 | Komura 387 | Kon 388 | Konae 389 | Konda 390 | Kondo 391 | Konishi 392 | Kono 393 | Konoe 394 | Koruba 395 | Koshin 396 | Kotara 397 | Kotoku 398 | Koyama 399 | Koyanagi 400 | Kozu 401 | Kubo 402 | Kubota 403 | Kudara 404 | Kudo 405 | Kuga 406 | Kumagae 407 | Kumasaka 408 | Kunda 409 | Kunikida 410 | Kunisada 411 | Kuno 412 | Kunomasu 413 | Kuramochi 414 | Kuramoto 415 | Kurata 416 | Kurkawa 417 | Kurmochi 418 | Kuroda 419 | Kurofuji 420 | Kurogane 421 | Kurohiko 422 | Kuroki 423 | Kurosawa 424 | Kurusu 425 | Kusatsu 426 | Kusonoki 427 | Kusuhara 428 | Kusunoki 429 | Kuwabara 430 | Kwakami 431 | Kyubei 432 | Maeda 433 | Maehata 434 | Maeno 435 | Maita 436 | Makiguchi 437 | Makino 438 | Makioka 439 | Makuda 440 | Marubeni 441 | Marugo 442 | Marusa 443 | Maruya 444 | Maruyama 445 | Masanobu 446 | Masaoka 447 | Mashita 448 | Masoni 449 | Masudu 450 | Masuko 451 | Masuno 452 | Masuzoe 453 | Matano 454 | Matokai 455 | Matoke 456 | Matsuda 457 | Matsukata 458 | Matsuki 459 | Matsumara 460 | Matsumoto 461 | Matsumura 462 | Matsuo 463 | Matsuoka 464 | Matsura 465 | Matsushina 466 | Matsushita 467 | Matsuya 468 | Matsuzawa 469 | Mayuzumi 470 | Mazaki 471 | Mazawa 472 | Mazuka 473 | Mifune 474 | Mihashi 475 | Miki 476 | Mimasuya 477 | Minabuchi 478 | Minami 479 | Minamoto 480 | Minatoya 481 | Minobe 482 | Mishima 483 | Mitsubishi 484 | Mitsuharu 485 | Mitsui 486 | Mitsukuri 487 | Mitsuwa 488 | Mitsuya 489 | Mitzusaka 490 | Miura 491 | Miwa 492 | Miyagi 493 | Miyahara 494 | Miyajima 495 | Miyake 496 | Miyamae 497 | Miyamoto 498 | Miyazaki 499 | Miyazawa 500 | Miyoshi 501 | Mizoguchi 502 | Mizumaki 503 | Mizuno 504 | Mizutani 505 | Modegi 506 | Momotami 507 | Momotani 508 | Monomonoi 509 | Mori 510 | Moriguchi 511 | Morimoto 512 | Morinaga 513 | Morioka 514 | Morishita 515 | Morisue 516 | Morita 517 | Morri 518 | Moto 519 | Motoori 520 | Motoyoshi 521 | Munakata 522 | Munkata 523 | Muraguchi 524 | Murakami 525 | Muraoka 526 | Murasaki 527 | Murase 528 | Murata 529 | Murkami 530 | Muro 531 | Muruyama 532 | Mushanaokoji 533 | Mushashibo 534 | Muso 535 | Mutsu 536 | Nagahama 537 | Nagai 538 | Nagano 539 | Nagasawa 540 | Nagase 541 | Nagata 542 | Nagatsuka 543 | Nagumo 544 | Naito 545 | Nakada 546 | Nakadai 547 | Nakadan 548 | Nakae 549 | Nakagawa 550 | Nakahara 551 | Nakajima 552 | Nakamoto 553 | Nakamura 554 | Nakane 555 | Nakanishi 556 | Nakano 557 | Nakanoi 558 | Nakao 559 | Nakasato 560 | Nakasawa 561 | Nakasone 562 | Nakata 563 | Nakatoni 564 | Nakayama 565 | Nakazawa 566 | Namiki 567 | Nanami 568 | Narahashi 569 | Narato 570 | Narita 571 | Nataga 572 | Natsume 573 | Nawabe 574 | Nemoto 575 | Niijima 576 | Nijo 577 | Ninomiya 578 | Nishi 579 | Nishihara 580 | Nishikawa 581 | Nishimoto 582 | Nishimura 583 | Nishimuraya 584 | Nishio 585 | Nishiwaki 586 | Nitta 587 | Nobunaga 588 | Noda 589 | Nogi 590 | Noguchi 591 | Nogushi 592 | Nomura 593 | Nonomura 594 | Noro 595 | Nosaka 596 | Nose 597 | Nozaki 598 | Nozara 599 | Numajiri 600 | Numata 601 | Obata 602 | Obinata 603 | Obuchi 604 | Ochiai 605 | Ochida 606 | Odaka 607 | Ogata 608 | Ogiwara 609 | Ogura 610 | Ogyu 611 | Ohba 612 | Ohira 613 | Ohishi 614 | Ohka 615 | Ohmae 616 | Ohmiya 617 | Oichi 618 | Oinuma 619 | Oishi 620 | Okabe 621 | Okada 622 | Okakura 623 | Okamoto 624 | Okamura 625 | Okanao 626 | Okanaya 627 | Okano 628 | Okasawa 629 | Okawa 630 | Okazaki 631 | Okazawaya 632 | Okimasa 633 | Okimoto 634 | Okita 635 | Okubo 636 | Okuda 637 | Okui 638 | Okuma 639 | Okuma 640 | Okumura 641 | Okura 642 | Omori 643 | Omura 644 | Onishi 645 | Ono 646 | Onoda 647 | Onoe 648 | Onohara 649 | Ooka 650 | Osagawa 651 | Osaragi 652 | Oshima 653 | Oshin 654 | Ota 655 | Otaka 656 | Otake 657 | Otani 658 | Otomo 659 | Otsu 660 | Otsuka 661 | Ouchi 662 | Oyama 663 | Ozaki 664 | Ozawa 665 | Ozu 666 | Raikatuji 667 | Royama 668 | Ryusaki 669 | Sada 670 | Saeki 671 | Saga 672 | Saigo 673 | Saiki 674 | Saionji 675 | Saito 676 | Saitoh 677 | Saji 678 | Sakagami 679 | Sakai 680 | Sakakibara 681 | Sakamoto 682 | Sakanoue 683 | Sakata 684 | Sakiyurai 685 | Sakoda 686 | Sakubara 687 | Sakuraba 688 | Sakurai 689 | Sammiya 690 | Sanda 691 | Sanjo 692 | Sano 693 | Santo 694 | Saromi 695 | Sarumara 696 | Sasada 697 | Sasakawa 698 | Sasaki 699 | Sassa 700 | Satake 701 | Sato 702 | Satoh 703 | Satoya 704 | Sawamatsu 705 | Sawamura 706 | Sayuki 707 | Segawa 708 | Sekigawa 709 | Sekine 710 | Sekozawa 711 | Sen 712 | Senmatsu 713 | Seo 714 | Serizawa 715 | Shiba 716 | Shibaguchi 717 | Shibanuma 718 | Shibasaki 719 | Shibasawa 720 | Shibata 721 | Shibukji 722 | Shichirobei 723 | Shidehara 724 | Shiga 725 | Shiganori 726 | Shige 727 | Shigeki 728 | Shigemitsu 729 | Shigi 730 | Shikitei 731 | Shikuk 732 | Shima 733 | Shimada 734 | Shimakage 735 | Shimamura 736 | Shimanouchi 737 | Shimaoka 738 | Shimazaki 739 | Shimazu 740 | Shimedzu 741 | Shimizu 742 | Shimohira 743 | Shimon 744 | Shimura 745 | Shimuzu 746 | Shinko 747 | Shinozaki 748 | Shinozuka 749 | Shintaro 750 | Shiokawa 751 | Shiomi 752 | Shiomiya 753 | Shionoya 754 | Shiotani 755 | Shioya 756 | Shirahata 757 | Shirai 758 | Shiraishi 759 | Shirane 760 | Shirasu 761 | Shiratori 762 | Shirokawa 763 | Shiroyama 764 | Shiskikura 765 | Shizuma 766 | Shobo 767 | Shoda 768 | Shunji 769 | Shunsen 770 | Siagyo 771 | Soga 772 | Sohda 773 | Soho 774 | Soma 775 | Someya 776 | Sone 777 | Sonoda 778 | Soseki 779 | Sotomura 780 | Suenami 781 | Sugai 782 | Sugase 783 | Sugawara 784 | Sugihara 785 | Sugimura 786 | Sugisata 787 | Sugita 788 | Sugitani 789 | Sugiyama 790 | Sumitimo 791 | Sunada 792 | Suzambo 793 | Suzuki 794 | Tabuchi 795 | Tadeshi 796 | Tagawa 797 | Taguchi 798 | Taira 799 | Taka 800 | Takabe 801 | Takagaki 802 | Takagawa 803 | Takagi 804 | Takahama 805 | Takahashi 806 | Takaki 807 | Takamura 808 | Takano 809 | Takaoka 810 | Takara 811 | Takarabe 812 | Takashi 813 | Takashita 814 | Takasu 815 | Takasugi 816 | Takayama 817 | Takecare 818 | Takeda 819 | Takei 820 | Takekawa 821 | Takemago 822 | Takemitsu 823 | Takemura 824 | Takenouchi 825 | Takeshita 826 | Taketomo 827 | Takeuchi 828 | Takewaki 829 | Takimoto 830 | Takishida 831 | Takishita 832 | Takizawa 833 | Taku 834 | Takudo 835 | Takudome 836 | Tamazaki 837 | Tamura 838 | Tamuro 839 | Tanaka 840 | Tange 841 | Tani 842 | Taniguchi 843 | Tanizaki 844 | Tankoshitsu 845 | Tansho 846 | Tanuma 847 | Tarumi 848 | Tatenaka 849 | Tatsuko 850 | Tatsuno 851 | Tatsuya 852 | Tawaraya 853 | Tayama 854 | Temko 855 | Tenshin 856 | Terada 857 | Terajima 858 | Terakado 859 | Terauchi 860 | Teshigahara 861 | Teshima 862 | Tochikura 863 | Togo 864 | Tojo 865 | Tokaji 866 | Tokuda 867 | Tokudome 868 | Tokuoka 869 | Tomika 870 | Tomimoto 871 | Tomioka 872 | Tommii 873 | Tomonaga 874 | Tomori 875 | Tono 876 | Torii 877 | Torisei 878 | Toru 879 | Toshishai 880 | Toshitala 881 | Toshusai 882 | Toyama 883 | Toyoda 884 | Toyoshima 885 | Toyota 886 | Toyotomi 887 | Tsubouchi 888 | Tsucgimoto 889 | Tsuchie 890 | Tsuda 891 | Tsuji 892 | Tsujimoto 893 | Tsujimura 894 | Tsukada 895 | Tsukade 896 | Tsukahara 897 | Tsukamoto 898 | Tsukatani 899 | Tsukawaki 900 | Tsukehara 901 | Tsukioka 902 | Tsumemasa 903 | Tsumura 904 | Tsunoda 905 | Tsurimi 906 | Tsuruga 907 | Tsuruya 908 | Tsushima 909 | Tsutaya 910 | Tsutomu 911 | Uboshita 912 | Uchida 913 | Uchiyama 914 | Ueda 915 | Uehara 916 | Uemura 917 | Ueshima 918 | Uesugi 919 | Uetake 920 | Ugaki 921 | Ui 922 | Ukiyo 923 | Umari 924 | Umehara 925 | Umeki 926 | Uno 927 | Uoya 928 | Urogataya 929 | Usami 930 | Ushiba 931 | Utagawa 932 | Wakai 933 | Wakatsuki 934 | Watabe 935 | Watanabe 936 | Watari 937 | Watnabe 938 | Watoga 939 | Yakuta 940 | Yamabe 941 | Yamada 942 | Yamagata 943 | Yamaguchi 944 | Yamaguchiya 945 | Yamaha 946 | Yamahata 947 | Yamakage 948 | Yamakawa 949 | Yamakazi 950 | Yamamoto 951 | Yamamura 952 | Yamana 953 | Yamanaka 954 | Yamanouchi 955 | Yamanoue 956 | Yamaoka 957 | Yamashita 958 | Yamato 959 | Yamawaki 960 | Yamazaki 961 | Yamhata 962 | Yamura 963 | Yanagawa 964 | Yanagi 965 | Yanagimoto 966 | Yanagita 967 | Yano 968 | Yasuda 969 | Yasuhiro 970 | Yasui 971 | Yasujiro 972 | Yasukawa 973 | Yasutake 974 | Yoemon 975 | Yokokawa 976 | Yokoyama 977 | Yonai 978 | Yosano 979 | Yoshida 980 | Yoshifumi 981 | Yoshihara 982 | Yoshikawa 983 | Yoshimatsu 984 | Yoshinobu 985 | Yoshioka 986 | Yoshitomi 987 | Yoshizaki 988 | Yoshizawa 989 | Yuasa 990 | Yuhara 991 | Yunokawa 992 | -------------------------------------------------------------------------------- /2.advancedTextClassification/data/names/Korean.txt: -------------------------------------------------------------------------------- 1 | Ahn 2 | Baik 3 | Bang 4 | Byon 5 | Cha 6 | Chang 7 | Chi 8 | Chin 9 | Cho 10 | Choe 11 | Choi 12 | Chong 13 | Chou 14 | Chu 15 | Chun 16 | Chung 17 | Chweh 18 | Gil 19 | Gu 20 | Gwang 21 | Ha 22 | Han 23 | Ho 24 | Hong 25 | Hung 26 | Hwang 27 | Hyun 28 | Jang 29 | Jeon 30 | Jeong 31 | Jo 32 | Jon 33 | Jong 34 | Jung 35 | Kang 36 | Kim 37 | Ko 38 | Koo 39 | Ku 40 | Kwak 41 | Kwang 42 | Lee 43 | Li 44 | Lim 45 | Ma 46 | Mo 47 | Moon 48 | Nam 49 | Ngai 50 | Noh 51 | Oh 52 | Pae 53 | Pak 54 | Park 55 | Ra 56 | Rhee 57 | Rheem 58 | Ri 59 | Rim 60 | Ron 61 | Ryom 62 | Ryoo 63 | Ryu 64 | San 65 | Seo 66 | Seok 67 | Shim 68 | Shin 69 | Shon 70 | Si 71 | Sin 72 | So 73 | Son 74 | Song 75 | Sook 76 | Suh 77 | Suk 78 | Sun 79 | Sung 80 | Tsai 81 | Wang 82 | Woo 83 | Yang 84 | Yeo 85 | Yeon 86 | Yi 87 | Yim 88 | Yoo 89 | Yoon 90 | You 91 | Youj 92 | Youn 93 | Yu 94 | Yun 95 | -------------------------------------------------------------------------------- /2.advancedTextClassification/data/names/Polish.txt: -------------------------------------------------------------------------------- 1 | Adamczak 2 | Adamczyk 3 | Andrysiak 4 | Auttenberg 5 | Bartosz 6 | Bernard 7 | Bobienski 8 | Bosko 9 | Broż 10 | Brzezicki 11 | Budny 12 | Bukoski 13 | Bukowski 14 | Chlebek 15 | Chmiel 16 | Czajka 17 | Czajkowski 18 | Dubanowski 19 | Dubicki 20 | Dunajski 21 | Dziedzic 22 | Fabian 23 | Filipek 24 | Filipowski 25 | Gajos 26 | Gniewek 27 | Gomolka 28 | Gomulka 29 | Gorecki 30 | Górka 31 | Górski 32 | Grzeskiewicz 33 | Gwozdek 34 | Jagoda 35 | Janda 36 | Janowski 37 | Jaskolski 38 | Jaskulski 39 | Jedynak 40 | Jelen 41 | Jez 42 | Jordan 43 | Kaczka 44 | Kaluza 45 | Kamiński 46 | Kasprzak 47 | Kava 48 | Kedzierski 49 | Kijek 50 | Klimek 51 | Kosmatka 52 | Kowalczyk 53 | Kowalski 54 | Koziol 55 | Kozlow 56 | Kozlowski 57 | Krakowski 58 | Król 59 | Kumiega 60 | Lawniczak 61 | Lis 62 | Majewski 63 | Malinowski 64 | Maly 65 | Marek 66 | Marszałek 67 | Maslanka 68 | Mencher 69 | Miazga 70 | Michel 71 | Mikolajczak 72 | Mozdzierz 73 | Niemczyk 74 | Niemec 75 | Nosek 76 | Nowak 77 | Pakulski 78 | Pasternack 79 | Pasternak 80 | Paszek 81 | Piatek 82 | Piontek 83 | Pokorny 84 | Poplawski 85 | Róg 86 | Rudaski 87 | Rudawski 88 | Rusnak 89 | Rutkowski 90 | Sadowski 91 | Salomon 92 | Serafin 93 | Sienkiewicz 94 | Sierzant 95 | Sitko 96 | Skala 97 | Slaski 98 | Ślązak 99 | Ślusarczyk 100 | Ślusarski 101 | Smolák 102 | Sniegowski 103 | Sobol 104 | Sokal 105 | Sokolof 106 | Sokoloff 107 | Sokolofsky 108 | Sokolowski 109 | Sokolsky 110 | Sówka 111 | Stanek 112 | Starek 113 | Stawski 114 | Stolarz 115 | Szczepanski 116 | Szewc 117 | Szwarc 118 | Szweda 119 | Szwedko 120 | Walentowicz 121 | Warszawski 122 | Wawrzaszek 123 | Wiater 124 | Winograd 125 | Winogrodzki 126 | Wojda 127 | Wojewódka 128 | Wojewódzki 129 | Wronski 130 | Wyrick 131 | Wyrzyk 132 | Zabek 133 | Zawisza 134 | Zdunowski 135 | Zdunowski 136 | Zielinski 137 | Ziemniak 138 | Zientek 139 | Żuraw 140 | -------------------------------------------------------------------------------- /2.advancedTextClassification/data/names/Portuguese.txt: -------------------------------------------------------------------------------- 1 | Abreu 2 | Albuquerque 3 | Almeida 4 | Alves 5 | Araújo 6 | Araullo 7 | Barros 8 | Basurto 9 | Belo 10 | Cabral 11 | Campos 12 | Cardozo 13 | Castro 14 | Coelho 15 | Costa 16 | Crespo 17 | Cruz 18 | D'cruz 19 | D'cruze 20 | Delgado 21 | De santigo 22 | Duarte 23 | Estéves 24 | Fernandes 25 | Ferreira 26 | Ferreiro 27 | Ferro 28 | Fonseca 29 | Franco 30 | Freitas 31 | Garcia 32 | Gaspar 33 | Gomes 34 | Gouveia 35 | Guerra 36 | Henriques 37 | Lobo 38 | Machado 39 | Madeira 40 | Magalhães 41 | Maria 42 | Mata 43 | Mateus 44 | Matos 45 | Medeiros 46 | Melo 47 | Mendes 48 | Moreno 49 | Nunes 50 | Palmeiro 51 | Paredes 52 | Pereira 53 | Pinheiro 54 | Pinho 55 | Ramires 56 | Ribeiro 57 | Rios 58 | Rocha 59 | Rodrigues 60 | Romão 61 | Rosario 62 | Salazar 63 | Santana 64 | Santiago 65 | Santos 66 | Serafim 67 | Silva 68 | Silveira 69 | Simões 70 | Soares 71 | Souza 72 | Torres 73 | Vargas 74 | Ventura 75 | -------------------------------------------------------------------------------- /2.advancedTextClassification/data/names/Scottish.txt: -------------------------------------------------------------------------------- 1 | Smith 2 | Brown 3 | Wilson 4 | Campbell 5 | Stewart 6 | Thomson 7 | Robertson 8 | Anderson 9 | Macdonald 10 | Scott 11 | Reid 12 | Murray 13 | Taylor 14 | Clark 15 | Ross 16 | Watson 17 | Morrison 18 | Paterson 19 | Young 20 | Mitchell 21 | Walker 22 | Fraser 23 | Miller 24 | Mcdonald 25 | Gray 26 | Henderson 27 | Hamilton 28 | Johnston 29 | Duncan 30 | Graham 31 | Ferguson 32 | Kerr 33 | Davidson 34 | Bell 35 | Cameron 36 | Kelly 37 | Martin 38 | Hunter 39 | Allan 40 | Mackenzie 41 | Grant 42 | Simpson 43 | Mackay 44 | Mclean 45 | Macleod 46 | Black 47 | Russell 48 | Marshall 49 | Wallace 50 | Gibson 51 | Kennedy 52 | Gordon 53 | Burns 54 | Sutherland 55 | Stevenson 56 | Munro 57 | Milne 58 | Watt 59 | Murphy 60 | Craig 61 | Wood 62 | Muir 63 | Wright 64 | Mckenzie 65 | Ritchie 66 | Johnstone 67 | Sinclair 68 | White 69 | Mcmillan 70 | Williamson 71 | Dickson 72 | Hughes 73 | Cunningham 74 | Mckay 75 | Bruce 76 | Millar 77 | Crawford 78 | Mcintosh 79 | Douglas 80 | Docherty 81 | King 82 | Jones 83 | Boyle 84 | Fleming 85 | Mcgregor 86 | Aitken 87 | Christie 88 | Shaw 89 | Maclean 90 | Jamieson 91 | Mcintyre 92 | Hay 93 | Lindsay 94 | Alexander 95 | Ramsay 96 | Mccallum 97 | Whyte 98 | Jackson 99 | Mclaughlin 100 | Hill 101 | -------------------------------------------------------------------------------- /2.advancedTextClassification/data/names/Spanish.txt: -------------------------------------------------------------------------------- 1 | Abana 2 | Abano 3 | Abarca 4 | Abaroa 5 | Abascal 6 | Abasolo 7 | Abel 8 | Abelló 9 | Aberquero 10 | Abreu 11 | Acosta 12 | Agramunt 13 | Aiza 14 | Alamilla 15 | Albert 16 | Albuquerque 17 | Aldana 18 | Alfaro 19 | Alvarado 20 | Álvarez 21 | Alves 22 | Amador 23 | Andreu 24 | Antúnez 25 | Aqua 26 | Aquino 27 | Araújo 28 | Araullo 29 | Araya 30 | Arce 31 | Arechavaleta 32 | Arena 33 | Aritza 34 | Armando 35 | Arreola 36 | Arriola 37 | Asis 38 | Asturias 39 | Avana 40 | Azarola 41 | Banderas 42 | Barros 43 | Basurto 44 | Bautista 45 | Bello 46 | Belmonte 47 | Bengochea 48 | Benitez 49 | Bermúdez 50 | Blanco 51 | Blanxart 52 | Bolívar 53 | Bonaventura 54 | Bosque 55 | Bustillo 56 | Busto 57 | Bustos 58 | Cabello 59 | Cabrera 60 | Campo 61 | Campos 62 | Capello 63 | Cardona 64 | Caro 65 | Casales 66 | Castell 67 | Castellano 68 | Castillion 69 | Castillo 70 | Castro 71 | Chavarría 72 | Chavez 73 | Colón 74 | Costa 75 | Crespo 76 | Cruz 77 | Cuéllar 78 | Cuevas 79 | D'cruz 80 | D'cruze 81 | De la cruz 82 | De la fuente 83 | Del bosque 84 | De leon 85 | Delgado 86 | Del olmo 87 | De santigo 88 | Díaz 89 | Dominguez 90 | Duarte 91 | Durante 92 | Echevarría 93 | Echeverría 94 | Elizondo 95 | Escamilla 96 | Escárcega 97 | Escarrà 98 | Esparza 99 | Espina 100 | Espino 101 | Espinosa 102 | Espinoza 103 | Estévez 104 | Etxebarria 105 | Etxeberria 106 | Félix 107 | Fernández 108 | Ferrer 109 | Fierro 110 | Flores 111 | Fonseca 112 | Franco 113 | Fuentes 114 | Gallego 115 | Gallo 116 | García 117 | Garrastazu 118 | Garza 119 | Gaspar 120 | Gebara 121 | Gomez 122 | Gonzales 123 | Gonzalez 124 | Grec 125 | Guadarrama 126 | Guerra 127 | Guerrero 128 | Gutiérrez 129 | Gutierrez 130 | Hernandez 131 | Herrera 132 | Herrero 133 | Hierro 134 | Holguín 135 | Huerta 136 | Ibáñez 137 | Ibarra 138 | Iñíguez 139 | Iturburua 140 | Jaso 141 | Jasso 142 | Jimenez 143 | Jordà 144 | Juárez 145 | Lobo 146 | Lopez 147 | Losa 148 | Loyola 149 | Machado 150 | Macías 151 | Maradona 152 | María 153 | Marino 154 | Márquez 155 | Martell 156 | Martí 157 | Martínez 158 | Martinez 159 | Mas 160 | Mata 161 | Mateu 162 | Medina 163 | Melendez 164 | Méndez 165 | Mendoza 166 | Menendez 167 | Merlo 168 | Michel 169 | Mingo 170 | Moles 171 | Molina 172 | Montero 173 | Morales 174 | Moralez 175 | Moreno 176 | Narváez 177 | Nieves 178 | Noguerra 179 | Núñez 180 | Obando 181 | Ochoa 182 | Ojeda 183 | Ola 184 | Oleastro 185 | Olguin 186 | Oliver 187 | Olmos 188 | Oquendo 189 | Orellana 190 | Oriol 191 | Ortega 192 | Ortiz 193 | Palomo 194 | Paredes 195 | Pavia 196 | Peláez 197 | Peña 198 | Pérez 199 | Perez 200 | Petit 201 | Picasso 202 | Porra 203 | Porras 204 | Prieto 205 | Puerta 206 | Puga 207 | Puig 208 | Quinones 209 | Quintana 210 | Quirós 211 | Ramírez 212 | Ramos 213 | Rana 214 | Rendón 215 | Rey 216 | Reyes 217 | Rios 218 | Rivera 219 | Rivero 220 | Robledo 221 | Robles 222 | Rocha 223 | Rodríguez 224 | Rodriquez 225 | Roig 226 | Rojas 227 | Rojo 228 | Roldán 229 | Romà 230 | Romà 231 | Romero 232 | Rosa 233 | Rosales 234 | Rubio 235 | Ruiz 236 | Sala 237 | Salamanca 238 | Salazar 239 | Salcedo 240 | Salinas 241 | Sanchez 242 | Sandoval 243 | San nicolas 244 | Santana 245 | Santiago 246 | Santillian 247 | Santos 248 | Sastre 249 | Sepúlveda 250 | Sierra 251 | Silva 252 | Soler 253 | Solo 254 | Solos 255 | Soto 256 | Suárez 257 | Suero 258 | Tapia 259 | Terrazas 260 | Tomàs 261 | Torres 262 | Tos 263 | Tosell 264 | Toset 265 | Travieso 266 | Trujillo 267 | Ubina 268 | Urbina 269 | Ureña 270 | Valdez 271 | Valencia 272 | Varela 273 | Vargas 274 | Vásquez 275 | Vázquez 276 | Vega 277 | Vela 278 | Vela 279 | Velazquez 280 | Ventura 281 | Vicario 282 | Vilaró 283 | Villa 284 | Villalobos 285 | Villanueva 286 | Villaverde 287 | Viola 288 | Viteri 289 | Vivas 290 | Vives 291 | Ybarra 292 | Zabala 293 | Zambrano 294 | Zamorano 295 | Zapatero 296 | Zavala 297 | Zubizarreta 298 | Zuñiga 299 | -------------------------------------------------------------------------------- /2.advancedTextClassification/data/names/Vietnamese.txt: -------------------------------------------------------------------------------- 1 | Nguyen 2 | Tron 3 | Le 4 | Pham 5 | Huynh 6 | Hoang 7 | Phan 8 | Vu 9 | Vo 10 | Dang 11 | Bui 12 | Do 13 | Ho 14 | Ngo 15 | Duong 16 | Ly 17 | An 18 | an 19 | Bach 20 | Banh 21 | Cao 22 | Chau 23 | Chu 24 | Chung 25 | Chu 26 | Diep 27 | Doan 28 | Dam 29 | Dao 30 | Dinh 31 | Doan 32 | Giang 33 | Ha 34 | Han 35 | Kieu 36 | Kim 37 | La 38 | Lac 39 | Lam 40 | Lieu 41 | Luc 42 | Luong 43 | Luu 44 | Ma 45 | Mach 46 | Mai 47 | Nghiem 48 | Phi 49 | Pho 50 | Phung 51 | Quach 52 | Quang 53 | Quyen 54 | Ta 55 | Thach 56 | Thai 57 | Sai 58 | Thi 59 | Than 60 | Thao 61 | Thuy 62 | Tieu 63 | To 64 | Ton 65 | Tong 66 | Trang 67 | Trieu 68 | Trinh 69 | Truong 70 | Van 71 | Vinh 72 | Vuong 73 | Vuu 74 | -------------------------------------------------------------------------------- /2.advancedTextClassification/data/question-classif-data/train_1000.label: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JesseYule/NLPBeginner/175762fcbf40f84c6900dcf7453f865dffe0faf6/2.advancedTextClassification/data/question-classif-data/train_1000.label -------------------------------------------------------------------------------- /2.advancedTextClassification/result/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JesseYule/NLPBeginner/175762fcbf40f84c6900dcf7453f865dffe0faf6/2.advancedTextClassification/result/plot.png -------------------------------------------------------------------------------- /2.advancedTextClassification/result/plot2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JesseYule/NLPBeginner/175762fcbf40f84c6900dcf7453f865dffe0faf6/2.advancedTextClassification/result/plot2.png -------------------------------------------------------------------------------- /2.advancedTextClassification/result/plot3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JesseYule/NLPBeginner/175762fcbf40f84c6900dcf7453f865dffe0faf6/2.advancedTextClassification/result/plot3.png -------------------------------------------------------------------------------- /3.textMatching(ESIM)/README.md: -------------------------------------------------------------------------------- 1 | ### 任务三:基于注意力机制的文本匹配 2 | 3 | 输入两个句子判断,判断它们之间的关系 4 | 5 | 1. 数据集: 6 | 7 | * https://nlp.stanford.edu/projects/snli/ 8 | 9 | * https://www.nyu.edu/projects/bowman/multinli/ 10 | 11 | 2. 知识点: 12 | 13 | 1. [LSTM]() 14 | 2. [seq2seq]() 15 | 3. [注意力机制]() 16 | 4. [ESIM]() 17 | 18 | ### 代码说明 19 | 20 | ​ 本次实验主要是基于SNLI和MultiNLI这两个语料库进行的文本匹配,关于这两个语料库的说明和实验目的在网址中有详细介绍。 21 | 22 | ​ 实验主要是利用ESIM模型进行的文本匹配,关于ESIM模型的详细介绍分析在知识点中我也总结了,但是我仍然强烈建议阅读papers中的几篇论文,它们都介绍了如何在文本匹配问题中应用ESIM。 23 | 24 | ​ 另外,代码被我大幅度简化,主要是为了可以清晰地研究整个模型的思路,完整的代码请看: 25 | 26 | ​ 27 | 28 | 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /3.textMatching(ESIM)/papers/A Broad-Coverage Challenge Corpus for Sentence Understanding through Inference.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JesseYule/NLPBeginner/175762fcbf40f84c6900dcf7453f865dffe0faf6/3.textMatching(ESIM)/papers/A Broad-Coverage Challenge Corpus for Sentence Understanding through Inference.pdf -------------------------------------------------------------------------------- /3.textMatching(ESIM)/papers/Enhanced LSTM for Natural Language Inference.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JesseYule/NLPBeginner/175762fcbf40f84c6900dcf7453f865dffe0faf6/3.textMatching(ESIM)/papers/Enhanced LSTM for Natural Language Inference.pdf -------------------------------------------------------------------------------- /3.textMatching(ESIM)/papers/Sequential Attention-based Network for Noetic End-to-End Response Selection.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JesseYule/NLPBeginner/175762fcbf40f84c6900dcf7453f865dffe0faf6/3.textMatching(ESIM)/papers/Sequential Attention-based Network for Noetic End-to-End Response Selection.pdf -------------------------------------------------------------------------------- /3.textMatching(ESIM)/python/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JesseYule/NLPBeginner/175762fcbf40f84c6900dcf7453f865dffe0faf6/3.textMatching(ESIM)/python/__init__.py -------------------------------------------------------------------------------- /3.textMatching(ESIM)/python/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JesseYule/NLPBeginner/175762fcbf40f84c6900dcf7453f865dffe0faf6/3.textMatching(ESIM)/python/models/__init__.py -------------------------------------------------------------------------------- /3.textMatching(ESIM)/python/models/esim.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from util import blocks 3 | 4 | 5 | class MyModel(object): 6 | def __init__(self, seq_length, emb_dim, hidden_dim, embeddings, emb_train): 7 | ## Define hyperparameters 8 | self.embedding_dim = emb_dim 9 | self.dim = hidden_dim 10 | self.sequence_length = seq_length 11 | 12 | ## Define the placeholders 13 | self.premise_x = tf.placeholder(tf.int32, [None, self.sequence_length]) 14 | self.hypothesis_x = tf.placeholder(tf.int32, [None, self.sequence_length]) 15 | self.y = tf.placeholder(tf.int32, [None]) 16 | self.keep_rate_ph = tf.placeholder(tf.float32, []) 17 | 18 | ## Define parameters 19 | self.E = tf.Variable(embeddings, trainable=emb_train) 20 | 21 | self.W_mlp = tf.Variable(tf.random_normal([self.dim * 8, self.dim], stddev=0.1)) 22 | self.b_mlp = tf.Variable(tf.random_normal([self.dim], stddev=0.1)) 23 | 24 | self.W_cl = tf.Variable(tf.random_normal([self.dim, 3], stddev=0.1)) 25 | self.b_cl = tf.Variable(tf.random_normal([3], stddev=0.1)) 26 | 27 | # Function for embedding lookup and dropout at embedding layer 28 | # dropout就是忽略部分特征检测器(让部分隐层节点值为0) 29 | def emb_drop(x): 30 | emb = tf.nn.embedding_lookup(self.E, x) 31 | emb_drop = tf.nn.dropout(emb, self.keep_rate_ph) 32 | return emb_drop 33 | 34 | # Get lengths of unpadded sentences 35 | prem_seq_lengths, mask_prem = blocks.length(self.premise_x) 36 | hyp_seq_lengths, mask_hyp = blocks.length(self.hypothesis_x) 37 | 38 | # ————————————————————————input encoding阶段——————————————————————————————- 39 | 40 | premise_in = emb_drop(self.premise_x) 41 | hypothesis_in = emb_drop(self.hypothesis_x) 42 | 43 | # 通过BiLSTM重新学习单词和上下文的关系 44 | premise_outs, c1 = blocks.biLSTM(premise_in, dim=self.dim, seq_len=prem_seq_lengths, name='premise') 45 | hypothesis_outs, c2 = blocks.biLSTM(hypothesis_in, dim=self.dim, seq_len=hyp_seq_lengths, name='hypothesis') 46 | print('premise_outs: ', premise_outs) 47 | 48 | premise_bi = tf.concat(premise_outs, axis=2) 49 | hypothesis_bi = tf.concat(hypothesis_outs, axis=2) 50 | 51 | premise_list = tf.unstack(premise_bi, axis=1) 52 | hypothesis_list = tf.unstack(hypothesis_bi, axis=1) 53 | print('hypothesis_list: ', hypothesis_list) 54 | 55 | # 注意力机制 56 | scores_all = [] 57 | premise_attn = [] 58 | alphas = [] 59 | 60 | for i in range(self.sequence_length): 61 | 62 | scores_i_list = [] 63 | for j in range(self.sequence_length): 64 | # 计算第一个句子(premise)的第i个单词和第二个句子所有单词的相似度(向量乘积) 65 | # 这里的score就是论文里面的e 66 | score_ij = tf.reduce_sum(tf.multiply(premise_list[i], hypothesis_list[j]), 1, keep_dims=True) 67 | scores_i_list.append(score_ij) 68 | 69 | scores_i = tf.stack(scores_i_list, axis=1) 70 | alpha_i = blocks.masked_softmax(scores_i, mask_hyp) # 通过softmax标准化转换成权重 71 | a_tilde_i = tf.reduce_sum(tf.multiply(alpha_i, hypothesis_bi), 1) # 这里就是用句子b的各个词向量根据权重去表示句子a的第i个词向量 72 | premise_attn.append(a_tilde_i) 73 | 74 | scores_all.append(scores_i) 75 | alphas.append(alpha_i) 76 | 77 | # 把scores的结构转为list 78 | scores_stack = tf.stack(scores_all, axis=2) 79 | scores_list = tf.unstack(scores_stack, axis=1) 80 | 81 | # 对句子b也重复上面的过程 82 | hypothesis_attn = [] 83 | betas = [] 84 | for j in range(self.sequence_length): 85 | scores_j = scores_list[j] 86 | beta_j = blocks.masked_softmax(scores_j, mask_prem) 87 | b_tilde_j = tf.reduce_sum(tf.multiply(beta_j, premise_bi), 1) 88 | hypothesis_attn.append(b_tilde_j) 89 | 90 | betas.append(beta_j) 91 | 92 | # Make attention-weighted sentence representations into one tensor, 93 | premise_attns = tf.stack(premise_attn, axis=1) 94 | hypothesis_attns = tf.stack(hypothesis_attn, axis=1) 95 | 96 | # For making attention plots, 97 | self.alpha_s = tf.stack(alphas, axis=2) 98 | self.beta_s = tf.stack(betas, axis=2) 99 | 100 | # Enhancement of local inference information 101 | # 下面就是分析差异的过程 102 | prem_diff = tf.subtract(premise_bi, premise_attns) 103 | prem_mul = tf.multiply(premise_bi, premise_attns) 104 | hyp_diff = tf.subtract(hypothesis_bi, hypothesis_attns) 105 | hyp_mul = tf.multiply(hypothesis_bi, hypothesis_attns) 106 | 107 | m_a = tf.concat([premise_bi, premise_attns, prem_diff, prem_mul], 2) 108 | m_b = tf.concat([hypothesis_bi, hypothesis_attns, hyp_diff, hyp_mul], 2) 109 | 110 | # Inference Composition 111 | # 用BiLSTM分析overall inference relationship between a premise and hypothesis 112 | v1_outs, c3 = blocks.biLSTM(m_a, dim=self.dim, seq_len=prem_seq_lengths, name='v1') 113 | v2_outs, c4 = blocks.biLSTM(m_b, dim=self.dim, seq_len=hyp_seq_lengths, name='v2') 114 | 115 | v1_bi = tf.concat(v1_outs, axis=2) 116 | v2_bi = tf.concat(v2_outs, axis=2) 117 | 118 | # Pooling Layer 119 | v_1_sum = tf.reduce_sum(v1_bi, 1) 120 | v_1_ave = tf.div(v_1_sum, tf.expand_dims(tf.cast(prem_seq_lengths, tf.float32), -1)) 121 | 122 | v_2_sum = tf.reduce_sum(v2_bi, 1) 123 | v_2_ave = tf.div(v_2_sum, tf.expand_dims(tf.cast(hyp_seq_lengths, tf.float32), -1)) 124 | 125 | v_1_max = tf.reduce_max(v1_bi, 1) 126 | v_2_max = tf.reduce_max(v2_bi, 1) 127 | 128 | v = tf.concat([v_1_ave, v_2_ave, v_1_max, v_2_max], 1) 129 | 130 | # 最后用MLP layer做分类 131 | h_mlp = tf.nn.tanh(tf.matmul(v, self.W_mlp) + self.b_mlp) 132 | 133 | # Dropout applied to classifier 134 | h_drop = tf.nn.dropout(h_mlp, self.keep_rate_ph) 135 | 136 | # Get prediction 137 | self.logits = tf.matmul(h_drop, self.W_cl) + self.b_cl 138 | 139 | # Define the cost function 140 | self.total_cost = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.y, logits=self.logits)) 141 | -------------------------------------------------------------------------------- /3.textMatching(ESIM)/python/train_mnli.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import os 3 | import importlib 4 | import random 5 | from util import logger 6 | import util.parameters as params 7 | from util.data_processing import * 8 | from util.evaluate import * 9 | import numpy as np 10 | 11 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 12 | 13 | # parameters主要设置模型相关的参数 14 | # FIXED_PARAMETERS就是一个字典,需要就从里面取出参数 15 | FIXED_PARAMETERS = params.load_parameters() 16 | modname = FIXED_PARAMETERS["model_name"] 17 | logpath = os.path.join(FIXED_PARAMETERS["log_path"], modname) + ".log" 18 | logger = logger.Logger(logpath) 19 | 20 | # 选择用什么模型,比如ESIM、biLSTM 21 | model = FIXED_PARAMETERS["model_type"] 22 | 23 | module = importlib.import_module(".".join(['models', model])) 24 | MyModel = getattr(module, 'MyModel') 25 | 26 | # Logging parameter settings at each launch of training script 27 | # This will help ensure nothing goes awry in reloading a model and we consistently use the same hyperparameter settings. 28 | logger.Log("FIXED_PARAMETERS\n %s" % FIXED_PARAMETERS) 29 | 30 | 31 | # ————————————————————————————————读取数据———————————————————————————————————— 32 | 33 | logger.Log("Loading data") 34 | 35 | training_mnli = load_nli_data(FIXED_PARAMETERS["training_mnli"]) 36 | 37 | test_matched = load_nli_data(FIXED_PARAMETERS["test_matched"]) 38 | test_mismatched = load_nli_data(FIXED_PARAMETERS["test_mismatched"]) 39 | 40 | if 'temp.jsonl' in FIXED_PARAMETERS["test_matched"]: 41 | # Removing temporary empty file that was created in parameters.py 42 | os.remove(FIXED_PARAMETERS["test_matched"]) 43 | logger.Log("Created and removed empty file called temp.jsonl since test set is not available.") 44 | 45 | dictpath = os.path.join(FIXED_PARAMETERS["log_path"], modname) + ".p" 46 | 47 | 48 | # ————————————————————————————word embedding———————————————————————————————————— 49 | if not os.path.isfile(dictpath): 50 | logger.Log("Building dictionary") 51 | if FIXED_PARAMETERS["alpha"] == 0: 52 | word_indices = build_dictionary([training_mnli]) 53 | else: 54 | word_indices = build_dictionary([training_mnli]) 55 | 56 | logger.Log("Padding and indexifying sentences") 57 | sentences_to_padded_index_sequences(word_indices, [training_mnli, 58 | test_matched, test_mismatched]) 59 | pickle.dump(word_indices, open(dictpath, "wb")) 60 | else: 61 | logger.Log("Loading dictionary from %s" % (dictpath)) 62 | word_indices = pickle.load(open(dictpath, "rb")) # word_indices是一个词典,每个单词对应一个编号,编号总长度就是单词种类的数量 63 | logger.Log("Padding and indexifying sentences") 64 | # 将句子转换成向量表示 65 | sentences_to_padded_index_sequences(word_indices, [training_mnli, 66 | test_matched, test_mismatched]) 67 | 68 | logger.Log("Loading embeddings") 69 | loaded_embeddings = loadEmbedding_rand(FIXED_PARAMETERS["embedding_data_path"], word_indices) 70 | 71 | 72 | # ——————————————————————————————创建模型—————————————————————————————————————————— 73 | class modelClassifier: 74 | def __init__(self, seq_length): 75 | # Define hyperparameters 76 | self.learning_rate = FIXED_PARAMETERS["learning_rate"] 77 | self.display_epoch_freq = 1 78 | self.display_step_freq = 5 79 | self.embedding_dim = FIXED_PARAMETERS["word_embedding_dim"] 80 | self.dim = FIXED_PARAMETERS["hidden_embedding_dim"] 81 | self.batch_size = FIXED_PARAMETERS["batch_size"] 82 | self.emb_train = FIXED_PARAMETERS["emb_train"] 83 | self.keep_rate = FIXED_PARAMETERS["keep_rate"] 84 | self.sequence_length = FIXED_PARAMETERS["seq_length"] 85 | self.alpha = FIXED_PARAMETERS["alpha"] 86 | 87 | logger.Log("Building model from %s.py" %(model)) 88 | self.model = MyModel(seq_length=self.sequence_length, emb_dim=self.embedding_dim, 89 | hidden_dim=self.dim, embeddings=loaded_embeddings, 90 | emb_train=self.emb_train) 91 | 92 | # Perform gradient descent with Adam 93 | self.optimizer = tf.train.AdamOptimizer(self.learning_rate, beta1=0.9, beta2=0.999).minimize(self.model.total_cost) 94 | 95 | # Boolean stating that training has not been completed, 96 | self.completed = False 97 | 98 | # tf things: initialize variables and create placeholder for session 99 | logger.Log("Initializing variables") 100 | self.init = tf.global_variables_initializer() 101 | self.sess = None 102 | self.saver = tf.train.Saver() 103 | 104 | def get_minibatch(self, dataset, start_index, end_index): 105 | indices = range(start_index, end_index) 106 | premise_vectors = np.vstack([dataset[i]['sentence1_binary_parse_index_sequence'] for i in indices]) 107 | hypothesis_vectors = np.vstack([dataset[i]['sentence2_binary_parse_index_sequence'] for i in indices]) 108 | genres = [dataset[i]['genre'] for i in indices] 109 | labels = [dataset[i]['label'] for i in indices] 110 | return premise_vectors, hypothesis_vectors, labels, genres 111 | 112 | def classify(self, examples): 113 | # This classifies a list of examples 114 | total_batch = int(len(examples) / self.batch_size) 115 | logits = np.empty(3) 116 | genres = [] 117 | for i in range(total_batch): 118 | minibatch_premise_vectors, minibatch_hypothesis_vectors, minibatch_labels, minibatch_genres = self.get_minibatch(examples, 119 | self.batch_size * i, self.batch_size * (i + 1)) 120 | feed_dict = {self.model.premise_x: minibatch_premise_vectors, 121 | self.model.hypothesis_x: minibatch_hypothesis_vectors, 122 | self.model.y: minibatch_labels, 123 | self.model.keep_rate_ph: 1.0} 124 | genres += minibatch_genres 125 | logit, cost = self.sess.run([self.model.logits, self.model.total_cost], feed_dict) 126 | logits = np.vstack([logits, logit]) 127 | 128 | return genres, np.argmax(logits[1:], axis=1), cost 129 | 130 | def train(self, train_mnli): 131 | self.sess = tf.Session() 132 | self.sess.run(self.init) 133 | 134 | self.step = 0 135 | self.epoch = 0 136 | self.best_dev_mat = 0. 137 | self.best_mtrain_acc = 0. 138 | self.last_train_acc = [.001, .001, .001] 139 | self.best_step = 0 140 | 141 | # Training cycle 142 | logger.Log("Training...") 143 | 144 | while True: 145 | training_data = train_mnli 146 | random.shuffle(training_data) # 随机重排训练数据 147 | avg_cost = 0. 148 | total_batch = int(len(training_data) / self.batch_size) 149 | 150 | # Loop over all batches in epoch 151 | for i in range(total_batch): 152 | # Assemble a minibatch of the next B examples 153 | minibatch_premise_vectors, minibatch_hypothesis_vectors, minibatch_labels, minibatch_genres = self.get_minibatch( 154 | training_data, self.batch_size * i, self.batch_size * (i + 1)) 155 | 156 | # Run the optimizer to take a gradient step, and also fetch the value of the 157 | # cost function for logging 158 | # 上面通过minibatch函数从训练集中随机抽取数据,这里填入模型中 159 | feed_dict = {self.model.premise_x: minibatch_premise_vectors, 160 | self.model.hypothesis_x: minibatch_hypothesis_vectors, 161 | self.model.y: minibatch_labels, 162 | self.model.keep_rate_ph: self.keep_rate} 163 | 164 | # 正式训练,计算损失 165 | _, c = self.sess.run([self.optimizer, self.model.total_cost], feed_dict) 166 | 167 | # Since a single epoch can take a ages for larger models (ESIM), 168 | # we'll print accuracy every 50 steps 169 | # 这里的意思是每训练了50步,就检验一下当前的模型 170 | # 从代码可以看出主要是用train_mnli的前五千条数据放进模型里面训练,分析模型输出和实际结果的差异 171 | if self.step % self.display_step_freq == 0: 172 | 173 | mtrain_acc, mtrain_cost = evaluate_classifier(self.classify, train_mnli[0:5000], self.batch_size) 174 | 175 | logger.Log("Step: %i\t MultiNLI train acc: %f" % (self.step, mtrain_acc)) 176 | logger.Log("Step: %i\t MultiNLI train cost: %f" % (self.step, mtrain_cost)) 177 | 178 | self.step += 1 179 | 180 | # Compute average loss 181 | # 每次训练都返回一次损失c,所以计算平均损失就相加再除以总训练次数 182 | avg_cost += c / (total_batch * self.batch_size) 183 | 184 | # Display some statistics about the epoch 185 | if self.epoch % self.display_epoch_freq == 0: 186 | logger.Log("Epoch: %i\t Avg. Cost: %f" % (self.epoch + 1, avg_cost)) 187 | 188 | self.epoch += 1 189 | self.last_train_acc[(self.epoch % 5) - 1] = mtrain_acc 190 | 191 | # Early stopping 192 | progress = 1000 * (sum(self.last_train_acc) / (5 * min(self.last_train_acc)) - 1) 193 | 194 | # 训练次数超过30000次就停止训练 195 | if (progress < 0.1) or (self.step > self.best_step + 30000): 196 | logger.Log("MultiNLI Train accuracy: %s" % (self.best_mtrain_acc)) 197 | self.completed = True 198 | break 199 | 200 | 201 | classifier = modelClassifier(FIXED_PARAMETERS["seq_length"]) 202 | 203 | classifier.train(training_mnli) 204 | 205 | logger.Log("Acc on matched multiNLI dev-set: %s" 206 | % (evaluate_classifier(classifier.classify, test_matched, FIXED_PARAMETERS["batch_size"]))[0]) 207 | logger.Log("Acc on mismatched multiNLI dev-set: %s" 208 | % (evaluate_classifier(classifier.classify, test_mismatched, FIXED_PARAMETERS["batch_size"]))[0]) 209 | -------------------------------------------------------------------------------- /3.textMatching(ESIM)/python/util/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JesseYule/NLPBeginner/175762fcbf40f84c6900dcf7453f865dffe0faf6/3.textMatching(ESIM)/python/util/__init__.py -------------------------------------------------------------------------------- /3.textMatching(ESIM)/python/util/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JesseYule/NLPBeginner/175762fcbf40f84c6900dcf7453f865dffe0faf6/3.textMatching(ESIM)/python/util/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /3.textMatching(ESIM)/python/util/__pycache__/logger.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JesseYule/NLPBeginner/175762fcbf40f84c6900dcf7453f865dffe0faf6/3.textMatching(ESIM)/python/util/__pycache__/logger.cpython-37.pyc -------------------------------------------------------------------------------- /3.textMatching(ESIM)/python/util/__pycache__/parameters.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JesseYule/NLPBeginner/175762fcbf40f84c6900dcf7453f865dffe0faf6/3.textMatching(ESIM)/python/util/__pycache__/parameters.cpython-37.pyc -------------------------------------------------------------------------------- /3.textMatching(ESIM)/python/util/blocks.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | Functions and components that can be slotted into tensorflow models. 4 | 5 | TODO: Write functions for various types of attention. 6 | 7 | """ 8 | 9 | import tensorflow as tf 10 | 11 | 12 | def length(sequence): 13 | """ 14 | Get true length of sequences (without padding), and mask for true-length in max-length. 15 | 16 | Input of shape: (batch_size, max_seq_length, hidden_dim) 17 | Output shapes, 18 | length: (batch_size) 19 | mask: (batch_size, max_seq_length, 1) 20 | """ 21 | populated = tf.sign(tf.abs(sequence)) 22 | length = tf.cast(tf.reduce_sum(populated, axis=1), tf.int32) 23 | mask = tf.cast(tf.expand_dims(populated, -1), tf.float32) 24 | return length, mask 25 | 26 | 27 | def biLSTM(inputs, dim, seq_len, name): 28 | """ 29 | A Bi-Directional LSTM layer. Returns forward and backward hidden states as a tuple, and cell states as a tuple. 30 | 31 | Ouput of hidden states: [(batch_size, max_seq_length, hidden_dim), (batch_size, max_seq_length, hidden_dim)] 32 | Same shape for cell states. 33 | """ 34 | with tf.name_scope(name): 35 | with tf.variable_scope('forward' + name): 36 | lstm_fwd = tf.contrib.rnn.LSTMCell(num_units=dim) 37 | with tf.variable_scope('backward' + name): 38 | lstm_bwd = tf.contrib.rnn.LSTMCell(num_units=dim) 39 | 40 | hidden_states, cell_states = tf.nn.bidirectional_dynamic_rnn(cell_fw=lstm_fwd, cell_bw=lstm_bwd, inputs=inputs, sequence_length=seq_len, dtype=tf.float32, scope=name) 41 | 42 | return hidden_states, cell_states 43 | 44 | 45 | def LSTM(inputs, dim, seq_len, name): 46 | """ 47 | An LSTM layer. Returns hidden states and cell states as a tuple. 48 | 49 | Ouput shape of hidden states: (batch_size, max_seq_length, hidden_dim) 50 | Same shape for cell states. 51 | """ 52 | with tf.name_scope(name): 53 | cell = tf.contrib.rnn.LSTMCell(num_units=dim) 54 | hidden_states, cell_states = tf.nn.dynamic_rnn(cell, inputs=inputs, sequence_length=seq_len, dtype=tf.float32, scope=name) 55 | 56 | return hidden_states, cell_states 57 | 58 | 59 | def last_output(output, true_length): 60 | """ 61 | To get the last hidden layer form a dynamically unrolled RNN. 62 | Input of shape (batch_size, max_seq_length, hidden_dim). 63 | 64 | true_length: Tensor of shape (batch_size). Such a tensor is given by the length() function. 65 | Output of shape (batch_size, hidden_dim). 66 | """ 67 | max_length = int(output.get_shape()[1]) 68 | length_mask = tf.expand_dims(tf.one_hot(true_length-1, max_length, on_value=1., off_value=0.), -1) 69 | last_output = tf.reduce_sum(tf.multiply(output, length_mask), 1) 70 | return last_output 71 | 72 | 73 | def masked_softmax(scores, mask): 74 | """ 75 | Used to calculcate a softmax score with true sequence length (without padding), rather than max-sequence length. 76 | 77 | Input shape: (batch_size, max_seq_length, hidden_dim). 78 | mask parameter: Tensor of shape (batch_size, max_seq_length). Such a mask is given by the length() function. 79 | """ 80 | numerator = tf.exp(tf.subtract(scores, tf.reduce_max(scores, 1, keep_dims=True))) * mask 81 | denominator = tf.reduce_sum(numerator, 1, keep_dims=True) 82 | weights = tf.div(numerator, denominator) 83 | return weights 84 | -------------------------------------------------------------------------------- /3.textMatching(ESIM)/python/util/data_processing.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import re 3 | import random 4 | import json 5 | import collections 6 | import parameters as params 7 | import pickle 8 | 9 | FIXED_PARAMETERS = params.load_parameters() 10 | 11 | LABEL_MAP = { 12 | "entailment": 0, 13 | "neutral": 1, 14 | "contradiction": 2, 15 | "hidden": 0 16 | } 17 | 18 | PADDING = "" 19 | UNKNOWN = "" 20 | 21 | def load_nli_data(path, snli=False): 22 | """ 23 | Load MultiNLI or SNLI data. 24 | If the "snli" parameter is set to True, a genre label of snli will be assigned to the data. 25 | """ 26 | data = [] 27 | with open(path) as f: 28 | for line in f: 29 | loaded_example = json.loads(line) 30 | if loaded_example["gold_label"] not in LABEL_MAP: 31 | continue 32 | loaded_example["label"] = LABEL_MAP[loaded_example["gold_label"]] 33 | if snli: 34 | loaded_example["genre"] = "snli" 35 | data.append(loaded_example) 36 | random.seed(1) 37 | random.shuffle(data) 38 | return data 39 | 40 | 41 | def load_nli_data_genre(path, genre, snli=True): 42 | """ 43 | Load a specific genre's examples from MultiNLI, or load SNLI data and assign a "snli" genre to the examples. 44 | If the "snli" parameter is set to True, a genre label of snli will be assigned to the data. If set to true, it will overwrite the genre label for MultiNLI data. 45 | """ 46 | data = [] 47 | j = 0 48 | with open(path) as f: 49 | for line in f: 50 | loaded_example = json.loads(line) 51 | if loaded_example["gold_label"] not in LABEL_MAP: 52 | continue 53 | loaded_example["label"] = LABEL_MAP[loaded_example["gold_label"]] 54 | if snli: 55 | loaded_example["genre"] = "snli" 56 | if loaded_example["genre"] == genre: 57 | data.append(loaded_example) 58 | random.seed(1) 59 | random.shuffle(data) 60 | return data 61 | 62 | 63 | def tokenize(string): 64 | string = re.sub(r'\(|\)', '', string) 65 | return string.split() 66 | 67 | def build_dictionary(training_datasets): 68 | """ 69 | Extract vocabulary and build dictionary. 70 | """ 71 | word_counter = collections.Counter() 72 | for i, dataset in enumerate(training_datasets): 73 | for example in dataset: 74 | word_counter.update(tokenize(example['sentence1_binary_parse'])) # 计算句子中每个词在整个语料库中的出现次数 75 | word_counter.update(tokenize(example['sentence2_binary_parse'])) 76 | # 得到一个词典,每个单词对应在语料库中出现的次数 77 | 78 | vocabulary = set([word for word in word_counter]) 79 | vocabulary = list(vocabulary) 80 | vocabulary = [PADDING, UNKNOWN] + vocabulary 81 | 82 | word_indices = dict(zip(vocabulary, range(len(vocabulary)))) # 得到一个词典,每个单词对应一个编号,编号长度就是单词的数量 83 | return word_indices 84 | 85 | 86 | def sentences_to_padded_index_sequences(word_indices, datasets): 87 | """ 88 | Annotate datasets with feature vectors. Adding right-sided padding. 89 | """ 90 | for i, dataset in enumerate(datasets): 91 | for example in dataset: 92 | for sentence in ['sentence1_binary_parse', 'sentence2_binary_parse']: 93 | example[sentence + '_index_sequence'] = np.zeros((FIXED_PARAMETERS["seq_length"]), dtype=np.int32) 94 | # 创建全0向量,保持所有句子向量长度一致 95 | 96 | token_sequence = tokenize(example[sentence]) # 分割句子 97 | padding = FIXED_PARAMETERS["seq_length"] - len(token_sequence) # 计算句子长度和句子向量长度之差(为了padding) 98 | 99 | for i in range(FIXED_PARAMETERS["seq_length"]): 100 | # 之前确定了每个单词对应的index,现在就把之前创建的句子的全0向量替换成每个单词的index 101 | # 超出句子长度的部分就做padding 102 | if i >= len(token_sequence): 103 | index = word_indices[PADDING] 104 | else: 105 | if token_sequence[i] in word_indices: 106 | index = word_indices[token_sequence[i]] 107 | else: 108 | index = word_indices[UNKNOWN] 109 | example[sentence + '_index_sequence'][i] = index 110 | 111 | 112 | def loadEmbedding_zeros(path, word_indices): 113 | """ 114 | Load GloVe embeddings. Initializng OOV words to vector of zeros. 115 | """ 116 | emb = np.zeros((len(word_indices), FIXED_PARAMETERS["word_embedding_dim"]), dtype='float32') 117 | 118 | with open(path, 'r') as f: 119 | for i, line in enumerate(f): 120 | if FIXED_PARAMETERS["embeddings_to_load"] != None: 121 | if i >= FIXED_PARAMETERS["embeddings_to_load"]: 122 | break 123 | 124 | s = line.split() 125 | if s[0] in word_indices: 126 | emb[word_indices[s[0]], :] = np.asarray(s[1:]) 127 | 128 | return emb 129 | 130 | 131 | def loadEmbedding_rand(path, word_indices): 132 | """ 133 | Load GloVe embeddings. Doing a random normal initialization for OOV words. 134 | """ 135 | n = len(word_indices) 136 | m = FIXED_PARAMETERS["word_embedding_dim"] 137 | emb = np.empty((n, m), dtype=np.float32) # emb的size为(单词向量长度)*(单词总数目) 138 | 139 | emb[:,:] = np.random.normal(size=(n,m)) 140 | 141 | # Explicitly assign embedding of to be zeros. 142 | # 最初的两个单词是PADDING和UNKNOW,所以设置为0 143 | emb[0:2, :] = np.zeros((1,m), dtype="float32") 144 | 145 | with open(path, 'r') as f: 146 | for i, line in enumerate(f): 147 | if FIXED_PARAMETERS["embeddings_to_load"] != None: 148 | if i >= FIXED_PARAMETERS["embeddings_to_load"]: 149 | break 150 | 151 | s = line.split() 152 | # 针对单词对应的glove向量,填入到emb中 153 | if s[0] in word_indices: 154 | emb[word_indices[s[0]], :] = np.asarray(s[1:]) 155 | 156 | return emb 157 | 158 | -------------------------------------------------------------------------------- /3.textMatching(ESIM)/python/util/evaluate.py: -------------------------------------------------------------------------------- 1 | def evaluate_classifier(classifier, eval_set, batch_size): 2 | """ 3 | Function to get accuracy and cost of the model, evaluated on a chosen dataset. 4 | 5 | classifier: the model's classfier, it should return genres, logit values, and cost for a given minibatch of the evaluation dataset 6 | eval_set: the chosen evaluation set, for eg. the dev-set 7 | batch_size: the size of minibatches. 8 | """ 9 | correct = 0 10 | # 为了验证模型,这里使用当前的模型对测试集eval_set进行检测,其中hypotheses储存了一系列的分类结果 11 | genres, hypotheses, cost = classifier(eval_set) 12 | cost = cost / batch_size 13 | full_batch = int(len(eval_set) / batch_size) * batch_size 14 | # 主要计算模型的每次输出和对应数据的实际分类结果是否相符,从而计算正确率 15 | for i in range(full_batch): 16 | hypothesis = hypotheses[i] 17 | if hypothesis == eval_set[i]['label']: 18 | correct += 1 19 | return correct / float(len(eval_set)), cost 20 | -------------------------------------------------------------------------------- /3.textMatching(ESIM)/python/util/logger.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import sys 3 | import json 4 | 5 | class Logger(object): 6 | """ 7 | A logging that doesn't leave logs open between writes, so as to allow AFS synchronization. 8 | """ 9 | 10 | # Level constants 11 | DEBUG = 0 12 | INFO = 1 13 | WARNING = 2 14 | ERROR = 3 15 | 16 | def __init__(self, log_path=None, json_log_path=None, min_print_level=0, min_file_level=0): 17 | """ 18 | log_path: The full path for the log file to write. The file will be appended to if it exists. 19 | min_print_level: Only messages with level above this level will be printed to stderr. 20 | min_file_level: Only messages with level above this level will be written to disk. 21 | """ 22 | self.log_path = log_path 23 | self.json_log_path = json_log_path 24 | self.min_print_level = min_print_level 25 | self.min_file_level = min_file_level 26 | 27 | def Log(self, message, level=INFO): 28 | if level >= self.min_print_level: 29 | # Write to STDERR 30 | sys.stderr.write("[%i] %s\n" % (level, message)) 31 | if self.log_path and level >= self.min_file_level: 32 | # Write to the log file then close it 33 | with open(self.log_path, 'a') as f: 34 | datetime_string = datetime.datetime.now().strftime( 35 | "%y-%m-%d %H:%M:%S") 36 | f.write("%s [%i] %s\n" % (datetime_string, level, message)) 37 | 38 | def LogJSON(self, message_obj, level=INFO): 39 | if self.json_log_path and level >= self.min_file_level: 40 | with open(self.json_log_path, 'w') as f: 41 | print >>f, json.dumps(message_obj) 42 | else: 43 | sys.stderr.write('WARNING: No JSON log filename.') 44 | 45 | -------------------------------------------------------------------------------- /3.textMatching(ESIM)/python/util/parameters.py: -------------------------------------------------------------------------------- 1 | """ 2 | The hyperparameters for a model are defined here. Arguments like the type of model, model name, paths to data, logs etc. are also defined here. 3 | All paramters and arguments can be changed by calling flags in the command line. 4 | 5 | Required arguements are, 6 | model_type: which model you wish to train with. Valid model types: cbow, bilstm, and esim. 7 | model_name: the name assigned to the model being trained, this will prefix the name of the logs and checkpoint files. 8 | """ 9 | 10 | import argparse 11 | import io 12 | import os 13 | import json 14 | 15 | parser = argparse.ArgumentParser() 16 | 17 | models = ['esim','cbow', 'bilstm', 'lstm'] 18 | def types(s): 19 | options = [mod for mod in models if s in models] 20 | if len(options) == 1: 21 | return options[0] 22 | return s 23 | 24 | # Valid genres to train on. 25 | genres = ['travel', 'fiction', 'slate', 'telephone', 'government'] 26 | def subtypes(s): 27 | options = [mod for mod in genres if s in genres] 28 | if len(options) == 1: 29 | return options[0] 30 | return s 31 | 32 | parser.add_argument("model_type", choices=models, type=types, help="Give model type.") 33 | parser.add_argument("model_name", type=str, help="Give model name, this will name logs and checkpoints made. For example cbow, esim_test etc.") 34 | 35 | parser.add_argument("--datapath", type=str, default="../data") 36 | parser.add_argument("--ckptpath", type=str, default="../logs") 37 | parser.add_argument("--logpath", type=str, default="../logs") 38 | 39 | parser.add_argument("--emb_to_load", type=int, default=None, help="Number of embeddings to load. If None, all embeddings are loaded.") 40 | parser.add_argument("--learning_rate", type=float, default=0.0004, help="Learning rate for model") 41 | parser.add_argument("--keep_rate", type=float, default=0.5, help="Keep rate for dropout in the model") 42 | parser.add_argument("--seq_length", type=int, default=50, help="Max sequence length") 43 | parser.add_argument("--emb_train", action='store_true', help="Call if you want to make your word embeddings trainable.") 44 | 45 | parser.add_argument("--genre", type=str, help="Which genre to train on") 46 | parser.add_argument("--alpha", type=float, default=0., help="What percentage of SNLI data to use in training") 47 | 48 | parser.add_argument("--test", action='store_true', help="Call if you want to only test on the best checkpoint.") 49 | 50 | args = parser.parse_args() 51 | 52 | """ 53 | # Check if test sets are available. If not, create an empty file. 54 | test_matched = "{}/multinli_0.9/multinli_0.9_test_matched_unlabeled.jsonl".format(args.datapath) 55 | 56 | if os.path.isfile(test_matched): 57 | test_matched = "{}/multinli_0.9/multinli_0.9_test_matched_unlabeled.jsonl".format(args.datapath) 58 | test_mismatched = "{}/multinli_0.9/multinli_0.9_test_matched_unlabeled.jsonl".format(args.datapath) 59 | test_path = "{}/multinli_0.9/".format(args.datapath) 60 | else: 61 | test_path = "{}/multinli_0.9/".format(args.datapath) 62 | temp_file = os.path.join(test_path, "temp.jsonl") 63 | io.open(temp_file, "wb") 64 | test_matched = temp_file 65 | test_mismatched = temp_file 66 | """ 67 | # Check if test sets are available. If not, create an empty file. 68 | # test_matched = "{}/multinli_0.9/multinli_0.9_test_matched.jsonl".format(args.datapath) 69 | test_matched = "../../../../data/multinli_1.0/multinli_1.0_test_matched.jsonl".format(args.datapath) 70 | 71 | 72 | if os.path.isfile(test_matched): 73 | test_matched = "../../../../data/multinli_1.0/multinli_1.0_dev_matched.jsonl".format(args.datapath) #"{}/multinli_0.9/multinli_0.9_test_matched.jsonl".format(args.datapath) 74 | test_mismatched = "../../../../data/multinli_1.0/multinli_1.0_dev_mismatched.jsonl".format(args.datapath) #"{}/multinli_0.9/multinli_0.9_test_mismatched.jsonl".format(args.datapath) 75 | test_path = "../../../../data/multinli_1.0".format(args.datapath) 76 | else: 77 | test_path = "../../../../data/multinli_1.0".format(args.datapath) 78 | temp_file = os.path.join(test_path, "temp.jsonl") 79 | io.open(temp_file, "wb") 80 | test_matched = temp_file 81 | test_mismatched = temp_file 82 | 83 | 84 | def load_parameters(): 85 | FIXED_PARAMETERS = { 86 | "model_type": args.model_type, 87 | "model_name": args.model_name, 88 | "training_mnli": "../../../../data/multinli_1.0/multinli_1.0_train.jsonl".format(args.datapath), 89 | "dev_matched": "../../../../data/multinli_1.0/multinli_1.0_dev_matched.jsonl".format(args.datapath), 90 | "dev_mismatched": "../../../../data/multinli_1.0/multinli_1.0_dev_mismatched.jsonl".format(args.datapath), 91 | "test_matched": test_matched, 92 | "test_mismatched": test_mismatched, 93 | "training_snli": "../../../../data//snli_1.0/snli_1.0_train.jsonl".format(args.datapath), 94 | "dev_snli": "../../../../data//snli_1.0/snli_1.0_dev.jsonl".format(args.datapath), 95 | "test_snli": "../../../../data//snli_1.0/snli_1.0_test.jsonl".format(args.datapath), 96 | "embedding_data_path": "../../../glove/glove.42B.300d.txt".format(args.datapath), 97 | #"embedding_data_path": "{}/glove.6B.50d.txt".format(args.datapath), 98 | "log_path": "{}".format(args.logpath), 99 | "ckpt_path": "{}".format(args.ckptpath), 100 | "embeddings_to_load": args.emb_to_load, 101 | "word_embedding_dim": 300, 102 | "hidden_embedding_dim": 300, 103 | #"word_embedding_dim": 50, 104 | #"hidden_embedding_dim": 50, 105 | "seq_length": args.seq_length, 106 | "keep_rate": args.keep_rate, 107 | "batch_size": 32, 108 | "learning_rate": args.learning_rate, 109 | "emb_train": args.emb_train, 110 | "alpha": args.alpha, 111 | "genre": args.genre 112 | } 113 | 114 | return FIXED_PARAMETERS 115 | 116 | def train_or_test(): 117 | return args.test 118 | 119 | -------------------------------------------------------------------------------- /4.NER(LSTM+CRF)/LICENSE.txt: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | Copyright 2017 Guillaume Genthial 179 | 180 | Licensed under the Apache License, Version 2.0 (the "License"); 181 | you may not use this file except in compliance with the License. 182 | You may obtain a copy of the License at 183 | 184 | http://www.apache.org/licenses/LICENSE-2.0 185 | 186 | Unless required by applicable law or agreed to in writing, software 187 | distributed under the License is distributed on an "AS IS" BASIS, 188 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 189 | See the License for the specific language governing permissions and 190 | limitations under the License. -------------------------------------------------------------------------------- /4.NER(LSTM+CRF)/README.md: -------------------------------------------------------------------------------- 1 | ### 任务四:基于LSTM+CRF的命名实体识别 2 | 3 | 十分抱歉,因为最近学业繁忙,且目前研究内容暂未涉及到NER,所以这里未能给出详细的代码分析说明,也未能对模型做详细分析,日后一定补上。 4 | 5 | 1. 知识点: 6 | 7 | - [概率图模型]() 8 | - [隐马尔可夫模型]() 9 | - [Viterbi算法]() 10 | - [条件随机场]() 11 | 12 | ​ 13 | 14 | 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /4.NER(LSTM+CRF)/build_data.py: -------------------------------------------------------------------------------- 1 | from model.config import Config 2 | from model.data_utils import CoNLLDataset, get_vocabs, UNK, NUM, \ 3 | get_glove_vocab, write_vocab, load_vocab, get_char_vocab, \ 4 | export_trimmed_glove_vectors, get_processing_word 5 | 6 | 7 | def main(): 8 | """Procedure to build data 9 | 10 | You MUST RUN this procedure. It iterates over the whole dataset (train, 11 | dev and test) and extract the vocabularies in terms of words, tags, and 12 | characters. Having built the vocabularies it writes them in a file. The 13 | writing of vocabulary in a file assigns an id (the line #) to each word. 14 | It then extract the relevant GloVe vectors and stores them in a np array 15 | such that the i-th entry corresponds to the i-th word in the vocabulary. 16 | 17 | 18 | Args: 19 | config: (instance of Config) has attributes like hyper-params... 20 | 21 | """ 22 | # get config and processing of words 23 | config = Config(load=False) 24 | processing_word = get_processing_word(lowercase=True) 25 | 26 | # Generators 27 | dev = CoNLLDataset(config.filename_dev, processing_word) 28 | test = CoNLLDataset(config.filename_test, processing_word) 29 | train = CoNLLDataset(config.filename_train, processing_word) 30 | 31 | # Build Word and Tag vocab 32 | vocab_words, vocab_tags = get_vocabs([train, dev, test]) 33 | vocab_glove = get_glove_vocab(config.filename_glove) 34 | 35 | vocab = vocab_words & vocab_glove 36 | vocab.add(UNK) 37 | vocab.add(NUM) 38 | 39 | # Save vocab 40 | write_vocab(vocab, config.filename_words) 41 | write_vocab(vocab_tags, config.filename_tags) 42 | 43 | # Trim GloVe Vectors 44 | vocab = load_vocab(config.filename_words) 45 | export_trimmed_glove_vectors(vocab, config.filename_glove, 46 | config.filename_trimmed, config.dim_word) 47 | 48 | # Build and save char vocab 49 | train = CoNLLDataset(config.filename_train) 50 | vocab_chars = get_char_vocab(train) 51 | write_vocab(vocab_chars, config.filename_chars) 52 | 53 | 54 | if __name__ == "__main__": 55 | main() 56 | -------------------------------------------------------------------------------- /4.NER(LSTM+CRF)/data/test.txt: -------------------------------------------------------------------------------- 1 | Jean B-PER 2 | Pierre I-PER 3 | lives O 4 | in O 5 | New B-LOC 6 | York I-LOC 7 | . O 8 | 9 | The O 10 | European B-ORG 11 | Union I-ORG 12 | is O 13 | a O 14 | political O 15 | and O 16 | economic O 17 | union O 18 | 19 | A O 20 | French B-MISC 21 | American I-MISC 22 | actor O 23 | won O 24 | an O 25 | oscar O 26 | 27 | Jean B-PER 28 | Pierre I-PER 29 | lives O 30 | in O 31 | New B-LOC 32 | York I-LOC 33 | . O 34 | 35 | The O 36 | European B-ORG 37 | Union I-ORG 38 | is O 39 | a O 40 | political O 41 | and O 42 | economic O 43 | union O 44 | 45 | A O 46 | French B-MISC 47 | American I-MISC 48 | actor O 49 | won O 50 | an O 51 | oscar O 52 | 53 | Jean B-PER 54 | Pierre I-PER 55 | lives O 56 | in O 57 | New B-LOC 58 | York I-LOC 59 | . O 60 | 61 | The O 62 | European B-ORG 63 | Union I-ORG 64 | is O 65 | a O 66 | political O 67 | and O 68 | economic O 69 | union O 70 | 71 | A O 72 | French B-MISC 73 | American I-MISC 74 | actor O 75 | won O 76 | an O 77 | oscar O 78 | 79 | Jean B-PER 80 | Pierre I-PER 81 | lives O 82 | in O 83 | New B-LOC 84 | York I-LOC 85 | . O 86 | 87 | The O 88 | European B-ORG 89 | Union I-ORG 90 | is O 91 | a O 92 | political O 93 | and O 94 | economic O 95 | union O 96 | 97 | A O 98 | French B-MISC 99 | American I-MISC 100 | actor O 101 | won O 102 | an O 103 | oscar O 104 | -------------------------------------------------------------------------------- /4.NER(LSTM+CRF)/evaluate.py: -------------------------------------------------------------------------------- 1 | from model.data_utils import CoNLLDataset 2 | from model.ner_model import NERModel 3 | from model.config import Config 4 | 5 | 6 | def align_data(data): 7 | """Given dict with lists, creates aligned strings 8 | 9 | Adapted from Assignment 3 of CS224N 10 | 11 | Args: 12 | data: (dict) data["x"] = ["I", "love", "you"] 13 | (dict) data["y"] = ["O", "O", "O"] 14 | 15 | Returns: 16 | data_aligned: (dict) data_align["x"] = "I love you" 17 | data_align["y"] = "O O O " 18 | 19 | """ 20 | spacings = [max([len(seq[i]) for seq in data.values()]) 21 | for i in range(len(data[list(data.keys())[0]]))] 22 | data_aligned = dict() 23 | 24 | # for each entry, create aligned string 25 | for key, seq in data.items(): 26 | str_aligned = "" 27 | for token, spacing in zip(seq, spacings): 28 | str_aligned += token + " " * (spacing - len(token) + 1) 29 | 30 | data_aligned[key] = str_aligned 31 | 32 | return data_aligned 33 | 34 | 35 | 36 | def interactive_shell(model): 37 | """Creates interactive shell to play with model 38 | 39 | Args: 40 | model: instance of NERModel 41 | 42 | """ 43 | model.logger.info(""" 44 | This is an interactive mode. 45 | To exit, enter 'exit'. 46 | You can enter a sentence like 47 | input> I love Paris""") 48 | 49 | while True: 50 | try: 51 | # for python 2 52 | sentence = raw_input("input> ") 53 | except NameError: 54 | # for python 3 55 | sentence = input("input> ") 56 | 57 | words_raw = sentence.strip().split(" ") 58 | 59 | if words_raw == ["exit"]: 60 | break 61 | 62 | preds = model.predict(words_raw) 63 | to_print = align_data({"input": words_raw, "output": preds}) 64 | 65 | for key, seq in to_print.items(): 66 | model.logger.info(seq) 67 | 68 | 69 | def main(): 70 | # create instance of config 71 | config = Config() 72 | 73 | # build model 74 | model = NERModel(config) 75 | model.build() 76 | model.restore_session(config.dir_model) 77 | 78 | # create dataset 79 | test = CoNLLDataset(config.filename_test, config.processing_word, 80 | config.processing_tag, config.max_iter) 81 | 82 | # evaluate and interact 83 | model.evaluate(test) 84 | interactive_shell(model) 85 | 86 | 87 | if __name__ == "__main__": 88 | main() 89 | -------------------------------------------------------------------------------- /4.NER(LSTM+CRF)/makefile: -------------------------------------------------------------------------------- 1 | glove: 2 | wget -P ./data/ "http://nlp.stanford.edu/data/glove.6B.zip" 3 | unzip ./data/glove.6B.zip -d data/glove.6B/ 4 | rm ./data/glove.6B.zip 5 | 6 | run: 7 | python build_data.py 8 | python train.py 9 | python evaluate.py 10 | -------------------------------------------------------------------------------- /4.NER(LSTM+CRF)/model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JesseYule/NLPBeginner/175762fcbf40f84c6900dcf7453f865dffe0faf6/4.NER(LSTM+CRF)/model/__init__.py -------------------------------------------------------------------------------- /4.NER(LSTM+CRF)/model/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JesseYule/NLPBeginner/175762fcbf40f84c6900dcf7453f865dffe0faf6/4.NER(LSTM+CRF)/model/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /4.NER(LSTM+CRF)/model/__pycache__/base_model.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JesseYule/NLPBeginner/175762fcbf40f84c6900dcf7453f865dffe0faf6/4.NER(LSTM+CRF)/model/__pycache__/base_model.cpython-37.pyc -------------------------------------------------------------------------------- /4.NER(LSTM+CRF)/model/__pycache__/config.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JesseYule/NLPBeginner/175762fcbf40f84c6900dcf7453f865dffe0faf6/4.NER(LSTM+CRF)/model/__pycache__/config.cpython-37.pyc -------------------------------------------------------------------------------- /4.NER(LSTM+CRF)/model/__pycache__/data_utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JesseYule/NLPBeginner/175762fcbf40f84c6900dcf7453f865dffe0faf6/4.NER(LSTM+CRF)/model/__pycache__/data_utils.cpython-37.pyc -------------------------------------------------------------------------------- /4.NER(LSTM+CRF)/model/__pycache__/general_utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JesseYule/NLPBeginner/175762fcbf40f84c6900dcf7453f865dffe0faf6/4.NER(LSTM+CRF)/model/__pycache__/general_utils.cpython-37.pyc -------------------------------------------------------------------------------- /4.NER(LSTM+CRF)/model/__pycache__/ner_model.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JesseYule/NLPBeginner/175762fcbf40f84c6900dcf7453f865dffe0faf6/4.NER(LSTM+CRF)/model/__pycache__/ner_model.cpython-37.pyc -------------------------------------------------------------------------------- /4.NER(LSTM+CRF)/model/base_model.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tensorflow as tf 3 | 4 | 5 | class BaseModel(object): 6 | """Generic class for general methods that are not specific to NER""" 7 | 8 | def __init__(self, config): 9 | """Defines self.config and self.logger 10 | 11 | Args: 12 | config: (Config instance) class with hyper parameters, 13 | vocab and embeddings 14 | 15 | """ 16 | self.config = config 17 | self.logger = config.logger 18 | self.sess = None 19 | self.saver = None 20 | 21 | 22 | def reinitialize_weights(self, scope_name): 23 | """Reinitializes the weights of a given layer""" 24 | variables = tf.contrib.framework.get_variables(scope_name) 25 | init = tf.variables_initializer(variables) 26 | self.sess.run(init) 27 | 28 | 29 | def add_train_op(self, lr_method, lr, loss, clip=-1): 30 | """Defines self.train_op that performs an update on a batch 31 | 32 | Args: 33 | lr_method: (string) sgd method, for example "adam" 34 | lr: (tf.placeholder) tf.float32, learning rate 35 | loss: (tensor) tf.float32 loss to minimize 36 | clip: (python float) clipping of gradient. If < 0, no clipping 37 | 38 | """ 39 | _lr_m = lr_method.lower() # lower to make sure 40 | 41 | with tf.variable_scope("train_step"): 42 | if _lr_m == 'adam': # sgd method 43 | optimizer = tf.train.AdamOptimizer(lr) 44 | elif _lr_m == 'adagrad': 45 | optimizer = tf.train.AdagradOptimizer(lr) 46 | elif _lr_m == 'sgd': 47 | optimizer = tf.train.GradientDescentOptimizer(lr) 48 | elif _lr_m == 'rmsprop': 49 | optimizer = tf.train.RMSPropOptimizer(lr) 50 | else: 51 | raise NotImplementedError("Unknown method {}".format(_lr_m)) 52 | 53 | if clip > 0: # gradient clipping if clip is positive 54 | grads, vs = zip(*optimizer.compute_gradients(loss)) 55 | grads, gnorm = tf.clip_by_global_norm(grads, clip) 56 | self.train_op = optimizer.apply_gradients(zip(grads, vs)) 57 | else: 58 | self.train_op = optimizer.minimize(loss) 59 | 60 | 61 | def initialize_session(self): 62 | """Defines self.sess and initialize the variables""" 63 | self.logger.info("Initializing tf session") 64 | self.sess = tf.Session() 65 | self.sess.run(tf.global_variables_initializer()) 66 | self.saver = tf.train.Saver() 67 | 68 | 69 | def restore_session(self, dir_model): 70 | """Reload weights into session 71 | 72 | Args: 73 | sess: tf.Session() 74 | dir_model: dir with weights 75 | 76 | """ 77 | self.logger.info("Reloading the latest trained model...") 78 | self.saver.restore(self.sess, dir_model) 79 | 80 | 81 | def save_session(self): 82 | """Saves session = weights""" 83 | if not os.path.exists(self.config.dir_model): 84 | os.makedirs(self.config.dir_model) 85 | self.saver.save(self.sess, self.config.dir_model) 86 | 87 | 88 | def close_session(self): 89 | """Closes the session""" 90 | self.sess.close() 91 | 92 | 93 | def add_summary(self): 94 | """Defines variables for Tensorboard 95 | 96 | Args: 97 | dir_output: (string) where the results are written 98 | 99 | """ 100 | self.merged = tf.summary.merge_all() 101 | self.file_writer = tf.summary.FileWriter(self.config.dir_output, 102 | self.sess.graph) 103 | 104 | 105 | def train(self, train, dev): 106 | """Performs training with early stopping and lr exponential decay 107 | 108 | Args: 109 | train: dataset that yields tuple of (sentences, tags) 110 | dev: dataset 111 | 112 | """ 113 | best_score = 0 114 | nepoch_no_imprv = 0 # for early stopping 115 | self.add_summary() # tensorboard 116 | 117 | for epoch in range(self.config.nepochs): 118 | self.logger.info("Epoch {:} out of {:}".format(epoch + 1, 119 | self.config.nepochs)) 120 | 121 | score = self.run_epoch(train, dev, epoch) 122 | self.config.lr *= self.config.lr_decay # decay learning rate 123 | 124 | # early stopping and saving best parameters 125 | if score >= best_score: 126 | nepoch_no_imprv = 0 127 | self.save_session() 128 | best_score = score 129 | self.logger.info("- new best score!") 130 | else: 131 | nepoch_no_imprv += 1 132 | if nepoch_no_imprv >= self.config.nepoch_no_imprv: 133 | self.logger.info("- early stopping {} epochs without "\ 134 | "improvement".format(nepoch_no_imprv)) 135 | break 136 | 137 | 138 | def evaluate(self, test): 139 | """Evaluate model on test set 140 | 141 | Args: 142 | test: instance of class Dataset 143 | 144 | """ 145 | self.logger.info("Testing model over test set") 146 | metrics = self.run_evaluate(test) 147 | msg = " - ".join(["{} {:04.2f}".format(k, v) 148 | for k, v in metrics.items()]) 149 | self.logger.info(msg) 150 | -------------------------------------------------------------------------------- /4.NER(LSTM+CRF)/model/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | 4 | from .general_utils import get_logger 5 | from .data_utils import get_trimmed_glove_vectors, load_vocab, \ 6 | get_processing_word 7 | 8 | 9 | class Config(): 10 | def __init__(self, load=True): 11 | """Initialize hyperparameters and load vocabs 12 | 13 | Args: 14 | load_embeddings: (bool) if True, load embeddings into 15 | np array, else None 16 | 17 | """ 18 | # directory for training outputs 19 | if not os.path.exists(self.dir_output): 20 | os.makedirs(self.dir_output) 21 | 22 | # create instance of logger 23 | self.logger = get_logger(self.path_log) 24 | 25 | # load if requested (default) 26 | if load: 27 | self.load() 28 | 29 | 30 | def load(self): 31 | """Loads vocabulary, processing functions and embeddings 32 | 33 | Supposes that build_data.py has been run successfully and that 34 | the corresponding files have been created (vocab and trimmed GloVe 35 | vectors) 36 | 37 | """ 38 | # 1. vocabulary 39 | self.vocab_words = load_vocab(self.filename_words) 40 | self.vocab_tags = load_vocab(self.filename_tags) 41 | self.vocab_chars = load_vocab(self.filename_chars) 42 | 43 | self.nwords = len(self.vocab_words) 44 | self.nchars = len(self.vocab_chars) 45 | self.ntags = len(self.vocab_tags) 46 | 47 | # 2. get processing functions that map str -> id 48 | self.processing_word = get_processing_word(self.vocab_words, 49 | self.vocab_chars, lowercase=True, chars=self.use_chars) 50 | self.processing_tag = get_processing_word(self.vocab_tags, 51 | lowercase=False, allow_unk=False) 52 | 53 | # 3. get pre-trained embeddings 54 | self.embeddings = (get_trimmed_glove_vectors(self.filename_trimmed) 55 | if self.use_pretrained else None) 56 | 57 | 58 | # general config 59 | dir_output = "results/test/" 60 | dir_model = dir_output + "model.weights/" 61 | path_log = dir_output + "log.txt" 62 | 63 | # embeddings 64 | dim_word = 300 65 | dim_char = 100 66 | 67 | # glove files 68 | filename_glove = "data/glove.6B/glove.6B.{}d.txt".format(dim_word) 69 | # trimmed embeddings (created from glove_filename with build_data.py) 70 | filename_trimmed = "data/glove.6B.{}d.trimmed.npz".format(dim_word) 71 | use_pretrained = True 72 | 73 | # dataset 74 | # filename_dev = "data/coNLL/eng/eng.testa.iob" 75 | # filename_test = "data/coNLL/eng/eng.testb.iob" 76 | # filename_train = "data/coNLL/eng/eng.train.iob" 77 | 78 | filename_dev = filename_test = filename_train = "data/test.txt" # test 79 | 80 | max_iter = None # if not None, max number of examples in Dataset 81 | 82 | # vocab (created from dataset with build_data.py) 83 | filename_words = "data/words.txt" 84 | filename_tags = "data/tags.txt" 85 | filename_chars = "data/chars.txt" 86 | 87 | # training 88 | train_embeddings = False 89 | nepochs = 15 90 | dropout = 0.5 91 | batch_size = 20 92 | lr_method = "adam" 93 | lr = 0.001 94 | lr_decay = 0.9 95 | clip = -1 # if negative, no clipping 96 | nepoch_no_imprv = 3 97 | 98 | # model hyperparameters 99 | hidden_size_char = 100 # lstm on chars 100 | hidden_size_lstm = 300 # lstm on word embeddings 101 | 102 | # NOTE: if both chars and crf, only 1.6x slower on GPU 103 | use_crf = True # if crf, training is 1.7x slower on CPU 104 | use_chars = True # if char embedding, training is 3.5x slower on CPU 105 | -------------------------------------------------------------------------------- /4.NER(LSTM+CRF)/model/data_utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | 4 | 5 | # shared global variables to be imported from model also 6 | UNK = "$UNK$" 7 | NUM = "$NUM$" 8 | NONE = "O" 9 | 10 | 11 | # special error message 12 | class MyIOError(Exception): 13 | def __init__(self, filename): 14 | # custom error message 15 | message = """ 16 | ERROR: Unable to locate file {}. 17 | 18 | FIX: Have you tried running python build_data.py first? 19 | This will build vocab file from your train, test and dev sets and 20 | trimm your word vectors. 21 | """.format(filename) 22 | super(MyIOError, self).__init__(message) 23 | 24 | 25 | class CoNLLDataset(object): 26 | """Class that iterates over CoNLL Dataset 27 | 28 | __iter__ method yields a tuple (words, tags) 29 | words: list of raw words 30 | tags: list of raw tags 31 | 32 | If processing_word and processing_tag are not None, 33 | optional preprocessing is appplied 34 | 35 | Example: 36 | ```python 37 | data = CoNLLDataset(filename) 38 | for sentence, tags in data: 39 | pass 40 | ``` 41 | 42 | """ 43 | def __init__(self, filename, processing_word=None, processing_tag=None, 44 | max_iter=None): 45 | """ 46 | Args: 47 | filename: path to the file 48 | processing_words: (optional) function that takes a word as input 49 | processing_tags: (optional) function that takes a tag as input 50 | max_iter: (optional) max number of sentences to yield 51 | 52 | """ 53 | self.filename = filename 54 | self.processing_word = processing_word 55 | self.processing_tag = processing_tag 56 | self.max_iter = max_iter 57 | self.length = None 58 | 59 | 60 | def __iter__(self): 61 | niter = 0 62 | with open(self.filename) as f: 63 | words, tags = [], [] 64 | for line in f: 65 | line = line.strip() 66 | if (len(line) == 0 or line.startswith("-DOCSTART-")): 67 | if len(words) != 0: 68 | niter += 1 69 | if self.max_iter is not None and niter > self.max_iter: 70 | break 71 | yield words, tags 72 | words, tags = [], [] 73 | else: 74 | ls = line.split(' ') 75 | word, tag = ls[0],ls[1] 76 | if self.processing_word is not None: 77 | word = self.processing_word(word) 78 | if self.processing_tag is not None: 79 | tag = self.processing_tag(tag) 80 | words += [word] 81 | tags += [tag] 82 | 83 | 84 | def __len__(self): 85 | """Iterates once over the corpus to set and store length""" 86 | if self.length is None: 87 | self.length = 0 88 | for _ in self: 89 | self.length += 1 90 | 91 | return self.length 92 | 93 | 94 | def get_vocabs(datasets): 95 | """Build vocabulary from an iterable of datasets objects 96 | 97 | Args: 98 | datasets: a list of dataset objects 99 | 100 | Returns: 101 | a set of all the words in the dataset 102 | 103 | """ 104 | print("Building vocab...") 105 | vocab_words = set() 106 | vocab_tags = set() 107 | for dataset in datasets: 108 | for words, tags in dataset: 109 | vocab_words.update(words) 110 | vocab_tags.update(tags) 111 | print("- done. {} tokens".format(len(vocab_words))) 112 | return vocab_words, vocab_tags 113 | 114 | 115 | def get_char_vocab(dataset): 116 | """Build char vocabulary from an iterable of datasets objects 117 | 118 | Args: 119 | dataset: a iterator yielding tuples (sentence, tags) 120 | 121 | Returns: 122 | a set of all the characters in the dataset 123 | 124 | """ 125 | vocab_char = set() 126 | for words, _ in dataset: 127 | for word in words: 128 | vocab_char.update(word) 129 | 130 | return vocab_char 131 | 132 | 133 | def get_glove_vocab(filename): 134 | """Load vocab from file 135 | 136 | Args: 137 | filename: path to the glove vectors 138 | 139 | Returns: 140 | vocab: set() of strings 141 | """ 142 | print("Building vocab...") 143 | vocab = set() 144 | with open(filename) as f: 145 | for line in f: 146 | word = line.strip().split(' ')[0] 147 | vocab.add(word) 148 | print("- done. {} tokens".format(len(vocab))) 149 | return vocab 150 | 151 | 152 | def write_vocab(vocab, filename): 153 | """Writes a vocab to a file 154 | 155 | Writes one word per line. 156 | 157 | Args: 158 | vocab: iterable that yields word 159 | filename: path to vocab file 160 | 161 | Returns: 162 | write a word per line 163 | 164 | """ 165 | print("Writing vocab...") 166 | with open(filename, "w") as f: 167 | for i, word in enumerate(vocab): 168 | if i != len(vocab) - 1: 169 | f.write("{}\n".format(word)) 170 | else: 171 | f.write(word) 172 | print("- done. {} tokens".format(len(vocab))) 173 | 174 | 175 | def load_vocab(filename): 176 | """Loads vocab from a file 177 | 178 | Args: 179 | filename: (string) the format of the file must be one word per line. 180 | 181 | Returns: 182 | d: dict[word] = index 183 | 184 | """ 185 | try: 186 | d = dict() 187 | with open(filename) as f: 188 | for idx, word in enumerate(f): 189 | word = word.strip() 190 | d[word] = idx 191 | 192 | except IOError: 193 | raise MyIOError(filename) 194 | return d 195 | 196 | 197 | def export_trimmed_glove_vectors(vocab, glove_filename, trimmed_filename, dim): 198 | """Saves glove vectors in numpy array 199 | 200 | Args: 201 | vocab: dictionary vocab[word] = index 202 | glove_filename: a path to a glove file 203 | trimmed_filename: a path where to store a matrix in npy 204 | dim: (int) dimension of embeddings 205 | 206 | """ 207 | embeddings = np.zeros([len(vocab), dim]) 208 | with open(glove_filename) as f: 209 | for line in f: 210 | line = line.strip().split(' ') 211 | word = line[0] 212 | embedding = [float(x) for x in line[1:]] 213 | if word in vocab: 214 | word_idx = vocab[word] 215 | embeddings[word_idx] = np.asarray(embedding) 216 | 217 | np.savez_compressed(trimmed_filename, embeddings=embeddings) 218 | 219 | 220 | def get_trimmed_glove_vectors(filename): 221 | """ 222 | Args: 223 | filename: path to the npz file 224 | 225 | Returns: 226 | matrix of embeddings (np array) 227 | 228 | """ 229 | try: 230 | with np.load(filename) as data: 231 | return data["embeddings"] 232 | 233 | except IOError: 234 | raise MyIOError(filename) 235 | 236 | 237 | def get_processing_word(vocab_words=None, vocab_chars=None, 238 | lowercase=False, chars=False, allow_unk=True): 239 | """Return lambda function that transform a word (string) into list, 240 | or tuple of (list, id) of int corresponding to the ids of the word and 241 | its corresponding characters. 242 | 243 | Args: 244 | vocab: dict[word] = idx 245 | 246 | Returns: 247 | f("cat") = ([12, 4, 32], 12345) 248 | = (list of char ids, word id) 249 | 250 | """ 251 | def f(word): 252 | # 0. get chars of words 253 | if vocab_chars is not None and chars == True: 254 | char_ids = [] 255 | for char in word: 256 | # ignore chars out of vocabulary 257 | if char in vocab_chars: 258 | char_ids += [vocab_chars[char]] 259 | 260 | # 1. preprocess word 261 | if lowercase: 262 | word = word.lower() 263 | if word.isdigit(): 264 | word = NUM 265 | 266 | # 2. get id of word 267 | if vocab_words is not None: 268 | if word in vocab_words: 269 | word = vocab_words[word] 270 | else: 271 | if allow_unk: 272 | word = vocab_words[UNK] 273 | else: 274 | raise Exception("Unknow key is not allowed. Check that "\ 275 | "your vocab (tags?) is correct") 276 | 277 | # 3. return tuple char ids, word id 278 | if vocab_chars is not None and chars == True: 279 | return char_ids, word 280 | else: 281 | return word 282 | 283 | return f 284 | 285 | 286 | def _pad_sequences(sequences, pad_tok, max_length): 287 | """ 288 | Args: 289 | sequences: a generator of list or tuple 290 | pad_tok: the char to pad with 291 | 292 | Returns: 293 | a list of list where each sublist has same length 294 | """ 295 | sequence_padded, sequence_length = [], [] 296 | 297 | for seq in sequences: 298 | seq = list(seq) 299 | seq_ = seq[:max_length] + [pad_tok]*max(max_length - len(seq), 0) 300 | sequence_padded += [seq_] 301 | sequence_length += [min(len(seq), max_length)] 302 | 303 | return sequence_padded, sequence_length 304 | 305 | 306 | def pad_sequences(sequences, pad_tok, nlevels=1): 307 | """ 308 | Args: 309 | sequences: a generator of list or tuple 310 | pad_tok: the char to pad with 311 | nlevels: "depth" of padding, for the case where we have characters ids 312 | 313 | Returns: 314 | a list of list where each sublist has same length 315 | 316 | """ 317 | if nlevels == 1: 318 | max_length = max(map(lambda x : len(x), sequences)) 319 | sequence_padded, sequence_length = _pad_sequences(sequences, 320 | pad_tok, max_length) 321 | 322 | elif nlevels == 2: 323 | max_length_word = max([max(map(lambda x: len(x), seq)) 324 | for seq in sequences]) 325 | sequence_padded, sequence_length = [], [] 326 | for seq in sequences: 327 | # all words are same length now 328 | sp, sl = _pad_sequences(seq, pad_tok, max_length_word) 329 | sequence_padded += [sp] 330 | sequence_length += [sl] 331 | 332 | max_length_sentence = max(map(lambda x : len(x), sequences)) 333 | sequence_padded, _ = _pad_sequences(sequence_padded, 334 | [pad_tok]*max_length_word, max_length_sentence) 335 | sequence_length, _ = _pad_sequences(sequence_length, 0, 336 | max_length_sentence) 337 | 338 | return sequence_padded, sequence_length 339 | 340 | 341 | def minibatches(data, minibatch_size): 342 | """ 343 | Args: 344 | data: generator of (sentence, tags) tuples 345 | minibatch_size: (int) 346 | 347 | Yields: 348 | list of tuples 349 | 350 | """ 351 | x_batch, y_batch = [], [] 352 | for (x, y) in data: 353 | if len(x_batch) == minibatch_size: 354 | yield x_batch, y_batch 355 | x_batch, y_batch = [], [] 356 | 357 | if type(x[0]) == tuple: 358 | x = zip(*x) 359 | x_batch += [x] 360 | y_batch += [y] 361 | 362 | if len(x_batch) != 0: 363 | yield x_batch, y_batch 364 | 365 | 366 | def get_chunk_type(tok, idx_to_tag): 367 | """ 368 | Args: 369 | tok: id of token, ex 4 370 | idx_to_tag: dictionary {4: "B-PER", ...} 371 | 372 | Returns: 373 | tuple: "B", "PER" 374 | 375 | """ 376 | tag_name = idx_to_tag[tok] 377 | tag_class = tag_name.split('-')[0] 378 | tag_type = tag_name.split('-')[-1] 379 | return tag_class, tag_type 380 | 381 | 382 | def get_chunks(seq, tags): 383 | """Given a sequence of tags, group entities and their position 384 | 385 | Args: 386 | seq: [4, 4, 0, 0, ...] sequence of labels 387 | tags: dict["O"] = 4 388 | 389 | Returns: 390 | list of (chunk_type, chunk_start, chunk_end) 391 | 392 | Example: 393 | seq = [4, 5, 0, 3] 394 | tags = {"B-PER": 4, "I-PER": 5, "B-LOC": 3} 395 | result = [("PER", 0, 2), ("LOC", 3, 4)] 396 | 397 | """ 398 | default = tags[NONE] 399 | idx_to_tag = {idx: tag for tag, idx in tags.items()} 400 | chunks = [] 401 | chunk_type, chunk_start = None, None 402 | for i, tok in enumerate(seq): 403 | # End of a chunk 1 404 | if tok == default and chunk_type is not None: 405 | # Add a chunk. 406 | chunk = (chunk_type, chunk_start, i) 407 | chunks.append(chunk) 408 | chunk_type, chunk_start = None, None 409 | 410 | # End of a chunk + start of a chunk! 411 | elif tok != default: 412 | tok_chunk_class, tok_chunk_type = get_chunk_type(tok, idx_to_tag) 413 | if chunk_type is None: 414 | chunk_type, chunk_start = tok_chunk_type, i 415 | elif tok_chunk_type != chunk_type or tok_chunk_class == "B": 416 | chunk = (chunk_type, chunk_start, i) 417 | chunks.append(chunk) 418 | chunk_type, chunk_start = tok_chunk_type, i 419 | else: 420 | pass 421 | 422 | # end condition 423 | if chunk_type is not None: 424 | chunk = (chunk_type, chunk_start, len(seq)) 425 | chunks.append(chunk) 426 | 427 | return chunks 428 | -------------------------------------------------------------------------------- /4.NER(LSTM+CRF)/model/general_utils.py: -------------------------------------------------------------------------------- 1 | import time 2 | import sys 3 | import logging 4 | import numpy as np 5 | 6 | 7 | def get_logger(filename): 8 | """Return a logger instance that writes in filename 9 | 10 | Args: 11 | filename: (string) path to log.txt 12 | 13 | Returns: 14 | logger: (instance of logger) 15 | 16 | """ 17 | logger = logging.getLogger('logger') 18 | logger.setLevel(logging.DEBUG) 19 | logging.basicConfig(format='%(message)s', level=logging.DEBUG) 20 | handler = logging.FileHandler(filename) 21 | handler.setLevel(logging.DEBUG) 22 | handler.setFormatter(logging.Formatter( 23 | '%(asctime)s:%(levelname)s: %(message)s')) 24 | logging.getLogger().addHandler(handler) 25 | 26 | return logger 27 | 28 | 29 | class Progbar(object): 30 | """Progbar class copied from keras (https://github.com/fchollet/keras/) 31 | 32 | Displays a progress bar. 33 | Small edit : added strict arg to update 34 | # Arguments 35 | target: Total number of steps expected. 36 | interval: Minimum visual progress update interval (in seconds). 37 | """ 38 | 39 | def __init__(self, target, width=30, verbose=1): 40 | self.width = width 41 | self.target = target 42 | self.sum_values = {} 43 | self.unique_values = [] 44 | self.start = time.time() 45 | self.total_width = 0 46 | self.seen_so_far = 0 47 | self.verbose = verbose 48 | 49 | def update(self, current, values=[], exact=[], strict=[]): 50 | """ 51 | Updates the progress bar. 52 | # Arguments 53 | current: Index of current step. 54 | values: List of tuples (name, value_for_last_step). 55 | The progress bar will display averages for these values. 56 | exact: List of tuples (name, value_for_last_step). 57 | The progress bar will display these values directly. 58 | """ 59 | 60 | for k, v in values: 61 | if k not in self.sum_values: 62 | self.sum_values[k] = [v * (current - self.seen_so_far), 63 | current - self.seen_so_far] 64 | self.unique_values.append(k) 65 | else: 66 | self.sum_values[k][0] += v * (current - self.seen_so_far) 67 | self.sum_values[k][1] += (current - self.seen_so_far) 68 | for k, v in exact: 69 | if k not in self.sum_values: 70 | self.unique_values.append(k) 71 | self.sum_values[k] = [v, 1] 72 | 73 | for k, v in strict: 74 | if k not in self.sum_values: 75 | self.unique_values.append(k) 76 | self.sum_values[k] = v 77 | 78 | self.seen_so_far = current 79 | 80 | now = time.time() 81 | if self.verbose == 1: 82 | prev_total_width = self.total_width 83 | sys.stdout.write("\b" * prev_total_width) 84 | sys.stdout.write("\r") 85 | 86 | numdigits = int(np.floor(np.log10(self.target))) + 1 87 | barstr = '%%%dd/%%%dd [' % (numdigits, numdigits) 88 | bar = barstr % (current, self.target) 89 | prog = float(current)/self.target 90 | prog_width = int(self.width*prog) 91 | if prog_width > 0: 92 | bar += ('='*(prog_width-1)) 93 | if current < self.target: 94 | bar += '>' 95 | else: 96 | bar += '=' 97 | bar += ('.'*(self.width-prog_width)) 98 | bar += ']' 99 | sys.stdout.write(bar) 100 | self.total_width = len(bar) 101 | 102 | if current: 103 | time_per_unit = (now - self.start) / current 104 | else: 105 | time_per_unit = 0 106 | eta = time_per_unit*(self.target - current) 107 | info = '' 108 | if current < self.target: 109 | info += ' - ETA: %ds' % eta 110 | else: 111 | info += ' - %ds' % (now - self.start) 112 | for k in self.unique_values: 113 | if type(self.sum_values[k]) is list: 114 | info += ' - %s: %.4f' % (k, 115 | self.sum_values[k][0] / max(1, self.sum_values[k][1])) 116 | else: 117 | info += ' - %s: %s' % (k, self.sum_values[k]) 118 | 119 | self.total_width += len(info) 120 | if prev_total_width > self.total_width: 121 | info += ((prev_total_width-self.total_width) * " ") 122 | 123 | sys.stdout.write(info) 124 | sys.stdout.flush() 125 | 126 | if current >= self.target: 127 | sys.stdout.write("\n") 128 | 129 | if self.verbose == 2: 130 | if current >= self.target: 131 | info = '%ds' % (now - self.start) 132 | for k in self.unique_values: 133 | info += ' - %s: %.4f' % (k, 134 | self.sum_values[k][0] / max(1, self.sum_values[k][1])) 135 | sys.stdout.write(info + "\n") 136 | 137 | def add(self, n, values=[]): 138 | self.update(self.seen_so_far+n, values) 139 | 140 | 141 | -------------------------------------------------------------------------------- /4.NER(LSTM+CRF)/model/ner_model.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | import tensorflow as tf 4 | 5 | 6 | from .data_utils import minibatches, pad_sequences, get_chunks 7 | from .general_utils import Progbar 8 | from .base_model import BaseModel 9 | 10 | 11 | class NERModel(BaseModel): 12 | """Specialized class of Model for NER""" 13 | 14 | def __init__(self, config): 15 | super(NERModel, self).__init__(config) 16 | self.idx_to_tag = {idx: tag for tag, idx in 17 | self.config.vocab_tags.items()} 18 | 19 | 20 | def add_placeholders(self): 21 | """Define placeholders = entries to computational graph""" 22 | # shape = (batch size, max length of sentence in batch) 23 | self.word_ids = tf.placeholder(tf.int32, shape=[None, None], 24 | name="word_ids") 25 | 26 | # shape = (batch size) 27 | self.sequence_lengths = tf.placeholder(tf.int32, shape=[None], 28 | name="sequence_lengths") 29 | 30 | # shape = (batch size, max length of sentence, max length of word) 31 | self.char_ids = tf.placeholder(tf.int32, shape=[None, None, None], 32 | name="char_ids") 33 | 34 | # shape = (batch_size, max_length of sentence) 35 | self.word_lengths = tf.placeholder(tf.int32, shape=[None, None], 36 | name="word_lengths") 37 | 38 | # shape = (batch size, max length of sentence in batch) 39 | self.labels = tf.placeholder(tf.int32, shape=[None, None], 40 | name="labels") 41 | 42 | # hyper parameters 43 | self.dropout = tf.placeholder(dtype=tf.float32, shape=[], 44 | name="dropout") 45 | self.lr = tf.placeholder(dtype=tf.float32, shape=[], 46 | name="lr") 47 | 48 | 49 | def get_feed_dict(self, words, labels=None, lr=None, dropout=None): 50 | """Given some data, pad it and build a feed dictionary 51 | 52 | Args: 53 | words: list of sentences. A sentence is a list of ids of a list of 54 | words. A word is a list of ids 55 | labels: list of ids 56 | lr: (float) learning rate 57 | dropout: (float) keep prob 58 | 59 | Returns: 60 | dict {placeholder: value} 61 | 62 | """ 63 | # perform padding of the given data 64 | if self.config.use_chars: 65 | char_ids, word_ids = zip(*words) 66 | word_ids, sequence_lengths = pad_sequences(word_ids, 0) 67 | char_ids, word_lengths = pad_sequences(char_ids, pad_tok=0, 68 | nlevels=2) 69 | else: 70 | word_ids, sequence_lengths = pad_sequences(words, 0) 71 | 72 | # build feed dictionary 73 | feed = { 74 | self.word_ids: word_ids, 75 | self.sequence_lengths: sequence_lengths 76 | } 77 | 78 | if self.config.use_chars: 79 | feed[self.char_ids] = char_ids 80 | feed[self.word_lengths] = word_lengths 81 | 82 | if labels is not None: 83 | labels, _ = pad_sequences(labels, 0) 84 | feed[self.labels] = labels 85 | 86 | if lr is not None: 87 | feed[self.lr] = lr 88 | 89 | if dropout is not None: 90 | feed[self.dropout] = dropout 91 | 92 | return feed, sequence_lengths 93 | 94 | 95 | def add_word_embeddings_op(self): 96 | """Defines self.word_embeddings 97 | 98 | If self.config.embeddings is not None and is a np array initialized 99 | with pre-trained word vectors, the word embeddings is just a look-up 100 | and we don't train the vectors. Otherwise, a random matrix with 101 | the correct shape is initialized. 102 | """ 103 | with tf.variable_scope("words"): 104 | if self.config.embeddings is None: 105 | self.logger.info("WARNING: randomly initializing word vectors") 106 | _word_embeddings = tf.get_variable( 107 | name="_word_embeddings", 108 | dtype=tf.float32, 109 | shape=[self.config.nwords, self.config.dim_word]) 110 | else: 111 | _word_embeddings = tf.Variable( 112 | self.config.embeddings, 113 | name="_word_embeddings", 114 | dtype=tf.float32, 115 | trainable=self.config.train_embeddings) 116 | 117 | word_embeddings = tf.nn.embedding_lookup(_word_embeddings, 118 | self.word_ids, name="word_embeddings") 119 | 120 | with tf.variable_scope("chars"): 121 | if self.config.use_chars: 122 | # get char embeddings matrix 123 | _char_embeddings = tf.get_variable( 124 | name="_char_embeddings", 125 | dtype=tf.float32, 126 | shape=[self.config.nchars, self.config.dim_char]) 127 | char_embeddings = tf.nn.embedding_lookup(_char_embeddings, 128 | self.char_ids, name="char_embeddings") 129 | 130 | # put the time dimension on axis=1 131 | s = tf.shape(char_embeddings) 132 | char_embeddings = tf.reshape(char_embeddings, 133 | shape=[s[0]*s[1], s[-2], self.config.dim_char]) 134 | word_lengths = tf.reshape(self.word_lengths, shape=[s[0]*s[1]]) 135 | 136 | # bi lstm on chars 137 | cell_fw = tf.contrib.rnn.LSTMCell(self.config.hidden_size_char, 138 | state_is_tuple=True) 139 | cell_bw = tf.contrib.rnn.LSTMCell(self.config.hidden_size_char, 140 | state_is_tuple=True) 141 | _output = tf.nn.bidirectional_dynamic_rnn( 142 | cell_fw, cell_bw, char_embeddings, 143 | sequence_length=word_lengths, dtype=tf.float32) 144 | 145 | # read and concat output 146 | _, ((_, output_fw), (_, output_bw)) = _output 147 | output = tf.concat([output_fw, output_bw], axis=-1) 148 | 149 | # shape = (batch size, max sentence length, char hidden size) 150 | output = tf.reshape(output, 151 | shape=[s[0], s[1], 2*self.config.hidden_size_char]) 152 | word_embeddings = tf.concat([word_embeddings, output], axis=-1) 153 | 154 | self.word_embeddings = tf.nn.dropout(word_embeddings, self.dropout) 155 | 156 | 157 | def add_logits_op(self): 158 | """Defines self.logits 159 | 160 | For each word in each sentence of the batch, it corresponds to a vector 161 | of scores, of dimension equal to the number of tags. 162 | """ 163 | with tf.variable_scope("bi-lstm"): 164 | cell_fw = tf.contrib.rnn.LSTMCell(self.config.hidden_size_lstm) 165 | cell_bw = tf.contrib.rnn.LSTMCell(self.config.hidden_size_lstm) 166 | (output_fw, output_bw), _ = tf.nn.bidirectional_dynamic_rnn( 167 | cell_fw, cell_bw, self.word_embeddings, 168 | sequence_length=self.sequence_lengths, dtype=tf.float32) 169 | output = tf.concat([output_fw, output_bw], axis=-1) 170 | output = tf.nn.dropout(output, self.dropout) 171 | 172 | with tf.variable_scope("proj"): 173 | W = tf.get_variable("W", dtype=tf.float32, 174 | shape=[2*self.config.hidden_size_lstm, self.config.ntags]) 175 | 176 | b = tf.get_variable("b", shape=[self.config.ntags], 177 | dtype=tf.float32, initializer=tf.zeros_initializer()) 178 | 179 | nsteps = tf.shape(output)[1] 180 | output = tf.reshape(output, [-1, 2*self.config.hidden_size_lstm]) 181 | pred = tf.matmul(output, W) + b 182 | self.logits = tf.reshape(pred, [-1, nsteps, self.config.ntags]) 183 | 184 | 185 | def add_pred_op(self): 186 | """Defines self.labels_pred 187 | 188 | This op is defined only in the case where we don't use a CRF since in 189 | that case we can make the prediction "in the graph" (thanks to tf 190 | functions in other words). With theCRF, as the inference is coded 191 | in python and not in pure tensroflow, we have to make the prediciton 192 | outside the graph. 193 | """ 194 | if not self.config.use_crf: 195 | self.labels_pred = tf.cast(tf.argmax(self.logits, axis=-1), 196 | tf.int32) 197 | 198 | 199 | def add_loss_op(self): 200 | """Defines the loss""" 201 | if self.config.use_crf: 202 | log_likelihood, trans_params = tf.contrib.crf.crf_log_likelihood( 203 | self.logits, self.labels, self.sequence_lengths) 204 | self.trans_params = trans_params # need to evaluate it for decoding 205 | self.loss = tf.reduce_mean(-log_likelihood) 206 | else: 207 | losses = tf.nn.sparse_softmax_cross_entropy_with_logits( 208 | logits=self.logits, labels=self.labels) 209 | mask = tf.sequence_mask(self.sequence_lengths) 210 | losses = tf.boolean_mask(losses, mask) 211 | self.loss = tf.reduce_mean(losses) 212 | 213 | # for tensorboard 214 | tf.summary.scalar("loss", self.loss) 215 | 216 | 217 | def build(self): 218 | # NER specific functions 219 | self.add_placeholders() 220 | self.add_word_embeddings_op() 221 | self.add_logits_op() 222 | self.add_pred_op() 223 | self.add_loss_op() 224 | 225 | # Generic functions that add training op and initialize session 226 | self.add_train_op(self.config.lr_method, self.lr, self.loss, 227 | self.config.clip) 228 | self.initialize_session() # now self.sess is defined and vars are init 229 | 230 | 231 | def predict_batch(self, words): 232 | """ 233 | Args: 234 | words: list of sentences 235 | 236 | Returns: 237 | labels_pred: list of labels for each sentence 238 | sequence_length 239 | 240 | """ 241 | fd, sequence_lengths = self.get_feed_dict(words, dropout=1.0) 242 | 243 | if self.config.use_crf: 244 | # get tag scores and transition params of CRF 245 | viterbi_sequences = [] 246 | logits, trans_params = self.sess.run( 247 | [self.logits, self.trans_params], feed_dict=fd) 248 | 249 | # iterate over the sentences because no batching in vitervi_decode 250 | for logit, sequence_length in zip(logits, sequence_lengths): 251 | logit = logit[:sequence_length] # keep only the valid steps 252 | viterbi_seq, viterbi_score = tf.contrib.crf.viterbi_decode( 253 | logit, trans_params) 254 | viterbi_sequences += [viterbi_seq] 255 | 256 | return viterbi_sequences, sequence_lengths 257 | 258 | else: 259 | labels_pred = self.sess.run(self.labels_pred, feed_dict=fd) 260 | 261 | return labels_pred, sequence_lengths 262 | 263 | 264 | def run_epoch(self, train, dev, epoch): 265 | """Performs one complete pass over the train set and evaluate on dev 266 | 267 | Args: 268 | train: dataset that yields tuple of sentences, tags 269 | dev: dataset 270 | epoch: (int) index of the current epoch 271 | 272 | Returns: 273 | f1: (python float), score to select model on, higher is better 274 | 275 | """ 276 | # progbar stuff for logging 277 | batch_size = self.config.batch_size 278 | nbatches = (len(train) + batch_size - 1) // batch_size 279 | prog = Progbar(target=nbatches) 280 | 281 | # iterate over dataset 282 | for i, (words, labels) in enumerate(minibatches(train, batch_size)): 283 | fd, _ = self.get_feed_dict(words, labels, self.config.lr, 284 | self.config.dropout) 285 | 286 | _, train_loss, summary = self.sess.run( 287 | [self.train_op, self.loss, self.merged], feed_dict=fd) 288 | 289 | prog.update(i + 1, [("train loss", train_loss)]) 290 | 291 | # tensorboard 292 | if i % 10 == 0: 293 | self.file_writer.add_summary(summary, epoch*nbatches + i) 294 | 295 | metrics = self.run_evaluate(dev) 296 | msg = " - ".join(["{} {:04.2f}".format(k, v) 297 | for k, v in metrics.items()]) 298 | self.logger.info(msg) 299 | 300 | return metrics["f1"] 301 | 302 | 303 | def run_evaluate(self, test): 304 | """Evaluates performance on test set 305 | 306 | Args: 307 | test: dataset that yields tuple of (sentences, tags) 308 | 309 | Returns: 310 | metrics: (dict) metrics["acc"] = 98.4, ... 311 | 312 | """ 313 | accs = [] 314 | correct_preds, total_correct, total_preds = 0., 0., 0. 315 | for words, labels in minibatches(test, self.config.batch_size): 316 | labels_pred, sequence_lengths = self.predict_batch(words) 317 | 318 | for lab, lab_pred, length in zip(labels, labels_pred, 319 | sequence_lengths): 320 | lab = lab[:length] 321 | lab_pred = lab_pred[:length] 322 | accs += [a==b for (a, b) in zip(lab, lab_pred)] 323 | 324 | lab_chunks = set(get_chunks(lab, self.config.vocab_tags)) 325 | lab_pred_chunks = set(get_chunks(lab_pred, 326 | self.config.vocab_tags)) 327 | 328 | correct_preds += len(lab_chunks & lab_pred_chunks) 329 | total_preds += len(lab_pred_chunks) 330 | total_correct += len(lab_chunks) 331 | 332 | p = correct_preds / total_preds if correct_preds > 0 else 0 333 | r = correct_preds / total_correct if correct_preds > 0 else 0 334 | f1 = 2 * p * r / (p + r) if correct_preds > 0 else 0 335 | acc = np.mean(accs) 336 | 337 | return {"acc": 100*acc, "f1": 100*f1} 338 | 339 | 340 | def predict(self, words_raw): 341 | """Returns list of tags 342 | 343 | Args: 344 | words_raw: list of words (string), just one sentence (no batch) 345 | 346 | Returns: 347 | preds: list of tags (string), one for each word in the sentence 348 | 349 | """ 350 | words = [self.config.processing_word(w) for w in words_raw] 351 | if type(words[0]) == tuple: 352 | words = zip(*words) 353 | pred_ids, _ = self.predict_batch([words]) 354 | preds = [self.idx_to_tag[idx] for idx in list(pred_ids[0])] 355 | 356 | return preds 357 | -------------------------------------------------------------------------------- /4.NER(LSTM+CRF)/requirements.txt: -------------------------------------------------------------------------------- 1 | tensorflow>=1.0 2 | numpy 3 | logging 4 | -------------------------------------------------------------------------------- /4.NER(LSTM+CRF)/train.py: -------------------------------------------------------------------------------- 1 | from model.data_utils import CoNLLDataset 2 | from model.ner_model import NERModel 3 | from model.config import Config 4 | 5 | 6 | def main(): 7 | # create instance of config 8 | config = Config() 9 | 10 | # build model 11 | model = NERModel(config) 12 | model.build() 13 | # model.restore_session("results/crf/model.weights/") # optional, restore weights 14 | # model.reinitialize_weights("proj") 15 | 16 | # create datasets 17 | dev = CoNLLDataset(config.filename_dev, config.processing_word, 18 | config.processing_tag, config.max_iter) 19 | train = CoNLLDataset(config.filename_train, config.processing_word, 20 | config.processing_tag, config.max_iter) 21 | 22 | # train model 23 | model.train(train, dev) 24 | 25 | if __name__ == "__main__": 26 | main() 27 | -------------------------------------------------------------------------------- /5.transformer/README.md: -------------------------------------------------------------------------------- 1 | ### 任务五:建立一个transformer 2 | 3 | 这里主要分析如何建立一个transformer,为了避免代码过于复杂,这里先不用transformer解决一个实际问题,而且考虑到transformer是一个十分重要的模型,也有一定难度,仅仅从概念公式上理解是不足够的,所以这里也详细给出了transformer的各个构成部分的代码供学习。 4 | 5 | 1. 知识点: 6 | 7 | 1. [self attention]() 8 | 2. [transformer]() 9 | 10 | ### 代码说明 11 | 12 | ​ 简单介绍一下代码结构,也可以把以下的介绍顺序作为分析代码的顺序。首先是encoderdecoder,主要建立编码解码模型,基于transformer,然后逐步实现编码器和解码器的各个构成部分,首先是注意力机制attention,基于注意力机制进一步构成multiHeadAttention,multiHeadAttention的输出会输入到positionwiseFeedForward中进行处理,而这两个结构就构成了解码器或者编码器中的一个子层,为了让模型能够识别出序列的位置,我们还需要positionEncoding,最后,transformer会把结果进行embedding,并交由generator进行处理。 13 | 14 | ​ 说实话,我觉得transformer的模型概念理解起来不难,但是代码的实现对我来说还是有点复杂,可能上述的叙述有点偏差,欢迎指出,我也在进一步研究如何结合transformer进行实际的应用。 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /5.transformer/__pycache__/attention.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JesseYule/NLPBeginner/175762fcbf40f84c6900dcf7453f865dffe0faf6/5.transformer/__pycache__/attention.cpython-37.pyc -------------------------------------------------------------------------------- /5.transformer/__pycache__/embedding.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JesseYule/NLPBeginner/175762fcbf40f84c6900dcf7453f865dffe0faf6/5.transformer/__pycache__/embedding.cpython-37.pyc -------------------------------------------------------------------------------- /5.transformer/__pycache__/encoderdecoder.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JesseYule/NLPBeginner/175762fcbf40f84c6900dcf7453f865dffe0faf6/5.transformer/__pycache__/encoderdecoder.cpython-37.pyc -------------------------------------------------------------------------------- /5.transformer/__pycache__/generator.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JesseYule/NLPBeginner/175762fcbf40f84c6900dcf7453f865dffe0faf6/5.transformer/__pycache__/generator.cpython-37.pyc -------------------------------------------------------------------------------- /5.transformer/__pycache__/multiHeadAttention.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JesseYule/NLPBeginner/175762fcbf40f84c6900dcf7453f865dffe0faf6/5.transformer/__pycache__/multiHeadAttention.cpython-37.pyc -------------------------------------------------------------------------------- /5.transformer/__pycache__/positionalEncoding.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JesseYule/NLPBeginner/175762fcbf40f84c6900dcf7453f865dffe0faf6/5.transformer/__pycache__/positionalEncoding.cpython-37.pyc -------------------------------------------------------------------------------- /5.transformer/__pycache__/positionwiseFeedForward.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JesseYule/NLPBeginner/175762fcbf40f84c6900dcf7453f865dffe0faf6/5.transformer/__pycache__/positionwiseFeedForward.cpython-37.pyc -------------------------------------------------------------------------------- /5.transformer/__pycache__/transformerModel.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JesseYule/NLPBeginner/175762fcbf40f84c6900dcf7453f865dffe0faf6/5.transformer/__pycache__/transformerModel.cpython-37.pyc -------------------------------------------------------------------------------- /5.transformer/attention.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | import math 4 | 5 | 6 | def attention(query, key, value, mask=None, dropout=0.0): 7 | "Compute 'Scaled Dot Product Attention'" 8 | d_k = query.size(-1) 9 | 10 | # Scaled Dot-Product 11 | scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k) 12 | 13 | if mask is not None: 14 | scores = scores.masked_fill(mask == 0, -1e9) 15 | 16 | # 归一化处理 17 | p_attn = F.softmax(scores, dim=-1) 18 | 19 | # (Dropout described below) 20 | p_attn = F.dropout(p_attn, p=dropout) 21 | 22 | return torch.matmul(p_attn, value), p_attn 23 | -------------------------------------------------------------------------------- /5.transformer/embedding.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import math 3 | 4 | 5 | class Embeddings(nn.Module): 6 | def __init__(self, d_model, vocab): 7 | super(Embeddings, self).__init__() 8 | self.lut = nn.Embedding(vocab, d_model) 9 | self.d_model = d_model 10 | 11 | def forward(self, x): 12 | return self.lut(x) * math.sqrt(self.d_model) 13 | -------------------------------------------------------------------------------- /5.transformer/encoderdecoder.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | import copy 5 | 6 | 7 | # encoderdecoder模型的抽象结构(框架),基于这个结构构建transformer 8 | class EncoderDecoder(nn.Module): 9 | """ 10 | A standard Encoder-Decoder architecture. Base model for this and many 11 | other models. 12 | """ 13 | 14 | def __init__(self, encoder, decoder, src_embed, tgt_embed, generator): 15 | super(EncoderDecoder, self).__init__() 16 | self.encoder = encoder 17 | self.decoder = decoder 18 | self.src_embed = src_embed 19 | self.tgt_embed = tgt_embed 20 | self.generator = generator 21 | 22 | def forward(self, src, tgt, src_mask, tgt_mask): 23 | "Take in and process masked src and target sequences." 24 | # 这里就展示了先编码再解码的过程 25 | memory = self.encoder(self.src_embed(src), src_mask) 26 | output = self.decoder(self.tgt_embed(tgt), memory, src_mask, tgt_mask) 27 | return output 28 | 29 | 30 | # 很多模块都是不断堆叠的,所以构建这个方法复制模块 31 | def clones(module, N): 32 | "Produce N identical layers." 33 | return nn.ModuleList([copy.deepcopy(module) for _ in range(N)]) 34 | 35 | 36 | # 一个encoder有多层,每个子层都会经过归一化处理(norm) 37 | class Encoder(nn.Module): 38 | "Core encoder is a stack of N layers" 39 | 40 | def __init__(self, layer, N): 41 | super(Encoder, self).__init__() 42 | self.layers = clones(layer, N) 43 | self.norm = LayerNorm(layer.size) 44 | 45 | def forward(self, x, mask): 46 | "Pass the input (and mask) through each layer in turn." 47 | for layer in self.layers: 48 | x = layer(x, mask) 49 | return self.norm(x) 50 | 51 | 52 | # 对子层的归一化处理 53 | class LayerNorm(nn.Module): 54 | "Construct a layernorm module (See citation for details)." 55 | def __init__(self, features, eps=1e-6): 56 | super(LayerNorm, self).__init__() 57 | self.a_2 = nn.Parameter(torch.ones(features)) 58 | self.b_2 = nn.Parameter(torch.zeros(features)) 59 | self.eps = eps 60 | 61 | def forward(self, x): 62 | mean = x.mean(-1, keepdim=True) 63 | std = x.std(-1, keepdim=True) 64 | return self.a_2 * (x - mean) / (std + self.eps) + self.b_2 # 加一个eps防止分母为0 65 | 66 | 67 | # 残差连接,防止梯度消失 68 | class SublayerConnection(nn.Module): 69 | """ 70 | A residual connection followed by a layer norm. 71 | Note for code simplicity we apply the norm first as opposed to last. 72 | """ 73 | def __init__(self, size, dropout): 74 | super(SublayerConnection, self).__init__() 75 | self.norm = LayerNorm(size) 76 | self.dropout = nn.Dropout(dropout) 77 | 78 | def forward(self, x, sublayer): 79 | "Apply residual connection to any sublayer function that maintains the same size." 80 | return x + self.dropout(sublayer(self.norm(x))) 81 | 82 | 83 | # 这里定义transformer的encoder层,定义了两个子层,multi-head self attention和feed forward 84 | class EncoderLayer(nn.Module): 85 | "Encoder is made up of two sublayers, self-attn and feed forward (defined below)" 86 | def __init__(self, size, self_attn, feed_forward, dropout): 87 | super(EncoderLayer, self).__init__() 88 | self.self_attn = self_attn 89 | self.feed_forward = feed_forward 90 | self.sublayer = clones(SublayerConnection(size, dropout), 2) 91 | self.size = size 92 | 93 | def forward(self, x, mask): 94 | "Follow Figure 1 (left) for connections." 95 | x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, mask)) 96 | return self.sublayer[1](x, self.feed_forward) 97 | 98 | 99 | # 定义decoder,每一个layer都是一个解码器模块,接受的输入包含上一个时刻的输出和编码层的输出:x、memory 100 | class Decoder(nn.Module): 101 | "Generic N layer decoder with masking." 102 | 103 | def __init__(self, layer, N): 104 | super(Decoder, self).__init__() 105 | self.layers = clones(layer, N) 106 | self.norm = LayerNorm(layer.size) 107 | 108 | def forward(self, x, memory, src_mask, tgt_mask): 109 | for layer in self.layers: 110 | x = layer(x, memory, src_mask, tgt_mask) 111 | return self.norm(x) 112 | 113 | 114 | # 解码层,这里定义了三个子层,第一个子层self attention处理x,第二个子层处理x和编码层的输出memory 115 | # 第三个子层对第二个子层的输出做feed forward 116 | class DecoderLayer(nn.Module): 117 | "Decoder is made up of three sublayers, self-attn, src-attn, and feed forward (defined below)" 118 | 119 | def __init__(self, size, self_attn, src_attn, feed_forward, dropout): 120 | super(DecoderLayer, self).__init__() 121 | self.size = size 122 | self.self_attn = self_attn 123 | self.src_attn = src_attn 124 | self.feed_forward = feed_forward 125 | self.sublayer = clones(SublayerConnection(size, dropout), 3) 126 | 127 | def forward(self, x, memory, src_mask, tgt_mask): 128 | "Follow Figure 1 (right) for connections." 129 | m = memory 130 | x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, tgt_mask)) 131 | x = self.sublayer[1](x, lambda x: self.src_attn(x, m, m, src_mask)) # 第一层处理过的x输入到这个第二层,并继续输出到第三层 132 | return self.sublayer[2](x, self.feed_forward) 133 | 134 | 135 | # 该方法主要用于防止解码器读取了后面位置(未来时刻)的信息 136 | def subsequent_mask(size): 137 | "Mask out subsequent positions." 138 | attn_shape = (1, size, size) 139 | subsequent_mask = np.triu(np.ones(attn_shape), k=1).astype('uint8') 140 | return torch.from_numpy(subsequent_mask) == 0 141 | -------------------------------------------------------------------------------- /5.transformer/generator.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch.nn.functional as F 3 | 4 | 5 | class Generator(nn.Module): 6 | "Standard generation step. (Not described in the paper.)" 7 | def __init__(self, d_model, vocab): 8 | super(Generator, self).__init__() 9 | self.proj = nn.Linear(d_model, vocab) 10 | 11 | def forward(self, x): 12 | return F.log_softmax(self.proj(x), dim=-1) 13 | -------------------------------------------------------------------------------- /5.transformer/multiHeadAttention.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import copy 3 | from attention import attention 4 | 5 | 6 | def clones(module, N): 7 | "Produce N identical layers." 8 | return nn.ModuleList([copy.deepcopy(module) for _ in range(N)]) 9 | 10 | 11 | # multihead对同一个输入进行多次注意力计算,并将结果拼接在一起 12 | class MultiHeadedAttention(nn.Module): 13 | def __init__(self, h, d_model, dropout=0.1): 14 | "Take in model size and number of heads." 15 | super(MultiHeadedAttention, self).__init__() 16 | assert d_model % h == 0 17 | # We assume d_v always equals d_k 18 | self.d_k = d_model // h 19 | self.h = h 20 | self.p = dropout 21 | self.linears = clones(nn.Linear(d_model, d_model), 4) 22 | self.attn = None 23 | 24 | def forward(self, query, key, value, mask=None): 25 | "Implements Figure 2" 26 | 27 | if mask is not None: 28 | # Same mask applied to all h heads. 29 | mask = mask.unsqueeze(1) 30 | 31 | nbatches = query.size(0) 32 | 33 | # 对Q、K、V进行多个不同的线性变换 34 | query, key, value = [l(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2) 35 | for l, x in zip(self.linears, (query, key, value))] 36 | 37 | # 进行注意力计算 38 | x, self.attn = attention(query, key, value, mask=mask, dropout=self.p) 39 | 40 | # 拼接结果 41 | x = x.transpose(1, 2).contiguous().view(nbatches, -1, self.h * self.d_k) 42 | return self.linears[-1](x) 43 | -------------------------------------------------------------------------------- /5.transformer/positionalEncoding.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import math 4 | from torch.autograd import Variable 5 | 6 | 7 | # 主要让模型能够利用序列的位置信息 8 | # 通过词的位置构建一个和词向量同样维度的向量,再和词向量相加 9 | class PositionalEncoding(nn.Module): 10 | "Implement the PE function." 11 | 12 | def __init__(self, d_model, dropout, max_len=5000): 13 | super(PositionalEncoding, self).__init__() 14 | self.dropout = nn.Dropout(p=dropout) 15 | 16 | # Compute the positional encodings once in log space. 17 | pe = torch.zeros(max_len, d_model) 18 | 19 | # 标记词向量的位置 20 | position = torch.arange(0., max_len).unsqueeze(1) 21 | 22 | div_term = torch.exp(torch.arange(0., d_model, 2) * 23 | -(math.log(10000.0) / d_model)) 24 | 25 | pe[:, 0::2] = torch.sin(position * div_term) 26 | pe[:, 1::2] = torch.cos(position * div_term) 27 | 28 | pe = pe.unsqueeze(0) 29 | self.register_buffer('pe', pe) 30 | 31 | def forward(self, x): 32 | x = x + Variable(self.pe[:, :x.size(1)], requires_grad=False) 33 | return self.dropout(x) 34 | -------------------------------------------------------------------------------- /5.transformer/positionwiseFeedForward.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch.nn.functional as F 3 | 4 | 5 | # 对不同位置进行同样的线性变换,但参数不共享 6 | class PositionwiseFeedForward(nn.Module): 7 | "Implements FFN equation." 8 | def __init__(self, d_model, d_ff, dropout=0.1): 9 | super(PositionwiseFeedForward, self).__init__() 10 | # Torch linears have a `b` by default. 11 | self.w_1 = nn.Linear(d_model, d_ff) 12 | self.w_2 = nn.Linear(d_ff, d_model) 13 | self.dropout = nn.Dropout(dropout) 14 | 15 | def forward(self, x): 16 | return self.w_2(self.dropout(F.relu(self.w_1(x)))) 17 | -------------------------------------------------------------------------------- /5.transformer/transformerModel.py: -------------------------------------------------------------------------------- 1 | from encoderdecoder import * 2 | from embedding import Embeddings 3 | from generator import Generator 4 | from multiHeadAttention import MultiHeadedAttention 5 | from positionalEncoding import PositionalEncoding 6 | from positionwiseFeedForward import PositionwiseFeedForward 7 | import torch.nn as nn 8 | import copy 9 | 10 | 11 | # transformer的完整结构 12 | def make_model(src_vocab, tgt_vocab, N=6, d_model=512, d_ff=2048, h=8, dropout=0.1): 13 | "Construct a model object based on hyperparameters." 14 | c = copy.deepcopy 15 | attn = MultiHeadedAttention(h, d_model, dropout) 16 | ff = PositionwiseFeedForward(d_model, d_ff, dropout) 17 | position = PositionalEncoding(d_model, dropout) 18 | model = EncoderDecoder( 19 | Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N), 20 | Decoder(DecoderLayer(d_model, c(attn), c(attn), c(ff), dropout), N), 21 | nn.Sequential(Embeddings(d_model, src_vocab), c(position)), 22 | nn.Sequential(Embeddings(d_model, tgt_vocab), c(position)), 23 | Generator(d_model, tgt_vocab)) 24 | 25 | # This was important from their code. Initialize parameters with Glorot or fan_avg. 26 | for p in model.parameters(): 27 | if p.dim() > 1: 28 | nn.init.xavier_uniform(p) 29 | return model 30 | -------------------------------------------------------------------------------- /IMG_0611.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JesseYule/NLPBeginner/175762fcbf40f84c6900dcf7453f865dffe0faf6/IMG_0611.PNG -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # NLPBeginner 2 | 本项目主要针对有机器学习基础、想入门自然语言处理的朋友,主要基于进行拓展补充。 3 | 4 | 主要涵盖了自然语言处理中比较基础的文本分类、文本匹配、序列标注等问题以及相关基础模型,可按照顺序进行学习。 5 | 6 | 直接运行代码可能会因为数据存储文件路径不同而报错,要根据你的实际情况修改路径,部分数据太大这里没有提供源文件。 7 | 8 | 在学习本项目之前,需掌握: 9 | 10 | 1. python基础:包括numpy、pandas、matplotlib等package,pytorch和tensorflow这两个深度学习框架。 11 | 2. 机器学习和深度模型:最起码要深入了解线性回归、logistic回归、CNN、RNN、LSTM 12 | 13 | ### 学习路线(相关知识点) 14 | 15 | 1. 基于logistic回归的文本分类: 16 | 17 | * [词袋模型](https://jesseyule.github.io/naturallanguage/bow/content.html) 18 | * [logistic回归](https://jesseyule.github.io/machinelearning/logisticRegression/content.html) 19 | 2. 基于深度学习的文本分类: 20 | 21 | * [卷积神经网络]() 22 | * [循环神经网络]() 23 | * [word2vec]() 24 | * [GloVe]() 25 | 3. 基于注意力机制的文本匹配 26 | * [LSTM]() 27 | * [seq2seq]() 28 | * [注意力机制]() 29 | * [ESIM]() 30 | 4. 基于LSTM+CRF的命名实体识别 31 | - [概率图模型]() 32 | - [隐马尔可夫模型]() 33 | - [Viterbi算法]() 34 | - [条件随机场]() 35 | 5. 构建一个Transformer 36 | * [self attention]() 37 | * [transformer]() 38 | 39 | 40 | 41 | 最后放一张自然语言领域目前的学习路线图,当然其实每年都有很多新技术新模型出现,知识永远都是学不完的,祝愿各位、也希望自己能保持热爱,在这个领域能做出一点贡献。 42 | 43 | ![IMG_0611](./IMG_0611.PNG) --------------------------------------------------------------------------------