├── .gitignore ├── README.md ├── project1-ML-Sentence Classification ├── data_preprocess.py ├── feature_extraction.py ├── main.py └── softmax_regerssion.py ├── project2-DL-Sentence Classification ├── Convolutional Neural Networks for Sentence Classification.pdf ├── dataloader_byhand.py ├── dataloader_bytorchtext.py ├── main.py └── models.py ├── project3-Named Entity Recognition ├── dataloader.py ├── main.py ├── model.pkl ├── models.py └── torchcrf │ └── __init__.py ├── project4-Machine Translation ├── dataloader.py ├── main.py └── models.py └── project5-Text Generation ├── dataloader.py ├── main.py ├── model.pkl ├── model_debug.pkl └── models.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # nlp-beginner-projects 2 | NLP常见任务实现(pytorch版) 3 | 4 | - [x] 项目一:基于softmax regression的文本多分类
5 | 博客地址:[基于softmax regression的文本多分类](https://blog.csdn.net/philpanic9/article/details/106606415) 6 | - [x] 项目二:基于RNN,CNN的文本多分类
7 | 博客地址:[基于RNN、CNN的文本多分类](https://blog.csdn.net/philpanic9/article/details/106728786)
8 | - [x] 项目三:BiLSTM-CRF命名实体识别
9 | 博客地址:[基于BiLSTM-CRF的实体识别](https://blog.csdn.net/philpanic9/article/details/106742297)
10 | > 输入待标注的句子,输出标注结果
11 | My name is Phil , I am from European Union . -->
12 | ['O', 'O', 'O', 'B-PER', 'O', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'O'] 13 | 14 | - [x] 项目四:机器翻译
15 | 博客地址:[基于Seq2Seq(包含Attention)的机器翻译](https://blog.csdn.net/philpanic9/article/details/106806350)
16 | > **输入人可读的日期,翻译出机器可读的日期**
17 | monday may 7 1983 --> 1983-05-07
18 | 19 march 1998 --> 1998-03-19
19 | 18 jul 2008 --> 2008-07-18
20 | 9/10/70 --> 1970-09-10
21 | thursday january 1 1981 --> 1981-01-01
22 | thursday january 26 2015 --> 2015-01-26
23 | saturday april 18 1990 --> 1990-04-18
24 | sunday may 12 1988 --> 1988-05-12
25 | - [x] 项目五:文本生成
26 | 博客地址:https://blog.csdn.net/philpanic9/article/details/106878540
27 | > **输入"我好可爱",生成藏头诗**
28 | 我病恨无我,。
29 | 好一解颜色。
30 | 可怜王经行自远,一解颜色。
31 | 爱绿溪阴。
32 | > **输入"花开有情",生成藏头诗**
33 | 花边行县柳,河桥晚泊船。
34 | 开远树,山鸟助酣歌。
35 | 有情何处,箫管凤初来。
36 | 情何处所,风吹青珊瑚,可怜王孙立
37 | 38 | 39 | 相关链接: 40 | 1. [项目1数据集](https://www.kaggle.com/c/sentiment-analysis-on-movie-reviews) 41 | 2. [fdu nlp-beginner](https://github.com/FudanNLP/nlp-beginner) 42 | 43 | 44 | 45 | 46 | -------------------------------------------------------------------------------- /project1-ML-Sentence Classification/data_preprocess.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | """ 4 | Created on 2020/4/15 21:01 5 | @author: phil 6 | """ 7 | 8 | import pandas as pd 9 | 10 | 11 | def read_data(train_file="../dataset/kaggle-movie-review/train.tsv"): 12 | train_df = pd.read_csv(train_file, sep='\t') 13 | # test_df = pd.read_csv(test_file, sep="\t") 14 | return train_df["Phrase"].values, train_df["Sentiment"].values 15 | 16 | 17 | if __name__ == "__main__": 18 | X_data, y_data = read_data() 19 | print("train size", len(X_data)) 20 | -------------------------------------------------------------------------------- /project1-ML-Sentence Classification/feature_extraction.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | """ 4 | Created on 2020/4/15 21:16 5 | @author: phil 6 | """ 7 | 8 | import numpy as np 9 | 10 | 11 | class BagOfWord: 12 | def __init__(self, do_lower_case=False): 13 | self.vocab = {} 14 | self.do_lower_case = do_lower_case 15 | 16 | def fit(self, sent_list): 17 | # sent_list 类型为 List 18 | for sent in sent_list: 19 | if self.do_lower_case: 20 | sent = sent.lower() 21 | words = sent.strip().split(" ") 22 | for word in words: 23 | if word not in self.vocab: 24 | self.vocab[word] = len(self.vocab) 25 | 26 | def transform(self, sent_list): 27 | vocab_size = len(self.vocab) 28 | bag_of_word_feature = np.zeros((len(sent_list), vocab_size)) 29 | for idx, sent in enumerate(sent_list): 30 | if self.do_lower_case: 31 | sent = sent.lower() 32 | words = sent.strip().split(" ") 33 | for word in words: 34 | bag_of_word_feature[idx][self.vocab[word]] += 1 35 | return bag_of_word_feature 36 | 37 | def fit_transform(self, sent_list): 38 | self.fit(sent_list) 39 | return self.transform(sent_list) 40 | 41 | 42 | class NGram: 43 | def __init__(self, ngram, do_lower_case=False): 44 | self.ngram = ngram 45 | self.feature_map = {} 46 | self.do_lower_case = do_lower_case 47 | 48 | def fit(self, sentList): 49 | for gram in self.ngram: 50 | for sent in sentList: 51 | if self.do_lower_case: 52 | sent = sent.lower() 53 | sent = sent.split(" ") 54 | for i in range(len(sent) - gram + 1): 55 | feature = "_".join(sent[i:i + gram]) 56 | if feature not in self.feature_map: 57 | self.feature_map[feature] = len(self.feature_map) 58 | 59 | def transform(self, sentList): 60 | n = len(sentList) 61 | m = len(self.feature_map) 62 | ngram_feature = np.zeros((n, m)) 63 | for idx, sent in enumerate(sentList): 64 | if self.do_lower_case: 65 | sent = sent.lower() 66 | sent = sent.split(" ") 67 | for gram in self.ngram: 68 | for i in range(len(sent) - gram + 1): 69 | feature = "_".join(sent[i:i + gram]) 70 | if feature in self.feature_map: 71 | ngram_feature[idx][self.feature_map[feature]] = 1 72 | return ngram_feature 73 | 74 | def fit_transform(self, sentList): 75 | self.fit(sentList) 76 | return self.transform(sentList) 77 | 78 | 79 | if __name__ == "__main__": 80 | gram = NGram((1, 2)) 81 | sents = ["I love you", "do you love yourself"] 82 | feature = gram.fit_transform(sents) 83 | print(gram.feature_map) 84 | print(feature) 85 | -------------------------------------------------------------------------------- /project1-ML-Sentence Classification/main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | """ 4 | Created on 2020/4/15 21:24 5 | @author: phil 6 | """ 7 | 8 | import numpy as np 9 | from data_preprocess import read_data 10 | from feature_extraction import BagOfWord, NGram 11 | from softmax_regerssion import SoftmaxRegression 12 | import matplotlib.pyplot as plt 13 | from sklearn.model_selection import train_test_split 14 | 15 | if __name__ == '__main__': 16 | debug = 1 17 | # 读入数据 18 | X_data, y_data = read_data() 19 | 20 | if debug == 1: 21 | # index = np.arange(len(X_data)) 22 | # np.random.shuffle(index) 23 | # X_data = X_data[index[:2000]] 24 | # y_data = y_data[index[:2000]] 25 | X_data = X_data[:1000] 26 | y_data = y_data[:1000] 27 | y = np.array(y_data).reshape(len(y_data), 1) 28 | 29 | # 数据集划分 30 | bag_of_word_model = BagOfWord(do_lower_case=True) 31 | ngram_model = NGram(ngram=(1, 2), do_lower_case=True) 32 | X_Bow = bag_of_word_model.fit_transform(X_data) 33 | X_Gram = ngram_model.fit_transform(X_data) 34 | 35 | print("Bow shape", X_Bow.shape) 36 | print("Gram shape", X_Gram.shape) 37 | 38 | X_train_Bow, X_test_Bow, y_train_Bow, y_test_Bow = train_test_split(X_Bow, y, test_size=0.2, random_state=42, stratify=y) 39 | X_train_Gram, X_test_Gram, y_train_Gram, y_test_Gram = train_test_split(X_Gram, y, test_size=0.2, random_state=42, stratify=y) 40 | 41 | # 训练模型 不同特征的差别 42 | epoch = 100 43 | bow_learning_rate = 1 44 | gram_learning_rate = 1 45 | 46 | # 梯度下降 47 | model1 = SoftmaxRegression() 48 | history = model1.fit(X_train_Bow, y_train_Bow, epoch=epoch, learning_rate=bow_learning_rate, print_loss_steps=epoch//10, update_strategy="stochastic") 49 | plt.plot(np.arange(len(history)), np.array(history)) 50 | plt.show() 51 | print("Bow train {} test {}".format(model1.score(X_train_Bow, y_train_Bow), model1.score(X_test_Bow, y_test_Bow))) 52 | 53 | model2 = SoftmaxRegression() 54 | history = model2.fit(X_train_Gram, y_train_Gram, epoch=epoch, learning_rate=gram_learning_rate, print_loss_steps=epoch//10, update_strategy="stochastic") 55 | plt.plot(np.arange(len(history)), np.array(history)) 56 | plt.show() 57 | print("Gram train {} test {}".format(model2.score(X_train_Gram, y_train_Gram), model2.score(X_test_Gram, y_test_Gram))) 58 | 59 | # 样本数量:20000 60 | # epoch = 100 61 | # bow_learning_rate = 0.001 62 | # gram_learning_rate = 0.5 63 | # Bow train 0.7094375 test 0.4885 64 | # Gram train 0.9786875 test 0.5335 -------------------------------------------------------------------------------- /project1-ML-Sentence Classification/softmax_regerssion.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | """ 4 | Created on 2020/4/15 20:58 5 | @author: phil 6 | """ 7 | 8 | import numpy as np 9 | 10 | 11 | def softmax(z): 12 | # 稳定版本的softmax,对z的每一行进行softmax 13 | z -= np.max(z, axis=1, keepdims=True) # 先减去该行的最大值 14 | z = np.exp(z) 15 | z /= np.sum(z, axis=1, keepdims=True) 16 | return z 17 | 18 | 19 | class SoftmaxRegression: 20 | def __init__(self): 21 | self.num_of_class = None # 类别数量 22 | self.n = None # 数据个数 23 | self.m = None # 数据维度 24 | self.weight = None # 模型权重 shape (类别数,数据维度) 25 | self.learning_rate = None 26 | 27 | def fit(self, X, y, learning_rate=0.01, epoch=10, num_of_class=5, print_loss_steps=-1, update_strategy="batch"): 28 | self.n, self.m = X.shape 29 | self.num_of_class = num_of_class 30 | self.weight = np.random.randn(self.num_of_class, self.m) 31 | self.learning_rate = learning_rate 32 | 33 | # 将y换为独热码矩阵,每一行独热码表示一个label 34 | y_one_hot = np.zeros((self.n, self.num_of_class)) 35 | for i in range(self.n): 36 | y_one_hot[i][y[i]] = 1 37 | 38 | loss_history = [] 39 | 40 | for e in range(epoch): 41 | # X (n, m) 每一行表示一个样本 42 | # weight (C, m) 每一行处理一个类别 43 | loss = 0 44 | if update_strategy == "stochastic": 45 | rand_index = np.arange(len(X)) 46 | np.random.shuffle(rand_index) 47 | for index in list(rand_index): 48 | Xi = X[index].reshape(1, -1) 49 | prob = Xi.dot(self.weight.T) 50 | prob = softmax(prob).flatten() 51 | loss += -np.log(prob[y[index]]) 52 | self.weight += Xi.reshape(1, self.m).T.dot((y_one_hot[index] - prob).reshape(1, self.num_of_class)).T 53 | 54 | if update_strategy == "batch": 55 | prob = X.dot(self.weight.T) # (n, C) 每个样本被预测为各个类别 56 | prob = softmax(prob) 57 | 58 | for i in range(self.n): 59 | loss -= np.log(prob[i][y[i]]) 60 | 61 | # 书中给的损失函数 62 | weight_update = np.zeros_like(self.weight) 63 | for i in range(self.n): 64 | weight_update += X[i].reshape(1, self.m).T.dot((y_one_hot[i] - prob[i]).reshape(1, self.num_of_class)).T 65 | self.weight += weight_update * self.learning_rate / self.n 66 | 67 | loss /= self.n 68 | loss_history.append(loss) 69 | if print_loss_steps != -1 and e % print_loss_steps == 0: 70 | print("epoch {} loss {}".format(e, loss)) 71 | return loss_history 72 | 73 | def predict(self, X): 74 | prob = softmax(X.dot(self.weight.T)) 75 | return prob.argmax(axis=1) 76 | 77 | def score(self, X, y): 78 | pred = self.predict(X) 79 | return np.sum(pred.reshape(y.shape) == y) / y.shape[0] -------------------------------------------------------------------------------- /project2-DL-Sentence Classification/Convolutional Neural Networks for Sentence Classification.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/positivepeng/nlp-beginner-projects/2f066cbdd24121938b90ee1d12fe063ad0a46df5/project2-DL-Sentence Classification/Convolutional Neural Networks for Sentence Classification.pdf -------------------------------------------------------------------------------- /project2-DL-Sentence Classification/dataloader_byhand.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | """ 4 | Created on 2020/6/12 15:25 5 | @author: phil 6 | """ 7 | 8 | import pandas as pd 9 | import os 10 | import numpy as np 11 | import torch 12 | from sklearn.model_selection import train_test_split 13 | from torch.nn.utils.rnn import pad_sequence 14 | from torch.utils.data import Dataset, DataLoader 15 | 16 | 17 | def prepare_data(dataset_path, sent_col_name, label_col_name): 18 | """ 读出tsv中的句子和标签 """ 19 | file_path = os.path.join(dataset_path, "train.tsv") 20 | data = pd.read_csv(file_path, sep="\t") 21 | X = data[sent_col_name].values 22 | y = data[label_col_name].values 23 | return X, y 24 | 25 | 26 | class Language: 27 | """ 根据句子列表建立词典并将单词列表转换为数值型表示 """ 28 | def __init__(self): 29 | self.word2id = {} 30 | self.id2word = {} 31 | 32 | def fit(self, sent_list): 33 | vocab = set() 34 | for sent in sent_list: 35 | vocab.update(sent.split(" ")) 36 | word_list = ["", ""] + list(vocab) 37 | self.word2id = {word: i for i, word in enumerate(word_list)} 38 | self.id2word = {i: word for i, word in enumerate(word_list)} 39 | 40 | def transform(self, sent_list, reverse=False): 41 | sent_list_id = [] 42 | word_mapper = self.word2id if not reverse else self.id2word 43 | unk = self.word2id[""] if not reverse else None 44 | for sent in sent_list: 45 | sent_id = list(map(lambda x: word_mapper.get(x, unk), sent.split(" ") if not reverse else sent)) 46 | sent_list_id.append(sent_id) 47 | return sent_list_id 48 | 49 | 50 | class ClsDataset(Dataset): 51 | """ 文本分类数据集 """ 52 | def __init__(self, sents, labels): 53 | self.sents = sents 54 | self.labels = labels 55 | 56 | def __getitem__(self, item): 57 | return self.sents[item], self.labels[item] 58 | 59 | def __len__(self): 60 | return len(self.sents) 61 | 62 | 63 | def collate_fn(batch_data): 64 | """ 自定义一个batch里面的数据的组织方式 """ 65 | batch_data.sort(key=lambda data_pair: len(data_pair[0]), reverse=True) 66 | 67 | sents, labels = zip(*batch_data) 68 | sents_len = [len(sent) for sent in sents] 69 | sents = [torch.LongTensor(sent) for sent in sents] 70 | padded_sents = pad_sequence(sents, batch_first=True, padding_value=0) 71 | 72 | return torch.LongTensor(padded_sents), torch.LongTensor(labels), torch.FloatTensor(sents_len) 73 | 74 | 75 | def get_wordvec(word2id, vec_file_path, vec_dim=50): 76 | """ 读出txt文件的预训练词向量 """ 77 | print("开始加载词向量") 78 | word_vectors = torch.nn.init.xavier_uniform_(torch.empty(len(word2id), vec_dim)) 79 | word_vectors[0, :] = 0 # 80 | found = 0 81 | with open(vec_file_path, "r", encoding="utf-8") as f: 82 | lines = f.readlines() 83 | for line in lines: 84 | splited = line.split(" ") 85 | if splited[0] in word2id: 86 | found += 1 87 | word_vectors[word2id[splited[0]]] = torch.tensor(list(map(lambda x: float(x), splited[1:]))) 88 | if found == len(word2id) - 1: # 允许找不到 89 | break 90 | print("总共 %d个词,其中%d个找到了对应的词向量" % (len(word2id), found)) 91 | return word_vectors.float() 92 | 93 | 94 | def make_dataloader(dataset_path="../dataset/kaggle-movie-review", sent_col_name="Phrase", label_col_name="Sentiment", batch_size=32, vec_file_path="./.vector_cache/glove.6B.50d.txt", debug=False): 95 | # X, y = prepare_datapairs(dataset_path="../dataset/imdb", sent_col_name="review", label_col_name="sentiment") 96 | X, y = prepare_data(dataset_path=dataset_path, sent_col_name=sent_col_name, label_col_name=label_col_name) 97 | 98 | if debug: 99 | X, y = X[:100], y[:100] 100 | 101 | X_language = Language() 102 | X_language.fit(X) 103 | X = X_language.transform(X) 104 | 105 | word_vectors = get_wordvec(X_language.word2id, vec_file_path=vec_file_path, vec_dim=50) 106 | # 总共 18229个词,其中12769个找到了对应的词向量 word_vectors = get_wordvec(X_language.word2id, 107 | # vec_file_path=r"F:\NLP-pretrained-model\glove.twitter.27B\glove.twitter.27B.50d.txt", vec_dim=50) 108 | 109 | # 测试 110 | # print(X[:2]) 111 | # X_id = X_language.transform(X[:2]) 112 | # print(X_language.transform(X_id, reverse=True)) 113 | 114 | X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42) 115 | 116 | cls_train_dataset, cls_val_dataset = ClsDataset(X_train, y_train), ClsDataset(X_val, y_val) 117 | cls_train_dataloader = DataLoader(cls_train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn) 118 | cls_val_dataloader = DataLoader(cls_val_dataset, batch_size=batch_size, collate_fn=collate_fn) 119 | 120 | return cls_train_dataloader, cls_val_dataloader, word_vectors, X_language 121 | 122 | 123 | if __name__ == "__main__": 124 | cls_train_dataloader, cls_val_dataloader, word_vectors, X_language = make_dataloader(debug=True, batch_size=10) 125 | for batch in cls_train_dataloader: 126 | X, y, lens = batch 127 | print(X.shape, y.shape) 128 | break 129 | -------------------------------------------------------------------------------- /project2-DL-Sentence Classification/dataloader_bytorchtext.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | """ 4 | Created on 2020/5/4 15:35 5 | @author: phil 6 | """ 7 | import os 8 | 9 | import pandas as pd 10 | import spacy 11 | from sklearn.model_selection import train_test_split 12 | from torch.nn import init 13 | from torchtext import data 14 | 15 | 16 | def prepare_data(dataset_path, sent_col_name, label_col_name, debug=False): 17 | """ 读出tsv中的句子和标签 """ 18 | file_path = os.path.join(dataset_path, "train.tsv") 19 | data = pd.read_csv(file_path, sep="\t") 20 | if debug: 21 | data = data.sample(n=100) 22 | X = data[sent_col_name].values 23 | y = data[label_col_name].values 24 | X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42) 25 | train_df, val_df = pd.DataFrame(), pd.DataFrame() 26 | train_df["sent"], train_df["label"] = X_train, y_train 27 | val_df["sent"], val_df["label"] = X_val, y_val 28 | 29 | train_file_path = os.path.join(dataset_path, "train.csv") 30 | val_file_path = os.path.join(dataset_path, "val.csv") 31 | train_df.to_csv(train_file_path, index=False) 32 | val_df.to_csv(val_file_path, index=False) 33 | 34 | return train_file_path, val_file_path 35 | 36 | 37 | def dataset2dataloader(dataset_path="../dataset/kaggle-movie-review", sent_col_name="Phrase", label_col_name="Sentiment", batch_size=32, vec_file_path="./.vector_cache/glove.6B.50d.txt", debug=False): 38 | train_file_name, val_file_name = prepare_data(dataset_path, sent_col_name, label_col_name, debug=debug) 39 | spacy_en = spacy.load('en_core_web_sm') 40 | 41 | def tokenizer(text): 42 | """ 定义分词操作 """ 43 | return [tok.text for tok in spacy_en.tokenizer(text)] 44 | 45 | # 这里只是定义了数据格式 46 | TEXT = data.Field(sequential=True, tokenize=tokenizer, lower=True) 47 | LABEL = data.Field(sequential=False, use_vocab=False) 48 | train, val = data.TabularDataset.splits( 49 | path='', train=train_file_name, validation=val_file_name, format='csv', skip_header=True, 50 | fields=[('sent', TEXT), ('label', LABEL)]) 51 | 52 | TEXT.build_vocab(train, vectors='glove.6B.50d') # , max_size=30000) 53 | # 当 corpus 中有的 token 在 vectors 中不存在时 的初始化方式. 54 | TEXT.vocab.vectors.unk_init = init.xavier_uniform 55 | 56 | DEVICE = "cpu" 57 | train_iter = data.BucketIterator(train, batch_size=batch_size, sort_key=lambda x: len(x.review), device=DEVICE) 58 | val_iter = data.BucketIterator(val, batch_size=batch_size, sort_key=lambda x: len(x.review), shuffle=True, device=DEVICE) 59 | 60 | # 在 test_iter , sort一定要设置成 False, 要不然会被 torchtext 搞乱样本顺序 61 | # test_iter = data.Iterator(dataset=test, batch_size=128, train=False, sort=False, device=DEVICE) 62 | 63 | return train_iter, val_iter, TEXT.vocab.vectors 64 | 65 | 66 | if __name__ == "__main__": 67 | train_iter, val_iter, vectors = dataset2dataloader(batch_size=32, debug=True) 68 | 69 | batch = next(iter(train_iter)) 70 | print(batch.sent.shape) 71 | print(batch.label.shape) 72 | -------------------------------------------------------------------------------- /project2-DL-Sentence Classification/main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | """ 4 | Created on 2020/4/30 8:33 5 | @author: phil 6 | """ 7 | from torch import optim 8 | import torch 9 | from models import TextRNN, TextCNN 10 | from dataloader_bytorchtext import dataset2dataloader 11 | from dataloader_byhand import make_dataloader 12 | import numpy as np 13 | 14 | if __name__ == "__main__": 15 | model_names = ["LSTM", "RNN", "CNN"] # 彩蛋:按过拟合难度排序,由难到易 16 | learning_rate = 0.001 17 | epoch_num = 500 18 | num_of_class = 5 19 | load_data_by_torchtext = True 20 | 21 | if load_data_by_torchtext: 22 | train_iter, val_iter, word_vectors = dataset2dataloader(batch_size=100, debug=True) 23 | else: 24 | train_iter, val_iter, word_vectors, X_lang = make_dataloader(batch_size=100, debug=True) 25 | 26 | for model_name in model_names[-1:]: 27 | if model_name == "RNN": 28 | model = TextRNN(vocab_size=len(word_vectors), embedding_dim=50, hidden_size=128, num_of_class=num_of_class, weights=word_vectors) 29 | elif model_name == "CNN": 30 | model = TextCNN(vocab_size=len(word_vectors), embedding_dim=50, num_of_class=num_of_class, embedding_vectors=word_vectors) 31 | elif model_name == "LSTM": 32 | model = TextRNN(vocab_size=len(word_vectors), embedding_dim=50, hidden_size=128, num_of_class=num_of_class, weights=word_vectors, rnn_type="LSTM") 33 | optimizer = optim.Adam(model.parameters(), lr=learning_rate) 34 | loss_fun = torch.nn.CrossEntropyLoss() 35 | 36 | for epoch in range(epoch_num): 37 | model.train() # 包含dropout或者BN的模型需要指定 38 | for i, batch in enumerate(train_iter): 39 | if load_data_by_torchtext: 40 | x, y = batch.sent.t(), batch.label 41 | else: 42 | x, y, lens = batch 43 | logits = model(x) 44 | optimizer.zero_grad() 45 | loss = loss_fun(logits, y) 46 | loss.backward() 47 | optimizer.step() 48 | 49 | # with torch.no_grad(): 50 | model.eval() 51 | train_accs = [] 52 | for i, batch in enumerate(train_iter): 53 | if load_data_by_torchtext: 54 | x, y = batch.sent.t(), batch.label 55 | else: 56 | x, y, lens = batch 57 | _, y_pre = torch.max(logits, -1) 58 | acc = torch.mean((torch.tensor(y_pre == y, dtype=torch.float))) 59 | train_accs.append(acc) 60 | train_acc = np.array(train_accs).mean() 61 | 62 | val_accs = [] 63 | for i, batch in enumerate(val_iter): 64 | if load_data_by_torchtext: 65 | x, y = batch.sent.t(), batch.label 66 | else: 67 | x, y, lens = batch 68 | logits = model(x) 69 | _, y_pre = torch.max(logits, -1) 70 | acc = torch.mean((torch.tensor(y_pre == y, dtype=torch.float))) 71 | val_accs.append(acc) 72 | val_acc = np.array(val_accs).mean() 73 | print("epoch %d train acc:%.2f, val acc:%.2f" % (epoch, train_acc, val_acc)) 74 | if train_acc >= 0.99: 75 | break 76 | 77 | -------------------------------------------------------------------------------- /project2-DL-Sentence Classification/models.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | """ 4 | Created on 2020/5/15 22:23 5 | @author: phil 6 | """ 7 | import torch.nn as nn 8 | import torch 9 | import torch.nn.functional as F 10 | 11 | 12 | class TextRNN(nn.Module): 13 | def __init__(self, vocab_size, embedding_dim, hidden_size, num_of_class, weights=None, rnn_type="RNN"): 14 | super(TextRNN, self).__init__() 15 | 16 | self.vocab_size = vocab_size 17 | self.hidden_size = hidden_size 18 | self.num_of_class = num_of_class 19 | self.embedding_dim = embedding_dim 20 | self.rnn_type = rnn_type 21 | 22 | if weights is not None: 23 | self.embed = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim, _weight=weights) 24 | else: 25 | self.embed = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim) 26 | 27 | if rnn_type == "RNN": 28 | self.rnn = nn.RNN(input_size=embedding_dim, hidden_size=hidden_size, batch_first=True) 29 | self.hidden2label = nn.Linear(hidden_size, num_of_class) 30 | elif rnn_type == "LSTM": 31 | self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_size, batch_first=True, bidirectional=True) 32 | self.hidden2label = nn.Linear(hidden_size*2, num_of_class) 33 | 34 | def forward(self, input_sents): 35 | # input_sents (batch_size, seq_len) 36 | batch_size, seq_len = input_sents.shape 37 | # (batch_size, seq_len, embedding_dim) 38 | embed_out = self.embed(input_sents) 39 | 40 | if self.rnn_type == "RNN": 41 | h0 = torch.randn(1, batch_size, self.hidden_size) 42 | _, hn = self.rnn(embed_out, h0) 43 | elif self.rnn_type == "LSTM": 44 | h0, c0 = torch.randn(2, batch_size, self.hidden_size), torch.randn(2, batch_size, self.hidden_size) 45 | output, (hn, _) = self.lstm(embed_out, (h0, c0)) 46 | 47 | logits = self.hidden2label(hn).squeeze(0) 48 | 49 | return logits 50 | 51 | 52 | class TextCNN(nn.Module): 53 | def __init__(self, vocab_size, embedding_dim, num_of_class, embedding_vectors=None, kernel_num=100, kerner_size=[3, 4, 5], dropout=0.5): 54 | super(TextCNN, self).__init__() 55 | if embedding_vectors is None: 56 | self.embed = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim) 57 | else: 58 | self.embed = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim, _weight=embedding_vectors) 59 | self.convs = nn.ModuleList([nn.Conv2d(1, kernel_num, (K, embedding_dim)) for K in kerner_size]) 60 | self.dropout = nn.Dropout(dropout) 61 | self.feature2label = nn.Linear(3*kernel_num, num_of_class) 62 | 63 | def forward(self, x): 64 | # x shape (batch_size, seq_len) 65 | embed_out = self.embed(x).unsqueeze(1) 66 | conv_out = [F.relu(conv(embed_out)).squeeze(3) for conv in self.convs] 67 | 68 | pool_out = [F.max_pool1d(block, block.size(2)).squeeze(2) for block in conv_out] 69 | 70 | pool_out = torch.cat(pool_out, 1) 71 | 72 | logits = self.feature2label(pool_out) 73 | 74 | return logits 75 | 76 | 77 | if __name__ == "__main__": 78 | model = TextCNN(vocab_size=10, embedding_dim=10, num_of_class=10) 79 | x = torch.randint(10, (10, 20)) 80 | logits = model.forward(x) 81 | 82 | -------------------------------------------------------------------------------- /project3-Named Entity Recognition/dataloader.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | """ 4 | Created on 2020/6/13 11:58 5 | @author: phil 6 | """ 7 | import os 8 | import pandas as pd 9 | import spacy 10 | from torch.nn import init 11 | from torchtext import data 12 | 13 | 14 | def prepare_data(dataset_path, debug=False): 15 | train_file_path = os.path.join(dataset_path, "train.txt") 16 | dev_file_path = os.path.join(dataset_path, "dev.txt") 17 | 18 | def process_file(file_path, target_file_path): 19 | sents, tags = [], [] 20 | with open(file_path, "r") as f: 21 | lines = f.readlines() 22 | sent, tag = [], [] 23 | for line in lines: 24 | line = line.strip() 25 | if len(line) == 0: 26 | sents.append(" ".join(sent)) 27 | tags.append(" ".join(tag)) 28 | sent, tag = [], [] 29 | else: 30 | splited = line.split(" ") 31 | sent.append(splited[0]) 32 | tag.append(splited[-1]) 33 | if len(sent) != 0: 34 | sents.append(" ".join(sent)) 35 | tags.append(" ".join(tag)) 36 | df = pd.DataFrame() 37 | df["sent"] = sents if not debug else sents[:100] 38 | df["tag"] = tags if not debug else tags[:100] 39 | df.to_csv(target_file_path, index=False) 40 | 41 | train_csv = os.path.join(dataset_path, "train.csv") if not debug else os.path.join(dataset_path, "train_small.csv") 42 | dev_csv = os.path.join(dataset_path, "dev.csv") if not debug else os.path.join(dataset_path, "train_dev.csv") 43 | 44 | if not os.path.exists(train_csv): 45 | process_file(train_file_path, train_csv) 46 | process_file(dev_file_path, dev_csv) 47 | 48 | return train_csv, dev_csv 49 | 50 | 51 | def dataset2dataloader(dataset_path="../dataset/conll2003-IOB", batch_size=3, debug=False): 52 | train_csv, dev_csv = prepare_data(dataset_path, debug=debug) 53 | 54 | def tokenizer(text): 55 | return text.split(" ") 56 | 57 | # 这里只是定义了数据格式 58 | TEXT = data.Field(sequential=True, tokenize=tokenizer, lower=False) 59 | TAG = data.Field(sequential=True, tokenize=tokenizer, lower=False) 60 | train, val = data.TabularDataset.splits( 61 | path='', train=train_csv, validation=dev_csv, format='csv', skip_header=True, 62 | fields=[('sent', TEXT), ('tag', TAG)]) 63 | 64 | TEXT.build_vocab(train, vectors='glove.6B.50d') # , max_size=30000) 65 | TAG.build_vocab(val) 66 | 67 | # 当 corpus 中有的 token 在 vectors 中不存在时 的初始化方式. 68 | TEXT.vocab.vectors.unk_init = init.xavier_uniform 69 | 70 | DEVICE = "cpu" 71 | train_iter = data.BucketIterator(train, batch_size=batch_size, sort_key=lambda x: len(x.sent), device=DEVICE) 72 | val_iter = data.BucketIterator(val, batch_size=batch_size, sort_key=lambda x: len(x.sent), device=DEVICE) 73 | 74 | # 在 test_iter , sort一定要设置成 False, 要不然会被 torchtext 搞乱样本顺序 75 | # test_iter = data.Iterator(dataset=test, batch_size=128, train=False, sort=False, device=DEVICE) 76 | 77 | return train_iter, val_iter, TEXT.vocab, TAG.vocab 78 | 79 | 80 | if __name__ == "__main__": 81 | # train_csv, dev_csv = prepare_data(dataset_path="../dataset/conll2003-IOB") 82 | train_iter, val_iter, sent_vocab, tag_vocab = dataset2dataloader(dataset_path="../dataset/conll2003-IOB", debug=True) 83 | word_vectors = sent_vocab.vectors 84 | 85 | for batch in train_iter: 86 | print(batch.sent.shape, batch.tag.shape) 87 | break -------------------------------------------------------------------------------- /project3-Named Entity Recognition/main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | """ 4 | Created on 2020/6/13 16:15 5 | @author: phil 6 | """ 7 | 8 | from dataloader import dataset2dataloader 9 | from models import BiLSTM_CRF_NER 10 | from torch.optim import Adam 11 | import torch 12 | import numpy as np 13 | import os 14 | 15 | if __name__ == "__main__": 16 | train_iter, val_iter, sent_vocab, tag_vocab = dataset2dataloader(batch_size=128) 17 | word_vectors = sent_vocab.vectors 18 | device = torch.device("cuda") if torch.cuda.is_available() else "cpu" 19 | 20 | model = BiLSTM_CRF_NER(vocab_size=len(sent_vocab.stoi), embedding_dim=50, hidden_size=128, num_tags=len(tag_vocab.stoi), word_vectors=word_vectors, device=device) 21 | 22 | epoch = 10 23 | learning_rate = 0.01 24 | model_path = "model.pkl" 25 | 26 | optimizer = Adam(model.parameters(), lr=learning_rate) 27 | 28 | if os.path.exists(model_path): 29 | model = torch.load(model_path) 30 | else: 31 | for ep in range(epoch): 32 | model.train() 33 | for i, batch in enumerate(train_iter): 34 | x, y = batch.sent.t(), batch.tag.t() 35 | mask = (x != sent_vocab.stoi[""]) 36 | optimizer.zero_grad() 37 | loss = model(x, y, mask) 38 | loss.backward() 39 | optimizer.step() 40 | if i % 100 == 0: 41 | print(f"epoch:{ep}, iter:{i}, loss:{loss.item()}", end=" ") 42 | 43 | model.eval() 44 | train_accs = [] 45 | preds, golds = [], [] 46 | for i, batch in enumerate(train_iter): 47 | x, y = batch.sent.t(), batch.tag.t() 48 | mask = (x != sent_vocab.stoi[""]) 49 | with torch.no_grad(): 50 | preds = model.predict(x, mask) 51 | right, total = 0, 0 52 | for pred, gold in zip(preds, y): 53 | right += np.sum(np.array(pred) == gold[:len(pred)].numpy()) 54 | total += len(pred) 55 | train_accs.append(right*1.0/total) 56 | train_acc = np.array(train_accs).mean() 57 | 58 | val_accs = [] 59 | for i, batch in enumerate(val_iter): 60 | x, y = batch.sent.t(), batch.tag.t() 61 | mask = (x != sent_vocab.stoi[""]) 62 | with torch.no_grad(): 63 | preds = model.predict(x, mask) 64 | right, total = 0, 0 65 | for pred, gold in zip(preds, y): 66 | right += np.sum(np.array(pred) == gold[:len(pred)].numpy()) 67 | total += len(pred) 68 | val_accs.append(right * 1.0 / total) 69 | val_acc = np.array(val_accs).mean() 70 | print("epoch %d train acc:%.2f, val acc:%.2f" % (epoch, train_acc, val_acc)) 71 | torch.save(model, model_path) 72 | test_sents = ["My name is Phil , I am from European Union ."] 73 | for sent in test_sents: 74 | ids = [sent_vocab.stoi[word] for word in sent.split(" ")] 75 | input_tensor = torch.tensor([ids]) 76 | mask = input_tensor != sent_vocab.stoi[""] 77 | with torch.no_grad(): 78 | pred = model.predict(input_tensor, mask) 79 | print(sent, "-->", [tag_vocab.itos[tag_id] for tag_id in pred[0]]) 80 | 81 | 82 | -------------------------------------------------------------------------------- /project3-Named Entity Recognition/model.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/positivepeng/nlp-beginner-projects/2f066cbdd24121938b90ee1d12fe063ad0a46df5/project3-Named Entity Recognition/model.pkl -------------------------------------------------------------------------------- /project3-Named Entity Recognition/models.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | """ 4 | Created on 2020/6/13 16:01 5 | @author: phil 6 | """ 7 | 8 | import torch.nn as nn 9 | from torchcrf import CRF 10 | import torch 11 | 12 | 13 | class BiLSTM_CRF_NER(nn.Module): 14 | def __init__(self, vocab_size, embedding_dim, hidden_size, num_tags, word_vectors=None, device="cpu"): 15 | super(BiLSTM_CRF_NER, self).__init__() 16 | self.device = device 17 | self.hidden_size = hidden_size 18 | self.embed = nn.Embedding(vocab_size, embedding_dim, _weight=word_vectors).to(device) 19 | self.lstm = nn.LSTM(embedding_dim, hidden_size, bidirectional=True, batch_first=True).to(device) 20 | self.hidden2tag = nn.Linear(hidden_size*2, num_tags) 21 | self.crf = CRF(num_tags=num_tags, batch_first=True).to(device) 22 | 23 | def forward(self, x, y, mask): 24 | emissions = self.get_emissions(x) 25 | loss = -self.crf(emissions=emissions, tags=y, mask=mask) 26 | return loss 27 | 28 | def predict(self, x, mask=None): 29 | emissions = self.get_emissions(x) 30 | preds = self.crf.decode(emissions, mask) 31 | return preds 32 | 33 | def get_emissions(self, x): 34 | batch_size, seq_len = x.shape 35 | embedded = self.embed(x) 36 | h0, c0 = torch.zeros(2, batch_size, self.hidden_size).to(self.device), torch.zeros(2, batch_size, self.hidden_size).to(self.device) 37 | lstm_out, (_, _) = self.lstm(embedded, (h0, c0)) 38 | emissions = self.hidden2tag(lstm_out) 39 | return emissions 40 | -------------------------------------------------------------------------------- /project3-Named Entity Recognition/torchcrf/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = '0.7.2' 2 | 3 | from typing import List, Optional 4 | 5 | import torch 6 | import torch.nn as nn 7 | 8 | 9 | class CRF(nn.Module): 10 | """Conditional random field. 11 | 12 | This module implements a conditional random field [LMP01]_. The forward computation 13 | of this class computes the log likelihood of the given sequence of tags and 14 | emission score tensor. This class also has `~CRF.decode` method which finds 15 | the best tag sequence given an emission score tensor using `Viterbi algorithm`_. 16 | 17 | Args: 18 | num_tags: Number of tags. 19 | batch_first: Whether the first dimension corresponds to the size of a minibatch. 20 | 21 | Attributes: 22 | start_transitions (`~torch.nn.Parameter`): Start transition score tensor of size 23 | ``(num_tags,)``. 24 | end_transitions (`~torch.nn.Parameter`): End transition score tensor of size 25 | ``(num_tags,)``. 26 | transitions (`~torch.nn.Parameter`): Transition score tensor of size 27 | ``(num_tags, num_tags)``. 28 | 29 | 30 | .. [LMP01] Lafferty, J., McCallum, A., Pereira, F. (2001). 31 | "Conditional random fields: Probabilistic models for segmenting and 32 | labeling sequence data". *Proc. 18th International Conf. on Machine 33 | Learning*. Morgan Kaufmann. pp. 282–289. 34 | 35 | .. _Viterbi algorithm: https://en.wikipedia.org/wiki/Viterbi_algorithm 36 | """ 37 | 38 | def __init__(self, num_tags: int, batch_first: bool = False) -> None: 39 | if num_tags <= 0: 40 | raise ValueError(f'invalid number of tags: {num_tags}') 41 | super().__init__() 42 | self.num_tags = num_tags 43 | self.batch_first = batch_first 44 | self.start_transitions = nn.Parameter(torch.empty(num_tags)) 45 | self.end_transitions = nn.Parameter(torch.empty(num_tags)) 46 | self.transitions = nn.Parameter(torch.empty(num_tags, num_tags)) 47 | 48 | self.reset_parameters() 49 | 50 | def reset_parameters(self) -> None: 51 | """Initialize the transition parameters. 52 | 53 | The parameters will be initialized randomly from a uniform distribution 54 | between -0.1 and 0.1. 55 | """ 56 | nn.init.uniform_(self.start_transitions, -0.1, 0.1) 57 | nn.init.uniform_(self.end_transitions, -0.1, 0.1) 58 | nn.init.uniform_(self.transitions, -0.1, 0.1) 59 | 60 | def __repr__(self) -> str: 61 | return f'{self.__class__.__name__}(num_tags={self.num_tags})' 62 | 63 | def forward( 64 | self, 65 | emissions: torch.Tensor, 66 | tags: torch.LongTensor, 67 | mask: Optional[torch.ByteTensor] = None, 68 | reduction: str = 'sum', 69 | ) -> torch.Tensor: 70 | """Compute the conditional log likelihood of a sequence of tags given emission scores. 71 | 72 | Args: 73 | emissions (`~torch.Tensor`): Emission score tensor of size 74 | ``(seq_length, batch_size, num_tags)`` if ``batch_first`` is ``False``, 75 | ``(batch_size, seq_length, num_tags)`` otherwise. 76 | tags (`~torch.LongTensor`): Sequence of tags tensor of size 77 | ``(seq_length, batch_size)`` if ``batch_first`` is ``False``, 78 | ``(batch_size, seq_length)`` otherwise. 79 | mask (`~torch.ByteTensor`): Mask tensor of size ``(seq_length, batch_size)`` 80 | if ``batch_first`` is ``False``, ``(batch_size, seq_length)`` otherwise. 81 | reduction: Specifies the reduction to apply to the output: 82 | ``none|sum|mean|token_mean``. ``none``: no reduction will be applied. 83 | ``sum``: the output will be summed over batches. ``mean``: the output will be 84 | averaged over batches. ``token_mean``: the output will be averaged over tokens. 85 | 86 | Returns: 87 | `~torch.Tensor`: The log likelihood. This will have size ``(batch_size,)`` if 88 | reduction is ``none``, ``()`` otherwise. 89 | """ 90 | self._validate(emissions, tags=tags, mask=mask) 91 | if reduction not in ('none', 'sum', 'mean', 'token_mean'): 92 | raise ValueError(f'invalid reduction: {reduction}') 93 | if mask is None: 94 | mask = torch.ones_like(tags, dtype=torch.uint8) 95 | 96 | if self.batch_first: 97 | emissions = emissions.transpose(0, 1) 98 | tags = tags.transpose(0, 1) 99 | mask = mask.transpose(0, 1) 100 | 101 | # shape: (batch_size,) 102 | numerator = self._compute_score(emissions, tags, mask) 103 | # shape: (batch_size,) 104 | denominator = self._compute_normalizer(emissions, mask) 105 | # shape: (batch_size,) 106 | llh = numerator - denominator 107 | 108 | if reduction == 'none': 109 | return llh 110 | if reduction == 'sum': 111 | return llh.sum() 112 | if reduction == 'mean': 113 | return llh.mean() 114 | assert reduction == 'token_mean' 115 | return llh.sum() / mask.float().sum() 116 | 117 | def decode(self, emissions: torch.Tensor, 118 | mask: Optional[torch.ByteTensor] = None) -> List[List[int]]: 119 | """Find the most likely tag sequence using Viterbi algorithm. 120 | 121 | Args: 122 | emissions (`~torch.Tensor`): Emission score tensor of size 123 | ``(seq_length, batch_size, num_tags)`` if ``batch_first`` is ``False``, 124 | ``(batch_size, seq_length, num_tags)`` otherwise. 125 | mask (`~torch.ByteTensor`): Mask tensor of size ``(seq_length, batch_size)`` 126 | if ``batch_first`` is ``False``, ``(batch_size, seq_length)`` otherwise. 127 | 128 | Returns: 129 | List of list containing the best tag sequence for each batch. 130 | """ 131 | self._validate(emissions, mask=mask) 132 | if mask is None: 133 | mask = emissions.new_ones(emissions.shape[:2], dtype=torch.uint8) 134 | 135 | if self.batch_first: 136 | emissions = emissions.transpose(0, 1) 137 | mask = mask.transpose(0, 1) 138 | 139 | return self._viterbi_decode(emissions, mask) 140 | 141 | def _validate( 142 | self, 143 | emissions: torch.Tensor, 144 | tags: Optional[torch.LongTensor] = None, 145 | mask: Optional[torch.ByteTensor] = None) -> None: 146 | if emissions.dim() != 3: 147 | raise ValueError(f'emissions must have dimension of 3, got {emissions.dim()}') 148 | if emissions.size(2) != self.num_tags: 149 | raise ValueError( 150 | f'expected last dimension of emissions is {self.num_tags}, ' 151 | f'got {emissions.size(2)}') 152 | 153 | if tags is not None: 154 | if emissions.shape[:2] != tags.shape: 155 | raise ValueError( 156 | 'the first two dimensions of emissions and tags must match, ' 157 | f'got {tuple(emissions.shape[:2])} and {tuple(tags.shape)}') 158 | 159 | if mask is not None: 160 | if emissions.shape[:2] != mask.shape: 161 | raise ValueError( 162 | 'the first two dimensions of emissions and mask must match, ' 163 | f'got {tuple(emissions.shape[:2])} and {tuple(mask.shape)}') 164 | no_empty_seq = not self.batch_first and mask[0].all() 165 | no_empty_seq_bf = self.batch_first and mask[:, 0].all() 166 | if not no_empty_seq and not no_empty_seq_bf: 167 | raise ValueError('mask of the first timestep must all be on') 168 | 169 | def _compute_score( 170 | self, emissions: torch.Tensor, tags: torch.LongTensor, 171 | mask: torch.ByteTensor) -> torch.Tensor: 172 | # emissions: (seq_length, batch_size, num_tags) 173 | # tags: (seq_length, batch_size) 174 | # mask: (seq_length, batch_size) 175 | assert emissions.dim() == 3 and tags.dim() == 2 176 | assert emissions.shape[:2] == tags.shape 177 | assert emissions.size(2) == self.num_tags 178 | assert mask.shape == tags.shape 179 | assert mask[0].all() 180 | 181 | seq_length, batch_size = tags.shape 182 | mask = mask.float() 183 | 184 | # Start transition score and first emission 185 | # shape: (batch_size,) 186 | score = self.start_transitions[tags[0]] 187 | score += emissions[0, torch.arange(batch_size), tags[0]] 188 | 189 | for i in range(1, seq_length): 190 | # Transition score to next tag, only added if next timestep is valid (mask == 1) 191 | # shape: (batch_size,) 192 | score += self.transitions[tags[i - 1], tags[i]] * mask[i] 193 | 194 | # Emission score for next tag, only added if next timestep is valid (mask == 1) 195 | # shape: (batch_size,) 196 | score += emissions[i, torch.arange(batch_size), tags[i]] * mask[i] 197 | 198 | # End transition score 199 | # shape: (batch_size,) 200 | seq_ends = mask.long().sum(dim=0) - 1 201 | # shape: (batch_size,) 202 | last_tags = tags[seq_ends, torch.arange(batch_size)] 203 | # shape: (batch_size,) 204 | score += self.end_transitions[last_tags] 205 | 206 | return score 207 | 208 | def _compute_normalizer( 209 | self, emissions: torch.Tensor, mask: torch.ByteTensor) -> torch.Tensor: 210 | # emissions: (seq_length, batch_size, num_tags) 211 | # mask: (seq_length, batch_size) 212 | assert emissions.dim() == 3 and mask.dim() == 2 213 | assert emissions.shape[:2] == mask.shape 214 | assert emissions.size(2) == self.num_tags 215 | assert mask[0].all() 216 | 217 | seq_length = emissions.size(0) 218 | 219 | # Start transition score and first emission; score has size of 220 | # (batch_size, num_tags) where for each batch, the j-th column stores 221 | # the score that the first timestep has tag j 222 | # shape: (batch_size, num_tags) 223 | score = self.start_transitions + emissions[0] 224 | 225 | for i in range(1, seq_length): 226 | # Broadcast score for every possible next tag 227 | # shape: (batch_size, num_tags, 1) 228 | broadcast_score = score.unsqueeze(2) 229 | 230 | # Broadcast emission score for every possible current tag 231 | # shape: (batch_size, 1, num_tags) 232 | broadcast_emissions = emissions[i].unsqueeze(1) 233 | 234 | # Compute the score tensor of size (batch_size, num_tags, num_tags) where 235 | # for each sample, entry at row i and column j stores the sum of scores of all 236 | # possible tag sequences so far that end with transitioning from tag i to tag j 237 | # and emitting 238 | # shape: (batch_size, num_tags, num_tags) 239 | next_score = broadcast_score + self.transitions + broadcast_emissions 240 | 241 | # Sum over all possible current tags, but we're in score space, so a sum 242 | # becomes a log-sum-exp: for each sample, entry i stores the sum of scores of 243 | # all possible tag sequences so far, that end in tag i 244 | # shape: (batch_size, num_tags) 245 | next_score = torch.logsumexp(next_score, dim=1) 246 | 247 | # Set score to the next score if this timestep is valid (mask == 1) 248 | # shape: (batch_size, num_tags) 249 | score = torch.where(mask[i].unsqueeze(1), next_score, score) 250 | 251 | # End transition score 252 | # shape: (batch_size, num_tags) 253 | score += self.end_transitions 254 | 255 | # Sum (log-sum-exp) over all possible tags 256 | # shape: (batch_size,) 257 | return torch.logsumexp(score, dim=1) 258 | 259 | def _viterbi_decode(self, emissions: torch.FloatTensor, 260 | mask: torch.ByteTensor) -> List[List[int]]: 261 | # emissions: (seq_length, batch_size, num_tags) 262 | # mask: (seq_length, batch_size) 263 | assert emissions.dim() == 3 and mask.dim() == 2 264 | assert emissions.shape[:2] == mask.shape 265 | assert emissions.size(2) == self.num_tags 266 | assert mask[0].all() 267 | 268 | seq_length, batch_size = mask.shape 269 | 270 | # Start transition and first emission 271 | # shape: (batch_size, num_tags) 272 | score = self.start_transitions + emissions[0] 273 | history = [] 274 | 275 | # score is a tensor of size (batch_size, num_tags) where for every batch, 276 | # value at column j stores the score of the best tag sequence so far that ends 277 | # with tag j 278 | # history saves where the best tags candidate transitioned from; this is used 279 | # when we trace back the best tag sequence 280 | 281 | # Viterbi algorithm recursive case: we compute the score of the best tag sequence 282 | # for every possible next tag 283 | for i in range(1, seq_length): 284 | # Broadcast viterbi score for every possible next tag 285 | # shape: (batch_size, num_tags, 1) 286 | broadcast_score = score.unsqueeze(2) 287 | 288 | # Broadcast emission score for every possible current tag 289 | # shape: (batch_size, 1, num_tags) 290 | broadcast_emission = emissions[i].unsqueeze(1) 291 | 292 | # Compute the score tensor of size (batch_size, num_tags, num_tags) where 293 | # for each sample, entry at row i and column j stores the score of the best 294 | # tag sequence so far that ends with transitioning from tag i to tag j and emitting 295 | # shape: (batch_size, num_tags, num_tags) 296 | next_score = broadcast_score + self.transitions + broadcast_emission 297 | 298 | # Find the maximum score over all possible current tag 299 | # shape: (batch_size, num_tags) 300 | next_score, indices = next_score.max(dim=1) 301 | 302 | # Set score to the next score if this timestep is valid (mask == 1) 303 | # and save the index that produces the next score 304 | # shape: (batch_size, num_tags) 305 | score = torch.where(mask[i].unsqueeze(1), next_score, score) 306 | history.append(indices) 307 | 308 | # End transition score 309 | # shape: (batch_size, num_tags) 310 | score += self.end_transitions 311 | 312 | # Now, compute the best path for each sample 313 | 314 | # shape: (batch_size,) 315 | seq_ends = mask.long().sum(dim=0) - 1 316 | best_tags_list = [] 317 | 318 | for idx in range(batch_size): 319 | # Find the tag which maximizes the score at the last timestep; this is our best tag 320 | # for the last timestep 321 | _, best_last_tag = score[idx].max(dim=0) 322 | best_tags = [best_last_tag.item()] 323 | 324 | # We trace back where the best last tag comes from, append that to our best tag 325 | # sequence, and trace it back again, and so on 326 | for hist in reversed(history[:seq_ends[idx]]): 327 | best_last_tag = hist[idx][best_tags[-1]] 328 | best_tags.append(best_last_tag.item()) 329 | 330 | # Reverse the order because we start from the last timestep 331 | best_tags.reverse() 332 | best_tags_list.append(best_tags) 333 | 334 | return best_tags_list 335 | -------------------------------------------------------------------------------- /project4-Machine Translation/dataloader.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | """ 4 | Created on 2020/6/8 11:26 5 | @author: phil 6 | """ 7 | 8 | # 参考吴恩达老师网易云深度学习课程作业 9 | import os 10 | import numpy as np 11 | import torch 12 | from faker import Faker 13 | import random 14 | 15 | from torch.nn import init 16 | from tqdm import tqdm 17 | from babel.dates import format_date 18 | from torchtext import data 19 | import pandas as pd 20 | from sklearn.model_selection import train_test_split 21 | fake = Faker() 22 | Faker.seed(12345) 23 | random.seed(12345) 24 | 25 | # Define format of the data we would like to generate 26 | FORMATS = ['short', 27 | 'medium', 28 | 'long', 29 | 'full', 30 | 'full', 31 | 'full', 32 | 'full', 33 | 'full', 34 | 'full', 35 | 'full', 36 | 'full', 37 | 'full', 38 | 'full', 39 | 'd MMM YYY', 40 | 'd MMMM YYY', 41 | 'dd MMM YYY', 42 | 'd MMM, YYY', 43 | 'd MMMM, YYY', 44 | 'dd, MMM YYY', 45 | 'd MM YY', 46 | 'd MMMM YYY', 47 | 'MMMM d YYY', 48 | 'MMMM d, YYY', 49 | 'dd.MM.YY'] 50 | 51 | # change this if you want it to work with another language 52 | LOCALES = ['en_US'] 53 | 54 | 55 | def load_date(): 56 | """ 57 | Loads some fake dates 58 | :returns: tuple containing human readable string, machine readable string, and date object 59 | """ 60 | dt = fake.date_object() 61 | 62 | try: 63 | human_readable = format_date(dt, format=random.choice(FORMATS), locale='en_US') 64 | human_readable = human_readable.lower() 65 | human_readable = human_readable.replace(',', '') 66 | machine_readable = dt.isoformat() 67 | except AttributeError as e: 68 | return None, None, None 69 | 70 | return human_readable, machine_readable, dt 71 | 72 | 73 | def load_dataset(m): 74 | """ 75 | Loads a dataset with m examples and vocabularies 76 | :m: the number of examples to generate 77 | """ 78 | dataset = [] 79 | for _ in tqdm(range(m)): 80 | h, m, _ = load_date() 81 | if h is not None: 82 | dataset.append([h, m]) 83 | 84 | return dataset 85 | 86 | 87 | def prepare_data(dataset_path=r"../dataset/date-normalization", dataset_size=10, debug=False): 88 | if debug: 89 | dataset_size = 10 90 | train_file = os.path.join(dataset_path, "train_samll.csv") 91 | eval_file = os.path.join(dataset_path, "eval_samll.csv") 92 | else: 93 | train_file = os.path.join(dataset_path, "train.csv") 94 | eval_file = os.path.join(dataset_path, "eval.csv") 95 | if not os.path.exists(train_file) and not os.path.exists(train_file): 96 | dataset = load_dataset(dataset_size) 97 | source, target = zip(*dataset) 98 | X_train, X_test, y_train, y_test = train_test_split(source, target, random_state=42, test_size=0.2) 99 | train_df = pd.DataFrame() 100 | train_df["source"], train_df["target"] = X_train, y_train 101 | eval_df = pd.DataFrame() 102 | eval_df["source"], eval_df["target"] = X_test, y_test 103 | train_df.to_csv(train_file, index=False) 104 | eval_df.to_csv(eval_file, index=False) 105 | return train_file, eval_file 106 | 107 | 108 | def dataset2dataloader(dataset_path, batch_size=10, dataset_size=10, debug=False): 109 | train_csv, dev_csv = prepare_data(dataset_path, dataset_size=dataset_size, debug=debug) 110 | 111 | def tokenizer(text): 112 | return list(text) 113 | 114 | # 这里只是定义了数据格式 115 | SOURCE = data.Field(sequential=True, tokenize=tokenizer, lower=False) 116 | # 目标输出前后需加入特殊的标志符 117 | TARGET = data.Field(sequential=True, tokenize=tokenizer, lower=False, init_token="", eos_token="") 118 | train, val = data.TabularDataset.splits( 119 | path='', train=train_csv, validation=dev_csv, format='csv', skip_header=True, 120 | fields=[('source', SOURCE), ('target', TARGET)]) 121 | 122 | SOURCE.build_vocab(train) 123 | TARGET.build_vocab(train) 124 | 125 | train_iter = data.BucketIterator(train, batch_size=batch_size, sort_key=lambda x: len(x.sent), shuffle=False) 126 | val_iter = data.BucketIterator(val, batch_size=batch_size, sort_key=lambda x: len(x.sent), shuffle=False) 127 | 128 | # 在 test_iter , sort一定要设置成 False, 要不然会被 torchtext 搞乱样本顺序 129 | # test_iter = data.Iterator(dataset=test, batch_size=128, train=False, sort=False, device=DEVICE) 130 | 131 | return train_iter, val_iter, SOURCE.vocab, TARGET.vocab -------------------------------------------------------------------------------- /project4-Machine Translation/main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | """ 4 | Created on 2020/6/8 11:36 5 | @author: phil 6 | """ 7 | from keras.utils import to_categorical 8 | 9 | from dataloader import load_dataset, dataset2dataloader 10 | from models import SimpleNMT 11 | from torch import optim 12 | import torch.nn as nn 13 | import torch 14 | import numpy as np 15 | from pprint import pprint 16 | from tqdm import tqdm 17 | 18 | if __name__ == "__main__": 19 | epoch = 500 20 | learning_rate = 0.001 21 | hidden_size = 64 22 | batch_size = 10 23 | 24 | train_iter, val_iter, source_vocab, target_vocab = dataset2dataloader(dataset_path=r"../dataset/date-normalization", 25 | batch_size=batch_size, dataset_size=10000, debug=True) 26 | source_vocab_size = len(source_vocab.stoi) 27 | target_vocab_size = len(target_vocab.stoi) 28 | 29 | # print(target_vocab.stoi) 30 | 31 | Tx, Ty = 25, 10 # 最大长度 32 | 33 | model = SimpleNMT(in_vocab_size=source_vocab_size, out_vocab_size=target_vocab_size, in_hidden_size=hidden_size, 34 | out_hidden_size=hidden_size, output_size=target_vocab_size, with_attention=True) 35 | 36 | optimizer = optim.Adam(model.parameters(), lr=learning_rate) 37 | criterion = nn.CrossEntropyLoss() 38 | 39 | embed_layer1 = nn.Embedding(source_vocab_size, source_vocab_size, 40 | _weight=torch.from_numpy(np.eye(source_vocab_size))) 41 | embed_layer2 = nn.Embedding(target_vocab_size, target_vocab_size, 42 | _weight=torch.from_numpy(np.eye(target_vocab_size))) 43 | 44 | model.train() 45 | for ep in range(epoch): 46 | epoch_loss = 0 47 | for batch in train_iter: 48 | optimizer.zero_grad() 49 | Xin, Yin, Yout = batch.source.t().long(), batch.target.t()[:, :-1].long(), batch.target.t()[:, 1:] 50 | batch_size = len(Xin) 51 | init_hidden = torch.zeros(1, batch_size, hidden_size) 52 | # if ep == epoch - 1: 53 | # print(Yout) 54 | Xin = embed_layer1(Xin).float() 55 | Yin = embed_layer2(Yin).float() 56 | logits = model(Xin, init_hidden, Yin) 57 | loss = criterion(logits.view(-1, logits.shape[-1]), Yout.flatten()) 58 | epoch_loss += loss.item() 59 | loss.backward() 60 | optimizer.step() 61 | if ep % (epoch // 10) == 0: 62 | print("loss", epoch_loss) 63 | 64 | # 测试训练集输出是否正确 65 | # for batch in train_iter: 66 | # # print(batch.source.t()) 67 | # print(batch.target.t()[:, 1:]) 68 | # print("finish") 69 | # init_hidden = torch.zeros(1, batch_size, hidden_size) 70 | # logits = model(Xin, init_hidden, Yin) 71 | # print(logits.argmax(-1)) 72 | 73 | sents_for_large = ["monday may 7 1983", "19 march 1998", "18 jul 2008", "9/10/70", "thursday january 1 1981", 74 | "thursday january 26 2015", "saturday april 18 1990", "sunday may 12 1988"] 75 | sents = ["monday march 7 1983", "9 may 1998", "thursday january 26 1995", "9/10/70"] 76 | 77 | 78 | def translate(model, sents): 79 | X = [] 80 | for sent in sents: 81 | X.append(list(map(lambda x: source_vocab[x], list(sent))) + [source_vocab[""]] * (Tx - len(sent))) 82 | Xoh = torch.from_numpy(np.array(list(map(lambda x: to_categorical(x, num_classes=source_vocab_size), X)))) 83 | encoder_init_hidden = torch.zeros(1, len(X), hidden_size) 84 | preds = model(Xoh, encoder_init_hidden, decoder_input=None, out_word2index=target_vocab.stoi, 85 | out_index2word=target_vocab.itos, max_len=Ty, out_size=target_vocab_size) 86 | for gold, pred in zip(sents, preds): 87 | print(gold, "-->", "".join(pred)) 88 | 89 | 90 | translate(model, sents) 91 | 92 | """ 不使用 attention 93 | dataset_size : 10000 94 | loss 940.5139790773392 95 | loss 151.68325132876635 96 | loss 17.91189043689519 97 | loss 8.461621267197188 98 | loss 0.4571912245155545 99 | loss 4.067497536438168 100 | loss 0.02432645454427984 101 | loss 0.022933890589229122 102 | loss 1.740354736426525 103 | loss 2.7019595313686295 104 | monday may 7 1983 --> 1983-05-07 105 | 19 march 1998 --> 1998-03-19 106 | 18 jul 2008 --> 2008-07-18 107 | 9/10/70 --> 1970-09-10 108 | thursday january 1 1981 --> 1981-01-01 109 | thursday january 26 2015 --> 2015-01-26 110 | saturday april 18 1990 --> 1990-04-18 111 | sunday may 12 1988 --> 1988-05-12 112 | """ 113 | 114 | """使用attention 115 | loss 870.4544065594673 116 | loss 65.41884177550673 117 | loss 53.339022306521656 118 | loss 0.08635593753569992 119 | loss 0.057157438381182146 120 | loss 0.0006471980702968949 121 | loss 0.09261544834953384 122 | loss 0.000922315769471993 123 | loss 0.00961817828419953 124 | loss 0.06814217135979561 125 | monday may 7 1983 --> 1983-05-07 126 | 19 march 1998 --> 1998-03-19 127 | 18 jul 2008 --> 2008-07-18 128 | 9/10/70 --> 1970-09-10 129 | thursday january 1 1981 --> 1981-01-01 130 | thursday january 26 2015 --> 2015-01-26 131 | saturday april 18 1990 --> 1990-04-18 132 | sunday may 12 1988 --> 1988-05-12 133 | """ 134 | -------------------------------------------------------------------------------- /project4-Machine Translation/models.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | """ 4 | Created on 2020/6/10 11:18 5 | @author: phil 6 | """ 7 | import torch.nn as nn 8 | import torch 9 | import numpy as np 10 | import torch.nn.functional as F 11 | 12 | 13 | class EncoderRNN(nn.Module): 14 | def __init__(self, vocab_size, hidden_size, dropout=0.5): 15 | super(EncoderRNN, self).__init__() 16 | self.hidden_size = hidden_size 17 | self.gru = nn.GRU(vocab_size, hidden_size, dropout=dropout, batch_first=True) 18 | 19 | def forward(self, x, init_hidden): 20 | seq_output, last_state = self.gru(x, init_hidden) 21 | return seq_output, last_state 22 | 23 | 24 | class DecoderRNN(nn.Module): 25 | def __init__(self, vocab_size, hidden_size, output_size, dropout=0.5): 26 | super(DecoderRNN, self).__init__() 27 | self.hidden_size = hidden_size 28 | self.gru = nn.GRU(vocab_size, hidden_size, dropout=dropout, batch_first=True) 29 | self.hidden2index = nn.Linear(hidden_size, output_size) 30 | 31 | def forward(self, x, init_state): 32 | seq_output, last_state = self.gru(x, init_state) 33 | seq_output = self.hidden2index(seq_output) 34 | return seq_output, last_state 35 | 36 | 37 | class DecoderAttenRNN(nn.Module): 38 | def __init__(self, vocab_size, hidden_size, output_size, dropout=0.5): 39 | super(DecoderAttenRNN, self).__init__() 40 | self.hidden_size = hidden_size 41 | self.gru = nn.GRU(vocab_size, hidden_size, dropout=dropout, batch_first=True) 42 | self.hidden2label = nn.Linear(hidden_size, output_size) 43 | self.atten_affine = nn.Linear(hidden_size*2, hidden_size) 44 | 45 | def get_alpha(self, hi, encoder_output): 46 | # hi shape (1, batch_size, hidden_size) 47 | # encoder_output (batch, seq_len, hidden_size) 48 | hi = hi.permute(1, 2, 0) # (batch_size, hidden_size, 1) 49 | # print(encoder_output.shape, hi.shape) 50 | e = torch.bmm(encoder_output, hi).squeeze(2) # (batch_size, seq_len) 51 | e = F.softmax(e, dim=1).unsqueeze(2) # (batch_size, seq_len, 1) 52 | alpha = (e * encoder_output).sum(dim=1) # (batch_size, hidden_size) 53 | 54 | return alpha 55 | 56 | def forward(self, x, init_state, seq_encoder_output): 57 | # print(x.shape, init_state.shape, seq_encoder_output.shape) 58 | batch_size, max_len, _ = x.shape # 独热码表示 59 | hi = init_state 60 | seq_decoder_output = [] 61 | for i in range(max_len): 62 | # alpha shape (batch_size, hidden_size) 63 | alpha = self.get_alpha(hi, seq_encoder_output) # alpha 表示当前time step的隐状态矩阵和encoder的各个time step输出的关联 64 | hi = torch.cat([alpha.unsqueeze(0), hi], dim=2) 65 | hi = self.atten_affine(hi) 66 | output, hi = self.gru(x[:, i, :].unsqueeze(1), hi) 67 | seq_output = self.hidden2label(output.squeeze(1)) 68 | seq_decoder_output.append(seq_output.squeeze(1)) 69 | seq_decoder_output = torch.stack(seq_decoder_output, dim=1) 70 | return seq_decoder_output, hi 71 | 72 | 73 | class SimpleNMT(nn.Module): 74 | def __init__(self, in_vocab_size, out_vocab_size, in_hidden_size, out_hidden_size, output_size, with_attention=False): 75 | super(SimpleNMT, self).__init__() 76 | self.with_attention = with_attention 77 | self.encoder = EncoderRNN(in_vocab_size, in_hidden_size) 78 | if self.with_attention: 79 | self.decoder = DecoderAttenRNN(out_vocab_size, out_hidden_size, output_size) 80 | else: 81 | self.decoder = DecoderRNN(out_vocab_size, out_hidden_size, output_size) 82 | 83 | def forward(self, encoder_input, encoder_init_hidden, decoder_input=None, out_word2index=None, out_index2word=None, 84 | max_len=None, out_size=None): 85 | encoder_seq_output, encoder_last_state = self.encoder(encoder_input, encoder_init_hidden) 86 | # 训练时decoder每个time step输入标准答案 87 | if decoder_input is not None: 88 | if self.with_attention: 89 | logits, _ = self.decoder(decoder_input, encoder_last_state, encoder_seq_output) 90 | else: 91 | logits, _ = self.decoder(decoder_input, encoder_last_state) 92 | return logits 93 | else: 94 | # 测试时没有标准答案,一直解码直到出现或者达到最大长度 95 | decoded_sents = [] 96 | for i in range(len(encoder_input)): 97 | sent = [] 98 | decoder_input = torch.FloatTensor(np.eye(out_size)[[out_word2index[""]]]).unsqueeze(0) 99 | hi = encoder_last_state[:, i, :].unsqueeze(1) 100 | for di in range(max_len): 101 | if self.with_attention: 102 | # alpha = self.decoder.get_alpha(hi, encoder_seq_output[i, :, :].unsqueeze( 103 | # 0)) # alpha 表示当前time step的隐状态矩阵和encoder的各个time step输出的关联 104 | # hi = torch.cat([alpha.unsqueeze(0), hi], dim=2) 105 | # hi = self.decoder.atten_affine(hi) 106 | # # print(decoder_input.shape, hi.shape, encoder_seq_output.shape) 107 | decoder_output, hdi = self.decoder(decoder_input, hi, encoder_seq_output[i, :, :].unsqueeze(0)) 108 | else: 109 | decoder_output, hdi = self.decoder(decoder_input, hi) 110 | topv, topi = decoder_output.data.topk(1) 111 | topi = topi.item() 112 | if topi == out_word2index[""]: 113 | break 114 | else: 115 | sent.append(out_index2word[topi]) 116 | decoder_input = torch.FloatTensor([np.eye(out_size)[topi]]).unsqueeze(0) 117 | hi = hdi 118 | decoded_sents.append(sent) 119 | return decoded_sents 120 | -------------------------------------------------------------------------------- /project5-Text Generation/dataloader.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | """ 4 | Created on 2020/6/18 10:56 5 | @author: phil 6 | """ 7 | import os 8 | import pandas as pd 9 | from torchtext import data 10 | 11 | 12 | def prepare_data(dataset_path="../dataset/poetry"): 13 | file_path = os.path.join(dataset_path, "poetryFromTang.txt") 14 | target_path = os.path.join(dataset_path, "train.csv") 15 | if not os.path.exists(target_path): 16 | with open(file_path, encoding="utf-8") as f: 17 | lines = f.read().split("\n\n") 18 | lines = list(map(lambda x: x.replace("\n", ""), lines)) 19 | df = pd.DataFrame() 20 | df["sent"] = lines 21 | df.to_csv(target_path, index=False, encoding='utf_8_sig') 22 | return target_path 23 | 24 | 25 | def dataset2dataloader(dataset_path="../dataset/poetry", batch_size=32, debug=False): 26 | if debug: 27 | train_csv = os.path.join(dataset_path, "train_small.csv") 28 | else: 29 | train_csv = prepare_data(dataset_path) 30 | 31 | def tokenizer(text): 32 | return list(text) 33 | 34 | SENT = data.Field(sequential=True, tokenize=tokenizer, lower=False, init_token="", eos_token="") 35 | train, _ = data.TabularDataset.splits(path='', train=train_csv, validation=train_csv, format='csv', 36 | skip_header=True, 37 | fields=[('sent', SENT)]) 38 | 39 | SENT.build_vocab(train) 40 | 41 | train_iter = data.BucketIterator(train, batch_size=batch_size, sort_key=lambda x: len(x.sent), shuffle=False) 42 | 43 | # 在 test_iter , sort一定要设置成 False, 要不然会被 torchtext 搞乱样本顺序 44 | # test_iter = data.Iterator(dataset=test, batch_size=128, train=False, sort=False, device=DEVICE) 45 | 46 | return train_iter, SENT.vocab 47 | 48 | 49 | if __name__ == "__main__": 50 | train_iter, vocab = dataset2dataloader() 51 | for batch in train_iter: 52 | print(batch.sent.t()) 53 | break 54 | -------------------------------------------------------------------------------- /project5-Text Generation/main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | """ 4 | Created on 2020/6/18 12:14 5 | @author: phil 6 | """ 7 | from tqdm import tqdm 8 | 9 | from dataloader import dataset2dataloader 10 | from torch.optim import Adam 11 | from models import PoetryModel 12 | import torch 13 | import torch.nn as nn 14 | import numpy as np 15 | import os 16 | 17 | if __name__ == "__main__": 18 | batch_size = 32 19 | learning_rate = 0.001 20 | hidden_size = 128 21 | epoch = 200 22 | 23 | train_iter, vocab = dataset2dataloader(batch_size=batch_size) 24 | 25 | vocab_size = len(vocab.stoi) 26 | # print(vocab_size, hidden_size, batch_size) 27 | model = PoetryModel(vocab_size=vocab_size, hidden_size=hidden_size, output_size=vocab_size) 28 | optimizer = Adam(model.parameters(), lr=learning_rate) 29 | criterion = nn.CrossEntropyLoss() 30 | 31 | one_hot_embedding = nn.Embedding(vocab_size, vocab_size, _weight=torch.from_numpy(np.eye(vocab_size))) 32 | 33 | model_path = "model.pkl" 34 | if os.path.exists(model_path): 35 | model = torch.load(model_path) 36 | else: 37 | for ep in tqdm(range(epoch)): 38 | model.train() 39 | total_loss = 0 40 | for i, batch in enumerate(train_iter): 41 | optimizer.zero_grad() 42 | sent = batch.sent.t() 43 | x, y = sent[:, :-1], sent[:, 1:] 44 | x = one_hot_embedding(x).float() 45 | init_hidden = torch.zeros(1, len(x), hidden_size) 46 | output, _ = model(x, init_hidden) 47 | output = output.reshape(-1, output.shape[-1]) 48 | y = y.flatten() 49 | loss = criterion(output, y) 50 | loss.backward() 51 | optimizer.step() 52 | total_loss += loss.item() 53 | if ep % (epoch // 10) == 0: 54 | print("loss: ", total_loss) 55 | torch.save(model, model_path) 56 | 57 | model.eval() 58 | # test = ["我好可爱"] 我病恨无我,。好一解颜色。可怜王经行自远,一解颜色。爱绿溪阴。 59 | # test = ["花开有情"] 花边行县柳,河桥晚泊船。开远树,山鸟助酣歌。有情何处,箫管凤初来。情何处所,风吹青珊瑚,可怜王孙立 60 | test = [""] 61 | for sent in test: 62 | sent = list(map(lambda x: vocab.stoi[x], list(sent))) 63 | x = torch.tensor(sent).unsqueeze(0) 64 | x = one_hot_embedding(x).float() 65 | with torch.no_grad(): 66 | output = model.generate(x, stoi=vocab.stoi, poetry_type="hidden head") 67 | ans = torch.cat(output, dim=1).argmax(-1).squeeze(0) 68 | for word_id in ans: 69 | print(vocab.itos[word_id.item()], end="") -------------------------------------------------------------------------------- /project5-Text Generation/model.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/positivepeng/nlp-beginner-projects/2f066cbdd24121938b90ee1d12fe063ad0a46df5/project5-Text Generation/model.pkl -------------------------------------------------------------------------------- /project5-Text Generation/model_debug.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/positivepeng/nlp-beginner-projects/2f066cbdd24121938b90ee1d12fe063ad0a46df5/project5-Text Generation/model_debug.pkl -------------------------------------------------------------------------------- /project5-Text Generation/models.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | """ 4 | Created on 2020/6/18 11:05 5 | @author: phil 6 | """ 7 | 8 | import torch.nn as nn 9 | import torch 10 | 11 | 12 | class PoetryModel(nn.Module): 13 | def __init__(self, vocab_size, hidden_size, output_size, dropout=0.5): 14 | super(PoetryModel, self).__init__() 15 | self.hidden_size = hidden_size 16 | self.gru = nn.GRU(input_size=vocab_size, hidden_size=hidden_size, dropout=dropout, batch_first=True) 17 | self.out = nn.Linear(hidden_size, output_size) 18 | 19 | def forward(self, x, init_hidden): 20 | # print(x.shape, init_hidden.shape) 21 | seq_out, hn = self.gru(x, init_hidden) 22 | output = self.out(seq_out) 23 | return output, hn 24 | 25 | def generate(self, x, stoi, poetry_type="begin", sent_num=4, max_len=15): 26 | init_hidden = torch.zeros(1, 1, self.hidden_size) 27 | output = [] 28 | if poetry_type == "hidden head" and x.shape[1] != sent_num: 29 | print("ERROR:选择了藏头诗但是输入字的个数不等于诗的句子数") 30 | return 31 | 32 | hn = init_hidden 33 | for i in range(sent_num): 34 | if i == 0 and poetry_type == "begin": 35 | seq_out, hn = self.gru(x, hn) 36 | seq_out = seq_out[:, -1, :].unsqueeze(1) 37 | output.append(x) 38 | if poetry_type == "hidden head": 39 | seq_out, hn = self.gru(x[:, i, :].unsqueeze(1), hn) 40 | seq_out = seq_out[:, -1, :].unsqueeze(1) 41 | output.append(x[:, i, :].unsqueeze(1)) 42 | for j in range(max_len): # 每一句的最大长度 43 | # 上一个time step的输出 44 | _, topi = self.out(seq_out).data.topk(1) 45 | topi = topi.item() 46 | xi_from_output = torch.zeros(1, 1, x.shape[-1]) 47 | xi_from_output[0][0][topi] = 1 48 | output.append(xi_from_output) 49 | seq_out, hn = self.gru(xi_from_output, hn) 50 | if topi == stoi["。"]: 51 | break 52 | return output 53 | --------------------------------------------------------------------------------