├── .gitignore
├── README.md
├── project1-ML-Sentence Classification
    ├── data_preprocess.py
    ├── feature_extraction.py
    ├── main.py
    └── softmax_regerssion.py
├── project2-DL-Sentence Classification
    ├── Convolutional Neural Networks for Sentence Classification.pdf
    ├── dataloader_byhand.py
    ├── dataloader_bytorchtext.py
    ├── main.py
    └── models.py
├── project3-Named Entity Recognition
    ├── dataloader.py
    ├── main.py
    ├── model.pkl
    ├── models.py
    └── torchcrf
    │   └── __init__.py
├── project4-Machine Translation
    ├── dataloader.py
    ├── main.py
    └── models.py
└── project5-Text Generation
    ├── dataloader.py
    ├── main.py
    ├── model.pkl
    ├── model_debug.pkl
    └── models.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # nlp-beginner-projects
 2 | NLP常见任务实现（pytorch版）
 3 | 
 4 | - [x] 项目一：基于softmax regression的文本多分类<br>
 5 | 博客地址：[基于softmax regression的文本多分类](https://blog.csdn.net/philpanic9/article/details/106606415)
 6 | - [x] 项目二：基于RNN，CNN的文本多分类<br>
 7 | 博客地址：[基于RNN、CNN的文本多分类](https://blog.csdn.net/philpanic9/article/details/106728786)<br>
 8 | - [x] 项目三：BiLSTM-CRF命名实体识别<br>
 9 | 博客地址：[基于BiLSTM-CRF的实体识别](https://blog.csdn.net/philpanic9/article/details/106742297)<br>
10 | > 输入待标注的句子，输出标注结果<br>
11 | My  name is Phil , I am from European Union . --> <br>
12 | ['O', 'O', 'O', 'B-PER', 'O', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'O']
13 | 
14 | - [x] 项目四：机器翻译<br>
15 | 博客地址：[基于Seq2Seq（包含Attention）的机器翻译](https://blog.csdn.net/philpanic9/article/details/106806350)<br>
16 | > **输入人可读的日期，翻译出机器可读的日期**<br>
17 | monday may 7 1983 --> 1983-05-07<br>
18 | 19 march 1998 --> 1998-03-19<br>
19 | 18 jul 2008 --> 2008-07-18<br>
20 | 9/10/70 --> 1970-09-10<br>
21 | thursday january 1 1981 --> 1981-01-01<br>
22 | thursday january 26 2015 --> 2015-01-26<br>
23 | saturday april 18 1990 --> 1990-04-18<br>
24 | sunday may 12 1988 --> 1988-05-12<br>
25 | - [x] 项目五：文本生成<br>
26 | 博客地址：https://blog.csdn.net/philpanic9/article/details/106878540<br>
27 | > **输入"我好可爱"，生成藏头诗**<br>
28 | 我病恨无我，。<br>
29 | 好一解颜色。<br>
30 | 可怜王经行自远，一解颜色。<br>
31 | 爱绿溪阴。<br>
32 | > **输入"花开有情"，生成藏头诗**<br>
33 | 花边行县柳，河桥晚泊船。<br>
34 | 开远树，山鸟助酣歌。<br>
35 | 有情何处，箫管凤初来。<br>
36 | 情何处所，风吹青珊瑚，可怜王孙立<br>
37 | 
38 | 
39 | 相关链接：
40 |  1. [项目1数据集](https://www.kaggle.com/c/sentiment-analysis-on-movie-reviews)
41 |  2. [fdu nlp-beginner](https://github.com/FudanNLP/nlp-beginner)
42 | 
43 | 
44 | 
45 | 
46 | 


--------------------------------------------------------------------------------
/project1-ML-Sentence Classification/data_preprocess.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | """
 4 | Created on 2020/4/15 21:01
 5 | @author: phil
 6 | """
 7 | 
 8 | import pandas as pd
 9 | 
10 | 
11 | def read_data(train_file="../dataset/kaggle-movie-review/train.tsv"):
12 |     train_df = pd.read_csv(train_file, sep='\t')
13 |     # test_df = pd.read_csv(test_file, sep="\t")
14 |     return train_df["Phrase"].values, train_df["Sentiment"].values
15 | 
16 | 
17 | if __name__ == "__main__":
18 |     X_data, y_data = read_data()
19 |     print("train size", len(X_data))
20 | 


--------------------------------------------------------------------------------
/project1-ML-Sentence Classification/feature_extraction.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | """
 4 | Created on 2020/4/15 21:16
 5 | @author: phil
 6 | """
 7 | 
 8 | import numpy as np
 9 | 
10 | 
11 | class BagOfWord:
12 |     def __init__(self, do_lower_case=False):
13 |         self.vocab = {}
14 |         self.do_lower_case = do_lower_case
15 | 
16 |     def fit(self, sent_list):
17 |         # sent_list 类型为 List
18 |         for sent in sent_list:
19 |             if self.do_lower_case:
20 |                 sent = sent.lower()
21 |             words = sent.strip().split(" ")
22 |             for word in words:
23 |                 if word not in self.vocab:
24 |                     self.vocab[word] = len(self.vocab)
25 | 
26 |     def transform(self, sent_list):
27 |         vocab_size = len(self.vocab)
28 |         bag_of_word_feature = np.zeros((len(sent_list), vocab_size))
29 |         for idx, sent in enumerate(sent_list):
30 |             if self.do_lower_case:
31 |                 sent = sent.lower()
32 |             words = sent.strip().split(" ")
33 |             for word in words:
34 |                 bag_of_word_feature[idx][self.vocab[word]] += 1
35 |         return bag_of_word_feature
36 | 
37 |     def fit_transform(self, sent_list):
38 |         self.fit(sent_list)
39 |         return self.transform(sent_list)
40 | 
41 | 
42 | class NGram:
43 |     def __init__(self, ngram, do_lower_case=False):
44 |         self.ngram = ngram
45 |         self.feature_map = {}
46 |         self.do_lower_case = do_lower_case
47 | 
48 |     def fit(self, sentList):
49 |         for gram in self.ngram:
50 |             for sent in sentList:
51 |                 if self.do_lower_case:
52 |                     sent = sent.lower()
53 |                 sent = sent.split(" ")
54 |                 for i in range(len(sent) - gram + 1):
55 |                     feature = "_".join(sent[i:i + gram])
56 |                     if feature not in self.feature_map:
57 |                         self.feature_map[feature] = len(self.feature_map)
58 | 
59 |     def transform(self, sentList):
60 |         n = len(sentList)
61 |         m = len(self.feature_map)
62 |         ngram_feature = np.zeros((n, m))
63 |         for idx, sent in enumerate(sentList):
64 |             if self.do_lower_case:
65 |                 sent = sent.lower()
66 |             sent = sent.split(" ")
67 |             for gram in self.ngram:
68 |                 for i in range(len(sent) - gram + 1):
69 |                     feature = "_".join(sent[i:i + gram])
70 |                     if feature in self.feature_map:
71 |                         ngram_feature[idx][self.feature_map[feature]] = 1
72 |         return ngram_feature
73 | 
74 |     def fit_transform(self, sentList):
75 |         self.fit(sentList)
76 |         return self.transform(sentList)
77 | 
78 | 
79 | if __name__ == "__main__":
80 |     gram = NGram((1, 2))
81 |     sents = ["I love you", "do you love yourself"]
82 |     feature = gram.fit_transform(sents)
83 |     print(gram.feature_map)
84 |     print(feature)
85 | 


--------------------------------------------------------------------------------
/project1-ML-Sentence Classification/main.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | """
 4 | Created on 2020/4/15 21:24
 5 | @author: phil
 6 | """
 7 | 
 8 | import numpy as np
 9 | from data_preprocess import read_data
10 | from feature_extraction import BagOfWord, NGram
11 | from softmax_regerssion import SoftmaxRegression
12 | import matplotlib.pyplot as plt
13 | from sklearn.model_selection import train_test_split
14 | 
15 | if __name__ == '__main__':
16 |     debug = 1
17 |     # 读入数据
18 |     X_data, y_data = read_data()
19 | 
20 |     if debug == 1:
21 |         # index = np.arange(len(X_data))
22 |         # np.random.shuffle(index)
23 |         # X_data = X_data[index[:2000]]
24 |         # y_data = y_data[index[:2000]]
25 |         X_data = X_data[:1000]
26 |         y_data = y_data[:1000]
27 |     y = np.array(y_data).reshape(len(y_data), 1)
28 | 
29 |     # 数据集划分
30 |     bag_of_word_model = BagOfWord(do_lower_case=True)
31 |     ngram_model = NGram(ngram=(1, 2), do_lower_case=True)
32 |     X_Bow = bag_of_word_model.fit_transform(X_data)
33 |     X_Gram = ngram_model.fit_transform(X_data)
34 | 
35 |     print("Bow shape", X_Bow.shape)
36 |     print("Gram shape", X_Gram.shape)
37 | 
38 |     X_train_Bow, X_test_Bow, y_train_Bow, y_test_Bow = train_test_split(X_Bow, y, test_size=0.2, random_state=42, stratify=y)
39 |     X_train_Gram, X_test_Gram, y_train_Gram, y_test_Gram = train_test_split(X_Gram, y, test_size=0.2, random_state=42, stratify=y)
40 | 
41 |     # 训练模型 不同特征的差别
42 |     epoch = 100
43 |     bow_learning_rate = 1
44 |     gram_learning_rate = 1
45 | 
46 |     # 梯度下降
47 |     model1 = SoftmaxRegression()
48 |     history = model1.fit(X_train_Bow, y_train_Bow, epoch=epoch, learning_rate=bow_learning_rate, print_loss_steps=epoch//10, update_strategy="stochastic")
49 |     plt.plot(np.arange(len(history)), np.array(history))
50 |     plt.show()
51 |     print("Bow train {} test {}".format(model1.score(X_train_Bow, y_train_Bow), model1.score(X_test_Bow, y_test_Bow)))
52 | 
53 |     model2 = SoftmaxRegression()
54 |     history = model2.fit(X_train_Gram, y_train_Gram, epoch=epoch, learning_rate=gram_learning_rate, print_loss_steps=epoch//10, update_strategy="stochastic")
55 |     plt.plot(np.arange(len(history)), np.array(history))
56 |     plt.show()
57 |     print("Gram train {} test {}".format(model2.score(X_train_Gram, y_train_Gram), model2.score(X_test_Gram, y_test_Gram)))
58 | 
59 |     # 样本数量：20000
60 |     # epoch = 100
61 |     # bow_learning_rate = 0.001
62 |     # gram_learning_rate = 0.5
63 |     # Bow  train 0.7094375  test  0.4885
64 |     # Gram train 0.9786875 test 0.5335


--------------------------------------------------------------------------------
/project1-ML-Sentence Classification/softmax_regerssion.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | """
 4 | Created on 2020/4/15 20:58
 5 | @author: phil
 6 | """
 7 | 
 8 | import numpy as np
 9 | 
10 | 
11 | def softmax(z):
12 |     # 稳定版本的softmax，对z的每一行进行softmax
13 |     z -= np.max(z, axis=1, keepdims=True)  # 先减去该行的最大值
14 |     z = np.exp(z)
15 |     z /= np.sum(z, axis=1, keepdims=True)
16 |     return z
17 | 
18 | 
19 | class SoftmaxRegression:
20 |     def __init__(self):
21 |         self.num_of_class = None  # 类别数量
22 |         self.n = None   # 数据个数
23 |         self.m = None   # 数据维度
24 |         self.weight = None  # 模型权重 shape (类别数，数据维度)
25 |         self.learning_rate = None
26 | 
27 |     def fit(self, X, y, learning_rate=0.01, epoch=10, num_of_class=5, print_loss_steps=-1, update_strategy="batch"):
28 |         self.n, self.m = X.shape
29 |         self.num_of_class = num_of_class
30 |         self.weight = np.random.randn(self.num_of_class, self.m)
31 |         self.learning_rate = learning_rate
32 | 
33 |         # 将y换为独热码矩阵，每一行独热码表示一个label
34 |         y_one_hot = np.zeros((self.n, self.num_of_class))
35 |         for i in range(self.n):
36 |             y_one_hot[i][y[i]] = 1
37 | 
38 |         loss_history = []
39 | 
40 |         for e in range(epoch):
41 |             # X (n, m) 每一行表示一个样本
42 |             # weight (C, m) 每一行处理一个类别
43 |             loss = 0
44 |             if update_strategy == "stochastic":
45 |                 rand_index = np.arange(len(X))
46 |                 np.random.shuffle(rand_index)
47 |                 for index in list(rand_index):
48 |                     Xi = X[index].reshape(1, -1)
49 |                     prob = Xi.dot(self.weight.T)
50 |                     prob = softmax(prob).flatten()
51 |                     loss += -np.log(prob[y[index]])
52 |                     self.weight += Xi.reshape(1, self.m).T.dot((y_one_hot[index] - prob).reshape(1, self.num_of_class)).T
53 | 
54 |             if update_strategy == "batch":
55 |                 prob = X.dot(self.weight.T)   # (n, C) 每个样本被预测为各个类别
56 |                 prob = softmax(prob)
57 | 
58 |                 for i in range(self.n):
59 |                     loss -= np.log(prob[i][y[i]])
60 | 
61 |                 # 书中给的损失函数
62 |                 weight_update = np.zeros_like(self.weight)
63 |                 for i in range(self.n):
64 |                     weight_update += X[i].reshape(1, self.m).T.dot((y_one_hot[i] - prob[i]).reshape(1, self.num_of_class)).T
65 |                 self.weight += weight_update * self.learning_rate / self.n
66 | 
67 |             loss /= self.n
68 |             loss_history.append(loss)
69 |             if print_loss_steps != -1 and e % print_loss_steps == 0:
70 |                 print("epoch {} loss {}".format(e, loss))
71 |         return loss_history
72 | 
73 |     def predict(self, X):
74 |         prob = softmax(X.dot(self.weight.T))
75 |         return prob.argmax(axis=1)
76 | 
77 |     def score(self, X, y):
78 |         pred = self.predict(X)
79 |         return np.sum(pred.reshape(y.shape) == y) / y.shape[0]


--------------------------------------------------------------------------------
/project2-DL-Sentence Classification/Convolutional Neural Networks for Sentence Classification.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/positivepeng/nlp-beginner-projects/2f066cbdd24121938b90ee1d12fe063ad0a46df5/project2-DL-Sentence Classification/Convolutional Neural Networks for Sentence Classification.pdf


--------------------------------------------------------------------------------
/project2-DL-Sentence Classification/dataloader_byhand.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- encoding: utf-8 -*-
  3 | """
  4 | Created on 2020/6/12 15:25
  5 | @author: phil
  6 | """
  7 | 
  8 | import pandas as pd
  9 | import os
 10 | import numpy as np
 11 | import torch
 12 | from sklearn.model_selection import train_test_split
 13 | from torch.nn.utils.rnn import pad_sequence
 14 | from torch.utils.data import Dataset, DataLoader
 15 | 
 16 | 
 17 | def prepare_data(dataset_path, sent_col_name, label_col_name):
 18 |     """ 读出tsv中的句子和标签 """
 19 |     file_path = os.path.join(dataset_path, "train.tsv")
 20 |     data = pd.read_csv(file_path, sep="\t")
 21 |     X = data[sent_col_name].values
 22 |     y = data[label_col_name].values
 23 |     return X, y
 24 | 
 25 | 
 26 | class Language:
 27 |     """ 根据句子列表建立词典并将单词列表转换为数值型表示 """
 28 |     def __init__(self):
 29 |         self.word2id = {}
 30 |         self.id2word = {}
 31 | 
 32 |     def fit(self, sent_list):
 33 |         vocab = set()
 34 |         for sent in sent_list:
 35 |             vocab.update(sent.split(" "))
 36 |         word_list = ["<pad>", "<unk>"] + list(vocab)
 37 |         self.word2id = {word: i for i, word in enumerate(word_list)}
 38 |         self.id2word = {i: word for i, word in enumerate(word_list)}
 39 | 
 40 |     def transform(self, sent_list, reverse=False):
 41 |         sent_list_id = []
 42 |         word_mapper = self.word2id if not reverse else self.id2word
 43 |         unk = self.word2id["<unk>"] if not reverse else None
 44 |         for sent in sent_list:
 45 |             sent_id = list(map(lambda x: word_mapper.get(x, unk), sent.split(" ") if not reverse else sent))
 46 |             sent_list_id.append(sent_id)
 47 |         return sent_list_id
 48 | 
 49 | 
 50 | class ClsDataset(Dataset):
 51 |     """ 文本分类数据集 """
 52 |     def __init__(self, sents, labels):
 53 |         self.sents = sents
 54 |         self.labels = labels
 55 | 
 56 |     def __getitem__(self, item):
 57 |         return self.sents[item], self.labels[item]
 58 | 
 59 |     def __len__(self):
 60 |         return len(self.sents)
 61 | 
 62 | 
 63 | def collate_fn(batch_data):
 64 |     """ 自定义一个batch里面的数据的组织方式 """
 65 |     batch_data.sort(key=lambda data_pair: len(data_pair[0]), reverse=True)
 66 | 
 67 |     sents, labels = zip(*batch_data)
 68 |     sents_len = [len(sent) for sent in sents]
 69 |     sents = [torch.LongTensor(sent) for sent in sents]
 70 |     padded_sents = pad_sequence(sents, batch_first=True, padding_value=0)
 71 | 
 72 |     return torch.LongTensor(padded_sents), torch.LongTensor(labels),  torch.FloatTensor(sents_len)
 73 | 
 74 | 
 75 | def get_wordvec(word2id, vec_file_path, vec_dim=50):
 76 |     """ 读出txt文件的预训练词向量 """
 77 |     print("开始加载词向量")
 78 |     word_vectors = torch.nn.init.xavier_uniform_(torch.empty(len(word2id), vec_dim))
 79 |     word_vectors[0, :] = 0  # <pad>
 80 |     found = 0
 81 |     with open(vec_file_path, "r", encoding="utf-8") as f:
 82 |         lines = f.readlines()
 83 |         for line in lines:
 84 |             splited = line.split(" ")
 85 |             if splited[0] in word2id:
 86 |                 found += 1
 87 |                 word_vectors[word2id[splited[0]]] = torch.tensor(list(map(lambda x: float(x), splited[1:])))
 88 |             if found == len(word2id) - 1:  # 允许<unk>找不到
 89 |                 break
 90 |     print("总共 %d个词，其中%d个找到了对应的词向量" % (len(word2id), found))
 91 |     return word_vectors.float()
 92 | 
 93 | 
 94 | def make_dataloader(dataset_path="../dataset/kaggle-movie-review", sent_col_name="Phrase", label_col_name="Sentiment", batch_size=32, vec_file_path="./.vector_cache/glove.6B.50d.txt", debug=False):
 95 |     # X, y = prepare_datapairs(dataset_path="../dataset/imdb", sent_col_name="review", label_col_name="sentiment")
 96 |     X, y = prepare_data(dataset_path=dataset_path, sent_col_name=sent_col_name, label_col_name=label_col_name)
 97 | 
 98 |     if debug:
 99 |         X, y = X[:100], y[:100]
100 | 
101 |     X_language = Language()
102 |     X_language.fit(X)
103 |     X = X_language.transform(X)
104 | 
105 |     word_vectors = get_wordvec(X_language.word2id, vec_file_path=vec_file_path, vec_dim=50)
106 |     # 总共 18229个词，其中12769个找到了对应的词向量 word_vectors = get_wordvec(X_language.word2id,
107 |     # vec_file_path=r"F:\NLP-pretrained-model\glove.twitter.27B\glove.twitter.27B.50d.txt", vec_dim=50)
108 | 
109 |     # 测试
110 |     # print(X[:2])
111 |     # X_id = X_language.transform(X[:2])
112 |     # print(X_language.transform(X_id, reverse=True))
113 | 
114 |     X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
115 | 
116 |     cls_train_dataset, cls_val_dataset = ClsDataset(X_train, y_train), ClsDataset(X_val, y_val)
117 |     cls_train_dataloader = DataLoader(cls_train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
118 |     cls_val_dataloader = DataLoader(cls_val_dataset, batch_size=batch_size, collate_fn=collate_fn)
119 | 
120 |     return cls_train_dataloader, cls_val_dataloader, word_vectors, X_language
121 | 
122 | 
123 | if __name__ == "__main__":
124 |     cls_train_dataloader, cls_val_dataloader, word_vectors, X_language = make_dataloader(debug=True, batch_size=10)
125 |     for batch in cls_train_dataloader:
126 |         X, y, lens = batch
127 |         print(X.shape, y.shape)
128 |         break
129 | 


--------------------------------------------------------------------------------
/project2-DL-Sentence Classification/dataloader_bytorchtext.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | """
 4 | Created on 2020/5/4 15:35
 5 | @author: phil
 6 | """
 7 | import os
 8 | 
 9 | import pandas as pd
10 | import spacy
11 | from sklearn.model_selection import train_test_split
12 | from torch.nn import init
13 | from torchtext import data
14 | 
15 | 
16 | def prepare_data(dataset_path, sent_col_name, label_col_name, debug=False):
17 |     """ 读出tsv中的句子和标签 """
18 |     file_path = os.path.join(dataset_path, "train.tsv")
19 |     data = pd.read_csv(file_path, sep="\t")
20 |     if debug:
21 |         data = data.sample(n=100)
22 |     X = data[sent_col_name].values
23 |     y = data[label_col_name].values
24 |     X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
25 |     train_df, val_df = pd.DataFrame(), pd.DataFrame()
26 |     train_df["sent"], train_df["label"] = X_train, y_train
27 |     val_df["sent"], val_df["label"] = X_val, y_val
28 | 
29 |     train_file_path = os.path.join(dataset_path, "train.csv")
30 |     val_file_path = os.path.join(dataset_path, "val.csv")
31 |     train_df.to_csv(train_file_path, index=False)
32 |     val_df.to_csv(val_file_path, index=False)
33 | 
34 |     return train_file_path, val_file_path
35 | 
36 | 
37 | def dataset2dataloader(dataset_path="../dataset/kaggle-movie-review", sent_col_name="Phrase", label_col_name="Sentiment", batch_size=32, vec_file_path="./.vector_cache/glove.6B.50d.txt", debug=False):
38 |     train_file_name, val_file_name = prepare_data(dataset_path, sent_col_name, label_col_name, debug=debug)
39 |     spacy_en = spacy.load('en_core_web_sm')
40 | 
41 |     def tokenizer(text):
42 |         """ 定义分词操作 """
43 |         return [tok.text for tok in spacy_en.tokenizer(text)]
44 | 
45 |     # 这里只是定义了数据格式
46 |     TEXT = data.Field(sequential=True, tokenize=tokenizer, lower=True)
47 |     LABEL = data.Field(sequential=False, use_vocab=False)
48 |     train, val = data.TabularDataset.splits(
49 |         path='', train=train_file_name, validation=val_file_name, format='csv', skip_header=True,
50 |         fields=[('sent', TEXT), ('label', LABEL)])
51 | 
52 |     TEXT.build_vocab(train, vectors='glove.6B.50d')  # , max_size=30000)
53 |     # 当 corpus 中有的 token 在 vectors 中不存在时 的初始化方式.
54 |     TEXT.vocab.vectors.unk_init = init.xavier_uniform
55 | 
56 |     DEVICE = "cpu"
57 |     train_iter = data.BucketIterator(train, batch_size=batch_size, sort_key=lambda x: len(x.review), device=DEVICE)
58 |     val_iter = data.BucketIterator(val, batch_size=batch_size, sort_key=lambda x: len(x.review), shuffle=True, device=DEVICE)
59 | 
60 |     # 在 test_iter , sort一定要设置成 False, 要不然会被 torchtext 搞乱样本顺序
61 |     # test_iter = data.Iterator(dataset=test, batch_size=128, train=False, sort=False, device=DEVICE)
62 | 
63 |     return train_iter, val_iter, TEXT.vocab.vectors
64 | 
65 | 
66 | if __name__ == "__main__":
67 |     train_iter, val_iter, vectors = dataset2dataloader(batch_size=32, debug=True)
68 | 
69 |     batch = next(iter(train_iter))
70 |     print(batch.sent.shape)
71 |     print(batch.label.shape)
72 | 


--------------------------------------------------------------------------------
/project2-DL-Sentence Classification/main.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | """
 4 | Created on 2020/4/30 8:33
 5 | @author: phil
 6 | """
 7 | from torch import optim
 8 | import torch
 9 | from models import TextRNN, TextCNN
10 | from dataloader_bytorchtext import dataset2dataloader
11 | from dataloader_byhand import make_dataloader
12 | import numpy as np
13 | 
14 | if __name__ == "__main__":
15 |     model_names = ["LSTM", "RNN", "CNN"]  # 彩蛋：按过拟合难度排序，由难到易
16 |     learning_rate = 0.001
17 |     epoch_num = 500
18 |     num_of_class = 5
19 |     load_data_by_torchtext = True
20 | 
21 |     if load_data_by_torchtext:
22 |         train_iter, val_iter, word_vectors = dataset2dataloader(batch_size=100, debug=True)
23 |     else:
24 |         train_iter, val_iter, word_vectors, X_lang = make_dataloader(batch_size=100, debug=True)
25 | 
26 |     for model_name in model_names[-1:]:
27 |         if model_name == "RNN":
28 |             model = TextRNN(vocab_size=len(word_vectors), embedding_dim=50, hidden_size=128, num_of_class=num_of_class, weights=word_vectors)
29 |         elif model_name == "CNN":
30 |             model = TextCNN(vocab_size=len(word_vectors), embedding_dim=50, num_of_class=num_of_class, embedding_vectors=word_vectors)
31 |         elif model_name == "LSTM":
32 |             model = TextRNN(vocab_size=len(word_vectors), embedding_dim=50, hidden_size=128, num_of_class=num_of_class, weights=word_vectors, rnn_type="LSTM")
33 |         optimizer = optim.Adam(model.parameters(), lr=learning_rate)
34 |         loss_fun = torch.nn.CrossEntropyLoss()
35 | 
36 |         for epoch in range(epoch_num):
37 |             model.train()  # 包含dropout或者BN的模型需要指定
38 |             for i, batch in enumerate(train_iter):
39 |                 if load_data_by_torchtext:
40 |                     x, y = batch.sent.t(), batch.label
41 |                 else:
42 |                     x, y, lens = batch
43 |                 logits = model(x)
44 |                 optimizer.zero_grad()
45 |                 loss = loss_fun(logits, y)
46 |                 loss.backward()
47 |                 optimizer.step()
48 | 
49 |             # with torch.no_grad():
50 |             model.eval()
51 |             train_accs = []
52 |             for i, batch in enumerate(train_iter):
53 |                 if load_data_by_torchtext:
54 |                     x, y = batch.sent.t(), batch.label
55 |                 else:
56 |                     x, y, lens = batch
57 |                 _, y_pre = torch.max(logits, -1)
58 |                 acc = torch.mean((torch.tensor(y_pre == y, dtype=torch.float)))
59 |                 train_accs.append(acc)
60 |             train_acc = np.array(train_accs).mean()
61 | 
62 |             val_accs = []
63 |             for i, batch in enumerate(val_iter):
64 |                 if load_data_by_torchtext:
65 |                     x, y = batch.sent.t(), batch.label
66 |                 else:
67 |                     x, y, lens = batch
68 |                 logits = model(x)
69 |                 _, y_pre = torch.max(logits, -1)
70 |                 acc = torch.mean((torch.tensor(y_pre == y, dtype=torch.float)))
71 |                 val_accs.append(acc)
72 |             val_acc = np.array(val_accs).mean()
73 |             print("epoch %d train acc:%.2f, val acc:%.2f" % (epoch, train_acc, val_acc))
74 |             if train_acc >= 0.99:
75 |                 break
76 | 
77 | 


--------------------------------------------------------------------------------
/project2-DL-Sentence Classification/models.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | """
 4 | Created on 2020/5/15 22:23
 5 | @author: phil
 6 | """
 7 | import torch.nn as nn
 8 | import torch
 9 | import torch.nn.functional as F
10 | 
11 | 
12 | class TextRNN(nn.Module):
13 |     def __init__(self, vocab_size, embedding_dim, hidden_size, num_of_class, weights=None, rnn_type="RNN"):
14 |         super(TextRNN, self).__init__()
15 | 
16 |         self.vocab_size = vocab_size
17 |         self.hidden_size = hidden_size
18 |         self.num_of_class = num_of_class
19 |         self.embedding_dim = embedding_dim
20 |         self.rnn_type = rnn_type
21 | 
22 |         if weights is not None:
23 |             self.embed = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim, _weight=weights)
24 |         else:
25 |             self.embed = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)
26 | 
27 |         if rnn_type == "RNN":
28 |             self.rnn = nn.RNN(input_size=embedding_dim, hidden_size=hidden_size, batch_first=True)
29 |             self.hidden2label = nn.Linear(hidden_size, num_of_class)
30 |         elif rnn_type == "LSTM":
31 |             self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_size, batch_first=True, bidirectional=True)
32 |             self.hidden2label = nn.Linear(hidden_size*2, num_of_class)
33 | 
34 |     def forward(self, input_sents):
35 |         # input_sents (batch_size, seq_len)
36 |         batch_size, seq_len = input_sents.shape
37 |         # (batch_size, seq_len, embedding_dim)
38 |         embed_out = self.embed(input_sents)
39 | 
40 |         if self.rnn_type == "RNN":
41 |             h0 = torch.randn(1, batch_size, self.hidden_size)
42 |             _, hn = self.rnn(embed_out, h0)
43 |         elif self.rnn_type == "LSTM":
44 |             h0, c0 = torch.randn(2, batch_size, self.hidden_size), torch.randn(2, batch_size, self.hidden_size)
45 |             output, (hn, _) = self.lstm(embed_out, (h0, c0))
46 | 
47 |         logits = self.hidden2label(hn).squeeze(0)
48 | 
49 |         return logits
50 | 
51 | 
52 | class TextCNN(nn.Module):
53 |     def __init__(self, vocab_size, embedding_dim, num_of_class, embedding_vectors=None, kernel_num=100, kerner_size=[3, 4, 5], dropout=0.5):
54 |         super(TextCNN, self).__init__()
55 |         if embedding_vectors is None:
56 |             self.embed = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)
57 |         else:
58 |             self.embed = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim, _weight=embedding_vectors)
59 |         self.convs = nn.ModuleList([nn.Conv2d(1, kernel_num, (K, embedding_dim)) for K in kerner_size])
60 |         self.dropout = nn.Dropout(dropout)
61 |         self.feature2label = nn.Linear(3*kernel_num, num_of_class)
62 | 
63 |     def forward(self, x):
64 |         # x shape (batch_size, seq_len)
65 |         embed_out = self.embed(x).unsqueeze(1)
66 |         conv_out = [F.relu(conv(embed_out)).squeeze(3) for conv in self.convs]
67 | 
68 |         pool_out = [F.max_pool1d(block, block.size(2)).squeeze(2) for block in conv_out]
69 | 
70 |         pool_out = torch.cat(pool_out, 1)
71 | 
72 |         logits = self.feature2label(pool_out)
73 | 
74 |         return logits
75 | 
76 | 
77 | if __name__ == "__main__":
78 |     model = TextCNN(vocab_size=10, embedding_dim=10, num_of_class=10)
79 |     x = torch.randint(10, (10, 20))
80 |     logits = model.forward(x)
81 | 
82 | 


--------------------------------------------------------------------------------
/project3-Named Entity Recognition/dataloader.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | """
 4 | Created on 2020/6/13 11:58
 5 | @author: phil
 6 | """
 7 | import os
 8 | import pandas as pd
 9 | import spacy
10 | from torch.nn import init
11 | from torchtext import data
12 | 
13 | 
14 | def prepare_data(dataset_path, debug=False):
15 |     train_file_path = os.path.join(dataset_path, "train.txt")
16 |     dev_file_path = os.path.join(dataset_path, "dev.txt")
17 | 
18 |     def process_file(file_path, target_file_path):
19 |         sents, tags = [], []
20 |         with open(file_path, "r") as f:
21 |             lines = f.readlines()
22 |             sent, tag = [], []
23 |             for line in lines:
24 |                 line = line.strip()
25 |                 if len(line) == 0:
26 |                     sents.append(" ".join(sent))
27 |                     tags.append(" ".join(tag))
28 |                     sent, tag = [], []
29 |                 else:
30 |                     splited = line.split(" ")
31 |                     sent.append(splited[0])
32 |                     tag.append(splited[-1])
33 |             if len(sent) != 0:
34 |                 sents.append(" ".join(sent))
35 |                 tags.append(" ".join(tag))
36 |         df = pd.DataFrame()
37 |         df["sent"] = sents if not debug else sents[:100]
38 |         df["tag"] = tags if not debug else tags[:100]
39 |         df.to_csv(target_file_path, index=False)
40 | 
41 |     train_csv = os.path.join(dataset_path, "train.csv") if not debug else os.path.join(dataset_path, "train_small.csv")
42 |     dev_csv = os.path.join(dataset_path, "dev.csv") if not debug else os.path.join(dataset_path, "train_dev.csv")
43 | 
44 |     if not os.path.exists(train_csv):
45 |         process_file(train_file_path, train_csv)
46 |         process_file(dev_file_path, dev_csv)
47 | 
48 |     return train_csv, dev_csv
49 | 
50 | 
51 | def dataset2dataloader(dataset_path="../dataset/conll2003-IOB", batch_size=3, debug=False):
52 |     train_csv, dev_csv = prepare_data(dataset_path, debug=debug)
53 | 
54 |     def tokenizer(text):
55 |         return text.split(" ")
56 | 
57 |     # 这里只是定义了数据格式
58 |     TEXT = data.Field(sequential=True, tokenize=tokenizer, lower=False)
59 |     TAG = data.Field(sequential=True, tokenize=tokenizer, lower=False)
60 |     train, val = data.TabularDataset.splits(
61 |         path='', train=train_csv, validation=dev_csv, format='csv', skip_header=True,
62 |         fields=[('sent', TEXT), ('tag', TAG)])
63 | 
64 |     TEXT.build_vocab(train, vectors='glove.6B.50d')  # , max_size=30000)
65 |     TAG.build_vocab(val)
66 | 
67 |     # 当 corpus 中有的 token 在 vectors 中不存在时 的初始化方式.
68 |     TEXT.vocab.vectors.unk_init = init.xavier_uniform
69 | 
70 |     DEVICE = "cpu"
71 |     train_iter = data.BucketIterator(train, batch_size=batch_size, sort_key=lambda x: len(x.sent), device=DEVICE)
72 |     val_iter = data.BucketIterator(val, batch_size=batch_size, sort_key=lambda x: len(x.sent), device=DEVICE)
73 | 
74 |     # 在 test_iter , sort一定要设置成 False, 要不然会被 torchtext 搞乱样本顺序
75 |     # test_iter = data.Iterator(dataset=test, batch_size=128, train=False, sort=False, device=DEVICE)
76 | 
77 |     return train_iter, val_iter, TEXT.vocab, TAG.vocab
78 | 
79 | 
80 | if __name__ == "__main__":
81 |     # train_csv, dev_csv = prepare_data(dataset_path="../dataset/conll2003-IOB")
82 |     train_iter, val_iter, sent_vocab, tag_vocab = dataset2dataloader(dataset_path="../dataset/conll2003-IOB", debug=True)
83 |     word_vectors = sent_vocab.vectors
84 | 
85 |     for batch in train_iter:
86 |         print(batch.sent.shape, batch.tag.shape)
87 |         break


--------------------------------------------------------------------------------
/project3-Named Entity Recognition/main.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | """
 4 | Created on 2020/6/13 16:15
 5 | @author: phil
 6 | """
 7 | 
 8 | from dataloader import dataset2dataloader
 9 | from models import BiLSTM_CRF_NER
10 | from torch.optim import Adam
11 | import torch
12 | import numpy as np
13 | import os
14 | 
15 | if __name__ == "__main__":
16 |     train_iter, val_iter, sent_vocab, tag_vocab = dataset2dataloader(batch_size=128)
17 |     word_vectors = sent_vocab.vectors
18 |     device = torch.device("cuda") if torch.cuda.is_available() else "cpu"
19 | 
20 |     model = BiLSTM_CRF_NER(vocab_size=len(sent_vocab.stoi), embedding_dim=50, hidden_size=128, num_tags=len(tag_vocab.stoi), word_vectors=word_vectors, device=device)
21 | 
22 |     epoch = 10
23 |     learning_rate = 0.01
24 |     model_path = "model.pkl"
25 | 
26 |     optimizer = Adam(model.parameters(), lr=learning_rate)
27 | 
28 |     if os.path.exists(model_path):
29 |         model = torch.load(model_path)
30 |     else:
31 |         for ep in range(epoch):
32 |             model.train()
33 |             for i, batch in enumerate(train_iter):
34 |                 x, y = batch.sent.t(), batch.tag.t()
35 |                 mask = (x != sent_vocab.stoi["<pad>"])
36 |                 optimizer.zero_grad()
37 |                 loss = model(x, y, mask)
38 |                 loss.backward()
39 |                 optimizer.step()
40 |                 if i % 100 == 0:
41 |                     print(f"epoch:{ep}, iter:{i}, loss:{loss.item()}", end=" ")
42 | 
43 |             model.eval()
44 |             train_accs = []
45 |             preds, golds = [], []
46 |             for i, batch in enumerate(train_iter):
47 |                 x, y = batch.sent.t(), batch.tag.t()
48 |                 mask = (x != sent_vocab.stoi["<pad>"])
49 |                 with torch.no_grad():
50 |                     preds = model.predict(x, mask)
51 |                 right, total = 0, 0
52 |                 for pred, gold in zip(preds, y):
53 |                     right += np.sum(np.array(pred) == gold[:len(pred)].numpy())
54 |                     total += len(pred)
55 |                 train_accs.append(right*1.0/total)
56 |             train_acc = np.array(train_accs).mean()
57 | 
58 |             val_accs = []
59 |             for i, batch in enumerate(val_iter):
60 |                 x, y = batch.sent.t(), batch.tag.t()
61 |                 mask = (x != sent_vocab.stoi["<pad>"])
62 |                 with torch.no_grad():
63 |                     preds = model.predict(x, mask)
64 |                 right, total = 0, 0
65 |                 for pred, gold in zip(preds, y):
66 |                     right += np.sum(np.array(pred) == gold[:len(pred)].numpy())
67 |                     total += len(pred)
68 |                 val_accs.append(right * 1.0 / total)
69 |             val_acc = np.array(val_accs).mean()
70 |             print("epoch %d train acc:%.2f, val acc:%.2f" % (epoch, train_acc, val_acc))
71 |     torch.save(model, model_path)
72 |     test_sents = ["My name is Phil , I am from European Union ."]
73 |     for sent in test_sents:
74 |         ids = [sent_vocab.stoi[word] for word in sent.split(" ")]
75 |         input_tensor = torch.tensor([ids])
76 |         mask = input_tensor != sent_vocab.stoi["<pad>"]
77 |         with torch.no_grad():
78 |             pred = model.predict(input_tensor, mask)
79 |         print(sent, "-->", [tag_vocab.itos[tag_id] for tag_id in pred[0]])
80 | 
81 | 
82 | 


--------------------------------------------------------------------------------
/project3-Named Entity Recognition/model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/positivepeng/nlp-beginner-projects/2f066cbdd24121938b90ee1d12fe063ad0a46df5/project3-Named Entity Recognition/model.pkl


--------------------------------------------------------------------------------
/project3-Named Entity Recognition/models.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | """
 4 | Created on 2020/6/13 16:01
 5 | @author: phil
 6 | """
 7 | 
 8 | import torch.nn as nn
 9 | from torchcrf import CRF
10 | import torch
11 | 
12 | 
13 | class BiLSTM_CRF_NER(nn.Module):
14 |     def __init__(self, vocab_size, embedding_dim, hidden_size, num_tags, word_vectors=None, device="cpu"):
15 |         super(BiLSTM_CRF_NER, self).__init__()
16 |         self.device = device
17 |         self.hidden_size = hidden_size
18 |         self.embed = nn.Embedding(vocab_size, embedding_dim, _weight=word_vectors).to(device)
19 |         self.lstm = nn.LSTM(embedding_dim, hidden_size, bidirectional=True, batch_first=True).to(device)
20 |         self.hidden2tag = nn.Linear(hidden_size*2, num_tags)
21 |         self.crf = CRF(num_tags=num_tags, batch_first=True).to(device)
22 | 
23 |     def forward(self, x, y, mask):
24 |         emissions = self.get_emissions(x)
25 |         loss = -self.crf(emissions=emissions, tags=y, mask=mask)
26 |         return loss
27 | 
28 |     def predict(self, x, mask=None):
29 |         emissions = self.get_emissions(x)
30 |         preds = self.crf.decode(emissions, mask)
31 |         return preds
32 | 
33 |     def get_emissions(self, x):
34 |         batch_size, seq_len = x.shape
35 |         embedded = self.embed(x)
36 |         h0, c0 = torch.zeros(2, batch_size, self.hidden_size).to(self.device), torch.zeros(2, batch_size, self.hidden_size).to(self.device)
37 |         lstm_out, (_, _) = self.lstm(embedded, (h0, c0))
38 |         emissions = self.hidden2tag(lstm_out)
39 |         return emissions
40 | 


--------------------------------------------------------------------------------
/project3-Named Entity Recognition/torchcrf/__init__.py:
--------------------------------------------------------------------------------
  1 | __version__ = '0.7.2'
  2 | 
  3 | from typing import List, Optional
  4 | 
  5 | import torch
  6 | import torch.nn as nn
  7 | 
  8 | 
  9 | class CRF(nn.Module):
 10 |     """Conditional random field.
 11 | 
 12 |     This module implements a conditional random field [LMP01]_. The forward computation
 13 |     of this class computes the log likelihood of the given sequence of tags and
 14 |     emission score tensor. This class also has `~CRF.decode` method which finds
 15 |     the best tag sequence given an emission score tensor using `Viterbi algorithm`_.
 16 | 
 17 |     Args:
 18 |         num_tags: Number of tags.
 19 |         batch_first: Whether the first dimension corresponds to the size of a minibatch.
 20 | 
 21 |     Attributes:
 22 |         start_transitions (`~torch.nn.Parameter`): Start transition score tensor of size
 23 |             ``(num_tags,)``.
 24 |         end_transitions (`~torch.nn.Parameter`): End transition score tensor of size
 25 |             ``(num_tags,)``.
 26 |         transitions (`~torch.nn.Parameter`): Transition score tensor of size
 27 |             ``(num_tags, num_tags)``.
 28 | 
 29 | 
 30 |     .. [LMP01] Lafferty, J., McCallum, A., Pereira, F. (2001).
 31 |        "Conditional random fields: Probabilistic models for segmenting and
 32 |        labeling sequence data". *Proc. 18th International Conf. on Machine
 33 |        Learning*. Morgan Kaufmann. pp. 282–289.
 34 | 
 35 |     .. _Viterbi algorithm: https://en.wikipedia.org/wiki/Viterbi_algorithm
 36 |     """
 37 | 
 38 |     def __init__(self, num_tags: int, batch_first: bool = False) -> None:
 39 |         if num_tags <= 0:
 40 |             raise ValueError(f'invalid number of tags: {num_tags}')
 41 |         super().__init__()
 42 |         self.num_tags = num_tags
 43 |         self.batch_first = batch_first
 44 |         self.start_transitions = nn.Parameter(torch.empty(num_tags))
 45 |         self.end_transitions = nn.Parameter(torch.empty(num_tags))
 46 |         self.transitions = nn.Parameter(torch.empty(num_tags, num_tags))
 47 | 
 48 |         self.reset_parameters()
 49 | 
 50 |     def reset_parameters(self) -> None:
 51 |         """Initialize the transition parameters.
 52 | 
 53 |         The parameters will be initialized randomly from a uniform distribution
 54 |         between -0.1 and 0.1.
 55 |         """
 56 |         nn.init.uniform_(self.start_transitions, -0.1, 0.1)
 57 |         nn.init.uniform_(self.end_transitions, -0.1, 0.1)
 58 |         nn.init.uniform_(self.transitions, -0.1, 0.1)
 59 | 
 60 |     def __repr__(self) -> str:
 61 |         return f'{self.__class__.__name__}(num_tags={self.num_tags})'
 62 | 
 63 |     def forward(
 64 |             self,
 65 |             emissions: torch.Tensor,
 66 |             tags: torch.LongTensor,
 67 |             mask: Optional[torch.ByteTensor] = None,
 68 |             reduction: str = 'sum',
 69 |     ) -> torch.Tensor:
 70 |         """Compute the conditional log likelihood of a sequence of tags given emission scores.
 71 | 
 72 |         Args:
 73 |             emissions (`~torch.Tensor`): Emission score tensor of size
 74 |                 ``(seq_length, batch_size, num_tags)`` if ``batch_first`` is ``False``,
 75 |                 ``(batch_size, seq_length, num_tags)`` otherwise.
 76 |             tags (`~torch.LongTensor`): Sequence of tags tensor of size
 77 |                 ``(seq_length, batch_size)`` if ``batch_first`` is ``False``,
 78 |                 ``(batch_size, seq_length)`` otherwise.
 79 |             mask (`~torch.ByteTensor`): Mask tensor of size ``(seq_length, batch_size)``
 80 |                 if ``batch_first`` is ``False``, ``(batch_size, seq_length)`` otherwise.
 81 |             reduction: Specifies  the reduction to apply to the output:
 82 |                 ``none|sum|mean|token_mean``. ``none``: no reduction will be applied.
 83 |                 ``sum``: the output will be summed over batches. ``mean``: the output will be
 84 |                 averaged over batches. ``token_mean``: the output will be averaged over tokens.
 85 | 
 86 |         Returns:
 87 |             `~torch.Tensor`: The log likelihood. This will have size ``(batch_size,)`` if
 88 |             reduction is ``none``, ``()`` otherwise.
 89 |         """
 90 |         self._validate(emissions, tags=tags, mask=mask)
 91 |         if reduction not in ('none', 'sum', 'mean', 'token_mean'):
 92 |             raise ValueError(f'invalid reduction: {reduction}')
 93 |         if mask is None:
 94 |             mask = torch.ones_like(tags, dtype=torch.uint8)
 95 | 
 96 |         if self.batch_first:
 97 |             emissions = emissions.transpose(0, 1)
 98 |             tags = tags.transpose(0, 1)
 99 |             mask = mask.transpose(0, 1)
100 | 
101 |         # shape: (batch_size,)
102 |         numerator = self._compute_score(emissions, tags, mask)
103 |         # shape: (batch_size,)
104 |         denominator = self._compute_normalizer(emissions, mask)
105 |         # shape: (batch_size,)
106 |         llh = numerator - denominator
107 | 
108 |         if reduction == 'none':
109 |             return llh
110 |         if reduction == 'sum':
111 |             return llh.sum()
112 |         if reduction == 'mean':
113 |             return llh.mean()
114 |         assert reduction == 'token_mean'
115 |         return llh.sum() / mask.float().sum()
116 | 
117 |     def decode(self, emissions: torch.Tensor,
118 |                mask: Optional[torch.ByteTensor] = None) -> List[List[int]]:
119 |         """Find the most likely tag sequence using Viterbi algorithm.
120 | 
121 |         Args:
122 |             emissions (`~torch.Tensor`): Emission score tensor of size
123 |                 ``(seq_length, batch_size, num_tags)`` if ``batch_first`` is ``False``,
124 |                 ``(batch_size, seq_length, num_tags)`` otherwise.
125 |             mask (`~torch.ByteTensor`): Mask tensor of size ``(seq_length, batch_size)``
126 |                 if ``batch_first`` is ``False``, ``(batch_size, seq_length)`` otherwise.
127 | 
128 |         Returns:
129 |             List of list containing the best tag sequence for each batch.
130 |         """
131 |         self._validate(emissions, mask=mask)
132 |         if mask is None:
133 |             mask = emissions.new_ones(emissions.shape[:2], dtype=torch.uint8)
134 | 
135 |         if self.batch_first:
136 |             emissions = emissions.transpose(0, 1)
137 |             mask = mask.transpose(0, 1)
138 | 
139 |         return self._viterbi_decode(emissions, mask)
140 | 
141 |     def _validate(
142 |             self,
143 |             emissions: torch.Tensor,
144 |             tags: Optional[torch.LongTensor] = None,
145 |             mask: Optional[torch.ByteTensor] = None) -> None:
146 |         if emissions.dim() != 3:
147 |             raise ValueError(f'emissions must have dimension of 3, got {emissions.dim()}')
148 |         if emissions.size(2) != self.num_tags:
149 |             raise ValueError(
150 |                 f'expected last dimension of emissions is {self.num_tags}, '
151 |                 f'got {emissions.size(2)}')
152 | 
153 |         if tags is not None:
154 |             if emissions.shape[:2] != tags.shape:
155 |                 raise ValueError(
156 |                     'the first two dimensions of emissions and tags must match, '
157 |                     f'got {tuple(emissions.shape[:2])} and {tuple(tags.shape)}')
158 | 
159 |         if mask is not None:
160 |             if emissions.shape[:2] != mask.shape:
161 |                 raise ValueError(
162 |                     'the first two dimensions of emissions and mask must match, '
163 |                     f'got {tuple(emissions.shape[:2])} and {tuple(mask.shape)}')
164 |             no_empty_seq = not self.batch_first and mask[0].all()
165 |             no_empty_seq_bf = self.batch_first and mask[:, 0].all()
166 |             if not no_empty_seq and not no_empty_seq_bf:
167 |                 raise ValueError('mask of the first timestep must all be on')
168 | 
169 |     def _compute_score(
170 |             self, emissions: torch.Tensor, tags: torch.LongTensor,
171 |             mask: torch.ByteTensor) -> torch.Tensor:
172 |         # emissions: (seq_length, batch_size, num_tags)
173 |         # tags: (seq_length, batch_size)
174 |         # mask: (seq_length, batch_size)
175 |         assert emissions.dim() == 3 and tags.dim() == 2
176 |         assert emissions.shape[:2] == tags.shape
177 |         assert emissions.size(2) == self.num_tags
178 |         assert mask.shape == tags.shape
179 |         assert mask[0].all()
180 | 
181 |         seq_length, batch_size = tags.shape
182 |         mask = mask.float()
183 | 
184 |         # Start transition score and first emission
185 |         # shape: (batch_size,)
186 |         score = self.start_transitions[tags[0]]
187 |         score += emissions[0, torch.arange(batch_size), tags[0]]
188 | 
189 |         for i in range(1, seq_length):
190 |             # Transition score to next tag, only added if next timestep is valid (mask == 1)
191 |             # shape: (batch_size,)
192 |             score += self.transitions[tags[i - 1], tags[i]] * mask[i]
193 | 
194 |             # Emission score for next tag, only added if next timestep is valid (mask == 1)
195 |             # shape: (batch_size,)
196 |             score += emissions[i, torch.arange(batch_size), tags[i]] * mask[i]
197 | 
198 |         # End transition score
199 |         # shape: (batch_size,)
200 |         seq_ends = mask.long().sum(dim=0) - 1
201 |         # shape: (batch_size,)
202 |         last_tags = tags[seq_ends, torch.arange(batch_size)]
203 |         # shape: (batch_size,)
204 |         score += self.end_transitions[last_tags]
205 | 
206 |         return score
207 | 
208 |     def _compute_normalizer(
209 |             self, emissions: torch.Tensor, mask: torch.ByteTensor) -> torch.Tensor:
210 |         # emissions: (seq_length, batch_size, num_tags)
211 |         # mask: (seq_length, batch_size)
212 |         assert emissions.dim() == 3 and mask.dim() == 2
213 |         assert emissions.shape[:2] == mask.shape
214 |         assert emissions.size(2) == self.num_tags
215 |         assert mask[0].all()
216 | 
217 |         seq_length = emissions.size(0)
218 | 
219 |         # Start transition score and first emission; score has size of
220 |         # (batch_size, num_tags) where for each batch, the j-th column stores
221 |         # the score that the first timestep has tag j
222 |         # shape: (batch_size, num_tags)
223 |         score = self.start_transitions + emissions[0]
224 | 
225 |         for i in range(1, seq_length):
226 |             # Broadcast score for every possible next tag
227 |             # shape: (batch_size, num_tags, 1)
228 |             broadcast_score = score.unsqueeze(2)
229 | 
230 |             # Broadcast emission score for every possible current tag
231 |             # shape: (batch_size, 1, num_tags)
232 |             broadcast_emissions = emissions[i].unsqueeze(1)
233 | 
234 |             # Compute the score tensor of size (batch_size, num_tags, num_tags) where
235 |             # for each sample, entry at row i and column j stores the sum of scores of all
236 |             # possible tag sequences so far that end with transitioning from tag i to tag j
237 |             # and emitting
238 |             # shape: (batch_size, num_tags, num_tags)
239 |             next_score = broadcast_score + self.transitions + broadcast_emissions
240 | 
241 |             # Sum over all possible current tags, but we're in score space, so a sum
242 |             # becomes a log-sum-exp: for each sample, entry i stores the sum of scores of
243 |             # all possible tag sequences so far, that end in tag i
244 |             # shape: (batch_size, num_tags)
245 |             next_score = torch.logsumexp(next_score, dim=1)
246 | 
247 |             # Set score to the next score if this timestep is valid (mask == 1)
248 |             # shape: (batch_size, num_tags)
249 |             score = torch.where(mask[i].unsqueeze(1), next_score, score)
250 | 
251 |         # End transition score
252 |         # shape: (batch_size, num_tags)
253 |         score += self.end_transitions
254 | 
255 |         # Sum (log-sum-exp) over all possible tags
256 |         # shape: (batch_size,)
257 |         return torch.logsumexp(score, dim=1)
258 | 
259 |     def _viterbi_decode(self, emissions: torch.FloatTensor,
260 |                         mask: torch.ByteTensor) -> List[List[int]]:
261 |         # emissions: (seq_length, batch_size, num_tags)
262 |         # mask: (seq_length, batch_size)
263 |         assert emissions.dim() == 3 and mask.dim() == 2
264 |         assert emissions.shape[:2] == mask.shape
265 |         assert emissions.size(2) == self.num_tags
266 |         assert mask[0].all()
267 | 
268 |         seq_length, batch_size = mask.shape
269 | 
270 |         # Start transition and first emission
271 |         # shape: (batch_size, num_tags)
272 |         score = self.start_transitions + emissions[0]
273 |         history = []
274 | 
275 |         # score is a tensor of size (batch_size, num_tags) where for every batch,
276 |         # value at column j stores the score of the best tag sequence so far that ends
277 |         # with tag j
278 |         # history saves where the best tags candidate transitioned from; this is used
279 |         # when we trace back the best tag sequence
280 | 
281 |         # Viterbi algorithm recursive case: we compute the score of the best tag sequence
282 |         # for every possible next tag
283 |         for i in range(1, seq_length):
284 |             # Broadcast viterbi score for every possible next tag
285 |             # shape: (batch_size, num_tags, 1)
286 |             broadcast_score = score.unsqueeze(2)
287 | 
288 |             # Broadcast emission score for every possible current tag
289 |             # shape: (batch_size, 1, num_tags)
290 |             broadcast_emission = emissions[i].unsqueeze(1)
291 | 
292 |             # Compute the score tensor of size (batch_size, num_tags, num_tags) where
293 |             # for each sample, entry at row i and column j stores the score of the best
294 |             # tag sequence so far that ends with transitioning from tag i to tag j and emitting
295 |             # shape: (batch_size, num_tags, num_tags)
296 |             next_score = broadcast_score + self.transitions + broadcast_emission
297 | 
298 |             # Find the maximum score over all possible current tag
299 |             # shape: (batch_size, num_tags)
300 |             next_score, indices = next_score.max(dim=1)
301 | 
302 |             # Set score to the next score if this timestep is valid (mask == 1)
303 |             # and save the index that produces the next score
304 |             # shape: (batch_size, num_tags)
305 |             score = torch.where(mask[i].unsqueeze(1), next_score, score)
306 |             history.append(indices)
307 | 
308 |         # End transition score
309 |         # shape: (batch_size, num_tags)
310 |         score += self.end_transitions
311 | 
312 |         # Now, compute the best path for each sample
313 | 
314 |         # shape: (batch_size,)
315 |         seq_ends = mask.long().sum(dim=0) - 1
316 |         best_tags_list = []
317 | 
318 |         for idx in range(batch_size):
319 |             # Find the tag which maximizes the score at the last timestep; this is our best tag
320 |             # for the last timestep
321 |             _, best_last_tag = score[idx].max(dim=0)
322 |             best_tags = [best_last_tag.item()]
323 | 
324 |             # We trace back where the best last tag comes from, append that to our best tag
325 |             # sequence, and trace it back again, and so on
326 |             for hist in reversed(history[:seq_ends[idx]]):
327 |                 best_last_tag = hist[idx][best_tags[-1]]
328 |                 best_tags.append(best_last_tag.item())
329 | 
330 |             # Reverse the order because we start from the last timestep
331 |             best_tags.reverse()
332 |             best_tags_list.append(best_tags)
333 | 
334 |         return best_tags_list
335 | 


--------------------------------------------------------------------------------
/project4-Machine Translation/dataloader.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- encoding: utf-8 -*-
  3 | """
  4 | Created on 2020/6/8 11:26
  5 | @author: phil
  6 | """
  7 | 
  8 | # 参考吴恩达老师网易云深度学习课程作业
  9 | import os
 10 | import numpy as np
 11 | import torch
 12 | from faker import Faker
 13 | import random
 14 | 
 15 | from torch.nn import init
 16 | from tqdm import tqdm
 17 | from babel.dates import format_date
 18 | from torchtext import data
 19 | import pandas as pd
 20 | from sklearn.model_selection import train_test_split
 21 | fake = Faker()
 22 | Faker.seed(12345)
 23 | random.seed(12345)
 24 | 
 25 | # Define format of the data we would like to generate
 26 | FORMATS = ['short',
 27 |            'medium',
 28 |            'long',
 29 |            'full',
 30 |            'full',
 31 |            'full',
 32 |            'full',
 33 |            'full',
 34 |            'full',
 35 |            'full',
 36 |            'full',
 37 |            'full',
 38 |            'full',
 39 |            'd MMM YYY',
 40 |            'd MMMM YYY',
 41 |            'dd MMM YYY',
 42 |            'd MMM, YYY',
 43 |            'd MMMM, YYY',
 44 |            'dd, MMM YYY',
 45 |            'd MM YY',
 46 |            'd MMMM YYY',
 47 |            'MMMM d YYY',
 48 |            'MMMM d, YYY',
 49 |            'dd.MM.YY']
 50 | 
 51 | # change this if you want it to work with another language
 52 | LOCALES = ['en_US']
 53 | 
 54 | 
 55 | def load_date():
 56 |     """
 57 |         Loads some fake dates
 58 |         :returns: tuple containing human readable string, machine readable string, and date object
 59 |     """
 60 |     dt = fake.date_object()
 61 | 
 62 |     try:
 63 |         human_readable = format_date(dt, format=random.choice(FORMATS), locale='en_US')
 64 |         human_readable = human_readable.lower()
 65 |         human_readable = human_readable.replace(',', '')
 66 |         machine_readable = dt.isoformat()
 67 |     except AttributeError as e:
 68 |         return None, None, None
 69 | 
 70 |     return human_readable, machine_readable, dt
 71 | 
 72 | 
 73 | def load_dataset(m):
 74 |     """
 75 |         Loads a dataset with m examples and vocabularies
 76 |         :m: the number of examples to generate
 77 |     """
 78 |     dataset = []
 79 |     for _ in tqdm(range(m)):
 80 |         h, m, _ = load_date()
 81 |         if h is not None:
 82 |             dataset.append([h, m])
 83 | 
 84 |     return dataset
 85 | 
 86 | 
 87 | def prepare_data(dataset_path=r"../dataset/date-normalization", dataset_size=10, debug=False):
 88 |     if debug:
 89 |         dataset_size = 10
 90 |         train_file = os.path.join(dataset_path, "train_samll.csv")
 91 |         eval_file = os.path.join(dataset_path, "eval_samll.csv")
 92 |     else:
 93 |         train_file = os.path.join(dataset_path, "train.csv")
 94 |         eval_file = os.path.join(dataset_path, "eval.csv")
 95 |     if not os.path.exists(train_file) and not os.path.exists(train_file):
 96 |         dataset = load_dataset(dataset_size)
 97 |         source, target = zip(*dataset)
 98 |         X_train, X_test, y_train, y_test = train_test_split(source, target, random_state=42, test_size=0.2)
 99 |         train_df = pd.DataFrame()
100 |         train_df["source"], train_df["target"] = X_train, y_train
101 |         eval_df = pd.DataFrame()
102 |         eval_df["source"], eval_df["target"] = X_test, y_test
103 |         train_df.to_csv(train_file, index=False)
104 |         eval_df.to_csv(eval_file, index=False)
105 |     return train_file, eval_file
106 | 
107 | 
108 | def dataset2dataloader(dataset_path, batch_size=10, dataset_size=10, debug=False):
109 |     train_csv, dev_csv = prepare_data(dataset_path, dataset_size=dataset_size, debug=debug)
110 | 
111 |     def tokenizer(text):
112 |         return list(text)
113 | 
114 |     # 这里只是定义了数据格式
115 |     SOURCE = data.Field(sequential=True, tokenize=tokenizer, lower=False)
116 |     # 目标输出前后需加入特殊的标志符
117 |     TARGET = data.Field(sequential=True, tokenize=tokenizer, lower=False, init_token="<start>", eos_token="<end>")
118 |     train, val = data.TabularDataset.splits(
119 |         path='', train=train_csv, validation=dev_csv, format='csv', skip_header=True,
120 |         fields=[('source', SOURCE), ('target', TARGET)])
121 | 
122 |     SOURCE.build_vocab(train)
123 |     TARGET.build_vocab(train)
124 | 
125 |     train_iter = data.BucketIterator(train, batch_size=batch_size, sort_key=lambda x: len(x.sent), shuffle=False)
126 |     val_iter = data.BucketIterator(val, batch_size=batch_size, sort_key=lambda x: len(x.sent), shuffle=False)
127 | 
128 |     # 在 test_iter , sort一定要设置成 False, 要不然会被 torchtext 搞乱样本顺序
129 |     # test_iter = data.Iterator(dataset=test, batch_size=128, train=False, sort=False, device=DEVICE)
130 | 
131 |     return train_iter, val_iter, SOURCE.vocab, TARGET.vocab


--------------------------------------------------------------------------------
/project4-Machine Translation/main.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- encoding: utf-8 -*-
  3 | """
  4 | Created on 2020/6/8 11:36
  5 | @author: phil
  6 | """
  7 | from keras.utils import to_categorical
  8 | 
  9 | from dataloader import load_dataset, dataset2dataloader
 10 | from models import SimpleNMT
 11 | from torch import optim
 12 | import torch.nn as nn
 13 | import torch
 14 | import numpy as np
 15 | from pprint import pprint
 16 | from tqdm import tqdm
 17 | 
 18 | if __name__ == "__main__":
 19 |     epoch = 500
 20 |     learning_rate = 0.001
 21 |     hidden_size = 64
 22 |     batch_size = 10
 23 | 
 24 |     train_iter, val_iter, source_vocab, target_vocab = dataset2dataloader(dataset_path=r"../dataset/date-normalization",
 25 |                                                                           batch_size=batch_size, dataset_size=10000, debug=True)
 26 |     source_vocab_size = len(source_vocab.stoi)
 27 |     target_vocab_size = len(target_vocab.stoi)
 28 | 
 29 |     # print(target_vocab.stoi)
 30 | 
 31 |     Tx, Ty = 25, 10  # 最大长度
 32 | 
 33 |     model = SimpleNMT(in_vocab_size=source_vocab_size, out_vocab_size=target_vocab_size, in_hidden_size=hidden_size,
 34 |                       out_hidden_size=hidden_size, output_size=target_vocab_size, with_attention=True)
 35 | 
 36 |     optimizer = optim.Adam(model.parameters(), lr=learning_rate)
 37 |     criterion = nn.CrossEntropyLoss()
 38 | 
 39 |     embed_layer1 = nn.Embedding(source_vocab_size, source_vocab_size,
 40 |                                 _weight=torch.from_numpy(np.eye(source_vocab_size)))
 41 |     embed_layer2 = nn.Embedding(target_vocab_size, target_vocab_size,
 42 |                                 _weight=torch.from_numpy(np.eye(target_vocab_size)))
 43 | 
 44 |     model.train()
 45 |     for ep in range(epoch):
 46 |         epoch_loss = 0
 47 |         for batch in train_iter:
 48 |             optimizer.zero_grad()
 49 |             Xin, Yin, Yout = batch.source.t().long(), batch.target.t()[:, :-1].long(), batch.target.t()[:, 1:]
 50 |             batch_size = len(Xin)
 51 |             init_hidden = torch.zeros(1, batch_size, hidden_size)
 52 |             # if ep == epoch - 1:
 53 |             #     print(Yout)
 54 |             Xin = embed_layer1(Xin).float()
 55 |             Yin = embed_layer2(Yin).float()
 56 |             logits = model(Xin, init_hidden, Yin)
 57 |             loss = criterion(logits.view(-1, logits.shape[-1]), Yout.flatten())
 58 |             epoch_loss += loss.item()
 59 |             loss.backward()
 60 |             optimizer.step()
 61 |         if ep % (epoch // 10) == 0:
 62 |             print("loss", epoch_loss)
 63 | 
 64 |     # 测试训练集输出是否正确
 65 |     # for batch in train_iter:
 66 |     #     # print(batch.source.t())
 67 |     #     print(batch.target.t()[:, 1:])
 68 |     # print("finish")
 69 |     # init_hidden = torch.zeros(1, batch_size, hidden_size)
 70 |     # logits = model(Xin, init_hidden, Yin)
 71 |     # print(logits.argmax(-1))
 72 | 
 73 |     sents_for_large = ["monday may 7 1983", "19 march 1998", "18 jul 2008", "9/10/70", "thursday january 1 1981",
 74 |                        "thursday january 26 2015", "saturday april 18 1990", "sunday may 12 1988"]
 75 |     sents = ["monday march 7 1983", "9 may 1998", "thursday january 26 1995", "9/10/70"]
 76 | 
 77 | 
 78 |     def translate(model, sents):
 79 |         X = []
 80 |         for sent in sents:
 81 |             X.append(list(map(lambda x: source_vocab[x], list(sent))) + [source_vocab["<pad>"]] * (Tx - len(sent)))
 82 |         Xoh = torch.from_numpy(np.array(list(map(lambda x: to_categorical(x, num_classes=source_vocab_size), X))))
 83 |         encoder_init_hidden = torch.zeros(1, len(X), hidden_size)
 84 |         preds = model(Xoh, encoder_init_hidden, decoder_input=None, out_word2index=target_vocab.stoi,
 85 |                       out_index2word=target_vocab.itos, max_len=Ty, out_size=target_vocab_size)
 86 |         for gold, pred in zip(sents, preds):
 87 |             print(gold, "-->", "".join(pred))
 88 | 
 89 | 
 90 |     translate(model, sents)
 91 | 
 92 |     """ 不使用 attention
 93 |     dataset_size : 10000
 94 |     loss 940.5139790773392
 95 |     loss 151.68325132876635
 96 |     loss 17.91189043689519
 97 |     loss 8.461621267197188
 98 |     loss 0.4571912245155545
 99 |     loss 4.067497536438168
100 |     loss 0.02432645454427984
101 |     loss 0.022933890589229122
102 |     loss 1.740354736426525
103 |     loss 2.7019595313686295
104 |     monday may 7 1983 --> 1983-05-07
105 |     19 march 1998 --> 1998-03-19
106 |     18 jul 2008 --> 2008-07-18
107 |     9/10/70 --> 1970-09-10
108 |     thursday january 1 1981 --> 1981-01-01
109 |     thursday january 26 2015 --> 2015-01-26
110 |     saturday april 18 1990 --> 1990-04-18
111 |     sunday may 12 1988 --> 1988-05-12
112 |     """
113 | 
114 |     """使用attention
115 |     loss 870.4544065594673
116 |     loss 65.41884177550673
117 |     loss 53.339022306521656
118 |     loss 0.08635593753569992
119 |     loss 0.057157438381182146
120 |     loss 0.0006471980702968949
121 |     loss 0.09261544834953384
122 |     loss 0.000922315769471993
123 |     loss 0.00961817828419953
124 |     loss 0.06814217135979561
125 |     monday may 7 1983 --> 1983-05-07
126 |     19 march 1998 --> 1998-03-19
127 |     18 jul 2008 --> 2008-07-18
128 |     9/10/70 --> 1970-09-10
129 |     thursday january 1 1981 --> 1981-01-01
130 |     thursday january 26 2015 --> 2015-01-26
131 |     saturday april 18 1990 --> 1990-04-18
132 |     sunday may 12 1988 --> 1988-05-12
133 |     """
134 | 


--------------------------------------------------------------------------------
/project4-Machine Translation/models.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- encoding: utf-8 -*-
  3 | """
  4 | Created on 2020/6/10 11:18
  5 | @author: phil
  6 | """
  7 | import torch.nn as nn
  8 | import torch
  9 | import numpy as np
 10 | import torch.nn.functional as F
 11 | 
 12 | 
 13 | class EncoderRNN(nn.Module):
 14 |     def __init__(self, vocab_size, hidden_size, dropout=0.5):
 15 |         super(EncoderRNN, self).__init__()
 16 |         self.hidden_size = hidden_size
 17 |         self.gru = nn.GRU(vocab_size, hidden_size, dropout=dropout, batch_first=True)
 18 | 
 19 |     def forward(self, x, init_hidden):
 20 |         seq_output, last_state = self.gru(x, init_hidden)
 21 |         return seq_output, last_state
 22 | 
 23 | 
 24 | class DecoderRNN(nn.Module):
 25 |     def __init__(self, vocab_size, hidden_size, output_size, dropout=0.5):
 26 |         super(DecoderRNN, self).__init__()
 27 |         self.hidden_size = hidden_size
 28 |         self.gru = nn.GRU(vocab_size, hidden_size, dropout=dropout, batch_first=True)
 29 |         self.hidden2index = nn.Linear(hidden_size, output_size)
 30 | 
 31 |     def forward(self, x, init_state):
 32 |         seq_output, last_state = self.gru(x, init_state)
 33 |         seq_output = self.hidden2index(seq_output)
 34 |         return seq_output, last_state
 35 | 
 36 | 
 37 | class DecoderAttenRNN(nn.Module):
 38 |     def __init__(self, vocab_size, hidden_size, output_size, dropout=0.5):
 39 |         super(DecoderAttenRNN, self).__init__()
 40 |         self.hidden_size = hidden_size
 41 |         self.gru = nn.GRU(vocab_size, hidden_size, dropout=dropout, batch_first=True)
 42 |         self.hidden2label = nn.Linear(hidden_size, output_size)
 43 |         self.atten_affine = nn.Linear(hidden_size*2, hidden_size)
 44 | 
 45 |     def get_alpha(self, hi, encoder_output):
 46 |         # hi shape (1, batch_size, hidden_size)
 47 |         # encoder_output (batch, seq_len, hidden_size)
 48 |         hi = hi.permute(1, 2, 0)   # (batch_size, hidden_size, 1)
 49 |         # print(encoder_output.shape, hi.shape)
 50 |         e = torch.bmm(encoder_output, hi).squeeze(2)  # (batch_size, seq_len)
 51 |         e = F.softmax(e, dim=1).unsqueeze(2)       # (batch_size, seq_len, 1)
 52 |         alpha = (e * encoder_output).sum(dim=1)    # (batch_size, hidden_size)
 53 | 
 54 |         return alpha
 55 | 
 56 |     def forward(self, x, init_state, seq_encoder_output):
 57 |         # print(x.shape, init_state.shape, seq_encoder_output.shape)
 58 |         batch_size, max_len, _ = x.shape  # 独热码表示
 59 |         hi = init_state
 60 |         seq_decoder_output = []
 61 |         for i in range(max_len):
 62 |             # alpha shape (batch_size, hidden_size)
 63 |             alpha = self.get_alpha(hi, seq_encoder_output)  # alpha 表示当前time step的隐状态矩阵和encoder的各个time step输出的关联
 64 |             hi = torch.cat([alpha.unsqueeze(0), hi], dim=2)
 65 |             hi = self.atten_affine(hi)
 66 |             output, hi = self.gru(x[:, i, :].unsqueeze(1), hi)
 67 |             seq_output = self.hidden2label(output.squeeze(1))
 68 |             seq_decoder_output.append(seq_output.squeeze(1))
 69 |         seq_decoder_output = torch.stack(seq_decoder_output, dim=1)
 70 |         return seq_decoder_output, hi
 71 | 
 72 | 
 73 | class SimpleNMT(nn.Module):
 74 |     def __init__(self, in_vocab_size, out_vocab_size, in_hidden_size, out_hidden_size, output_size, with_attention=False):
 75 |         super(SimpleNMT, self).__init__()
 76 |         self.with_attention = with_attention
 77 |         self.encoder = EncoderRNN(in_vocab_size, in_hidden_size)
 78 |         if self.with_attention:
 79 |             self.decoder = DecoderAttenRNN(out_vocab_size, out_hidden_size, output_size)
 80 |         else:
 81 |             self.decoder = DecoderRNN(out_vocab_size, out_hidden_size, output_size)
 82 | 
 83 |     def forward(self, encoder_input, encoder_init_hidden, decoder_input=None, out_word2index=None, out_index2word=None,
 84 |                 max_len=None, out_size=None):
 85 |         encoder_seq_output, encoder_last_state = self.encoder(encoder_input, encoder_init_hidden)
 86 |         # 训练时decoder每个time step输入标准答案
 87 |         if decoder_input is not None:
 88 |             if self.with_attention:
 89 |                 logits, _ = self.decoder(decoder_input, encoder_last_state, encoder_seq_output)
 90 |             else:
 91 |                 logits, _ = self.decoder(decoder_input, encoder_last_state)
 92 |             return logits
 93 |         else:
 94 |             # 测试时没有标准答案，一直解码直到出现<end>或者达到最大长度
 95 |             decoded_sents = []
 96 |             for i in range(len(encoder_input)):
 97 |                 sent = []
 98 |                 decoder_input = torch.FloatTensor(np.eye(out_size)[[out_word2index["<start>"]]]).unsqueeze(0)
 99 |                 hi = encoder_last_state[:, i, :].unsqueeze(1)
100 |                 for di in range(max_len):
101 |                     if self.with_attention:
102 |                         # alpha = self.decoder.get_alpha(hi, encoder_seq_output[i, :, :].unsqueeze(
103 |                         #     0))  # alpha 表示当前time step的隐状态矩阵和encoder的各个time step输出的关联
104 |                         # hi = torch.cat([alpha.unsqueeze(0), hi], dim=2)
105 |                         # hi = self.decoder.atten_affine(hi)
106 |                         # # print(decoder_input.shape, hi.shape, encoder_seq_output.shape)
107 |                         decoder_output, hdi = self.decoder(decoder_input, hi, encoder_seq_output[i, :, :].unsqueeze(0))
108 |                     else:
109 |                         decoder_output, hdi = self.decoder(decoder_input, hi)
110 |                     topv, topi = decoder_output.data.topk(1)
111 |                     topi = topi.item()
112 |                     if topi == out_word2index["<end>"]:
113 |                         break
114 |                     else:
115 |                         sent.append(out_index2word[topi])
116 |                     decoder_input = torch.FloatTensor([np.eye(out_size)[topi]]).unsqueeze(0)
117 |                     hi = hdi
118 |                 decoded_sents.append(sent)
119 |             return decoded_sents
120 | 


--------------------------------------------------------------------------------
/project5-Text Generation/dataloader.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | """
 4 | Created on 2020/6/18 10:56
 5 | @author: phil
 6 | """
 7 | import os
 8 | import pandas as pd
 9 | from torchtext import data
10 | 
11 | 
12 | def prepare_data(dataset_path="../dataset/poetry"):
13 |     file_path = os.path.join(dataset_path, "poetryFromTang.txt")
14 |     target_path = os.path.join(dataset_path, "train.csv")
15 |     if not os.path.exists(target_path):
16 |         with open(file_path, encoding="utf-8") as f:
17 |             lines = f.read().split("\n\n")
18 |         lines = list(map(lambda x: x.replace("\n", ""), lines))
19 |         df = pd.DataFrame()
20 |         df["sent"] = lines
21 |         df.to_csv(target_path, index=False, encoding='utf_8_sig')
22 |     return target_path
23 | 
24 | 
25 | def dataset2dataloader(dataset_path="../dataset/poetry", batch_size=32, debug=False):
26 |     if debug:
27 |         train_csv = os.path.join(dataset_path, "train_small.csv")
28 |     else:
29 |         train_csv = prepare_data(dataset_path)
30 | 
31 |     def tokenizer(text):
32 |         return list(text)
33 | 
34 |     SENT = data.Field(sequential=True, tokenize=tokenizer, lower=False, init_token="<start>", eos_token="<end>")
35 |     train, _ = data.TabularDataset.splits(path='', train=train_csv, validation=train_csv, format='csv',
36 |                                           skip_header=True,
37 |                                           fields=[('sent', SENT)])
38 | 
39 |     SENT.build_vocab(train)
40 | 
41 |     train_iter = data.BucketIterator(train, batch_size=batch_size, sort_key=lambda x: len(x.sent), shuffle=False)
42 | 
43 |     # 在 test_iter , sort一定要设置成 False, 要不然会被 torchtext 搞乱样本顺序
44 |     # test_iter = data.Iterator(dataset=test, batch_size=128, train=False, sort=False, device=DEVICE)
45 | 
46 |     return train_iter, SENT.vocab
47 | 
48 | 
49 | if __name__ == "__main__":
50 |     train_iter, vocab = dataset2dataloader()
51 |     for batch in train_iter:
52 |         print(batch.sent.t())
53 |         break
54 | 


--------------------------------------------------------------------------------
/project5-Text Generation/main.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | """
 4 | Created on 2020/6/18 12:14
 5 | @author: phil
 6 | """
 7 | from tqdm import tqdm
 8 | 
 9 | from dataloader import dataset2dataloader
10 | from torch.optim import Adam
11 | from models import PoetryModel
12 | import torch
13 | import torch.nn as nn
14 | import numpy as np
15 | import os
16 | 
17 | if __name__ == "__main__":
18 |     batch_size = 32
19 |     learning_rate = 0.001
20 |     hidden_size = 128
21 |     epoch = 200
22 | 
23 |     train_iter, vocab = dataset2dataloader(batch_size=batch_size)
24 | 
25 |     vocab_size = len(vocab.stoi)
26 |     # print(vocab_size, hidden_size, batch_size)
27 |     model = PoetryModel(vocab_size=vocab_size, hidden_size=hidden_size, output_size=vocab_size)
28 |     optimizer = Adam(model.parameters(), lr=learning_rate)
29 |     criterion = nn.CrossEntropyLoss()
30 | 
31 |     one_hot_embedding = nn.Embedding(vocab_size, vocab_size, _weight=torch.from_numpy(np.eye(vocab_size)))
32 | 
33 |     model_path = "model.pkl"
34 |     if os.path.exists(model_path):
35 |         model = torch.load(model_path)
36 |     else:
37 |         for ep in tqdm(range(epoch)):
38 |             model.train()
39 |             total_loss = 0
40 |             for i, batch in enumerate(train_iter):
41 |                 optimizer.zero_grad()
42 |                 sent = batch.sent.t()
43 |                 x, y = sent[:, :-1], sent[:, 1:]
44 |                 x = one_hot_embedding(x).float()
45 |                 init_hidden = torch.zeros(1, len(x), hidden_size)
46 |                 output, _ = model(x, init_hidden)
47 |                 output = output.reshape(-1, output.shape[-1])
48 |                 y = y.flatten()
49 |                 loss = criterion(output, y)
50 |                 loss.backward()
51 |                 optimizer.step()
52 |                 total_loss += loss.item()
53 |             if ep % (epoch // 10) == 0:
54 |                 print("loss: ", total_loss)
55 |         torch.save(model, model_path)
56 | 
57 |     model.eval()
58 |     # test = ["我好可爱"]  我病恨无我，。好一解颜色。可怜王经行自远，一解颜色。爱绿溪阴。
59 |     # test = ["花开有情"]  花边行县柳，河桥晚泊船。开远树，山鸟助酣歌。有情何处，箫管凤初来。情何处所，风吹青珊瑚，可怜王孙立
60 |     test = [""]
61 |     for sent in test:
62 |         sent = list(map(lambda x: vocab.stoi[x], list(sent)))
63 |         x = torch.tensor(sent).unsqueeze(0)
64 |         x = one_hot_embedding(x).float()
65 |         with torch.no_grad():
66 |             output = model.generate(x, stoi=vocab.stoi, poetry_type="hidden head")
67 |     ans = torch.cat(output, dim=1).argmax(-1).squeeze(0)
68 |     for word_id in ans:
69 |         print(vocab.itos[word_id.item()], end="")


--------------------------------------------------------------------------------
/project5-Text Generation/model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/positivepeng/nlp-beginner-projects/2f066cbdd24121938b90ee1d12fe063ad0a46df5/project5-Text Generation/model.pkl


--------------------------------------------------------------------------------
/project5-Text Generation/model_debug.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/positivepeng/nlp-beginner-projects/2f066cbdd24121938b90ee1d12fe063ad0a46df5/project5-Text Generation/model_debug.pkl


--------------------------------------------------------------------------------
/project5-Text Generation/models.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | """
 4 | Created on 2020/6/18 11:05
 5 | @author: phil
 6 | """
 7 | 
 8 | import torch.nn as nn
 9 | import torch
10 | 
11 | 
12 | class PoetryModel(nn.Module):
13 |     def __init__(self, vocab_size, hidden_size, output_size, dropout=0.5):
14 |         super(PoetryModel, self).__init__()
15 |         self.hidden_size = hidden_size
16 |         self.gru = nn.GRU(input_size=vocab_size, hidden_size=hidden_size, dropout=dropout, batch_first=True)
17 |         self.out = nn.Linear(hidden_size, output_size)
18 | 
19 |     def forward(self, x, init_hidden):
20 |         # print(x.shape, init_hidden.shape)
21 |         seq_out, hn = self.gru(x, init_hidden)
22 |         output = self.out(seq_out)
23 |         return output, hn
24 | 
25 |     def generate(self, x, stoi, poetry_type="begin", sent_num=4, max_len=15):
26 |         init_hidden = torch.zeros(1, 1, self.hidden_size)
27 |         output = []
28 |         if poetry_type == "hidden head" and x.shape[1] != sent_num:
29 |             print("ERROR：选择了藏头诗但是输入字的个数不等于诗的句子数")
30 |             return
31 | 
32 |         hn = init_hidden
33 |         for i in range(sent_num):
34 |             if i == 0 and poetry_type == "begin":
35 |                 seq_out, hn = self.gru(x, hn)
36 |                 seq_out = seq_out[:, -1, :].unsqueeze(1)
37 |                 output.append(x)
38 |             if poetry_type == "hidden head":
39 |                 seq_out, hn = self.gru(x[:, i, :].unsqueeze(1), hn)
40 |                 seq_out = seq_out[:, -1, :].unsqueeze(1)
41 |                 output.append(x[:, i, :].unsqueeze(1))
42 |             for j in range(max_len):  # 每一句的最大长度
43 |                 # 上一个time step的输出
44 |                 _, topi = self.out(seq_out).data.topk(1)
45 |                 topi = topi.item()
46 |                 xi_from_output = torch.zeros(1, 1, x.shape[-1])
47 |                 xi_from_output[0][0][topi] = 1
48 |                 output.append(xi_from_output)
49 |                 seq_out, hn = self.gru(xi_from_output, hn)
50 |                 if topi == stoi["。"]:
51 |                     break
52 |         return output
53 | 


--------------------------------------------------------------------------------