├── .gitignore
├── README.md
├── project1-ML-Sentence Classification
├── data_preprocess.py
├── feature_extraction.py
├── main.py
└── softmax_regerssion.py
├── project2-DL-Sentence Classification
├── Convolutional Neural Networks for Sentence Classification.pdf
├── dataloader_byhand.py
├── dataloader_bytorchtext.py
├── main.py
└── models.py
├── project3-Named Entity Recognition
├── dataloader.py
├── main.py
├── model.pkl
├── models.py
└── torchcrf
│ └── __init__.py
├── project4-Machine Translation
├── dataloader.py
├── main.py
└── models.py
└── project5-Text Generation
├── dataloader.py
├── main.py
├── model.pkl
├── model_debug.pkl
└── models.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # IPython
81 | profile_default/
82 | ipython_config.py
83 |
84 | # pyenv
85 | .python-version
86 |
87 | # pipenv
88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
91 | # install all needed dependencies.
92 | #Pipfile.lock
93 |
94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95 | __pypackages__/
96 |
97 | # Celery stuff
98 | celerybeat-schedule
99 | celerybeat.pid
100 |
101 | # SageMath parsed files
102 | *.sage.py
103 |
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 |
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 |
117 | # Rope project settings
118 | .ropeproject
119 |
120 | # mkdocs documentation
121 | /site
122 |
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 |
128 | # Pyre type checker
129 | .pyre/
130 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # nlp-beginner-projects
2 | NLP常见任务实现(pytorch版)
3 |
4 | - [x] 项目一:基于softmax regression的文本多分类
5 | 博客地址:[基于softmax regression的文本多分类](https://blog.csdn.net/philpanic9/article/details/106606415)
6 | - [x] 项目二:基于RNN,CNN的文本多分类
7 | 博客地址:[基于RNN、CNN的文本多分类](https://blog.csdn.net/philpanic9/article/details/106728786)
8 | - [x] 项目三:BiLSTM-CRF命名实体识别
9 | 博客地址:[基于BiLSTM-CRF的实体识别](https://blog.csdn.net/philpanic9/article/details/106742297)
10 | > 输入待标注的句子,输出标注结果
11 | My name is Phil , I am from European Union . -->
12 | ['O', 'O', 'O', 'B-PER', 'O', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'O']
13 |
14 | - [x] 项目四:机器翻译
15 | 博客地址:[基于Seq2Seq(包含Attention)的机器翻译](https://blog.csdn.net/philpanic9/article/details/106806350)
16 | > **输入人可读的日期,翻译出机器可读的日期**
17 | monday may 7 1983 --> 1983-05-07
18 | 19 march 1998 --> 1998-03-19
19 | 18 jul 2008 --> 2008-07-18
20 | 9/10/70 --> 1970-09-10
21 | thursday january 1 1981 --> 1981-01-01
22 | thursday january 26 2015 --> 2015-01-26
23 | saturday april 18 1990 --> 1990-04-18
24 | sunday may 12 1988 --> 1988-05-12
25 | - [x] 项目五:文本生成
26 | 博客地址:https://blog.csdn.net/philpanic9/article/details/106878540
27 | > **输入"我好可爱",生成藏头诗**
28 | 我病恨无我,。
29 | 好一解颜色。
30 | 可怜王经行自远,一解颜色。
31 | 爱绿溪阴。
32 | > **输入"花开有情",生成藏头诗**
33 | 花边行县柳,河桥晚泊船。
34 | 开远树,山鸟助酣歌。
35 | 有情何处,箫管凤初来。
36 | 情何处所,风吹青珊瑚,可怜王孙立
37 |
38 |
39 | 相关链接:
40 | 1. [项目1数据集](https://www.kaggle.com/c/sentiment-analysis-on-movie-reviews)
41 | 2. [fdu nlp-beginner](https://github.com/FudanNLP/nlp-beginner)
42 |
43 |
44 |
45 |
46 |
--------------------------------------------------------------------------------
/project1-ML-Sentence Classification/data_preprocess.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- encoding: utf-8 -*-
3 | """
4 | Created on 2020/4/15 21:01
5 | @author: phil
6 | """
7 |
8 | import pandas as pd
9 |
10 |
11 | def read_data(train_file="../dataset/kaggle-movie-review/train.tsv"):
12 | train_df = pd.read_csv(train_file, sep='\t')
13 | # test_df = pd.read_csv(test_file, sep="\t")
14 | return train_df["Phrase"].values, train_df["Sentiment"].values
15 |
16 |
17 | if __name__ == "__main__":
18 | X_data, y_data = read_data()
19 | print("train size", len(X_data))
20 |
--------------------------------------------------------------------------------
/project1-ML-Sentence Classification/feature_extraction.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- encoding: utf-8 -*-
3 | """
4 | Created on 2020/4/15 21:16
5 | @author: phil
6 | """
7 |
8 | import numpy as np
9 |
10 |
11 | class BagOfWord:
12 | def __init__(self, do_lower_case=False):
13 | self.vocab = {}
14 | self.do_lower_case = do_lower_case
15 |
16 | def fit(self, sent_list):
17 | # sent_list 类型为 List
18 | for sent in sent_list:
19 | if self.do_lower_case:
20 | sent = sent.lower()
21 | words = sent.strip().split(" ")
22 | for word in words:
23 | if word not in self.vocab:
24 | self.vocab[word] = len(self.vocab)
25 |
26 | def transform(self, sent_list):
27 | vocab_size = len(self.vocab)
28 | bag_of_word_feature = np.zeros((len(sent_list), vocab_size))
29 | for idx, sent in enumerate(sent_list):
30 | if self.do_lower_case:
31 | sent = sent.lower()
32 | words = sent.strip().split(" ")
33 | for word in words:
34 | bag_of_word_feature[idx][self.vocab[word]] += 1
35 | return bag_of_word_feature
36 |
37 | def fit_transform(self, sent_list):
38 | self.fit(sent_list)
39 | return self.transform(sent_list)
40 |
41 |
42 | class NGram:
43 | def __init__(self, ngram, do_lower_case=False):
44 | self.ngram = ngram
45 | self.feature_map = {}
46 | self.do_lower_case = do_lower_case
47 |
48 | def fit(self, sentList):
49 | for gram in self.ngram:
50 | for sent in sentList:
51 | if self.do_lower_case:
52 | sent = sent.lower()
53 | sent = sent.split(" ")
54 | for i in range(len(sent) - gram + 1):
55 | feature = "_".join(sent[i:i + gram])
56 | if feature not in self.feature_map:
57 | self.feature_map[feature] = len(self.feature_map)
58 |
59 | def transform(self, sentList):
60 | n = len(sentList)
61 | m = len(self.feature_map)
62 | ngram_feature = np.zeros((n, m))
63 | for idx, sent in enumerate(sentList):
64 | if self.do_lower_case:
65 | sent = sent.lower()
66 | sent = sent.split(" ")
67 | for gram in self.ngram:
68 | for i in range(len(sent) - gram + 1):
69 | feature = "_".join(sent[i:i + gram])
70 | if feature in self.feature_map:
71 | ngram_feature[idx][self.feature_map[feature]] = 1
72 | return ngram_feature
73 |
74 | def fit_transform(self, sentList):
75 | self.fit(sentList)
76 | return self.transform(sentList)
77 |
78 |
79 | if __name__ == "__main__":
80 | gram = NGram((1, 2))
81 | sents = ["I love you", "do you love yourself"]
82 | feature = gram.fit_transform(sents)
83 | print(gram.feature_map)
84 | print(feature)
85 |
--------------------------------------------------------------------------------
/project1-ML-Sentence Classification/main.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- encoding: utf-8 -*-
3 | """
4 | Created on 2020/4/15 21:24
5 | @author: phil
6 | """
7 |
8 | import numpy as np
9 | from data_preprocess import read_data
10 | from feature_extraction import BagOfWord, NGram
11 | from softmax_regerssion import SoftmaxRegression
12 | import matplotlib.pyplot as plt
13 | from sklearn.model_selection import train_test_split
14 |
15 | if __name__ == '__main__':
16 | debug = 1
17 | # 读入数据
18 | X_data, y_data = read_data()
19 |
20 | if debug == 1:
21 | # index = np.arange(len(X_data))
22 | # np.random.shuffle(index)
23 | # X_data = X_data[index[:2000]]
24 | # y_data = y_data[index[:2000]]
25 | X_data = X_data[:1000]
26 | y_data = y_data[:1000]
27 | y = np.array(y_data).reshape(len(y_data), 1)
28 |
29 | # 数据集划分
30 | bag_of_word_model = BagOfWord(do_lower_case=True)
31 | ngram_model = NGram(ngram=(1, 2), do_lower_case=True)
32 | X_Bow = bag_of_word_model.fit_transform(X_data)
33 | X_Gram = ngram_model.fit_transform(X_data)
34 |
35 | print("Bow shape", X_Bow.shape)
36 | print("Gram shape", X_Gram.shape)
37 |
38 | X_train_Bow, X_test_Bow, y_train_Bow, y_test_Bow = train_test_split(X_Bow, y, test_size=0.2, random_state=42, stratify=y)
39 | X_train_Gram, X_test_Gram, y_train_Gram, y_test_Gram = train_test_split(X_Gram, y, test_size=0.2, random_state=42, stratify=y)
40 |
41 | # 训练模型 不同特征的差别
42 | epoch = 100
43 | bow_learning_rate = 1
44 | gram_learning_rate = 1
45 |
46 | # 梯度下降
47 | model1 = SoftmaxRegression()
48 | history = model1.fit(X_train_Bow, y_train_Bow, epoch=epoch, learning_rate=bow_learning_rate, print_loss_steps=epoch//10, update_strategy="stochastic")
49 | plt.plot(np.arange(len(history)), np.array(history))
50 | plt.show()
51 | print("Bow train {} test {}".format(model1.score(X_train_Bow, y_train_Bow), model1.score(X_test_Bow, y_test_Bow)))
52 |
53 | model2 = SoftmaxRegression()
54 | history = model2.fit(X_train_Gram, y_train_Gram, epoch=epoch, learning_rate=gram_learning_rate, print_loss_steps=epoch//10, update_strategy="stochastic")
55 | plt.plot(np.arange(len(history)), np.array(history))
56 | plt.show()
57 | print("Gram train {} test {}".format(model2.score(X_train_Gram, y_train_Gram), model2.score(X_test_Gram, y_test_Gram)))
58 |
59 | # 样本数量:20000
60 | # epoch = 100
61 | # bow_learning_rate = 0.001
62 | # gram_learning_rate = 0.5
63 | # Bow train 0.7094375 test 0.4885
64 | # Gram train 0.9786875 test 0.5335
--------------------------------------------------------------------------------
/project1-ML-Sentence Classification/softmax_regerssion.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- encoding: utf-8 -*-
3 | """
4 | Created on 2020/4/15 20:58
5 | @author: phil
6 | """
7 |
8 | import numpy as np
9 |
10 |
11 | def softmax(z):
12 | # 稳定版本的softmax,对z的每一行进行softmax
13 | z -= np.max(z, axis=1, keepdims=True) # 先减去该行的最大值
14 | z = np.exp(z)
15 | z /= np.sum(z, axis=1, keepdims=True)
16 | return z
17 |
18 |
19 | class SoftmaxRegression:
20 | def __init__(self):
21 | self.num_of_class = None # 类别数量
22 | self.n = None # 数据个数
23 | self.m = None # 数据维度
24 | self.weight = None # 模型权重 shape (类别数,数据维度)
25 | self.learning_rate = None
26 |
27 | def fit(self, X, y, learning_rate=0.01, epoch=10, num_of_class=5, print_loss_steps=-1, update_strategy="batch"):
28 | self.n, self.m = X.shape
29 | self.num_of_class = num_of_class
30 | self.weight = np.random.randn(self.num_of_class, self.m)
31 | self.learning_rate = learning_rate
32 |
33 | # 将y换为独热码矩阵,每一行独热码表示一个label
34 | y_one_hot = np.zeros((self.n, self.num_of_class))
35 | for i in range(self.n):
36 | y_one_hot[i][y[i]] = 1
37 |
38 | loss_history = []
39 |
40 | for e in range(epoch):
41 | # X (n, m) 每一行表示一个样本
42 | # weight (C, m) 每一行处理一个类别
43 | loss = 0
44 | if update_strategy == "stochastic":
45 | rand_index = np.arange(len(X))
46 | np.random.shuffle(rand_index)
47 | for index in list(rand_index):
48 | Xi = X[index].reshape(1, -1)
49 | prob = Xi.dot(self.weight.T)
50 | prob = softmax(prob).flatten()
51 | loss += -np.log(prob[y[index]])
52 | self.weight += Xi.reshape(1, self.m).T.dot((y_one_hot[index] - prob).reshape(1, self.num_of_class)).T
53 |
54 | if update_strategy == "batch":
55 | prob = X.dot(self.weight.T) # (n, C) 每个样本被预测为各个类别
56 | prob = softmax(prob)
57 |
58 | for i in range(self.n):
59 | loss -= np.log(prob[i][y[i]])
60 |
61 | # 书中给的损失函数
62 | weight_update = np.zeros_like(self.weight)
63 | for i in range(self.n):
64 | weight_update += X[i].reshape(1, self.m).T.dot((y_one_hot[i] - prob[i]).reshape(1, self.num_of_class)).T
65 | self.weight += weight_update * self.learning_rate / self.n
66 |
67 | loss /= self.n
68 | loss_history.append(loss)
69 | if print_loss_steps != -1 and e % print_loss_steps == 0:
70 | print("epoch {} loss {}".format(e, loss))
71 | return loss_history
72 |
73 | def predict(self, X):
74 | prob = softmax(X.dot(self.weight.T))
75 | return prob.argmax(axis=1)
76 |
77 | def score(self, X, y):
78 | pred = self.predict(X)
79 | return np.sum(pred.reshape(y.shape) == y) / y.shape[0]
--------------------------------------------------------------------------------
/project2-DL-Sentence Classification/Convolutional Neural Networks for Sentence Classification.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/positivepeng/nlp-beginner-projects/2f066cbdd24121938b90ee1d12fe063ad0a46df5/project2-DL-Sentence Classification/Convolutional Neural Networks for Sentence Classification.pdf
--------------------------------------------------------------------------------
/project2-DL-Sentence Classification/dataloader_byhand.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- encoding: utf-8 -*-
3 | """
4 | Created on 2020/6/12 15:25
5 | @author: phil
6 | """
7 |
8 | import pandas as pd
9 | import os
10 | import numpy as np
11 | import torch
12 | from sklearn.model_selection import train_test_split
13 | from torch.nn.utils.rnn import pad_sequence
14 | from torch.utils.data import Dataset, DataLoader
15 |
16 |
17 | def prepare_data(dataset_path, sent_col_name, label_col_name):
18 | """ 读出tsv中的句子和标签 """
19 | file_path = os.path.join(dataset_path, "train.tsv")
20 | data = pd.read_csv(file_path, sep="\t")
21 | X = data[sent_col_name].values
22 | y = data[label_col_name].values
23 | return X, y
24 |
25 |
26 | class Language:
27 | """ 根据句子列表建立词典并将单词列表转换为数值型表示 """
28 | def __init__(self):
29 | self.word2id = {}
30 | self.id2word = {}
31 |
32 | def fit(self, sent_list):
33 | vocab = set()
34 | for sent in sent_list:
35 | vocab.update(sent.split(" "))
36 | word_list = ["", ""] + list(vocab)
37 | self.word2id = {word: i for i, word in enumerate(word_list)}
38 | self.id2word = {i: word for i, word in enumerate(word_list)}
39 |
40 | def transform(self, sent_list, reverse=False):
41 | sent_list_id = []
42 | word_mapper = self.word2id if not reverse else self.id2word
43 | unk = self.word2id[""] if not reverse else None
44 | for sent in sent_list:
45 | sent_id = list(map(lambda x: word_mapper.get(x, unk), sent.split(" ") if not reverse else sent))
46 | sent_list_id.append(sent_id)
47 | return sent_list_id
48 |
49 |
50 | class ClsDataset(Dataset):
51 | """ 文本分类数据集 """
52 | def __init__(self, sents, labels):
53 | self.sents = sents
54 | self.labels = labels
55 |
56 | def __getitem__(self, item):
57 | return self.sents[item], self.labels[item]
58 |
59 | def __len__(self):
60 | return len(self.sents)
61 |
62 |
63 | def collate_fn(batch_data):
64 | """ 自定义一个batch里面的数据的组织方式 """
65 | batch_data.sort(key=lambda data_pair: len(data_pair[0]), reverse=True)
66 |
67 | sents, labels = zip(*batch_data)
68 | sents_len = [len(sent) for sent in sents]
69 | sents = [torch.LongTensor(sent) for sent in sents]
70 | padded_sents = pad_sequence(sents, batch_first=True, padding_value=0)
71 |
72 | return torch.LongTensor(padded_sents), torch.LongTensor(labels), torch.FloatTensor(sents_len)
73 |
74 |
75 | def get_wordvec(word2id, vec_file_path, vec_dim=50):
76 | """ 读出txt文件的预训练词向量 """
77 | print("开始加载词向量")
78 | word_vectors = torch.nn.init.xavier_uniform_(torch.empty(len(word2id), vec_dim))
79 | word_vectors[0, :] = 0 #
80 | found = 0
81 | with open(vec_file_path, "r", encoding="utf-8") as f:
82 | lines = f.readlines()
83 | for line in lines:
84 | splited = line.split(" ")
85 | if splited[0] in word2id:
86 | found += 1
87 | word_vectors[word2id[splited[0]]] = torch.tensor(list(map(lambda x: float(x), splited[1:])))
88 | if found == len(word2id) - 1: # 允许找不到
89 | break
90 | print("总共 %d个词,其中%d个找到了对应的词向量" % (len(word2id), found))
91 | return word_vectors.float()
92 |
93 |
94 | def make_dataloader(dataset_path="../dataset/kaggle-movie-review", sent_col_name="Phrase", label_col_name="Sentiment", batch_size=32, vec_file_path="./.vector_cache/glove.6B.50d.txt", debug=False):
95 | # X, y = prepare_datapairs(dataset_path="../dataset/imdb", sent_col_name="review", label_col_name="sentiment")
96 | X, y = prepare_data(dataset_path=dataset_path, sent_col_name=sent_col_name, label_col_name=label_col_name)
97 |
98 | if debug:
99 | X, y = X[:100], y[:100]
100 |
101 | X_language = Language()
102 | X_language.fit(X)
103 | X = X_language.transform(X)
104 |
105 | word_vectors = get_wordvec(X_language.word2id, vec_file_path=vec_file_path, vec_dim=50)
106 | # 总共 18229个词,其中12769个找到了对应的词向量 word_vectors = get_wordvec(X_language.word2id,
107 | # vec_file_path=r"F:\NLP-pretrained-model\glove.twitter.27B\glove.twitter.27B.50d.txt", vec_dim=50)
108 |
109 | # 测试
110 | # print(X[:2])
111 | # X_id = X_language.transform(X[:2])
112 | # print(X_language.transform(X_id, reverse=True))
113 |
114 | X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
115 |
116 | cls_train_dataset, cls_val_dataset = ClsDataset(X_train, y_train), ClsDataset(X_val, y_val)
117 | cls_train_dataloader = DataLoader(cls_train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
118 | cls_val_dataloader = DataLoader(cls_val_dataset, batch_size=batch_size, collate_fn=collate_fn)
119 |
120 | return cls_train_dataloader, cls_val_dataloader, word_vectors, X_language
121 |
122 |
123 | if __name__ == "__main__":
124 | cls_train_dataloader, cls_val_dataloader, word_vectors, X_language = make_dataloader(debug=True, batch_size=10)
125 | for batch in cls_train_dataloader:
126 | X, y, lens = batch
127 | print(X.shape, y.shape)
128 | break
129 |
--------------------------------------------------------------------------------
/project2-DL-Sentence Classification/dataloader_bytorchtext.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- encoding: utf-8 -*-
3 | """
4 | Created on 2020/5/4 15:35
5 | @author: phil
6 | """
7 | import os
8 |
9 | import pandas as pd
10 | import spacy
11 | from sklearn.model_selection import train_test_split
12 | from torch.nn import init
13 | from torchtext import data
14 |
15 |
16 | def prepare_data(dataset_path, sent_col_name, label_col_name, debug=False):
17 | """ 读出tsv中的句子和标签 """
18 | file_path = os.path.join(dataset_path, "train.tsv")
19 | data = pd.read_csv(file_path, sep="\t")
20 | if debug:
21 | data = data.sample(n=100)
22 | X = data[sent_col_name].values
23 | y = data[label_col_name].values
24 | X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
25 | train_df, val_df = pd.DataFrame(), pd.DataFrame()
26 | train_df["sent"], train_df["label"] = X_train, y_train
27 | val_df["sent"], val_df["label"] = X_val, y_val
28 |
29 | train_file_path = os.path.join(dataset_path, "train.csv")
30 | val_file_path = os.path.join(dataset_path, "val.csv")
31 | train_df.to_csv(train_file_path, index=False)
32 | val_df.to_csv(val_file_path, index=False)
33 |
34 | return train_file_path, val_file_path
35 |
36 |
37 | def dataset2dataloader(dataset_path="../dataset/kaggle-movie-review", sent_col_name="Phrase", label_col_name="Sentiment", batch_size=32, vec_file_path="./.vector_cache/glove.6B.50d.txt", debug=False):
38 | train_file_name, val_file_name = prepare_data(dataset_path, sent_col_name, label_col_name, debug=debug)
39 | spacy_en = spacy.load('en_core_web_sm')
40 |
41 | def tokenizer(text):
42 | """ 定义分词操作 """
43 | return [tok.text for tok in spacy_en.tokenizer(text)]
44 |
45 | # 这里只是定义了数据格式
46 | TEXT = data.Field(sequential=True, tokenize=tokenizer, lower=True)
47 | LABEL = data.Field(sequential=False, use_vocab=False)
48 | train, val = data.TabularDataset.splits(
49 | path='', train=train_file_name, validation=val_file_name, format='csv', skip_header=True,
50 | fields=[('sent', TEXT), ('label', LABEL)])
51 |
52 | TEXT.build_vocab(train, vectors='glove.6B.50d') # , max_size=30000)
53 | # 当 corpus 中有的 token 在 vectors 中不存在时 的初始化方式.
54 | TEXT.vocab.vectors.unk_init = init.xavier_uniform
55 |
56 | DEVICE = "cpu"
57 | train_iter = data.BucketIterator(train, batch_size=batch_size, sort_key=lambda x: len(x.review), device=DEVICE)
58 | val_iter = data.BucketIterator(val, batch_size=batch_size, sort_key=lambda x: len(x.review), shuffle=True, device=DEVICE)
59 |
60 | # 在 test_iter , sort一定要设置成 False, 要不然会被 torchtext 搞乱样本顺序
61 | # test_iter = data.Iterator(dataset=test, batch_size=128, train=False, sort=False, device=DEVICE)
62 |
63 | return train_iter, val_iter, TEXT.vocab.vectors
64 |
65 |
66 | if __name__ == "__main__":
67 | train_iter, val_iter, vectors = dataset2dataloader(batch_size=32, debug=True)
68 |
69 | batch = next(iter(train_iter))
70 | print(batch.sent.shape)
71 | print(batch.label.shape)
72 |
--------------------------------------------------------------------------------
/project2-DL-Sentence Classification/main.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- encoding: utf-8 -*-
3 | """
4 | Created on 2020/4/30 8:33
5 | @author: phil
6 | """
7 | from torch import optim
8 | import torch
9 | from models import TextRNN, TextCNN
10 | from dataloader_bytorchtext import dataset2dataloader
11 | from dataloader_byhand import make_dataloader
12 | import numpy as np
13 |
14 | if __name__ == "__main__":
15 | model_names = ["LSTM", "RNN", "CNN"] # 彩蛋:按过拟合难度排序,由难到易
16 | learning_rate = 0.001
17 | epoch_num = 500
18 | num_of_class = 5
19 | load_data_by_torchtext = True
20 |
21 | if load_data_by_torchtext:
22 | train_iter, val_iter, word_vectors = dataset2dataloader(batch_size=100, debug=True)
23 | else:
24 | train_iter, val_iter, word_vectors, X_lang = make_dataloader(batch_size=100, debug=True)
25 |
26 | for model_name in model_names[-1:]:
27 | if model_name == "RNN":
28 | model = TextRNN(vocab_size=len(word_vectors), embedding_dim=50, hidden_size=128, num_of_class=num_of_class, weights=word_vectors)
29 | elif model_name == "CNN":
30 | model = TextCNN(vocab_size=len(word_vectors), embedding_dim=50, num_of_class=num_of_class, embedding_vectors=word_vectors)
31 | elif model_name == "LSTM":
32 | model = TextRNN(vocab_size=len(word_vectors), embedding_dim=50, hidden_size=128, num_of_class=num_of_class, weights=word_vectors, rnn_type="LSTM")
33 | optimizer = optim.Adam(model.parameters(), lr=learning_rate)
34 | loss_fun = torch.nn.CrossEntropyLoss()
35 |
36 | for epoch in range(epoch_num):
37 | model.train() # 包含dropout或者BN的模型需要指定
38 | for i, batch in enumerate(train_iter):
39 | if load_data_by_torchtext:
40 | x, y = batch.sent.t(), batch.label
41 | else:
42 | x, y, lens = batch
43 | logits = model(x)
44 | optimizer.zero_grad()
45 | loss = loss_fun(logits, y)
46 | loss.backward()
47 | optimizer.step()
48 |
49 | # with torch.no_grad():
50 | model.eval()
51 | train_accs = []
52 | for i, batch in enumerate(train_iter):
53 | if load_data_by_torchtext:
54 | x, y = batch.sent.t(), batch.label
55 | else:
56 | x, y, lens = batch
57 | _, y_pre = torch.max(logits, -1)
58 | acc = torch.mean((torch.tensor(y_pre == y, dtype=torch.float)))
59 | train_accs.append(acc)
60 | train_acc = np.array(train_accs).mean()
61 |
62 | val_accs = []
63 | for i, batch in enumerate(val_iter):
64 | if load_data_by_torchtext:
65 | x, y = batch.sent.t(), batch.label
66 | else:
67 | x, y, lens = batch
68 | logits = model(x)
69 | _, y_pre = torch.max(logits, -1)
70 | acc = torch.mean((torch.tensor(y_pre == y, dtype=torch.float)))
71 | val_accs.append(acc)
72 | val_acc = np.array(val_accs).mean()
73 | print("epoch %d train acc:%.2f, val acc:%.2f" % (epoch, train_acc, val_acc))
74 | if train_acc >= 0.99:
75 | break
76 |
77 |
--------------------------------------------------------------------------------
/project2-DL-Sentence Classification/models.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- encoding: utf-8 -*-
3 | """
4 | Created on 2020/5/15 22:23
5 | @author: phil
6 | """
7 | import torch.nn as nn
8 | import torch
9 | import torch.nn.functional as F
10 |
11 |
12 | class TextRNN(nn.Module):
13 | def __init__(self, vocab_size, embedding_dim, hidden_size, num_of_class, weights=None, rnn_type="RNN"):
14 | super(TextRNN, self).__init__()
15 |
16 | self.vocab_size = vocab_size
17 | self.hidden_size = hidden_size
18 | self.num_of_class = num_of_class
19 | self.embedding_dim = embedding_dim
20 | self.rnn_type = rnn_type
21 |
22 | if weights is not None:
23 | self.embed = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim, _weight=weights)
24 | else:
25 | self.embed = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)
26 |
27 | if rnn_type == "RNN":
28 | self.rnn = nn.RNN(input_size=embedding_dim, hidden_size=hidden_size, batch_first=True)
29 | self.hidden2label = nn.Linear(hidden_size, num_of_class)
30 | elif rnn_type == "LSTM":
31 | self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_size, batch_first=True, bidirectional=True)
32 | self.hidden2label = nn.Linear(hidden_size*2, num_of_class)
33 |
34 | def forward(self, input_sents):
35 | # input_sents (batch_size, seq_len)
36 | batch_size, seq_len = input_sents.shape
37 | # (batch_size, seq_len, embedding_dim)
38 | embed_out = self.embed(input_sents)
39 |
40 | if self.rnn_type == "RNN":
41 | h0 = torch.randn(1, batch_size, self.hidden_size)
42 | _, hn = self.rnn(embed_out, h0)
43 | elif self.rnn_type == "LSTM":
44 | h0, c0 = torch.randn(2, batch_size, self.hidden_size), torch.randn(2, batch_size, self.hidden_size)
45 | output, (hn, _) = self.lstm(embed_out, (h0, c0))
46 |
47 | logits = self.hidden2label(hn).squeeze(0)
48 |
49 | return logits
50 |
51 |
52 | class TextCNN(nn.Module):
53 | def __init__(self, vocab_size, embedding_dim, num_of_class, embedding_vectors=None, kernel_num=100, kerner_size=[3, 4, 5], dropout=0.5):
54 | super(TextCNN, self).__init__()
55 | if embedding_vectors is None:
56 | self.embed = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)
57 | else:
58 | self.embed = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim, _weight=embedding_vectors)
59 | self.convs = nn.ModuleList([nn.Conv2d(1, kernel_num, (K, embedding_dim)) for K in kerner_size])
60 | self.dropout = nn.Dropout(dropout)
61 | self.feature2label = nn.Linear(3*kernel_num, num_of_class)
62 |
63 | def forward(self, x):
64 | # x shape (batch_size, seq_len)
65 | embed_out = self.embed(x).unsqueeze(1)
66 | conv_out = [F.relu(conv(embed_out)).squeeze(3) for conv in self.convs]
67 |
68 | pool_out = [F.max_pool1d(block, block.size(2)).squeeze(2) for block in conv_out]
69 |
70 | pool_out = torch.cat(pool_out, 1)
71 |
72 | logits = self.feature2label(pool_out)
73 |
74 | return logits
75 |
76 |
77 | if __name__ == "__main__":
78 | model = TextCNN(vocab_size=10, embedding_dim=10, num_of_class=10)
79 | x = torch.randint(10, (10, 20))
80 | logits = model.forward(x)
81 |
82 |
--------------------------------------------------------------------------------
/project3-Named Entity Recognition/dataloader.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- encoding: utf-8 -*-
3 | """
4 | Created on 2020/6/13 11:58
5 | @author: phil
6 | """
7 | import os
8 | import pandas as pd
9 | import spacy
10 | from torch.nn import init
11 | from torchtext import data
12 |
13 |
14 | def prepare_data(dataset_path, debug=False):
15 | train_file_path = os.path.join(dataset_path, "train.txt")
16 | dev_file_path = os.path.join(dataset_path, "dev.txt")
17 |
18 | def process_file(file_path, target_file_path):
19 | sents, tags = [], []
20 | with open(file_path, "r") as f:
21 | lines = f.readlines()
22 | sent, tag = [], []
23 | for line in lines:
24 | line = line.strip()
25 | if len(line) == 0:
26 | sents.append(" ".join(sent))
27 | tags.append(" ".join(tag))
28 | sent, tag = [], []
29 | else:
30 | splited = line.split(" ")
31 | sent.append(splited[0])
32 | tag.append(splited[-1])
33 | if len(sent) != 0:
34 | sents.append(" ".join(sent))
35 | tags.append(" ".join(tag))
36 | df = pd.DataFrame()
37 | df["sent"] = sents if not debug else sents[:100]
38 | df["tag"] = tags if not debug else tags[:100]
39 | df.to_csv(target_file_path, index=False)
40 |
41 | train_csv = os.path.join(dataset_path, "train.csv") if not debug else os.path.join(dataset_path, "train_small.csv")
42 | dev_csv = os.path.join(dataset_path, "dev.csv") if not debug else os.path.join(dataset_path, "train_dev.csv")
43 |
44 | if not os.path.exists(train_csv):
45 | process_file(train_file_path, train_csv)
46 | process_file(dev_file_path, dev_csv)
47 |
48 | return train_csv, dev_csv
49 |
50 |
51 | def dataset2dataloader(dataset_path="../dataset/conll2003-IOB", batch_size=3, debug=False):
52 | train_csv, dev_csv = prepare_data(dataset_path, debug=debug)
53 |
54 | def tokenizer(text):
55 | return text.split(" ")
56 |
57 | # 这里只是定义了数据格式
58 | TEXT = data.Field(sequential=True, tokenize=tokenizer, lower=False)
59 | TAG = data.Field(sequential=True, tokenize=tokenizer, lower=False)
60 | train, val = data.TabularDataset.splits(
61 | path='', train=train_csv, validation=dev_csv, format='csv', skip_header=True,
62 | fields=[('sent', TEXT), ('tag', TAG)])
63 |
64 | TEXT.build_vocab(train, vectors='glove.6B.50d') # , max_size=30000)
65 | TAG.build_vocab(val)
66 |
67 | # 当 corpus 中有的 token 在 vectors 中不存在时 的初始化方式.
68 | TEXT.vocab.vectors.unk_init = init.xavier_uniform
69 |
70 | DEVICE = "cpu"
71 | train_iter = data.BucketIterator(train, batch_size=batch_size, sort_key=lambda x: len(x.sent), device=DEVICE)
72 | val_iter = data.BucketIterator(val, batch_size=batch_size, sort_key=lambda x: len(x.sent), device=DEVICE)
73 |
74 | # 在 test_iter , sort一定要设置成 False, 要不然会被 torchtext 搞乱样本顺序
75 | # test_iter = data.Iterator(dataset=test, batch_size=128, train=False, sort=False, device=DEVICE)
76 |
77 | return train_iter, val_iter, TEXT.vocab, TAG.vocab
78 |
79 |
80 | if __name__ == "__main__":
81 | # train_csv, dev_csv = prepare_data(dataset_path="../dataset/conll2003-IOB")
82 | train_iter, val_iter, sent_vocab, tag_vocab = dataset2dataloader(dataset_path="../dataset/conll2003-IOB", debug=True)
83 | word_vectors = sent_vocab.vectors
84 |
85 | for batch in train_iter:
86 | print(batch.sent.shape, batch.tag.shape)
87 | break
--------------------------------------------------------------------------------
/project3-Named Entity Recognition/main.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- encoding: utf-8 -*-
3 | """
4 | Created on 2020/6/13 16:15
5 | @author: phil
6 | """
7 |
8 | from dataloader import dataset2dataloader
9 | from models import BiLSTM_CRF_NER
10 | from torch.optim import Adam
11 | import torch
12 | import numpy as np
13 | import os
14 |
15 | if __name__ == "__main__":
16 | train_iter, val_iter, sent_vocab, tag_vocab = dataset2dataloader(batch_size=128)
17 | word_vectors = sent_vocab.vectors
18 | device = torch.device("cuda") if torch.cuda.is_available() else "cpu"
19 |
20 | model = BiLSTM_CRF_NER(vocab_size=len(sent_vocab.stoi), embedding_dim=50, hidden_size=128, num_tags=len(tag_vocab.stoi), word_vectors=word_vectors, device=device)
21 |
22 | epoch = 10
23 | learning_rate = 0.01
24 | model_path = "model.pkl"
25 |
26 | optimizer = Adam(model.parameters(), lr=learning_rate)
27 |
28 | if os.path.exists(model_path):
29 | model = torch.load(model_path)
30 | else:
31 | for ep in range(epoch):
32 | model.train()
33 | for i, batch in enumerate(train_iter):
34 | x, y = batch.sent.t(), batch.tag.t()
35 | mask = (x != sent_vocab.stoi[""])
36 | optimizer.zero_grad()
37 | loss = model(x, y, mask)
38 | loss.backward()
39 | optimizer.step()
40 | if i % 100 == 0:
41 | print(f"epoch:{ep}, iter:{i}, loss:{loss.item()}", end=" ")
42 |
43 | model.eval()
44 | train_accs = []
45 | preds, golds = [], []
46 | for i, batch in enumerate(train_iter):
47 | x, y = batch.sent.t(), batch.tag.t()
48 | mask = (x != sent_vocab.stoi[""])
49 | with torch.no_grad():
50 | preds = model.predict(x, mask)
51 | right, total = 0, 0
52 | for pred, gold in zip(preds, y):
53 | right += np.sum(np.array(pred) == gold[:len(pred)].numpy())
54 | total += len(pred)
55 | train_accs.append(right*1.0/total)
56 | train_acc = np.array(train_accs).mean()
57 |
58 | val_accs = []
59 | for i, batch in enumerate(val_iter):
60 | x, y = batch.sent.t(), batch.tag.t()
61 | mask = (x != sent_vocab.stoi[""])
62 | with torch.no_grad():
63 | preds = model.predict(x, mask)
64 | right, total = 0, 0
65 | for pred, gold in zip(preds, y):
66 | right += np.sum(np.array(pred) == gold[:len(pred)].numpy())
67 | total += len(pred)
68 | val_accs.append(right * 1.0 / total)
69 | val_acc = np.array(val_accs).mean()
70 | print("epoch %d train acc:%.2f, val acc:%.2f" % (epoch, train_acc, val_acc))
71 | torch.save(model, model_path)
72 | test_sents = ["My name is Phil , I am from European Union ."]
73 | for sent in test_sents:
74 | ids = [sent_vocab.stoi[word] for word in sent.split(" ")]
75 | input_tensor = torch.tensor([ids])
76 | mask = input_tensor != sent_vocab.stoi[""]
77 | with torch.no_grad():
78 | pred = model.predict(input_tensor, mask)
79 | print(sent, "-->", [tag_vocab.itos[tag_id] for tag_id in pred[0]])
80 |
81 |
82 |
--------------------------------------------------------------------------------
/project3-Named Entity Recognition/model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/positivepeng/nlp-beginner-projects/2f066cbdd24121938b90ee1d12fe063ad0a46df5/project3-Named Entity Recognition/model.pkl
--------------------------------------------------------------------------------
/project3-Named Entity Recognition/models.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- encoding: utf-8 -*-
3 | """
4 | Created on 2020/6/13 16:01
5 | @author: phil
6 | """
7 |
8 | import torch.nn as nn
9 | from torchcrf import CRF
10 | import torch
11 |
12 |
13 | class BiLSTM_CRF_NER(nn.Module):
14 | def __init__(self, vocab_size, embedding_dim, hidden_size, num_tags, word_vectors=None, device="cpu"):
15 | super(BiLSTM_CRF_NER, self).__init__()
16 | self.device = device
17 | self.hidden_size = hidden_size
18 | self.embed = nn.Embedding(vocab_size, embedding_dim, _weight=word_vectors).to(device)
19 | self.lstm = nn.LSTM(embedding_dim, hidden_size, bidirectional=True, batch_first=True).to(device)
20 | self.hidden2tag = nn.Linear(hidden_size*2, num_tags)
21 | self.crf = CRF(num_tags=num_tags, batch_first=True).to(device)
22 |
23 | def forward(self, x, y, mask):
24 | emissions = self.get_emissions(x)
25 | loss = -self.crf(emissions=emissions, tags=y, mask=mask)
26 | return loss
27 |
28 | def predict(self, x, mask=None):
29 | emissions = self.get_emissions(x)
30 | preds = self.crf.decode(emissions, mask)
31 | return preds
32 |
33 | def get_emissions(self, x):
34 | batch_size, seq_len = x.shape
35 | embedded = self.embed(x)
36 | h0, c0 = torch.zeros(2, batch_size, self.hidden_size).to(self.device), torch.zeros(2, batch_size, self.hidden_size).to(self.device)
37 | lstm_out, (_, _) = self.lstm(embedded, (h0, c0))
38 | emissions = self.hidden2tag(lstm_out)
39 | return emissions
40 |
--------------------------------------------------------------------------------
/project3-Named Entity Recognition/torchcrf/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = '0.7.2'
2 |
3 | from typing import List, Optional
4 |
5 | import torch
6 | import torch.nn as nn
7 |
8 |
9 | class CRF(nn.Module):
10 | """Conditional random field.
11 |
12 | This module implements a conditional random field [LMP01]_. The forward computation
13 | of this class computes the log likelihood of the given sequence of tags and
14 | emission score tensor. This class also has `~CRF.decode` method which finds
15 | the best tag sequence given an emission score tensor using `Viterbi algorithm`_.
16 |
17 | Args:
18 | num_tags: Number of tags.
19 | batch_first: Whether the first dimension corresponds to the size of a minibatch.
20 |
21 | Attributes:
22 | start_transitions (`~torch.nn.Parameter`): Start transition score tensor of size
23 | ``(num_tags,)``.
24 | end_transitions (`~torch.nn.Parameter`): End transition score tensor of size
25 | ``(num_tags,)``.
26 | transitions (`~torch.nn.Parameter`): Transition score tensor of size
27 | ``(num_tags, num_tags)``.
28 |
29 |
30 | .. [LMP01] Lafferty, J., McCallum, A., Pereira, F. (2001).
31 | "Conditional random fields: Probabilistic models for segmenting and
32 | labeling sequence data". *Proc. 18th International Conf. on Machine
33 | Learning*. Morgan Kaufmann. pp. 282–289.
34 |
35 | .. _Viterbi algorithm: https://en.wikipedia.org/wiki/Viterbi_algorithm
36 | """
37 |
38 | def __init__(self, num_tags: int, batch_first: bool = False) -> None:
39 | if num_tags <= 0:
40 | raise ValueError(f'invalid number of tags: {num_tags}')
41 | super().__init__()
42 | self.num_tags = num_tags
43 | self.batch_first = batch_first
44 | self.start_transitions = nn.Parameter(torch.empty(num_tags))
45 | self.end_transitions = nn.Parameter(torch.empty(num_tags))
46 | self.transitions = nn.Parameter(torch.empty(num_tags, num_tags))
47 |
48 | self.reset_parameters()
49 |
50 | def reset_parameters(self) -> None:
51 | """Initialize the transition parameters.
52 |
53 | The parameters will be initialized randomly from a uniform distribution
54 | between -0.1 and 0.1.
55 | """
56 | nn.init.uniform_(self.start_transitions, -0.1, 0.1)
57 | nn.init.uniform_(self.end_transitions, -0.1, 0.1)
58 | nn.init.uniform_(self.transitions, -0.1, 0.1)
59 |
60 | def __repr__(self) -> str:
61 | return f'{self.__class__.__name__}(num_tags={self.num_tags})'
62 |
63 | def forward(
64 | self,
65 | emissions: torch.Tensor,
66 | tags: torch.LongTensor,
67 | mask: Optional[torch.ByteTensor] = None,
68 | reduction: str = 'sum',
69 | ) -> torch.Tensor:
70 | """Compute the conditional log likelihood of a sequence of tags given emission scores.
71 |
72 | Args:
73 | emissions (`~torch.Tensor`): Emission score tensor of size
74 | ``(seq_length, batch_size, num_tags)`` if ``batch_first`` is ``False``,
75 | ``(batch_size, seq_length, num_tags)`` otherwise.
76 | tags (`~torch.LongTensor`): Sequence of tags tensor of size
77 | ``(seq_length, batch_size)`` if ``batch_first`` is ``False``,
78 | ``(batch_size, seq_length)`` otherwise.
79 | mask (`~torch.ByteTensor`): Mask tensor of size ``(seq_length, batch_size)``
80 | if ``batch_first`` is ``False``, ``(batch_size, seq_length)`` otherwise.
81 | reduction: Specifies the reduction to apply to the output:
82 | ``none|sum|mean|token_mean``. ``none``: no reduction will be applied.
83 | ``sum``: the output will be summed over batches. ``mean``: the output will be
84 | averaged over batches. ``token_mean``: the output will be averaged over tokens.
85 |
86 | Returns:
87 | `~torch.Tensor`: The log likelihood. This will have size ``(batch_size,)`` if
88 | reduction is ``none``, ``()`` otherwise.
89 | """
90 | self._validate(emissions, tags=tags, mask=mask)
91 | if reduction not in ('none', 'sum', 'mean', 'token_mean'):
92 | raise ValueError(f'invalid reduction: {reduction}')
93 | if mask is None:
94 | mask = torch.ones_like(tags, dtype=torch.uint8)
95 |
96 | if self.batch_first:
97 | emissions = emissions.transpose(0, 1)
98 | tags = tags.transpose(0, 1)
99 | mask = mask.transpose(0, 1)
100 |
101 | # shape: (batch_size,)
102 | numerator = self._compute_score(emissions, tags, mask)
103 | # shape: (batch_size,)
104 | denominator = self._compute_normalizer(emissions, mask)
105 | # shape: (batch_size,)
106 | llh = numerator - denominator
107 |
108 | if reduction == 'none':
109 | return llh
110 | if reduction == 'sum':
111 | return llh.sum()
112 | if reduction == 'mean':
113 | return llh.mean()
114 | assert reduction == 'token_mean'
115 | return llh.sum() / mask.float().sum()
116 |
117 | def decode(self, emissions: torch.Tensor,
118 | mask: Optional[torch.ByteTensor] = None) -> List[List[int]]:
119 | """Find the most likely tag sequence using Viterbi algorithm.
120 |
121 | Args:
122 | emissions (`~torch.Tensor`): Emission score tensor of size
123 | ``(seq_length, batch_size, num_tags)`` if ``batch_first`` is ``False``,
124 | ``(batch_size, seq_length, num_tags)`` otherwise.
125 | mask (`~torch.ByteTensor`): Mask tensor of size ``(seq_length, batch_size)``
126 | if ``batch_first`` is ``False``, ``(batch_size, seq_length)`` otherwise.
127 |
128 | Returns:
129 | List of list containing the best tag sequence for each batch.
130 | """
131 | self._validate(emissions, mask=mask)
132 | if mask is None:
133 | mask = emissions.new_ones(emissions.shape[:2], dtype=torch.uint8)
134 |
135 | if self.batch_first:
136 | emissions = emissions.transpose(0, 1)
137 | mask = mask.transpose(0, 1)
138 |
139 | return self._viterbi_decode(emissions, mask)
140 |
141 | def _validate(
142 | self,
143 | emissions: torch.Tensor,
144 | tags: Optional[torch.LongTensor] = None,
145 | mask: Optional[torch.ByteTensor] = None) -> None:
146 | if emissions.dim() != 3:
147 | raise ValueError(f'emissions must have dimension of 3, got {emissions.dim()}')
148 | if emissions.size(2) != self.num_tags:
149 | raise ValueError(
150 | f'expected last dimension of emissions is {self.num_tags}, '
151 | f'got {emissions.size(2)}')
152 |
153 | if tags is not None:
154 | if emissions.shape[:2] != tags.shape:
155 | raise ValueError(
156 | 'the first two dimensions of emissions and tags must match, '
157 | f'got {tuple(emissions.shape[:2])} and {tuple(tags.shape)}')
158 |
159 | if mask is not None:
160 | if emissions.shape[:2] != mask.shape:
161 | raise ValueError(
162 | 'the first two dimensions of emissions and mask must match, '
163 | f'got {tuple(emissions.shape[:2])} and {tuple(mask.shape)}')
164 | no_empty_seq = not self.batch_first and mask[0].all()
165 | no_empty_seq_bf = self.batch_first and mask[:, 0].all()
166 | if not no_empty_seq and not no_empty_seq_bf:
167 | raise ValueError('mask of the first timestep must all be on')
168 |
169 | def _compute_score(
170 | self, emissions: torch.Tensor, tags: torch.LongTensor,
171 | mask: torch.ByteTensor) -> torch.Tensor:
172 | # emissions: (seq_length, batch_size, num_tags)
173 | # tags: (seq_length, batch_size)
174 | # mask: (seq_length, batch_size)
175 | assert emissions.dim() == 3 and tags.dim() == 2
176 | assert emissions.shape[:2] == tags.shape
177 | assert emissions.size(2) == self.num_tags
178 | assert mask.shape == tags.shape
179 | assert mask[0].all()
180 |
181 | seq_length, batch_size = tags.shape
182 | mask = mask.float()
183 |
184 | # Start transition score and first emission
185 | # shape: (batch_size,)
186 | score = self.start_transitions[tags[0]]
187 | score += emissions[0, torch.arange(batch_size), tags[0]]
188 |
189 | for i in range(1, seq_length):
190 | # Transition score to next tag, only added if next timestep is valid (mask == 1)
191 | # shape: (batch_size,)
192 | score += self.transitions[tags[i - 1], tags[i]] * mask[i]
193 |
194 | # Emission score for next tag, only added if next timestep is valid (mask == 1)
195 | # shape: (batch_size,)
196 | score += emissions[i, torch.arange(batch_size), tags[i]] * mask[i]
197 |
198 | # End transition score
199 | # shape: (batch_size,)
200 | seq_ends = mask.long().sum(dim=0) - 1
201 | # shape: (batch_size,)
202 | last_tags = tags[seq_ends, torch.arange(batch_size)]
203 | # shape: (batch_size,)
204 | score += self.end_transitions[last_tags]
205 |
206 | return score
207 |
208 | def _compute_normalizer(
209 | self, emissions: torch.Tensor, mask: torch.ByteTensor) -> torch.Tensor:
210 | # emissions: (seq_length, batch_size, num_tags)
211 | # mask: (seq_length, batch_size)
212 | assert emissions.dim() == 3 and mask.dim() == 2
213 | assert emissions.shape[:2] == mask.shape
214 | assert emissions.size(2) == self.num_tags
215 | assert mask[0].all()
216 |
217 | seq_length = emissions.size(0)
218 |
219 | # Start transition score and first emission; score has size of
220 | # (batch_size, num_tags) where for each batch, the j-th column stores
221 | # the score that the first timestep has tag j
222 | # shape: (batch_size, num_tags)
223 | score = self.start_transitions + emissions[0]
224 |
225 | for i in range(1, seq_length):
226 | # Broadcast score for every possible next tag
227 | # shape: (batch_size, num_tags, 1)
228 | broadcast_score = score.unsqueeze(2)
229 |
230 | # Broadcast emission score for every possible current tag
231 | # shape: (batch_size, 1, num_tags)
232 | broadcast_emissions = emissions[i].unsqueeze(1)
233 |
234 | # Compute the score tensor of size (batch_size, num_tags, num_tags) where
235 | # for each sample, entry at row i and column j stores the sum of scores of all
236 | # possible tag sequences so far that end with transitioning from tag i to tag j
237 | # and emitting
238 | # shape: (batch_size, num_tags, num_tags)
239 | next_score = broadcast_score + self.transitions + broadcast_emissions
240 |
241 | # Sum over all possible current tags, but we're in score space, so a sum
242 | # becomes a log-sum-exp: for each sample, entry i stores the sum of scores of
243 | # all possible tag sequences so far, that end in tag i
244 | # shape: (batch_size, num_tags)
245 | next_score = torch.logsumexp(next_score, dim=1)
246 |
247 | # Set score to the next score if this timestep is valid (mask == 1)
248 | # shape: (batch_size, num_tags)
249 | score = torch.where(mask[i].unsqueeze(1), next_score, score)
250 |
251 | # End transition score
252 | # shape: (batch_size, num_tags)
253 | score += self.end_transitions
254 |
255 | # Sum (log-sum-exp) over all possible tags
256 | # shape: (batch_size,)
257 | return torch.logsumexp(score, dim=1)
258 |
259 | def _viterbi_decode(self, emissions: torch.FloatTensor,
260 | mask: torch.ByteTensor) -> List[List[int]]:
261 | # emissions: (seq_length, batch_size, num_tags)
262 | # mask: (seq_length, batch_size)
263 | assert emissions.dim() == 3 and mask.dim() == 2
264 | assert emissions.shape[:2] == mask.shape
265 | assert emissions.size(2) == self.num_tags
266 | assert mask[0].all()
267 |
268 | seq_length, batch_size = mask.shape
269 |
270 | # Start transition and first emission
271 | # shape: (batch_size, num_tags)
272 | score = self.start_transitions + emissions[0]
273 | history = []
274 |
275 | # score is a tensor of size (batch_size, num_tags) where for every batch,
276 | # value at column j stores the score of the best tag sequence so far that ends
277 | # with tag j
278 | # history saves where the best tags candidate transitioned from; this is used
279 | # when we trace back the best tag sequence
280 |
281 | # Viterbi algorithm recursive case: we compute the score of the best tag sequence
282 | # for every possible next tag
283 | for i in range(1, seq_length):
284 | # Broadcast viterbi score for every possible next tag
285 | # shape: (batch_size, num_tags, 1)
286 | broadcast_score = score.unsqueeze(2)
287 |
288 | # Broadcast emission score for every possible current tag
289 | # shape: (batch_size, 1, num_tags)
290 | broadcast_emission = emissions[i].unsqueeze(1)
291 |
292 | # Compute the score tensor of size (batch_size, num_tags, num_tags) where
293 | # for each sample, entry at row i and column j stores the score of the best
294 | # tag sequence so far that ends with transitioning from tag i to tag j and emitting
295 | # shape: (batch_size, num_tags, num_tags)
296 | next_score = broadcast_score + self.transitions + broadcast_emission
297 |
298 | # Find the maximum score over all possible current tag
299 | # shape: (batch_size, num_tags)
300 | next_score, indices = next_score.max(dim=1)
301 |
302 | # Set score to the next score if this timestep is valid (mask == 1)
303 | # and save the index that produces the next score
304 | # shape: (batch_size, num_tags)
305 | score = torch.where(mask[i].unsqueeze(1), next_score, score)
306 | history.append(indices)
307 |
308 | # End transition score
309 | # shape: (batch_size, num_tags)
310 | score += self.end_transitions
311 |
312 | # Now, compute the best path for each sample
313 |
314 | # shape: (batch_size,)
315 | seq_ends = mask.long().sum(dim=0) - 1
316 | best_tags_list = []
317 |
318 | for idx in range(batch_size):
319 | # Find the tag which maximizes the score at the last timestep; this is our best tag
320 | # for the last timestep
321 | _, best_last_tag = score[idx].max(dim=0)
322 | best_tags = [best_last_tag.item()]
323 |
324 | # We trace back where the best last tag comes from, append that to our best tag
325 | # sequence, and trace it back again, and so on
326 | for hist in reversed(history[:seq_ends[idx]]):
327 | best_last_tag = hist[idx][best_tags[-1]]
328 | best_tags.append(best_last_tag.item())
329 |
330 | # Reverse the order because we start from the last timestep
331 | best_tags.reverse()
332 | best_tags_list.append(best_tags)
333 |
334 | return best_tags_list
335 |
--------------------------------------------------------------------------------
/project4-Machine Translation/dataloader.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- encoding: utf-8 -*-
3 | """
4 | Created on 2020/6/8 11:26
5 | @author: phil
6 | """
7 |
8 | # 参考吴恩达老师网易云深度学习课程作业
9 | import os
10 | import numpy as np
11 | import torch
12 | from faker import Faker
13 | import random
14 |
15 | from torch.nn import init
16 | from tqdm import tqdm
17 | from babel.dates import format_date
18 | from torchtext import data
19 | import pandas as pd
20 | from sklearn.model_selection import train_test_split
21 | fake = Faker()
22 | Faker.seed(12345)
23 | random.seed(12345)
24 |
25 | # Define format of the data we would like to generate
26 | FORMATS = ['short',
27 | 'medium',
28 | 'long',
29 | 'full',
30 | 'full',
31 | 'full',
32 | 'full',
33 | 'full',
34 | 'full',
35 | 'full',
36 | 'full',
37 | 'full',
38 | 'full',
39 | 'd MMM YYY',
40 | 'd MMMM YYY',
41 | 'dd MMM YYY',
42 | 'd MMM, YYY',
43 | 'd MMMM, YYY',
44 | 'dd, MMM YYY',
45 | 'd MM YY',
46 | 'd MMMM YYY',
47 | 'MMMM d YYY',
48 | 'MMMM d, YYY',
49 | 'dd.MM.YY']
50 |
51 | # change this if you want it to work with another language
52 | LOCALES = ['en_US']
53 |
54 |
55 | def load_date():
56 | """
57 | Loads some fake dates
58 | :returns: tuple containing human readable string, machine readable string, and date object
59 | """
60 | dt = fake.date_object()
61 |
62 | try:
63 | human_readable = format_date(dt, format=random.choice(FORMATS), locale='en_US')
64 | human_readable = human_readable.lower()
65 | human_readable = human_readable.replace(',', '')
66 | machine_readable = dt.isoformat()
67 | except AttributeError as e:
68 | return None, None, None
69 |
70 | return human_readable, machine_readable, dt
71 |
72 |
73 | def load_dataset(m):
74 | """
75 | Loads a dataset with m examples and vocabularies
76 | :m: the number of examples to generate
77 | """
78 | dataset = []
79 | for _ in tqdm(range(m)):
80 | h, m, _ = load_date()
81 | if h is not None:
82 | dataset.append([h, m])
83 |
84 | return dataset
85 |
86 |
87 | def prepare_data(dataset_path=r"../dataset/date-normalization", dataset_size=10, debug=False):
88 | if debug:
89 | dataset_size = 10
90 | train_file = os.path.join(dataset_path, "train_samll.csv")
91 | eval_file = os.path.join(dataset_path, "eval_samll.csv")
92 | else:
93 | train_file = os.path.join(dataset_path, "train.csv")
94 | eval_file = os.path.join(dataset_path, "eval.csv")
95 | if not os.path.exists(train_file) and not os.path.exists(train_file):
96 | dataset = load_dataset(dataset_size)
97 | source, target = zip(*dataset)
98 | X_train, X_test, y_train, y_test = train_test_split(source, target, random_state=42, test_size=0.2)
99 | train_df = pd.DataFrame()
100 | train_df["source"], train_df["target"] = X_train, y_train
101 | eval_df = pd.DataFrame()
102 | eval_df["source"], eval_df["target"] = X_test, y_test
103 | train_df.to_csv(train_file, index=False)
104 | eval_df.to_csv(eval_file, index=False)
105 | return train_file, eval_file
106 |
107 |
108 | def dataset2dataloader(dataset_path, batch_size=10, dataset_size=10, debug=False):
109 | train_csv, dev_csv = prepare_data(dataset_path, dataset_size=dataset_size, debug=debug)
110 |
111 | def tokenizer(text):
112 | return list(text)
113 |
114 | # 这里只是定义了数据格式
115 | SOURCE = data.Field(sequential=True, tokenize=tokenizer, lower=False)
116 | # 目标输出前后需加入特殊的标志符
117 | TARGET = data.Field(sequential=True, tokenize=tokenizer, lower=False, init_token="", eos_token="")
118 | train, val = data.TabularDataset.splits(
119 | path='', train=train_csv, validation=dev_csv, format='csv', skip_header=True,
120 | fields=[('source', SOURCE), ('target', TARGET)])
121 |
122 | SOURCE.build_vocab(train)
123 | TARGET.build_vocab(train)
124 |
125 | train_iter = data.BucketIterator(train, batch_size=batch_size, sort_key=lambda x: len(x.sent), shuffle=False)
126 | val_iter = data.BucketIterator(val, batch_size=batch_size, sort_key=lambda x: len(x.sent), shuffle=False)
127 |
128 | # 在 test_iter , sort一定要设置成 False, 要不然会被 torchtext 搞乱样本顺序
129 | # test_iter = data.Iterator(dataset=test, batch_size=128, train=False, sort=False, device=DEVICE)
130 |
131 | return train_iter, val_iter, SOURCE.vocab, TARGET.vocab
--------------------------------------------------------------------------------
/project4-Machine Translation/main.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- encoding: utf-8 -*-
3 | """
4 | Created on 2020/6/8 11:36
5 | @author: phil
6 | """
7 | from keras.utils import to_categorical
8 |
9 | from dataloader import load_dataset, dataset2dataloader
10 | from models import SimpleNMT
11 | from torch import optim
12 | import torch.nn as nn
13 | import torch
14 | import numpy as np
15 | from pprint import pprint
16 | from tqdm import tqdm
17 |
18 | if __name__ == "__main__":
19 | epoch = 500
20 | learning_rate = 0.001
21 | hidden_size = 64
22 | batch_size = 10
23 |
24 | train_iter, val_iter, source_vocab, target_vocab = dataset2dataloader(dataset_path=r"../dataset/date-normalization",
25 | batch_size=batch_size, dataset_size=10000, debug=True)
26 | source_vocab_size = len(source_vocab.stoi)
27 | target_vocab_size = len(target_vocab.stoi)
28 |
29 | # print(target_vocab.stoi)
30 |
31 | Tx, Ty = 25, 10 # 最大长度
32 |
33 | model = SimpleNMT(in_vocab_size=source_vocab_size, out_vocab_size=target_vocab_size, in_hidden_size=hidden_size,
34 | out_hidden_size=hidden_size, output_size=target_vocab_size, with_attention=True)
35 |
36 | optimizer = optim.Adam(model.parameters(), lr=learning_rate)
37 | criterion = nn.CrossEntropyLoss()
38 |
39 | embed_layer1 = nn.Embedding(source_vocab_size, source_vocab_size,
40 | _weight=torch.from_numpy(np.eye(source_vocab_size)))
41 | embed_layer2 = nn.Embedding(target_vocab_size, target_vocab_size,
42 | _weight=torch.from_numpy(np.eye(target_vocab_size)))
43 |
44 | model.train()
45 | for ep in range(epoch):
46 | epoch_loss = 0
47 | for batch in train_iter:
48 | optimizer.zero_grad()
49 | Xin, Yin, Yout = batch.source.t().long(), batch.target.t()[:, :-1].long(), batch.target.t()[:, 1:]
50 | batch_size = len(Xin)
51 | init_hidden = torch.zeros(1, batch_size, hidden_size)
52 | # if ep == epoch - 1:
53 | # print(Yout)
54 | Xin = embed_layer1(Xin).float()
55 | Yin = embed_layer2(Yin).float()
56 | logits = model(Xin, init_hidden, Yin)
57 | loss = criterion(logits.view(-1, logits.shape[-1]), Yout.flatten())
58 | epoch_loss += loss.item()
59 | loss.backward()
60 | optimizer.step()
61 | if ep % (epoch // 10) == 0:
62 | print("loss", epoch_loss)
63 |
64 | # 测试训练集输出是否正确
65 | # for batch in train_iter:
66 | # # print(batch.source.t())
67 | # print(batch.target.t()[:, 1:])
68 | # print("finish")
69 | # init_hidden = torch.zeros(1, batch_size, hidden_size)
70 | # logits = model(Xin, init_hidden, Yin)
71 | # print(logits.argmax(-1))
72 |
73 | sents_for_large = ["monday may 7 1983", "19 march 1998", "18 jul 2008", "9/10/70", "thursday january 1 1981",
74 | "thursday january 26 2015", "saturday april 18 1990", "sunday may 12 1988"]
75 | sents = ["monday march 7 1983", "9 may 1998", "thursday january 26 1995", "9/10/70"]
76 |
77 |
78 | def translate(model, sents):
79 | X = []
80 | for sent in sents:
81 | X.append(list(map(lambda x: source_vocab[x], list(sent))) + [source_vocab[""]] * (Tx - len(sent)))
82 | Xoh = torch.from_numpy(np.array(list(map(lambda x: to_categorical(x, num_classes=source_vocab_size), X))))
83 | encoder_init_hidden = torch.zeros(1, len(X), hidden_size)
84 | preds = model(Xoh, encoder_init_hidden, decoder_input=None, out_word2index=target_vocab.stoi,
85 | out_index2word=target_vocab.itos, max_len=Ty, out_size=target_vocab_size)
86 | for gold, pred in zip(sents, preds):
87 | print(gold, "-->", "".join(pred))
88 |
89 |
90 | translate(model, sents)
91 |
92 | """ 不使用 attention
93 | dataset_size : 10000
94 | loss 940.5139790773392
95 | loss 151.68325132876635
96 | loss 17.91189043689519
97 | loss 8.461621267197188
98 | loss 0.4571912245155545
99 | loss 4.067497536438168
100 | loss 0.02432645454427984
101 | loss 0.022933890589229122
102 | loss 1.740354736426525
103 | loss 2.7019595313686295
104 | monday may 7 1983 --> 1983-05-07
105 | 19 march 1998 --> 1998-03-19
106 | 18 jul 2008 --> 2008-07-18
107 | 9/10/70 --> 1970-09-10
108 | thursday january 1 1981 --> 1981-01-01
109 | thursday january 26 2015 --> 2015-01-26
110 | saturday april 18 1990 --> 1990-04-18
111 | sunday may 12 1988 --> 1988-05-12
112 | """
113 |
114 | """使用attention
115 | loss 870.4544065594673
116 | loss 65.41884177550673
117 | loss 53.339022306521656
118 | loss 0.08635593753569992
119 | loss 0.057157438381182146
120 | loss 0.0006471980702968949
121 | loss 0.09261544834953384
122 | loss 0.000922315769471993
123 | loss 0.00961817828419953
124 | loss 0.06814217135979561
125 | monday may 7 1983 --> 1983-05-07
126 | 19 march 1998 --> 1998-03-19
127 | 18 jul 2008 --> 2008-07-18
128 | 9/10/70 --> 1970-09-10
129 | thursday january 1 1981 --> 1981-01-01
130 | thursday january 26 2015 --> 2015-01-26
131 | saturday april 18 1990 --> 1990-04-18
132 | sunday may 12 1988 --> 1988-05-12
133 | """
134 |
--------------------------------------------------------------------------------
/project4-Machine Translation/models.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- encoding: utf-8 -*-
3 | """
4 | Created on 2020/6/10 11:18
5 | @author: phil
6 | """
7 | import torch.nn as nn
8 | import torch
9 | import numpy as np
10 | import torch.nn.functional as F
11 |
12 |
13 | class EncoderRNN(nn.Module):
14 | def __init__(self, vocab_size, hidden_size, dropout=0.5):
15 | super(EncoderRNN, self).__init__()
16 | self.hidden_size = hidden_size
17 | self.gru = nn.GRU(vocab_size, hidden_size, dropout=dropout, batch_first=True)
18 |
19 | def forward(self, x, init_hidden):
20 | seq_output, last_state = self.gru(x, init_hidden)
21 | return seq_output, last_state
22 |
23 |
24 | class DecoderRNN(nn.Module):
25 | def __init__(self, vocab_size, hidden_size, output_size, dropout=0.5):
26 | super(DecoderRNN, self).__init__()
27 | self.hidden_size = hidden_size
28 | self.gru = nn.GRU(vocab_size, hidden_size, dropout=dropout, batch_first=True)
29 | self.hidden2index = nn.Linear(hidden_size, output_size)
30 |
31 | def forward(self, x, init_state):
32 | seq_output, last_state = self.gru(x, init_state)
33 | seq_output = self.hidden2index(seq_output)
34 | return seq_output, last_state
35 |
36 |
37 | class DecoderAttenRNN(nn.Module):
38 | def __init__(self, vocab_size, hidden_size, output_size, dropout=0.5):
39 | super(DecoderAttenRNN, self).__init__()
40 | self.hidden_size = hidden_size
41 | self.gru = nn.GRU(vocab_size, hidden_size, dropout=dropout, batch_first=True)
42 | self.hidden2label = nn.Linear(hidden_size, output_size)
43 | self.atten_affine = nn.Linear(hidden_size*2, hidden_size)
44 |
45 | def get_alpha(self, hi, encoder_output):
46 | # hi shape (1, batch_size, hidden_size)
47 | # encoder_output (batch, seq_len, hidden_size)
48 | hi = hi.permute(1, 2, 0) # (batch_size, hidden_size, 1)
49 | # print(encoder_output.shape, hi.shape)
50 | e = torch.bmm(encoder_output, hi).squeeze(2) # (batch_size, seq_len)
51 | e = F.softmax(e, dim=1).unsqueeze(2) # (batch_size, seq_len, 1)
52 | alpha = (e * encoder_output).sum(dim=1) # (batch_size, hidden_size)
53 |
54 | return alpha
55 |
56 | def forward(self, x, init_state, seq_encoder_output):
57 | # print(x.shape, init_state.shape, seq_encoder_output.shape)
58 | batch_size, max_len, _ = x.shape # 独热码表示
59 | hi = init_state
60 | seq_decoder_output = []
61 | for i in range(max_len):
62 | # alpha shape (batch_size, hidden_size)
63 | alpha = self.get_alpha(hi, seq_encoder_output) # alpha 表示当前time step的隐状态矩阵和encoder的各个time step输出的关联
64 | hi = torch.cat([alpha.unsqueeze(0), hi], dim=2)
65 | hi = self.atten_affine(hi)
66 | output, hi = self.gru(x[:, i, :].unsqueeze(1), hi)
67 | seq_output = self.hidden2label(output.squeeze(1))
68 | seq_decoder_output.append(seq_output.squeeze(1))
69 | seq_decoder_output = torch.stack(seq_decoder_output, dim=1)
70 | return seq_decoder_output, hi
71 |
72 |
73 | class SimpleNMT(nn.Module):
74 | def __init__(self, in_vocab_size, out_vocab_size, in_hidden_size, out_hidden_size, output_size, with_attention=False):
75 | super(SimpleNMT, self).__init__()
76 | self.with_attention = with_attention
77 | self.encoder = EncoderRNN(in_vocab_size, in_hidden_size)
78 | if self.with_attention:
79 | self.decoder = DecoderAttenRNN(out_vocab_size, out_hidden_size, output_size)
80 | else:
81 | self.decoder = DecoderRNN(out_vocab_size, out_hidden_size, output_size)
82 |
83 | def forward(self, encoder_input, encoder_init_hidden, decoder_input=None, out_word2index=None, out_index2word=None,
84 | max_len=None, out_size=None):
85 | encoder_seq_output, encoder_last_state = self.encoder(encoder_input, encoder_init_hidden)
86 | # 训练时decoder每个time step输入标准答案
87 | if decoder_input is not None:
88 | if self.with_attention:
89 | logits, _ = self.decoder(decoder_input, encoder_last_state, encoder_seq_output)
90 | else:
91 | logits, _ = self.decoder(decoder_input, encoder_last_state)
92 | return logits
93 | else:
94 | # 测试时没有标准答案,一直解码直到出现或者达到最大长度
95 | decoded_sents = []
96 | for i in range(len(encoder_input)):
97 | sent = []
98 | decoder_input = torch.FloatTensor(np.eye(out_size)[[out_word2index[""]]]).unsqueeze(0)
99 | hi = encoder_last_state[:, i, :].unsqueeze(1)
100 | for di in range(max_len):
101 | if self.with_attention:
102 | # alpha = self.decoder.get_alpha(hi, encoder_seq_output[i, :, :].unsqueeze(
103 | # 0)) # alpha 表示当前time step的隐状态矩阵和encoder的各个time step输出的关联
104 | # hi = torch.cat([alpha.unsqueeze(0), hi], dim=2)
105 | # hi = self.decoder.atten_affine(hi)
106 | # # print(decoder_input.shape, hi.shape, encoder_seq_output.shape)
107 | decoder_output, hdi = self.decoder(decoder_input, hi, encoder_seq_output[i, :, :].unsqueeze(0))
108 | else:
109 | decoder_output, hdi = self.decoder(decoder_input, hi)
110 | topv, topi = decoder_output.data.topk(1)
111 | topi = topi.item()
112 | if topi == out_word2index[""]:
113 | break
114 | else:
115 | sent.append(out_index2word[topi])
116 | decoder_input = torch.FloatTensor([np.eye(out_size)[topi]]).unsqueeze(0)
117 | hi = hdi
118 | decoded_sents.append(sent)
119 | return decoded_sents
120 |
--------------------------------------------------------------------------------
/project5-Text Generation/dataloader.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- encoding: utf-8 -*-
3 | """
4 | Created on 2020/6/18 10:56
5 | @author: phil
6 | """
7 | import os
8 | import pandas as pd
9 | from torchtext import data
10 |
11 |
12 | def prepare_data(dataset_path="../dataset/poetry"):
13 | file_path = os.path.join(dataset_path, "poetryFromTang.txt")
14 | target_path = os.path.join(dataset_path, "train.csv")
15 | if not os.path.exists(target_path):
16 | with open(file_path, encoding="utf-8") as f:
17 | lines = f.read().split("\n\n")
18 | lines = list(map(lambda x: x.replace("\n", ""), lines))
19 | df = pd.DataFrame()
20 | df["sent"] = lines
21 | df.to_csv(target_path, index=False, encoding='utf_8_sig')
22 | return target_path
23 |
24 |
25 | def dataset2dataloader(dataset_path="../dataset/poetry", batch_size=32, debug=False):
26 | if debug:
27 | train_csv = os.path.join(dataset_path, "train_small.csv")
28 | else:
29 | train_csv = prepare_data(dataset_path)
30 |
31 | def tokenizer(text):
32 | return list(text)
33 |
34 | SENT = data.Field(sequential=True, tokenize=tokenizer, lower=False, init_token="", eos_token="")
35 | train, _ = data.TabularDataset.splits(path='', train=train_csv, validation=train_csv, format='csv',
36 | skip_header=True,
37 | fields=[('sent', SENT)])
38 |
39 | SENT.build_vocab(train)
40 |
41 | train_iter = data.BucketIterator(train, batch_size=batch_size, sort_key=lambda x: len(x.sent), shuffle=False)
42 |
43 | # 在 test_iter , sort一定要设置成 False, 要不然会被 torchtext 搞乱样本顺序
44 | # test_iter = data.Iterator(dataset=test, batch_size=128, train=False, sort=False, device=DEVICE)
45 |
46 | return train_iter, SENT.vocab
47 |
48 |
49 | if __name__ == "__main__":
50 | train_iter, vocab = dataset2dataloader()
51 | for batch in train_iter:
52 | print(batch.sent.t())
53 | break
54 |
--------------------------------------------------------------------------------
/project5-Text Generation/main.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- encoding: utf-8 -*-
3 | """
4 | Created on 2020/6/18 12:14
5 | @author: phil
6 | """
7 | from tqdm import tqdm
8 |
9 | from dataloader import dataset2dataloader
10 | from torch.optim import Adam
11 | from models import PoetryModel
12 | import torch
13 | import torch.nn as nn
14 | import numpy as np
15 | import os
16 |
17 | if __name__ == "__main__":
18 | batch_size = 32
19 | learning_rate = 0.001
20 | hidden_size = 128
21 | epoch = 200
22 |
23 | train_iter, vocab = dataset2dataloader(batch_size=batch_size)
24 |
25 | vocab_size = len(vocab.stoi)
26 | # print(vocab_size, hidden_size, batch_size)
27 | model = PoetryModel(vocab_size=vocab_size, hidden_size=hidden_size, output_size=vocab_size)
28 | optimizer = Adam(model.parameters(), lr=learning_rate)
29 | criterion = nn.CrossEntropyLoss()
30 |
31 | one_hot_embedding = nn.Embedding(vocab_size, vocab_size, _weight=torch.from_numpy(np.eye(vocab_size)))
32 |
33 | model_path = "model.pkl"
34 | if os.path.exists(model_path):
35 | model = torch.load(model_path)
36 | else:
37 | for ep in tqdm(range(epoch)):
38 | model.train()
39 | total_loss = 0
40 | for i, batch in enumerate(train_iter):
41 | optimizer.zero_grad()
42 | sent = batch.sent.t()
43 | x, y = sent[:, :-1], sent[:, 1:]
44 | x = one_hot_embedding(x).float()
45 | init_hidden = torch.zeros(1, len(x), hidden_size)
46 | output, _ = model(x, init_hidden)
47 | output = output.reshape(-1, output.shape[-1])
48 | y = y.flatten()
49 | loss = criterion(output, y)
50 | loss.backward()
51 | optimizer.step()
52 | total_loss += loss.item()
53 | if ep % (epoch // 10) == 0:
54 | print("loss: ", total_loss)
55 | torch.save(model, model_path)
56 |
57 | model.eval()
58 | # test = ["我好可爱"] 我病恨无我,。好一解颜色。可怜王经行自远,一解颜色。爱绿溪阴。
59 | # test = ["花开有情"] 花边行县柳,河桥晚泊船。开远树,山鸟助酣歌。有情何处,箫管凤初来。情何处所,风吹青珊瑚,可怜王孙立
60 | test = [""]
61 | for sent in test:
62 | sent = list(map(lambda x: vocab.stoi[x], list(sent)))
63 | x = torch.tensor(sent).unsqueeze(0)
64 | x = one_hot_embedding(x).float()
65 | with torch.no_grad():
66 | output = model.generate(x, stoi=vocab.stoi, poetry_type="hidden head")
67 | ans = torch.cat(output, dim=1).argmax(-1).squeeze(0)
68 | for word_id in ans:
69 | print(vocab.itos[word_id.item()], end="")
--------------------------------------------------------------------------------
/project5-Text Generation/model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/positivepeng/nlp-beginner-projects/2f066cbdd24121938b90ee1d12fe063ad0a46df5/project5-Text Generation/model.pkl
--------------------------------------------------------------------------------
/project5-Text Generation/model_debug.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/positivepeng/nlp-beginner-projects/2f066cbdd24121938b90ee1d12fe063ad0a46df5/project5-Text Generation/model_debug.pkl
--------------------------------------------------------------------------------
/project5-Text Generation/models.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- encoding: utf-8 -*-
3 | """
4 | Created on 2020/6/18 11:05
5 | @author: phil
6 | """
7 |
8 | import torch.nn as nn
9 | import torch
10 |
11 |
12 | class PoetryModel(nn.Module):
13 | def __init__(self, vocab_size, hidden_size, output_size, dropout=0.5):
14 | super(PoetryModel, self).__init__()
15 | self.hidden_size = hidden_size
16 | self.gru = nn.GRU(input_size=vocab_size, hidden_size=hidden_size, dropout=dropout, batch_first=True)
17 | self.out = nn.Linear(hidden_size, output_size)
18 |
19 | def forward(self, x, init_hidden):
20 | # print(x.shape, init_hidden.shape)
21 | seq_out, hn = self.gru(x, init_hidden)
22 | output = self.out(seq_out)
23 | return output, hn
24 |
25 | def generate(self, x, stoi, poetry_type="begin", sent_num=4, max_len=15):
26 | init_hidden = torch.zeros(1, 1, self.hidden_size)
27 | output = []
28 | if poetry_type == "hidden head" and x.shape[1] != sent_num:
29 | print("ERROR:选择了藏头诗但是输入字的个数不等于诗的句子数")
30 | return
31 |
32 | hn = init_hidden
33 | for i in range(sent_num):
34 | if i == 0 and poetry_type == "begin":
35 | seq_out, hn = self.gru(x, hn)
36 | seq_out = seq_out[:, -1, :].unsqueeze(1)
37 | output.append(x)
38 | if poetry_type == "hidden head":
39 | seq_out, hn = self.gru(x[:, i, :].unsqueeze(1), hn)
40 | seq_out = seq_out[:, -1, :].unsqueeze(1)
41 | output.append(x[:, i, :].unsqueeze(1))
42 | for j in range(max_len): # 每一句的最大长度
43 | # 上一个time step的输出
44 | _, topi = self.out(seq_out).data.topk(1)
45 | topi = topi.item()
46 | xi_from_output = torch.zeros(1, 1, x.shape[-1])
47 | xi_from_output[0][0][topi] = 1
48 | output.append(xi_from_output)
49 | seq_out, hn = self.gru(xi_from_output, hn)
50 | if topi == stoi["。"]:
51 | break
52 | return output
53 |
--------------------------------------------------------------------------------