├── README.md ├── bert ├── bert.pdf ├── train.py ├── model.py └── dataprocess.py ├── requirements.txt ├── word2vec ├── requirements.txt ├── word2vec.pdf ├── negative_sampling.py ├── word2vecGensim.py ├── vocabs.py ├── CBOW.py ├── skip_gram.py ├── CBOW_negative_sampling.py ├── skip_gram_hierarchical_softmax.py ├── CBOW_hierarchical_softmax.py ├── skip_gram_negative_sampling.py └── hierarchical_softmax.py ├── transformer ├── transformer.pdf └── transformer.py └── .gitignore /README.md: -------------------------------------------------------------------------------- 1 | # nlp 2 | 3 | - word2vec 4 | - transformer 5 | - bert 6 | 7 | -------------------------------------------------------------------------------- /bert/bert.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rexrex9/nlp_torchtext/HEAD/bert/bert.pdf -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | treelib==1.6.1 2 | torch==1.11.0 3 | tqdm==4.63.1 4 | gensim==4.1.2 5 | -------------------------------------------------------------------------------- /word2vec/requirements.txt: -------------------------------------------------------------------------------- 1 | treelib==1.6.1 2 | torch==1.11.0 3 | tqdm==4.63.1 4 | gensim==4.1.2 5 | -------------------------------------------------------------------------------- /word2vec/word2vec.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rexrex9/nlp_torchtext/HEAD/word2vec/word2vec.pdf -------------------------------------------------------------------------------- /transformer/transformer.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rexrex9/nlp_torchtext/HEAD/transformer/transformer.pdf -------------------------------------------------------------------------------- /word2vec/negative_sampling.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | def negative_sample(pos,vocabs): 4 | return random.sample(set(vocabs)-set(pos),len(pos)) 5 | -------------------------------------------------------------------------------- /word2vec/word2vecGensim.py: -------------------------------------------------------------------------------- 1 | from gensim.models import word2vec 2 | 3 | if __name__ == '__main__': 4 | s1 = [0,1,2,3,4] 5 | s2 = [0,2,4,5,6] 6 | s3 = [2,3,4,4,6] 7 | s4 = [1,3,5,0,3] 8 | seqs = [s1,s2,s3,s4] 9 | model = word2vec.Word2Vec(seqs, vector_size=16, min_count=1) 10 | 11 | print(model.wv[1]) 12 | 13 | print(model.wv.most_similar(1, topn=3)) -------------------------------------------------------------------------------- /word2vec/vocabs.py: -------------------------------------------------------------------------------- 1 | 2 | def getvocabsOnlyIndex(seqs): 3 | vocabs = set() 4 | for seq in seqs: 5 | vocabs |= set(seq) 6 | vocabs = list(vocabs) 7 | return vocabs 8 | 9 | def getVocabs(seqs): 10 | vacabs = set() 11 | for seq in seqs: 12 | vacabs |= set(seq) 13 | vacabs = list(vacabs) 14 | vacab_map = dict(zip(vacabs, range(len(vacabs)))) 15 | return vacabs,vacab_map -------------------------------------------------------------------------------- /word2vec/CBOW.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | import torch 3 | from word2vec.vocabs import getvocabsOnlyIndex 4 | 5 | class CBOW(nn.Module): 6 | 7 | def __init__(self,vocabs,vector_size): 8 | super().__init__() 9 | self.vocabs = torch.LongTensor(vocabs) 10 | vocab_numbers = len(vocabs) 11 | self.word_embs = nn.Embedding(vocab_numbers,vector_size) 12 | self.bkp_word_embs = nn.Embedding(vocab_numbers,vector_size) 13 | self.softmax = nn.Softmax() 14 | 15 | def forward(self,x): 16 | x = self.word_embs(x) 17 | x = torch.mean(x,0) 18 | bkp = self.bkp_word_embs(self.vocabs) 19 | y = torch.matmul(x,bkp.T) 20 | y = self.softmax(y) 21 | return torch.unsqueeze(y,0) 22 | 23 | def word2vec( seqs, window_size = 1 ): 24 | vocabs = getvocabsOnlyIndex(seqs) 25 | net = CBOW(vocabs,vector_size=16) 26 | criterion = torch.nn.CrossEntropyLoss() 27 | optimizer = torch.optim.SGD( net.parameters(), lr=0.01) 28 | net.train() 29 | for seq in seqs: 30 | for i in range(0,len(seq)-(window_size*2)): 31 | optimizer.zero_grad() 32 | window = seq[i:i+1+window_size*2] 33 | #[1] 34 | y = torch.LongTensor([window[window_size]]) 35 | window.pop(window_size) 36 | #[window*2] 37 | x = torch.LongTensor(window) 38 | y_pred = net(x) 39 | loss = criterion(y_pred, y) 40 | loss.backward() 41 | optimizer.step() 42 | print(loss) 43 | 44 | if __name__ == '__main__': 45 | s1 = [0,1,2,3,4] 46 | s2 = [0,2,4,5,6] 47 | s3 = [2,3,4,4,6] 48 | s4 = [1,3,5,0,3] 49 | seqs = [s1,s2,s3,s4] 50 | word2vec(seqs) -------------------------------------------------------------------------------- /word2vec/skip_gram.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | import torch 3 | from word2vec.vocabs import getvocabsOnlyIndex 4 | 5 | class Skip_Gram(nn.Module): 6 | 7 | def __init__(self,vocabs,vector_size): 8 | super().__init__() 9 | self.vocabs = torch.LongTensor(vocabs) 10 | vocab_numbers = len(vocabs) 11 | self.word_embs = nn.Embedding(vocab_numbers,vector_size) 12 | self.bkp_word_embs = nn.Embedding(vocab_numbers,vector_size) 13 | self.softmax = nn.Softmax() 14 | 15 | def forward(self,x): 16 | x = self.word_embs(x) 17 | bkp = self.bkp_word_embs(self.vocabs) 18 | y = torch.matmul(x,bkp.T) 19 | y = self.softmax(y) 20 | return y 21 | 22 | def word2vec( seqs, window_size = 1 ): 23 | vocabs = getvocabsOnlyIndex(seqs) 24 | net = Skip_Gram(vocabs,vector_size=16) 25 | criterion = torch.nn.CrossEntropyLoss() 26 | optimizer = torch.optim.SGD( net.parameters(), lr=0.01) 27 | net.train() 28 | for seq in seqs: 29 | for i in range(0,len(seq)-(window_size*2)): 30 | optimizer.zero_grad() 31 | window = seq[i:i+1+window_size*2] 32 | # [window*2] 33 | x = torch.LongTensor([window[window_size] for _ in range(window_size*2)]) 34 | y_pred = net(x) 35 | window.pop(window_size) 36 | # [window*2] 37 | y = torch.LongTensor(window) 38 | loss = criterion(y_pred, y) 39 | print(loss) 40 | loss.backward() 41 | optimizer.step() 42 | 43 | if __name__ == '__main__': 44 | s1 = [0,1,2,3,4] 45 | s2 = [0,2,4,5,6] 46 | s3 = [2,3,4,4,6] 47 | s4 = [1,3,5,0,3] 48 | seqs = [s1,s2,s3,s4] 49 | word2vec(seqs) -------------------------------------------------------------------------------- /word2vec/CBOW_negative_sampling.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | import torch 3 | from word2vec.vocabs import getvocabsOnlyIndex 4 | from word2vec.negative_sampling import negative_sample 5 | 6 | class CBOW_With_Negative_Sampling(nn.Module): 7 | 8 | def __init__(self,vocab_numbers,vector_size): 9 | super().__init__() 10 | 11 | self.word_embs = nn.Embedding(vocab_numbers,vector_size) 12 | self.bkp_word_embs = nn.Embedding(vocab_numbers,vector_size) 13 | 14 | def forward(self,cs,os): 15 | os = self.word_embs(os) 16 | os = torch.mean(os,0) 17 | cs = self.bkp_word_embs(cs) 18 | y = torch.sum(cs*os,1) 19 | return torch.sigmoid(y) 20 | 21 | def word2vec( seqs, window_size = 1 ): 22 | vocabs = getvocabsOnlyIndex(seqs) 23 | net = CBOW_With_Negative_Sampling(len(vocabs),vector_size=16) 24 | criterion = torch.nn.BCELoss() 25 | optimizer = torch.optim.SGD( net.parameters(), lr=0.01) 26 | net.train() 27 | for seq in seqs: 28 | for i in range(0,len(seq)-(window_size*2)): 29 | window = seq[i:i+1+window_size*2] 30 | cs = [window[window_size]] 31 | neg = negative_sample(cs,vocabs) 32 | cs.extend(neg) 33 | #[2] 34 | cs = torch.LongTensor(cs) 35 | y=[1,0] 36 | window.pop(window_size) 37 | 38 | optimizer.zero_grad() 39 | #[2,window_size*2] 40 | os = torch.concat([torch.unsqueeze(torch.LongTensor(window),0) for _ in y],0) 41 | y_pred = net(cs,os) 42 | y = torch.FloatTensor(y) 43 | loss = criterion(y_pred,y) 44 | loss.backward() 45 | optimizer.step() 46 | print(loss) 47 | 48 | 49 | if __name__ == '__main__': 50 | s1 = [0,1,2,3,4] 51 | s2 = [0,2,4,5,6] 52 | s3 = [2,3,4,4,6] 53 | s4 = [1,3,5,0,3] 54 | seqs = [s1,s2,s3,s4] 55 | word2vec(seqs) -------------------------------------------------------------------------------- /word2vec/skip_gram_hierarchical_softmax.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | import torch 3 | from word2vec.vocabs import getvocabsOnlyIndex 4 | from word2vec.hierarchical_softmax import Hierarchical_Softmax 5 | 6 | class Skip_Gram_With_Hierarchical_Softmax(nn.Module): 7 | 8 | def __init__(self,node_number,vector_size): 9 | super().__init__() 10 | self.node_embs = nn.Embedding(node_number,vector_size) 11 | 12 | def forward(self,cs,nodes): 13 | cs = self.node_embs(cs) 14 | nodes = self.node_embs(nodes) 15 | y = torch.sum(cs*nodes,1) 16 | y = torch.sigmoid(y) 17 | return y 18 | 19 | def word2vec( seqs, window_size = 1 ): 20 | vocabs = getvocabsOnlyIndex(seqs) 21 | HS = Hierarchical_Softmax(vocabs) 22 | 23 | net = Skip_Gram_With_Hierarchical_Softmax(HS.getNodeNumber(),vector_size=16) 24 | criterion = torch.nn.BCELoss() 25 | optimizer = torch.optim.SGD( net.parameters(), lr=0.01) 26 | net.train() 27 | for seq in seqs: 28 | for i in range(0,len(seq)-(window_size*2)): 29 | window = seq[i:i+1+window_size*2] 30 | x = window[window_size] 31 | window.pop(window_size) 32 | paths, isLefts = HS.getPathByLeaves(window) #得到所有节点与是否是走左边的标注 33 | 34 | # 开始迭代 35 | optimizer.zero_grad() 36 | # [ window*deep ] 37 | cs = torch.LongTensor([x for _ in paths]) 38 | # [ window*deep ] 39 | nodes = torch.LongTensor(paths) 40 | y_pred = net(cs,nodes) 41 | # [ window*deep ] 42 | y = torch.FloatTensor(isLefts) 43 | loss = criterion(y_pred, y) 44 | loss.backward() 45 | print(loss) 46 | optimizer.step() 47 | 48 | if __name__ == '__main__': 49 | s1 = [0,1,2,3,4] 50 | s2 = [0,2,4,5,6] 51 | s3 = [2,3,4,4,6] 52 | s4 = [1,3,5,0,3] 53 | seqs = [s1,s2,s3,s4] 54 | word2vec(seqs) -------------------------------------------------------------------------------- /word2vec/CBOW_hierarchical_softmax.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | import torch 3 | from word2vec.vocabs import getvocabsOnlyIndex 4 | from word2vec.hierarchical_softmax import Hierarchical_Softmax 5 | 6 | class CBOW_With_Hierarchical_Softmax(nn.Module): 7 | 8 | def __init__(self,node_number,vector_size): 9 | super().__init__() 10 | self.node_embs = nn.Embedding(node_number,vector_size) 11 | 12 | def forward(self,os,nodes): 13 | os = self.node_embs(os) 14 | os = torch.mean(os, 1) 15 | nodes = self.node_embs(nodes) 16 | y = torch.sum(os*nodes,1) 17 | y = torch.sigmoid(y) 18 | return y 19 | 20 | def word2vec( seqs, window_size = 1 ): 21 | vocabs = getvocabsOnlyIndex(seqs) 22 | HS = Hierarchical_Softmax(vocabs) 23 | 24 | net = CBOW_With_Hierarchical_Softmax(HS.getNodeNumber(),vector_size=16) 25 | criterion = torch.nn.BCELoss() 26 | optimizer = torch.optim.SGD( net.parameters(), lr=0.01) 27 | net.train() 28 | for seq in seqs: 29 | for i in range(0,len(seq)-(window_size*2)): 30 | window = seq[i:i+1+window_size*2] 31 | cs = [window[window_size]] 32 | paths, isLefts = HS.getPathByLeaves(cs) #得到所有节点与是否是走左边的标注 33 | window.pop(window_size) 34 | # 开始迭代 35 | optimizer.zero_grad() 36 | # [deep,window] 37 | os = torch.concat([torch.unsqueeze(torch.LongTensor(window), 0) for _ in paths], 0) 38 | # [deep] 39 | nodes = torch.LongTensor(paths) 40 | y_pred = net(os,nodes) 41 | # [deep] 42 | y = torch.FloatTensor(isLefts) 43 | loss = criterion(y_pred, y) 44 | loss.backward() 45 | optimizer.step() 46 | print(loss) 47 | 48 | if __name__ == '__main__': 49 | s1 = [0,1,2,3,4] 50 | s2 = [0,2,4,5,6] 51 | s3 = [2,3,4,4,6] 52 | s4 = [1,3,5,0,3] 53 | seqs = [s1,s2,s3,s4] 54 | word2vec(seqs) -------------------------------------------------------------------------------- /word2vec/skip_gram_negative_sampling.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | import torch 3 | from word2vec.vocabs import getvocabsOnlyIndex 4 | from word2vec.negative_sampling import negative_sample 5 | 6 | class Skip_Gram_With_Negative_Sampling(nn.Module): 7 | 8 | def __init__(self,vocab_numbers,vector_size): 9 | super().__init__() 10 | self.word_embs = nn.Embedding(vocab_numbers,vector_size) 11 | self.bkp_word_embs = nn.Embedding(vocab_numbers,vector_size) 12 | 13 | def forward(self,cs,os): 14 | cs = self.word_embs(cs) 15 | os = self.bkp_word_embs(os) 16 | y = torch.sum(cs*os,1) 17 | y = torch.sigmoid(y) 18 | return y 19 | 20 | def word2vec( seqs, window_size = 1 ): 21 | vocabs = getvocabsOnlyIndex(seqs) 22 | net = Skip_Gram_With_Negative_Sampling(len(vocabs),vector_size=16) 23 | criterion = torch.nn.BCELoss() 24 | optimizer = torch.optim.SGD( net.parameters(), lr=0.01) 25 | net.train() 26 | for seq in seqs: 27 | for i in range(0,len(seq)-(window_size*2)): 28 | window = seq[i:i+1+window_size*2] 29 | # [window*2+neg] 30 | cs = torch.LongTensor([window[window_size] for _ in range(window_size*4)]) 31 | print(cs.shape) 32 | window.pop(window_size) 33 | y = [1 for _ in window] 34 | neg = negative_sample(window,vocabs) 35 | y.extend([0 for _ in neg]) 36 | window.extend(neg) 37 | 38 | optimizer.zero_grad() 39 | # [window*2+neg] 40 | os = torch.LongTensor(window) 41 | y_pred = net(cs,os) 42 | # [window*2+neg] 43 | y = torch.FloatTensor(y) 44 | loss = criterion(y_pred,y) 45 | loss.backward() 46 | optimizer.step() 47 | print(loss) 48 | 49 | 50 | if __name__ == '__main__': 51 | s1 = [0,1,2,3,4] 52 | s2 = [0,2,4,5,6] 53 | s3 = [2,3,4,4,6] 54 | s4 = [1,3,5,0,3] 55 | seqs = [s1,s2,s3,s4] 56 | word2vec(seqs) -------------------------------------------------------------------------------- /bert/train.py: -------------------------------------------------------------------------------- 1 | from bert import model 2 | from bert import dataprocess as dp 3 | import torch 4 | from torch.utils.data import DataLoader 5 | 6 | class Dataset(torch.utils.data.Dataset): 7 | def __init__(self,tokenss, segmentss, mlm_pred_positionss, nsp_Y, mlm_Y): 8 | self.tokenss = torch.LongTensor(tokenss) 9 | self.segmentss = torch.LongTensor(segmentss) 10 | self.mlm_pred_positionss= torch.LongTensor(mlm_pred_positionss) 11 | self.nsp_Y = torch.LongTensor(nsp_Y) 12 | self.mlm_Y = torch.LongTensor(mlm_Y) 13 | 14 | def __getitem__(self, idx): 15 | return (self.tokenss[idx], self.segmentss[idx], 16 | self.mlm_pred_positionss[idx], self.nsp_Y[idx], 17 | self.mlm_Y[idx]) 18 | 19 | def __len__(self): 20 | return len(self.tokenss) 21 | 22 | 23 | def train(epochs = 10,batchSize=2): 24 | tokenss, segmentss, mlm_pred_positionss, nsp_Y, mlm_Y, vocab_dict = dp.getPreData(dp.seqs) 25 | 26 | dataSet = Dataset(tokenss, segmentss, mlm_pred_positionss, nsp_Y, mlm_Y) 27 | net = model.BERTModel(vocab_size=len(vocab_dict), e_dim=32, transformer_h_dim=32, 28 | mlm_h_dim=32, n_heads=3, n_layers=12, max_len=128) 29 | optimizer = torch.optim.Adam(net.parameters(), lr=0.01) 30 | criterion = torch.nn.CrossEntropyLoss() 31 | for e in range(epochs): 32 | for tokenss, segmentss, mlm_pred_positionss, nsp_Y, mlm_Y in DataLoader(dataSet, batch_size=batchSize, shuffle=True): 33 | optimizer.zero_grad() 34 | encoded_X, mlm_Y_hat, nsp_Y_hat = net(tokenss, segmentss, mlm_pred_positionss) 35 | mlm_Y_hat = mlm_Y_hat.reshape(-1,len(vocab_dict)) 36 | mlm_Y = mlm_Y.reshape(-1) 37 | mlm_loss = criterion(mlm_Y_hat,mlm_Y) 38 | nsp_loss = criterion(nsp_Y_hat,nsp_Y) 39 | loss = mlm_loss + nsp_loss 40 | loss.backward() 41 | optimizer.step() 42 | print('epoch {}, loss = {:.4f}'.format(e,loss)) 43 | 44 | if __name__ == '__main__': 45 | ''' 46 | 该示例代码未考虑padding,如要考虑padding则用填充,并记录valid_lens(实际长度)方便并行计算。 47 | ''' 48 | train() -------------------------------------------------------------------------------- /word2vec/hierarchical_softmax.py: -------------------------------------------------------------------------------- 1 | import math 2 | from treelib import Tree,Node 3 | 4 | class Hierarchical_Softmax(): 5 | 6 | def __init__(self,vocabs): 7 | self.vocabs = vocabs 8 | self.tree = self.__getBinaryTreeByVocabs() 9 | self.leaf_path_map = self.__set_paths_to_leaves_map() 10 | #self.tree.show() 11 | 12 | def __getBinaryTreeByVocabs(self): 13 | deep=math.ceil(math.log2(len(self.vocabs))) 14 | startValue = len(self.vocabs) 15 | tree = Tree() 16 | tree.create_node(startValue, startValue) 17 | lastNodes = [startValue] 18 | startValue+=1 19 | for i in range(deep-1): 20 | parents = [] 21 | for lastNode in lastNodes: 22 | tree.create_node(startValue, startValue, parent=lastNode) 23 | parents.append(startValue) 24 | startValue += 1 25 | tree.create_node(startValue, startValue, parent=lastNode) 26 | parents.append(startValue) 27 | startValue += 1 28 | lastNodes=parents 29 | i = 0 30 | for vacab in self.vocabs: 31 | tree.create_node(vacab,vacab,parent=lastNodes[math.floor(i/2)]) 32 | i+=1 33 | 34 | return tree 35 | 36 | def __set_paths_to_leaves_map(self): 37 | leaf_path_map = {} 38 | lst = self.tree.paths_to_leaves() 39 | for l in lst: 40 | leaf_path_map[l[-1]]=l 41 | return leaf_path_map 42 | 43 | def isLeft(self,v,p): 44 | return v == self.tree.children(p)[0].identifier 45 | 46 | def getPathToByOneLeaf(self,leaf): 47 | ''' 48 | 根据叶子节点得到所有父节点以及是否走向左边的标注。 49 | :param leaf: 叶子节点的id 50 | :return: ([7, 8, 11], [1, 0, 0]) 51 | ''' 52 | path = self.leaf_path_map[leaf] 53 | islefts=[] 54 | for i in range(len(path)-1): 55 | islefts.append(int(self.isLeft(path[i+1],path[i]))) 56 | return path[:-1],islefts 57 | 58 | def getPathByLeaves(self,leafs): 59 | paths,isLefts = [],[] 60 | for leaf in leafs: 61 | path,isLeft = self.getPathToByOneLeaf(leaf) 62 | paths.extend(path) 63 | isLefts.extend(isLeft) 64 | return paths,isLefts 65 | 66 | def getNodeNumber(self): 67 | return len(self.tree.nodes) -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /bert/model.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | import torch 3 | from torch.nn import functional as F 4 | from transformer.transformer import EncoderLayer 5 | 6 | class BERTEncoder(nn.Module): 7 | 8 | def __init__(self, vocab_size, e_dim, h_dim, n_heads, n_layers, max_len=1024): 9 | ''' 10 | :param vocab_size: 词汇数量 11 | :param e_dim: 词向量维度 12 | :param h_dim: Transformer编码层中间层的维度 13 | :param n_heads: Transformer多头注意力的头数 14 | :param n_layers: Transformer编码层的层数 15 | :param max_len: 序列最长长度 16 | ''' 17 | super(BERTEncoder, self).__init__() 18 | self.token_embedding = nn.Embedding(vocab_size, e_dim) 19 | self.segment_embedding = nn.Embedding(2, e_dim) 20 | self.pos_embedding = nn.Parameter(torch.randn(1, max_len,e_dim)) 21 | self.encoder_layers = nn.ModuleList( [EncoderLayer( e_dim, h_dim, n_heads ) for _ in range( n_layers )] ) 22 | 23 | def forward(self, tokens, segments): 24 | X = self.token_embedding(tokens) + self.segment_embedding(segments) 25 | X = X + self.pos_embedding.data[:, :X.shape[1], :] 26 | for layer in self.encoder_layers: 27 | X = layer(X) 28 | return X 29 | 30 | class MaskLM(nn.Module): 31 | 32 | def __init__(self, vocab_size, h_dim, e_dim): 33 | super(MaskLM, self).__init__() 34 | self.mlp = nn.Sequential(nn.Linear(e_dim, h_dim), 35 | nn.ReLU(), 36 | nn.LayerNorm(h_dim), 37 | nn.Linear(h_dim, vocab_size), 38 | nn.Softmax()) 39 | 40 | def forward(self, X, pred_positions): 41 | num_pred_positions = pred_positions.shape[1] 42 | pred_positions = pred_positions.reshape(-1) 43 | batch_size = X.shape[0] 44 | batch_idx = torch.arange(0, batch_size) 45 | batch_idx = torch.repeat_interleave(batch_idx, num_pred_positions) 46 | masked_X = X[batch_idx, pred_positions] 47 | masked_X = masked_X.reshape((batch_size, num_pred_positions, -1)) 48 | mlm_Y_hat = self.mlp(masked_X) 49 | return mlm_Y_hat 50 | 51 | class NextSentencePred(nn.Module): 52 | 53 | def __init__(self, e_dim): 54 | super(NextSentencePred, self).__init__() 55 | self.output = nn.Linear(e_dim, 2) 56 | 57 | def forward(self, X): 58 | return F.softmax(self.output(X)) 59 | 60 | class BERTModel(nn.Module): 61 | 62 | def __init__( self, vocab_size, e_dim, transformer_h_dim, mlm_h_dim, n_heads, n_layers, max_len = 1024 ): 63 | ''' 64 | :param vocab_size: 词汇数量 65 | :param e_dim: 词向量维度 66 | :param transformer_h_dim: transformer中间隐藏层的维度 67 | :param mlm_h_dim: mlm网络中间隐藏层维度 68 | :param n_heads: Transformer多头注意力的头数 69 | :param n_layers: Transformer编码层的层数 70 | :param max_len: 序列最长长度 71 | ''' 72 | super(BERTModel, self).__init__() 73 | self.encoder = BERTEncoder(vocab_size, e_dim, transformer_h_dim, n_heads, n_layers, max_len=max_len) 74 | 75 | self.mlm = MaskLM(vocab_size, mlm_h_dim, e_dim) 76 | self.nsp = NextSentencePred(e_dim) 77 | 78 | def forward(self, tokens, segments, pred_positions=None): 79 | encoded_X = self.encoder(tokens, segments) 80 | 81 | mlm_Y_hat = self.mlm(encoded_X, pred_positions) 82 | nsp_Y_hat = self.nsp(encoded_X[:, 0, :]) 83 | return encoded_X, mlm_Y_hat, nsp_Y_hat 84 | 85 | if __name__ == '__main__': 86 | net = BERTModel(vocab_size=100, e_dim=768, transformer_h_dim=768, mlm_h_dim=768, n_heads=3, n_layers=12, max_len = 1024) 87 | batch_size = 24 88 | tokens = torch.randint(0,100,(batch_size,12)) 89 | segments = torch.cat([torch.zeros(batch_size,7,dtype=int),torch.ones(batch_size,5,dtype=int)],dim=1) 90 | pred_positions = torch.randint(0,12,(batch_size,3)) 91 | encoded_X, mlm_Y_hat, nsp_Y_hat = net(tokens,segments,pred_positions) 92 | 93 | print(encoded_X.shape) 94 | print(mlm_Y_hat.shape) 95 | print(nsp_Y_hat.shape) 96 | 97 | 98 | -------------------------------------------------------------------------------- /bert/dataprocess.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | def getVocabs(data): 4 | vacab_list=['', '', '', ''] 5 | vacab_set = set() 6 | for ss in data: 7 | for s in ss: 8 | vacab_set|=set(s) 9 | vacab_list.extend(list(vacab_set)) 10 | vacab_dict = {v:i for i,v in enumerate(vacab_list)} 11 | return vacab_dict,vacab_set 12 | 13 | def getTokensAndSegmentsSingle(tokens_a, tokens_b): 14 | tokens = [''] + tokens_a + [''] 15 | segments = [0] * (len(tokens_a) + 2) 16 | tokens += tokens_b + [''] 17 | segments += [1] * (len(tokens_b) + 1) 18 | return tokens, segments 19 | 20 | def getTokensAndSegments(data): 21 | tokens,segments = [],[] 22 | for s in data: 23 | token,seg = getTokensAndSegmentsSingle(*s) 24 | tokens.append(token) 25 | segments.append(seg) 26 | return tokens,segments 27 | 28 | def getParas(data): 29 | paras = [] 30 | for d in data: 31 | paras.append(d[0]) 32 | paras.append(d[1]) 33 | return paras 34 | 35 | def getNspData(data): 36 | paras = getParas(data) 37 | nsp_data = [] 38 | nsp_Y = [] 39 | for d in data: 40 | sentences = [d[0]] 41 | if random.random() < 0.5: 42 | sentences.append(d[1]) 43 | nsp_Y.append(1) 44 | else: 45 | sentences.append(random.choice(paras)) 46 | nsp_Y.append(0) 47 | nsp_data.append(sentences) 48 | return nsp_data,nsp_Y 49 | 50 | def mapping(tokenss,mlm_true_wordss,vocab_dict): 51 | n_tokenss,mlm_Y = [],[] 52 | for tokens in tokenss: 53 | n_tokenss.append([vocab_dict[token] for token in tokens]) 54 | for words in mlm_true_wordss: 55 | mlm_Y.append([vocab_dict[word] for word in words]) 56 | return n_tokenss,mlm_Y 57 | 58 | def maskMlmData(tokens,vocab_set): 59 | num_pred = round(len(tokens) * 0.15) # 预测15%个随机词 60 | mlm_true_words,mlm_pred_positions=[],[] 61 | for i in range(num_pred): 62 | while True: #如果要替换的位置是'', '', '',则继续选择 63 | change_index = random.choice(range(len(tokens))) 64 | if tokens[change_index] not in ['', '', '']: 65 | break 66 | mlm_pred_positions.append(change_index) 67 | mlm_true_words.append(tokens[change_index]) 68 | if random.random() < 0.8: # 80%概率mask 69 | tokens[change_index] = '' 70 | else: 71 | # 10%用随机词替换该词, 剩余10%保持不变 72 | if random.random() < 0.5: 73 | tokens[change_index] = random.choice(list(vocab_set)) 74 | return tokens,mlm_true_words,mlm_pred_positions 75 | 76 | def getMlmData(tokenss,vocab_set): 77 | n_tokenss,mlm_true_wordss, mlm_pred_positionss = [],[],[] 78 | for tokens in tokenss: 79 | tokens, mlm_true_words, mlm_pred_positions = maskMlmData(tokens,vocab_set) 80 | n_tokenss.append(tokens) 81 | mlm_true_wordss.append(mlm_true_words) 82 | mlm_pred_positionss.append(mlm_pred_positions) 83 | return n_tokenss,mlm_true_wordss, mlm_pred_positionss 84 | 85 | def getPreData(data): 86 | vocab_dict, vacab_set = getVocabs(seqs) 87 | nsp_data, nsp_Y = getNspData(data) #生成nsp任务的文本数据 88 | tokenss, segmentss = getTokensAndSegments(nsp_data) #生成bert encoder所需输入 89 | tokenss, mlm_true_wordss, mlm_pred_positionss = getMlmData(tokenss,vacab_set) #生成mlm任务的文本数据 90 | tokenss, mlm_Y = mapping(tokenss,mlm_true_wordss,vocab_dict) #映射成索引 91 | return tokenss,segmentss,mlm_pred_positionss,nsp_Y,mlm_Y,vocab_dict 92 | 93 | seqs = [[['i','b','c','d','e','f'],['a','m','c','f','j','g']], 94 | [['d','e','f','e','a','f'],['a','b','c','d','e','d']], 95 | [['h','i','j','k','h','b'],['a','b','e','f','g','a']], 96 | [['a','b','c','d','e','f'],['a','b','c','e','m','g']], 97 | [['b','l','n','e','f','h'],['e','e','m','d','j','f']], 98 | [['b','g','d','m','f','g'],['e','e','c','d','e','f']]] 99 | 100 | if __name__ == '__main__': 101 | getPreData(seqs) 102 | -------------------------------------------------------------------------------- /transformer/transformer.py: -------------------------------------------------------------------------------- 1 | 2 | import torch 3 | from torch import nn 4 | from torch.autograd import Variable 5 | 6 | #多头注意力层 7 | class MultiHeadAttentionLayer( nn.Module ): 8 | 9 | def __init__( self, e_dim, h_dim, n_heads ): 10 | ''' 11 | :param e_dim: 输入的向量维度 12 | :param h_dim: 每个单头注意力层输出的向量维度 13 | :param n_heads: 头数 14 | ''' 15 | super().__init__() 16 | self.atte_layers = nn.ModuleList([ OneHeadAttention( e_dim, h_dim ) for _ in range( n_heads ) ] ) 17 | self.l = nn.Linear( h_dim * n_heads, e_dim) 18 | 19 | def forward( self, seq_inputs, querys = None, mask = None ): 20 | outs = [] 21 | for one in self.atte_layers: 22 | out = one( seq_inputs, querys, mask ) 23 | outs.append( out ) 24 | # [ batch, seq_lens, h_dim * n_heads ] 25 | outs = torch.cat( outs, dim=-1 ) 26 | # [ batch, seq_lens, e_dim ] 27 | outs = self.l( outs ) 28 | return outs 29 | 30 | #单头注意力层 31 | class OneHeadAttention( nn.Module ): 32 | 33 | def __init__( self, e_dim, h_dim ): 34 | ''' 35 | :param e_dim: 输入向量维度 36 | :param h_dim: 输出向量维度 37 | ''' 38 | super().__init__() 39 | self.h_dim = h_dim 40 | 41 | # 初始化Q,K,V的映射线性层 42 | self.lQ = nn.Linear( e_dim, h_dim ) 43 | self.lK = nn.Linear( e_dim, h_dim ) 44 | self.lV = nn.Linear( e_dim, h_dim ) 45 | 46 | def forward( self, seq_inputs , querys = None, mask = None ): 47 | ''' 48 | :param seq_inputs: #[ batch, seq_lens, e_dim ] 49 | :param querys: #[ batch, seq_lens, e_dim ] 50 | :param mask: #[ 1, seq_lens, seq_lens ] 51 | :return: 52 | ''' 53 | # 如果有encoder的输出, 则映射该张量,否则还是就是自注意力的逻辑 54 | if querys is not None: 55 | Q = self.lQ( querys ) #[ batch, seq_lens, h_dim ] 56 | else: 57 | Q = self.lQ( seq_inputs ) #[ batch, seq_lens, h_dim ] 58 | K = self.lK( seq_inputs ) #[ batch, seq_lens, h_dim ] 59 | V = self.lV( seq_inputs ) #[ batch, seq_lens, h_dim ] 60 | # [ batch, seq_lens, seq_lens ] 61 | QK = torch.matmul( Q,K.permute( 0, 2, 1 ) ) 62 | # [ batch, seq_lens, seq_lens ] 63 | QK /= ( self.h_dim ** 0.5 ) 64 | 65 | # 将对应Mask序列中0的位置变为-1e9,意为遮盖掉此处的值 66 | if mask is not None: 67 | QK = QK.masked_fill( mask == 0, -1e9 ) 68 | # [ batch, seq_lens, seq_lens ] 69 | a = torch.softmax( QK, dim = -1 ) 70 | # [ batch, seq_lens, h_dim ] 71 | outs = torch.matmul( a, V ) 72 | return outs 73 | 74 | #前馈神经网络 75 | class FeedForward(nn.Module): 76 | 77 | def __init__( self, e_dim, ff_dim, drop_rate = 0.1 ): 78 | super().__init__() 79 | self.l1 = nn.Linear( e_dim, ff_dim ) 80 | self.l2 = nn.Linear( ff_dim, e_dim ) 81 | self.drop_out = nn.Dropout( drop_rate ) 82 | 83 | def forward( self, x ): 84 | outs = self.l1( x ) 85 | outs = self.l2( self.drop_out( torch.relu( outs ) ) ) 86 | return outs 87 | 88 | #位置编码 89 | class PositionalEncoding( nn.Module ): 90 | 91 | def __init__( self, e_dim, dropout = 0.1, max_len = 512 ): 92 | super().__init__() 93 | self.dropout = nn.Dropout( p = dropout ) 94 | pe = torch.zeros( max_len, e_dim ) 95 | position = torch.arange( 0, max_len ).unsqueeze( 1 ) 96 | 97 | div_term = 10000.0 ** ( torch.arange( 0, e_dim, 2 ) / e_dim ) 98 | 99 | #偶数位计算sin, 奇数位计算cos 100 | pe[ :, 0::2 ] = torch.sin( position / div_term ) 101 | pe[ :, 1::2 ] = torch.cos( position / div_term ) 102 | 103 | pe = pe.unsqueeze(0) 104 | self.pe = pe 105 | 106 | def forward( self, x ): 107 | x = x + Variable( self.pe[:, : x.size( 1 ) ], requires_grad = False ) 108 | return self.dropout( x ) 109 | 110 | #编码层 111 | class EncoderLayer(nn.Module): 112 | 113 | def __init__( self, e_dim, h_dim, n_heads, drop_rate = 0.1 ): 114 | ''' 115 | :param e_dim: 输入向量的维度 116 | :param h_dim: 注意力层中间隐含层的维度 117 | :param n_heads: 多头注意力的头目数量 118 | :param drop_rate: drop out的比例 119 | ''' 120 | super().__init__() 121 | # 初始化多头注意力层 122 | self.attention = MultiHeadAttentionLayer( e_dim, h_dim, n_heads ) 123 | # 初始化注意力层之后的LN 124 | self.a_LN = nn.LayerNorm( e_dim ) 125 | # 初始化前馈神经网络层 126 | self.ff_layer = FeedForward( e_dim, e_dim//2 ) 127 | # 初始化前馈网络之后的LN 128 | self.ff_LN = nn.LayerNorm( e_dim ) 129 | 130 | self.drop_out = nn.Dropout( drop_rate ) 131 | 132 | def forward(self, seq_inputs ): 133 | # seq_inputs = [batch, seqs_len, e_dim] 134 | # 多头注意力, 输出维度[ batch, seq_lens, e_dim ] 135 | outs_ = self.attention( seq_inputs ) 136 | # 残差连接与LN, 输出维度[ batch, seq_lens, e_dim ] 137 | outs = self.a_LN( seq_inputs + self.drop_out( outs_ ) ) 138 | # 前馈神经网络, 输出维度[ batch, seq_lens, e_dim ] 139 | outs_ = self.ff_layer( outs ) 140 | # 残差与LN, 输出维度[ batch, seq_lens, e_dim ] 141 | outs = self.ff_LN( outs + self.drop_out( outs_) ) 142 | return outs 143 | 144 | 145 | class TransformerEncoder(nn.Module): 146 | 147 | def __init__(self, e_dim, h_dim, n_heads, n_layers, drop_rate = 0.1 ): 148 | ''' 149 | :param e_dim: 输入向量的维度 150 | :param h_dim: 注意力层中间隐含层的维度 151 | :param n_heads: 多头注意力的头目数量 152 | :param n_layers: 编码层的数量 153 | :param drop_rate: drop out的比例 154 | ''' 155 | super().__init__() 156 | #初始化位置编码层 157 | self.position_encoding = PositionalEncoding( e_dim ) 158 | #初始化N个“编码层” 159 | self.encoder_layers = nn.ModuleList( [EncoderLayer( e_dim, h_dim, n_heads, drop_rate ) 160 | for _ in range( n_layers )] ) 161 | def forward( self, seq_inputs ): 162 | ''' 163 | :param seq_inputs: 已经经过Embedding层的张量,维度是[ batch, seq_lens, dim ] 164 | :return: 与输入张量维度一样的张量,维度是[ batch, seq_lens, dim ] 165 | ''' 166 | #先进行位置编码 167 | seq_inputs = self.position_encoding( seq_inputs ) 168 | #输入进N个“编码层”中开始传播 169 | for layer in self.encoder_layers: 170 | seq_inputs = layer( seq_inputs ) 171 | return seq_inputs 172 | 173 | 174 | #生成mask序列 175 | def subsequent_mask( size ): 176 | subsequent_mask = torch.triu( torch.ones( (1, size, size) ) ) == 0 177 | return subsequent_mask 178 | 179 | 180 | #解码层 181 | class DecoderLayer(nn.Module): 182 | 183 | def __init__( self, e_dim, h_dim, n_heads, drop_rate = 0.1 ): 184 | ''' 185 | :param e_dim: 输入向量的维度 186 | :param h_dim: 注意力层中间隐含层的维度 187 | :param n_heads: 多头注意力的头目数量 188 | :param drop_rate: drop out的比例 189 | ''' 190 | super().__init__() 191 | 192 | # 初始化自注意力层 193 | self.self_attention = MultiHeadAttentionLayer( e_dim, h_dim, n_heads ) 194 | # 初始化自注意力层之后的LN 195 | self.sa_LN = nn.LayerNorm( e_dim ) 196 | # 初始化交互注意力层 197 | self.interactive_attention = MultiHeadAttentionLayer( e_dim, h_dim, n_heads ) 198 | # 初始化交互注意力层之后的LN 199 | self.ia_LN = nn.LayerNorm (e_dim ) 200 | # 初始化前馈神经网络层 201 | self.ff_layer = FeedForward( e_dim, e_dim//2 ) 202 | # 初始化前馈网络之后的LN 203 | self.ff_LN = nn.LayerNorm( e_dim ) 204 | 205 | self.drop_out = nn.Dropout( drop_rate ) 206 | 207 | def forward( self, seq_inputs , querys, mask ): 208 | ''' 209 | :param seq_inputs: [ batch, seqs_len, e_dim ] 210 | :param querys: encoder的输出 211 | :param mask: 遮盖位置的标注序列 [ 1, seqs_len, seqs_len ] 212 | ''' 213 | # 自注意力层, 输出维度[ batch, seq_lens, e_dim ] 214 | outs_ = self.self_attention( seq_inputs , mask=mask ) 215 | # 残差连与LN, 输出维度[ batch, seq_lens, e_dim ] 216 | outs = self.sa_LN( seq_inputs + self.drop_out( outs_ ) ) 217 | # 交互注意力层, 输出维度[ batch, seq_lens, e_dim ] 218 | outs_ = self.interactive_attention( outs, querys ) 219 | # 残差连与LN, 输出维度[ batch, seq_lens, e_dim 220 | outs = self.ia_LN( outs + self.drop_out(outs_) ) 221 | # 前馈神经网络, 输出维度[ batch, seq_lens, e_dim ] 222 | outs_ = self.ff_layer( outs ) 223 | # 残差与LN, 输出维度[ batch, seq_lens, e_dim ] 224 | outs = self.ff_LN( outs + self.drop_out( outs_) ) 225 | return outs 226 | 227 | 228 | 229 | class TransformerDecoder(nn.Module): 230 | def __init__(self, e_dim, h_dim, n_heads, n_layers, n_classes,drop_rate = 0.1 ): 231 | ''' 232 | :param e_dim: 输入向量的维度 233 | :param h_dim: 注意力层中间隐含层的维度 234 | :param n_heads: 多头注意力的头目数量 235 | :param n_layers: 解码层的数量 236 | :param n_classes: 类别数 237 | :param drop_rate: drop out的比例 238 | ''' 239 | super().__init__() 240 | # 初始化位置编码层 241 | self.position_encoding = PositionalEncoding( e_dim ) 242 | # 初始化N个“解码层” 243 | self.decoder_layers = nn.ModuleList( [DecoderLayer( e_dim, h_dim, n_heads, drop_rate ) 244 | for _ in range( n_layers )] ) 245 | # 线性层 246 | self.linear = nn.Linear(e_dim,n_classes) 247 | # softmax激活函数 248 | self.softmax = nn.Softmax() 249 | 250 | 251 | def forward( self, seq_inputs, querys ): 252 | ''' 253 | :param seq_inputs: 已经经过Embedding层的张量,维度是[ batch, seq_lens, dim ] 254 | :param querys: encoder的输出,维度是[ batch, seq_lens, dim ] 255 | :return: 与输入张量维度一样的张量,维度是[ batch, seq_lens, dim ] 256 | ''' 257 | # 先进行位置编码 258 | seq_inputs = self.position_encoding( seq_inputs ) 259 | # 得到mask序列 260 | mask = subsequent_mask( seq_inputs.shape[1] ) 261 | # 输入进N个“解码层”中开始传播 262 | for layer in self.decoder_layers: 263 | seq_inputs = layer( seq_inputs, querys, mask ) 264 | # 最终线性变化后Softmax归一化 265 | seq_outputs = self.softmax(self.linear(seq_inputs)) 266 | return seq_outputs 267 | 268 | class Transformer(nn.Module): 269 | 270 | def __init__(self,e_dim, h_dim, n_heads, n_layers, n_classes,drop_rate=0.1): 271 | super().__init__() 272 | self.encoder = TransformerEncoder(e_dim, h_dim, n_heads, n_layers, drop_rate) 273 | self.decoder = TransformerDecoder(e_dim, h_dim, n_heads, n_layers, n_classes,drop_rate) 274 | 275 | def forward(self,input,output): 276 | querys = self.encoder(input) 277 | pred_seqs = self.decoder(output,querys) 278 | return pred_seqs 279 | 280 | 281 | 282 | if __name__ == '__main__': 283 | input = torch.randn( 5, 3, 12 ) 284 | net = Transformer(12, 8, 3, 6, 20) 285 | output = torch.randn( 5, 3, 12) 286 | pred_seqs = net(input,output) 287 | 288 | print(pred_seqs.shape) --------------------------------------------------------------------------------