├── .gitignore ├── BERT.py ├── BERT_window_mask.py ├── BLEU.py ├── CBOW.py ├── ELMo.py ├── GPT.py ├── LICENSE ├── README.md ├── cnn-lm.py ├── pytorch ├── BERT.py ├── CBOW.py ├── ELMo.py ├── GPT.py ├── README.md ├── __pycache__ │ ├── transformer.cpython-37.pyc │ ├── transformer.cpython-38.pyc │ ├── utils.cpython-37.pyc │ └── utils.cpython-38.pyc ├── cnn_lm.py ├── seq2seq.py ├── seq2seq_attention.py ├── skip_gram.py ├── transformer.py ├── utils.py └── visual.py ├── requirements.txt ├── seq2seq.py ├── seq2seq_attention.py ├── simple_realize ├── CBOW.py ├── README.md ├── cnn-lm.py ├── imgs │ ├── attention.gif │ ├── seq2seq-embedding.gif │ └── skip-gram.gif ├── seq2seq.py ├── seq2seq_attention.py ├── skip-gram.py └── transformer.py ├── skip-gram.py ├── tf_idf.py ├── tf_idf_sklearn.py ├── transformer.py ├── utils.py └── visual.py /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_* 2 | .idea/ 3 | visual/ 4 | MRPC/ 5 | img/ 6 | -------------------------------------------------------------------------------- /BERT.py: -------------------------------------------------------------------------------- 1 | # [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/pdf/1810.04805.pdf) 2 | import numpy as np 3 | import tensorflow as tf 4 | import utils # this refers to utils.py in my [repo](https://github.com/MorvanZhou/NLP-Tutorials/) 5 | import time 6 | from GPT import GPT 7 | import os 8 | import pickle 9 | 10 | 11 | class BERT(GPT): 12 | def __init__(self, model_dim, max_len, n_layer, n_head, n_vocab, lr, max_seg=3, drop_rate=0.1, padding_idx=0): 13 | super().__init__(model_dim, max_len, n_layer, n_head, n_vocab, lr, max_seg, drop_rate, padding_idx) 14 | # I think task emb is not necessary for pretraining, 15 | # because the aim of all tasks is to train a universal sentence embedding 16 | # the body encoder is the same across all tasks, 17 | # and different output layer defines different task just like transfer learning. 18 | # finetuning replaces output layer and leaves the body encoder unchanged. 19 | 20 | # self.task_emb = keras.layers.Embedding( 21 | # input_dim=n_task, output_dim=model_dim, # [n_task, dim] 22 | # embeddings_initializer=tf.initializers.RandomNormal(0., 0.01), 23 | # ) 24 | 25 | def step(self, seqs, segs, seqs_, loss_mask, nsp_labels): 26 | with tf.GradientTape() as tape: 27 | mlm_logits, nsp_logits = self.call(seqs, segs, training=True) 28 | mlm_loss_batch = tf.boolean_mask(self.cross_entropy(seqs_, mlm_logits), loss_mask) 29 | mlm_loss = tf.reduce_mean(mlm_loss_batch) 30 | nsp_loss = tf.reduce_mean(self.cross_entropy(nsp_labels, nsp_logits)) 31 | loss = mlm_loss + 0.2 * nsp_loss 32 | grads = tape.gradient(loss, self.trainable_variables) 33 | self.opt.apply_gradients(zip(grads, self.trainable_variables)) 34 | return loss, mlm_logits 35 | 36 | def mask(self, seqs): 37 | mask = tf.cast(tf.math.equal(seqs, self.padding_idx), tf.float32) 38 | return mask[:, tf.newaxis, tf.newaxis, :] # [n, 1, 1, step] 39 | 40 | 41 | def _get_loss_mask(len_arange, seq, pad_id): 42 | rand_id = np.random.choice(len_arange, size=max(2, int(MASK_RATE * len(len_arange))), replace=False) 43 | loss_mask = np.full_like(seq, pad_id, dtype=np.bool) 44 | loss_mask[rand_id] = True 45 | return loss_mask[None, :], rand_id 46 | 47 | 48 | def do_mask(seq, len_arange, pad_id, mask_id): 49 | loss_mask, rand_id = _get_loss_mask(len_arange, seq, pad_id) 50 | seq[rand_id] = mask_id 51 | return loss_mask 52 | 53 | 54 | def do_replace(seq, len_arange, pad_id, word_ids): 55 | loss_mask, rand_id = _get_loss_mask(len_arange, seq, pad_id) 56 | seq[rand_id] = np.random.choice(word_ids, size=len(rand_id)) 57 | return loss_mask 58 | 59 | 60 | def do_nothing(seq, len_arange, pad_id): 61 | loss_mask, _ = _get_loss_mask(len_arange, seq, pad_id) 62 | return loss_mask 63 | 64 | 65 | def random_mask_or_replace(data, arange, batch_size): 66 | seqs, segs, xlen, nsp_labels = data.sample(batch_size) 67 | seqs_ = seqs.copy() 68 | p = np.random.random() 69 | if p < 0.7: 70 | # mask 71 | loss_mask = np.concatenate( 72 | [do_mask( 73 | seqs[i], 74 | np.concatenate((arange[:xlen[i, 0]], arange[xlen[i, 0] + 1:xlen[i].sum() + 1])), 75 | data.pad_id, 76 | data.v2i[""]) for i in range(len(seqs))], axis=0) 77 | elif p < 0.85: 78 | # do nothing 79 | loss_mask = np.concatenate( 80 | [do_nothing( 81 | seqs[i], 82 | np.concatenate((arange[:xlen[i, 0]], arange[xlen[i, 0] + 1:xlen[i].sum() + 1])), 83 | data.pad_id) for i in range(len(seqs))], axis=0) 84 | else: 85 | # replace 86 | loss_mask = np.concatenate( 87 | [do_replace( 88 | seqs[i], 89 | np.concatenate((arange[:xlen[i, 0]], arange[xlen[i, 0] + 1:xlen[i].sum() + 1])), 90 | data.pad_id, 91 | data.word_ids) for i in range(len(seqs))], axis=0) 92 | return seqs, segs, seqs_, loss_mask, xlen, nsp_labels 93 | 94 | 95 | def train(model, data, step=10000, name="bert"): 96 | t0 = time.time() 97 | arange = np.arange(0, data.max_len) 98 | for t in range(step): 99 | seqs, segs, seqs_, loss_mask, xlen, nsp_labels = random_mask_or_replace(data, arange, 16) 100 | loss, pred = model.step(seqs, segs, seqs_, loss_mask, nsp_labels) 101 | if t % 100 == 0: 102 | pred = pred[0].numpy().argmax(axis=1) 103 | t1 = time.time() 104 | print( 105 | "\n\nstep: ", t, 106 | "| time: %.2f" % (t1 - t0), 107 | "| loss: %.3f" % loss.numpy(), 108 | "\n| tgt: ", " ".join([data.i2v[i] for i in seqs[0][:xlen[0].sum()+1]]), 109 | "\n| prd: ", " ".join([data.i2v[i] for i in pred[:xlen[0].sum()+1]]), 110 | "\n| tgt word: ", [data.i2v[i] for i in seqs_[0]*loss_mask[0] if i != data.v2i[""]], 111 | "\n| prd word: ", [data.i2v[i] for i in pred*loss_mask[0] if i != data.v2i[""]], 112 | ) 113 | t0 = t1 114 | os.makedirs("./visual/models/%s" % name, exist_ok=True) 115 | model.save_weights("./visual/models/%s/model.ckpt" % name) 116 | 117 | 118 | def export_attention(model, data, name="bert"): 119 | model.load_weights("./visual/models/%s/model.ckpt" % name) 120 | 121 | # save attention matrix for visualization 122 | seqs, segs, xlen, nsp_labels = data.sample(32) 123 | model.call(seqs, segs, False) 124 | data = {"src": [[data.i2v[i] for i in seqs[j]] for j in range(len(seqs))], "attentions": model.attentions} 125 | path = "./visual/tmp/%s_attention_matrix.pkl" % name 126 | os.makedirs(os.path.dirname(path), exist_ok=True) 127 | with open(path, "wb") as f: 128 | pickle.dump(data, f) 129 | 130 | 131 | if __name__ == "__main__": 132 | utils.set_soft_gpu(True) 133 | MODEL_DIM = 256 134 | N_LAYER = 4 135 | LEARNING_RATE = 1e-4 136 | MASK_RATE = 0.15 137 | 138 | d = utils.MRPCData("./MRPC", 2000) 139 | print("num word: ", d.num_word) 140 | m = BERT( 141 | model_dim=MODEL_DIM, max_len=d.max_len, n_layer=N_LAYER, n_head=4, n_vocab=d.num_word, 142 | lr=LEARNING_RATE, max_seg=d.num_seg, drop_rate=0.2, padding_idx=d.v2i[""]) 143 | train(m, d, step=10000, name="bert") 144 | export_attention(m, d, "bert") 145 | 146 | -------------------------------------------------------------------------------- /BERT_window_mask.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from GPT import GPT, train, export_attention 3 | import utils # this refers to utils.py in my [repo](https://github.com/MorvanZhou/NLP-Tutorials/) 4 | 5 | 6 | class BERT(GPT): 7 | def __init__(self, model_dim, max_len, n_layer, n_head, n_vocab, lr, max_seg=3, drop_rate=0.1, padding_idx=0): 8 | super().__init__(model_dim, max_len, n_layer, n_head, n_vocab, lr, max_seg, drop_rate, padding_idx) 9 | 10 | def mask(self, seqs): 11 | """ 12 | abcd-- 13 | a010011 14 | b001011 15 | c000111 16 | d000011 17 | -000001 18 | -000000 19 | 20 | a is a embedding for a-cd 21 | b is a embedding for ab-d 22 | c is a embedding for abc- 23 | later, b embedding will + another b embedding from previous residual input to predict c 24 | """ 25 | eye = tf.eye(self.max_len+1, batch_shape=[len(seqs)], dtype=tf.float32)[:, 1:, :-1] 26 | pad = tf.math.equal(seqs, self.padding_idx) 27 | mask = tf.where(pad[:, tf.newaxis, tf.newaxis, :], 1, eye[:, tf.newaxis, :, :]) 28 | return mask # [n, 1, step, step] 29 | 30 | 31 | if __name__ == "__main__": 32 | utils.set_soft_gpu(True) 33 | MODEL_DIM = 256 34 | N_LAYER = 4 35 | LEARNING_RATE = 1e-4 36 | d = utils.MRPCData("./MRPC", 2000) 37 | print("num word: ", d.num_word) 38 | m = BERT( 39 | model_dim=MODEL_DIM, max_len=d.max_len - 1, n_layer=N_LAYER, n_head=4, n_vocab=d.num_word, 40 | lr=LEARNING_RATE, max_seg=d.num_seg, drop_rate=0.2, padding_idx=d.pad_id) 41 | train(m, d, step=5000, name="bert_window_mask") 42 | export_attention(m, d, "bert_window_mask") 43 | 44 | -------------------------------------------------------------------------------- /BLEU.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from collections import Counter 3 | 4 | 5 | def BLEU(references, generated, max_grams=4, weights=None): 6 | ref_list = [ref.lower().split(" ") for ref in references] 7 | gen = generated.lower().split(" ") 8 | cpn = np.empty((max_grams,), dtype=np.float32) 9 | for n in range(1, max_grams+1): 10 | gen_gram = [" ".join(gen[i:i+n]) for i in range(0, len(gen)-n+1)] 11 | refs_gram = [[" ".join(ref[i:i+n]) for i in range(0, len(ref)-n+1)] for ref in ref_list] 12 | g_counter = Counter(gen_gram) 13 | r_counters = [Counter(ref_gram) for ref_gram in refs_gram] 14 | count_clip = 0 15 | for k, v in g_counter.items(): 16 | count_clip += min(v, max([r.get(k, 0) for r in r_counters])) 17 | cpn[n-1] = count_clip/sum(g_counter.values()) 18 | 19 | ls = len(gen) 20 | lc = max([len(ref) for ref in ref_list]) 21 | brevity_penalty = 1 if lc > ls else np.exp(1-ls/lc) 22 | 23 | if weights is None: 24 | weights = np.ones_like(cpn) 25 | bleu = brevity_penalty * np.exp(np.mean(weights * np.log(cpn))) 26 | return bleu 27 | 28 | 29 | bleu = BLEU(["The cat is on the mat", "There is a cat on the mat"], "The cat is on the mat", 3) 30 | print(bleu) -------------------------------------------------------------------------------- /CBOW.py: -------------------------------------------------------------------------------- 1 | # [Efficient Estimation of Word Representations in Vector Space](https://arxiv.org/pdf/1301.3781.pdf) 2 | from tensorflow import keras 3 | import tensorflow as tf 4 | from utils import process_w2v_data # this refers to utils.py in my [repo](https://github.com/MorvanZhou/NLP-Tutorials/) 5 | from visual import show_w2v_word_embedding # this refers to visual.py in my [repo](https://github.com/MorvanZhou/NLP-Tutorials/) 6 | 7 | corpus = [ 8 | # numbers 9 | "5 2 4 8 6 2 3 6 4", 10 | "4 8 5 6 9 5 5 6", 11 | "1 1 5 2 3 3 8", 12 | "3 6 9 6 8 7 4 6 3", 13 | "8 9 9 6 1 4 3 4", 14 | "1 0 2 0 2 1 3 3 3 3 3", 15 | "9 3 3 0 1 4 7 8", 16 | "9 9 8 5 6 7 1 2 3 0 1 0", 17 | 18 | # alphabets, expecting that 9 is close to letters 19 | "a t g q e h 9 u f", 20 | "e q y u o i p s", 21 | "q o 9 p l k j o k k o p", 22 | "h g y i u t t a e q", 23 | "i k d q r e 9 e a d", 24 | "o p d g 9 s a f g a", 25 | "i u y g h k l a s w", 26 | "o l u y a o g f s", 27 | "o p i u y g d a s j d l", 28 | "u k i l o 9 l j s", 29 | "y g i s h k j l f r f", 30 | "i o h n 9 9 d 9 f a 9", 31 | ] 32 | 33 | 34 | class CBOW(keras.Model): 35 | def __init__(self, v_dim, emb_dim): 36 | super().__init__() 37 | self.v_dim = v_dim 38 | self.embeddings = keras.layers.Embedding( 39 | input_dim=v_dim, output_dim=emb_dim, # [n_vocab, emb_dim] 40 | embeddings_initializer=keras.initializers.RandomNormal(0., 0.1), 41 | ) 42 | 43 | # noise-contrastive estimation 44 | self.nce_w = self.add_weight( 45 | name="nce_w", shape=[v_dim, emb_dim], 46 | initializer=keras.initializers.TruncatedNormal(0., 0.1)) # [n_vocab, emb_dim] 47 | self.nce_b = self.add_weight( 48 | name="nce_b", shape=(v_dim,), 49 | initializer=keras.initializers.Constant(0.1)) # [n_vocab, ] 50 | 51 | self.opt = keras.optimizers.Adam(0.01) 52 | 53 | def call(self, x, training=None, mask=None): 54 | # x.shape = [n, skip_window*2] 55 | o = self.embeddings(x) # [n, skip_window*2, emb_dim] 56 | o = tf.reduce_mean(o, axis=1) # [n, emb_dim] 57 | return o 58 | 59 | # negative sampling: take one positive label and num_sampled negative labels to compute the loss 60 | # in order to reduce the computation of full softmax 61 | def loss(self, x, y, training=None): 62 | embedded = self.call(x, training) 63 | return tf.reduce_mean( 64 | tf.nn.nce_loss( 65 | weights=self.nce_w, biases=self.nce_b, labels=tf.expand_dims(y, axis=1), 66 | inputs=embedded, num_sampled=5, num_classes=self.v_dim)) 67 | 68 | def step(self, x, y): 69 | with tf.GradientTape() as tape: 70 | loss = self.loss(x, y, True) 71 | grads = tape.gradient(loss, self.trainable_variables) 72 | self.opt.apply_gradients(zip(grads, self.trainable_variables)) 73 | return loss.numpy() 74 | 75 | 76 | def train(model, data): 77 | for t in range(2500): 78 | bx, by = data.sample(8) 79 | loss = model.step(bx, by) 80 | if t % 200 == 0: 81 | print("step: {} | loss: {}".format(t, loss)) 82 | 83 | 84 | if __name__ == "__main__": 85 | d = process_w2v_data(corpus, skip_window=2, method="cbow") 86 | m = CBOW(d.num_word, 2) 87 | train(m, d) 88 | 89 | # plotting 90 | show_w2v_word_embedding(m, d, "./visual/results/cbow.png") -------------------------------------------------------------------------------- /ELMo.py: -------------------------------------------------------------------------------- 1 | # [Deep contextualized word representations](https://arxiv.org/pdf/1802.05365.pdf) 2 | from tensorflow import keras 3 | import tensorflow as tf 4 | import utils # this refers to utils.py in my [repo](https://github.com/MorvanZhou/NLP-Tutorials/) 5 | import time 6 | import os 7 | 8 | 9 | class ELMo(keras.Model): 10 | def __init__(self, v_dim, emb_dim, units, n_layers, lr): 11 | super().__init__() 12 | self.n_layers = n_layers 13 | self.units = units 14 | 15 | # encoder 16 | self.word_embed = keras.layers.Embedding( 17 | input_dim=v_dim, output_dim=emb_dim, # [n_vocab, emb_dim] 18 | embeddings_initializer=keras.initializers.RandomNormal(0., 0.001), 19 | mask_zero=True, 20 | ) 21 | # forward lstm 22 | self.fs = [keras.layers.LSTM(units, return_sequences=True) for _ in range(n_layers)] 23 | self.f_logits = keras.layers.Dense(v_dim) 24 | # backward lstm 25 | self.bs = [keras.layers.LSTM(units, return_sequences=True, go_backwards=True) for _ in range(n_layers)] 26 | self.b_logits = keras.layers.Dense(v_dim) 27 | 28 | self.cross_entropy1 = keras.losses.SparseCategoricalCrossentropy(from_logits=True) 29 | self.cross_entropy2 = keras.losses.SparseCategoricalCrossentropy(from_logits=True) 30 | self.opt = keras.optimizers.Adam(lr) 31 | 32 | def call(self, seqs): 33 | embedded = self.word_embed(seqs) # [n, step, dim] 34 | """ 35 | 0123 forward 36 | 1234 forward predict 37 | 1234 backward 38 | 0123 backward predict 39 | """ 40 | mask = self.word_embed.compute_mask(seqs) 41 | fxs, bxs = [embedded[:, :-1]], [embedded[:, 1:]] 42 | for fl, bl in zip(self.fs, self.bs): 43 | fx = fl( 44 | fxs[-1], mask=mask[:, :-1], initial_state=fl.get_initial_state(fxs[-1]) 45 | ) # [n, step-1, dim] 46 | bx = bl( 47 | bxs[-1], mask=mask[:, 1:], initial_state=bl.get_initial_state(bxs[-1]) 48 | ) # [n, step-1, dim] 49 | fxs.append(fx) # predict 1234 50 | bxs.append(tf.reverse(bx, axis=[1])) # predict 0123 51 | return fxs, bxs 52 | 53 | def step(self, seqs): 54 | with tf.GradientTape() as tape: 55 | fxs, bxs = self.call(seqs) 56 | fo, bo = self.f_logits(fxs[-1]), self.b_logits(bxs[-1]) 57 | loss = (self.cross_entropy1(seqs[:, 1:], fo) + self.cross_entropy2(seqs[:, :-1], bo))/2 58 | grads = tape.gradient(loss, self.trainable_variables) 59 | self.opt.apply_gradients(zip(grads, self.trainable_variables)) 60 | return loss, (fo, bo) 61 | 62 | def get_emb(self, seqs): 63 | fxs, bxs = self.call(seqs) 64 | xs = [ 65 | tf.concat((fxs[0][:, 1:, :], bxs[0][:, :-1, :]), axis=2).numpy() # from word embedding 66 | ] + [ 67 | tf.concat((f[:, :-1, :], b[:, 1:, :]), axis=2).numpy() for f, b in zip(fxs[1:], bxs[1:])] # from sentence embedding 68 | for x in xs: 69 | print("layers shape=", x.shape) 70 | return xs 71 | 72 | 73 | def train(model, data, step): 74 | t0 = time.time() 75 | for t in range(step): 76 | seqs = data.sample(BATCH_SIZE) 77 | loss, (fo, bo) = model.step(seqs) 78 | if t % 80 == 0: 79 | fp = fo[0].numpy().argmax(axis=1) 80 | bp = bo[0].numpy().argmax(axis=1) 81 | t1 = time.time() 82 | print( 83 | "\n\nstep: ", t, 84 | "| time: %.2f" % (t1 - t0), 85 | "| loss: %.3f" % loss.numpy(), 86 | "\n| tgt: ", " ".join([data.i2v[i] for i in seqs[0] if i != data.pad_id]), 87 | "\n| f_prd: ", " ".join([data.i2v[i] for i in fp if i != data.pad_id]), 88 | "\n| b_prd: ", " ".join([data.i2v[i] for i in bp if i != data.pad_id]), 89 | ) 90 | t0 = t1 91 | os.makedirs("./visual/models/elmo", exist_ok=True) 92 | model.save_weights("./visual/models/elmo/model.ckpt") 93 | 94 | 95 | def export_w2v(model, data): 96 | model.load_weights("./visual/models/elmo/model.ckpt") 97 | emb = model.get_emb(data.sample(4)) 98 | print(emb) 99 | 100 | 101 | if __name__ == "__main__": 102 | utils.set_soft_gpu(True) 103 | UNITS = 256 104 | N_LAYERS = 2 105 | BATCH_SIZE = 16 106 | LEARNING_RATE = 2e-3 107 | d = utils.MRPCSingle("./MRPC", rows=2000) 108 | print("num word: ", d.num_word) 109 | m = ELMo(d.num_word, emb_dim=UNITS, units=UNITS, n_layers=N_LAYERS, lr=LEARNING_RATE) 110 | train(m, d, 10000) 111 | export_w2v(m, d) -------------------------------------------------------------------------------- /GPT.py: -------------------------------------------------------------------------------- 1 | # [Improving Language Understanding by Generative Pre-Training](https://cdn.openai.com/research-covers/language-unsupervised/language_understanding_paper.pdf) 2 | import tensorflow as tf 3 | from tensorflow import keras 4 | import utils # this refers to utils.py in my [repo](https://github.com/MorvanZhou/NLP-Tutorials/) 5 | import time 6 | from transformer import Encoder 7 | import pickle 8 | import os 9 | 10 | 11 | class GPT(keras.Model): 12 | def __init__(self, model_dim, max_len, n_layer, n_head, n_vocab, lr, max_seg=3, drop_rate=0.1, padding_idx=0): 13 | super().__init__() 14 | self.padding_idx = padding_idx 15 | self.n_vocab = n_vocab 16 | self.max_len = max_len 17 | 18 | # I think task emb is not necessary for pretraining, 19 | # because the aim of all tasks is to train a universal sentence embedding 20 | # the body encoder is the same across all tasks, 21 | # and different output layer defines different task just like transfer learning. 22 | # finetuning replaces output layer and leaves the body encoder unchanged. 23 | 24 | # self.task_emb = keras.layers.Embedding( 25 | # input_dim=n_task, output_dim=model_dim, # [n_task, dim] 26 | # embeddings_initializer=tf.initializers.RandomNormal(0., 0.01), 27 | # ) 28 | 29 | self.word_emb = keras.layers.Embedding( 30 | input_dim=n_vocab, output_dim=model_dim, # [n_vocab, dim] 31 | embeddings_initializer=tf.initializers.RandomNormal(0., 0.01), 32 | ) 33 | self.segment_emb = keras.layers.Embedding( 34 | input_dim=max_seg, output_dim=model_dim, # [max_seg, dim] 35 | embeddings_initializer=tf.initializers.RandomNormal(0., 0.01), 36 | ) 37 | self.position_emb = self.add_weight( 38 | name="pos", shape=[1, max_len, model_dim], dtype=tf.float32, # [1, step, dim] 39 | initializer=keras.initializers.RandomNormal(0., 0.01)) 40 | self.encoder = Encoder(n_head, model_dim, drop_rate, n_layer) 41 | self.task_mlm = keras.layers.Dense(n_vocab) 42 | self.task_nsp = keras.layers.Dense(2) 43 | 44 | self.cross_entropy = keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction="none") 45 | self.opt = keras.optimizers.Adam(lr) 46 | 47 | def call(self, seqs, segs, training=False): 48 | embed = self.input_emb(seqs, segs) # [n, step, dim] 49 | z = self.encoder(embed, training=training, mask=self.mask(seqs)) # [n, step, dim] 50 | mlm_logits = self.task_mlm(z) # [n, step, n_vocab] 51 | nsp_logits = self.task_nsp(tf.reshape(z, [z.shape[0], -1])) # [n, n_cls] 52 | return mlm_logits, nsp_logits 53 | 54 | def step(self, seqs, segs, seqs_, nsp_labels): 55 | with tf.GradientTape() as tape: 56 | mlm_logits, nsp_logits = self.call(seqs, segs, training=True) 57 | pad_mask = tf.math.not_equal(seqs_, self.padding_idx) 58 | pred_loss = tf.reduce_mean(tf.boolean_mask(self.cross_entropy(seqs_, mlm_logits), pad_mask)) 59 | nsp_loss = tf.reduce_mean(self.cross_entropy(nsp_labels, nsp_logits)) 60 | loss = pred_loss + 0.2 * nsp_loss 61 | grads = tape.gradient(loss, self.trainable_variables) 62 | self.opt.apply_gradients(zip(grads, self.trainable_variables)) 63 | return loss, mlm_logits 64 | 65 | def input_emb(self, seqs, segs): 66 | return self.word_emb(seqs) + self.segment_emb(segs) + self.position_emb # [n, step, dim] 67 | 68 | def mask(self, seqs): 69 | """ 70 | abcd-- 71 | a011111 72 | b001111 73 | c000111 74 | d000011 75 | -000011 76 | -000011 77 | 78 | force head not to see afterward. eg. 79 | a is a embedding for a--- 80 | b is a embedding for ab-- 81 | c is a embedding for abc- 82 | later, b embedding will + b another embedding from previous residual input to predict c 83 | """ 84 | mask = 1 - tf.linalg.band_part(tf.ones((self.max_len, self.max_len)), -1, 0) 85 | pad = tf.math.equal(seqs, self.padding_idx) 86 | mask = tf.where(pad[:, tf.newaxis, tf.newaxis, :], 1, mask[tf.newaxis, tf.newaxis, :, :]) 87 | return mask # (step, step) 88 | 89 | @property 90 | def attentions(self): 91 | attentions = { 92 | "encoder": [l.mh.attention.numpy() for l in self.encoder.ls], 93 | } 94 | return attentions 95 | 96 | 97 | def train(model, data, step=10000, name="gpt"): 98 | t0 = time.time() 99 | for t in range(step): 100 | seqs, segs, xlen, nsp_labels = data.sample(16) 101 | loss, pred = model.step(seqs[:, :-1], segs[:, :-1], seqs[:, 1:], nsp_labels) 102 | if t % 100 == 0: 103 | pred = pred[0].numpy().argmax(axis=1) 104 | t1 = time.time() 105 | print( 106 | "\n\nstep: ", t, 107 | "| time: %.2f" % (t1 - t0), 108 | "| loss: %.3f" % loss.numpy(), 109 | "\n| tgt: ", " ".join([data.i2v[i] for i in seqs[0, 1:][:xlen[0].sum()+1]]), 110 | "\n| prd: ", " ".join([data.i2v[i] for i in pred[:xlen[0].sum()+1]]), 111 | ) 112 | t0 = t1 113 | os.makedirs("./visual/models/%s" % name, exist_ok=True) 114 | model.save_weights("./visual/models/%s/model.ckpt" % name) 115 | 116 | 117 | def export_attention(model, data, name="gpt"): 118 | model.load_weights("./visual/models/%s/model.ckpt" % name) 119 | 120 | # save attention matrix for visualization 121 | seqs, segs, xlen, nsp_labels = data.sample(32) 122 | model.call(seqs[:, :-1], segs[:, :-1], False) 123 | data = {"src": [[data.i2v[i] for i in seqs[j]] for j in range(len(seqs))], "attentions": model.attentions} 124 | path = "./visual/tmp/%s_attention_matrix.pkl" % name 125 | os.makedirs(os.path.dirname(path), exist_ok=True) 126 | with open(path, "wb") as f: 127 | pickle.dump(data, f) 128 | 129 | 130 | if __name__ == "__main__": 131 | utils.set_soft_gpu(True) 132 | MODEL_DIM = 256 133 | N_LAYER = 4 134 | LEARNING_RATE = 1e-4 135 | 136 | d = utils.MRPCData("./MRPC", 2000) 137 | print("num word: ", d.num_word) 138 | m = GPT( 139 | model_dim=MODEL_DIM, max_len=d.max_len - 1, n_layer=N_LAYER, n_head=4, n_vocab=d.num_word, 140 | lr=LEARNING_RATE, max_seg=d.num_seg, drop_rate=0.2, padding_idx=d.pad_id) 141 | train(m, d, step=5000, name="gpt") 142 | export_attention(m, d, name="gpt") 143 | 144 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Andrew Gambardella 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Natural Language Processing Tutorial 2 | 3 | Tutorial in Chinese can be found in [mofanpy.com](https://mofanpy.com/tutorials/machine-learning/nlp/). 4 | 5 | This repo includes many simple implementations of models in Neural Language Processing (NLP). 6 | 7 | All code implementations in this tutorial are organized as following: 8 | 9 | 1. Search Engine 10 | - [TF-IDF numpy / TF-IDF skearn](#TF-IDF) 11 | 2. Understand Word (W2V) 12 | - [Continuous Bag of Words (CBOW)](#Word2Vec) 13 | - [Skip-Gram](#Word2Vec) 14 | 3. Understand Sentence (Seq2Seq) 15 | - [seq2seq](#Seq2Seq) 16 | - [CNN language model](#CNNLanguageModel) 17 | 4. All about Attention 18 | - [seq2seq with attention](#Seq2SeqAttention) 19 | - [Transformer](#Transformer) 20 | 5. Pretrained Models 21 | - [ELMo](#ELMO) 22 | - [GPT](#GPT) 23 | - [BERT](#BERT) 24 | 25 | Thanks for the contribution made by [@W1Fl](https://github.com/W1Fl) with a simplified keras codes in [simple_realize](simple_realize). 26 | And the a [pytorch version of this NLP](/pytorch) tutorial made by [@ruifanxu](https://github.com/ruifan831). 27 | 28 | ## Installation 29 | 30 | ```shell script 31 | $ git clone https://github.com/MorvanZhou/NLP-Tutorials 32 | $ cd NLP-Tutorials/ 33 | $ sudo pip3 install -r requirements.txt 34 | ``` 35 | 36 | 37 | ## TF-IDF 38 | 39 | TF-IDF numpy [code](tf_idf.py) 40 | 41 | TF-IDF short sklearn [code](tf_idf_sklearn.py) 42 | 43 | 44 | image 45 | 46 | 47 | 48 | ## Word2Vec 49 | [Efficient Estimation of Word Representations in Vector Space](https://arxiv.org/pdf/1301.3781.pdf) 50 | 51 | Skip-Gram [code](skip-gram.py) 52 | 53 | CBOW [code](CBOW.py) 54 | 55 | 56 | image 57 | 58 | 59 | 60 | image 61 | 62 | 63 | 64 | image 65 | 66 | 67 | 68 | ## Seq2Seq 69 | [Sequence to Sequence Learning with Neural Networks](https://papers.nips.cc/paper/5346-sequence-to-sequence-learning-with-neural-networks.pdf) 70 | 71 | Seq2Seq [code](seq2seq.py) 72 | 73 | 74 | image 75 | 76 | 77 | ## CNNLanguageModel 78 | [Convolutional Neural Networks for Sentence Classification](https://arxiv.org/pdf/1408.5882.pdf) 79 | 80 | CNN language model [code](cnn-lm.py) 81 | 82 | 83 | image 84 | 85 | 86 | 87 | ## Seq2SeqAttention 88 | [Effective Approaches to Attention-based Neural Machine Translation](https://arxiv.org/pdf/1508.04025.pdf) 89 | 90 | Seq2Seq Attention [code](seq2seq_attention.py) 91 | 92 | 93 | image 94 | 95 | 96 | image 97 | 98 | 99 | 100 | 101 | ## Transformer 102 | [Attention Is All You Need](https://arxiv.org/pdf/1706.03762.pdf) 103 | 104 | Transformer [code](transformer.py) 105 | 106 | 107 | image 108 | 109 | 110 | image 111 | 112 | 113 | image 114 | 115 | 116 | 117 | ## ELMO 118 | [Deep contextualized word representations](https://arxiv.org/pdf/1802.05365.pdf) 119 | 120 | ELMO [code](ELMo.py) 121 | 122 | 123 | image 124 | 125 | 126 | image 127 | 128 | 129 | 130 | ## GPT 131 | [Improving Language Understanding by Generative Pre-Training](https://cdn.openai.com/research-covers/language-unsupervised/language_understanding_paper.pdf) 132 | 133 | GPT [code](GPT.py) 134 | 135 | 136 | image 137 | 138 | 139 | image 140 | 141 | 142 | 143 | ## BERT 144 | [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/pdf/1810.04805.pdf) 145 | 146 | BERT [code](BERT.py) 147 | 148 | My new attempt [Bert with window mask](BERT_window_mask.py) 149 | 150 | 151 | image 152 | 153 | 154 | image 155 | 156 | 157 | -------------------------------------------------------------------------------- /cnn-lm.py: -------------------------------------------------------------------------------- 1 | # a modification from [Convolutional Neural Networks for Sentence Classification](https://arxiv.org/pdf/1408.5882.pdf) 2 | 3 | import tensorflow as tf 4 | from tensorflow import keras 5 | import numpy as np 6 | import utils # this refers to utils.py in my [repo](https://github.com/MorvanZhou/NLP-Tutorials/) 7 | import tensorflow_addons as tfa 8 | 9 | 10 | class CNNTranslation(keras.Model): 11 | def __init__(self, enc_v_dim, dec_v_dim, emb_dim, units, max_pred_len, start_token, end_token): 12 | super().__init__() 13 | self.units = units 14 | 15 | # encoder 16 | self.enc_embeddings = keras.layers.Embedding( 17 | input_dim=enc_v_dim, output_dim=emb_dim, # [enc_n_vocab, emb_dim] 18 | embeddings_initializer=tf.initializers.RandomNormal(0., 0.1), 19 | ) 20 | self.conv2ds = [ 21 | keras.layers.Conv2D(16, (n, emb_dim), padding="valid", activation=keras.activations.relu) 22 | for n in range(2, 5)] 23 | self.max_pools = [keras.layers.MaxPool2D((n, 1)) for n in [7, 6, 5]] 24 | self.encoder = keras.layers.Dense(units, activation=keras.activations.relu) 25 | 26 | # decoder 27 | self.dec_embeddings = keras.layers.Embedding( 28 | input_dim=dec_v_dim, output_dim=emb_dim, # [dec_n_vocab, emb_dim] 29 | embeddings_initializer=tf.initializers.RandomNormal(0., 0.1), 30 | ) 31 | self.decoder_cell = keras.layers.LSTMCell(units=units) 32 | decoder_dense = keras.layers.Dense(dec_v_dim) 33 | # train decoder 34 | self.decoder_train = tfa.seq2seq.BasicDecoder( 35 | cell=self.decoder_cell, 36 | sampler=tfa.seq2seq.sampler.TrainingSampler(), # sampler for train 37 | output_layer=decoder_dense 38 | ) 39 | # predict decoder 40 | self.decoder_eval = tfa.seq2seq.BasicDecoder( 41 | cell=self.decoder_cell, 42 | sampler=tfa.seq2seq.sampler.GreedyEmbeddingSampler(), # sampler for predict 43 | output_layer=decoder_dense 44 | ) 45 | 46 | self.cross_entropy = keras.losses.SparseCategoricalCrossentropy(from_logits=True) 47 | self.opt = keras.optimizers.Adam(0.01) 48 | self.max_pred_len = max_pred_len 49 | self.start_token = start_token 50 | self.end_token = end_token 51 | 52 | def encode(self, x): 53 | embedded = self.enc_embeddings(x) # [n, step, emb] 54 | o = tf.expand_dims(embedded, axis=3) # [n, step=8, emb=16, 1] 55 | co = [conv2d(o) for conv2d in self.conv2ds] # [n, 7, 1, 16], [n, 6, 1, 16], [n, 5, 1, 16] 56 | co = [self.max_pools[i](co[i]) for i in range(len(co))] # [n, 1, 1, 16] * 3 57 | co = [tf.squeeze(c, axis=[1, 2]) for c in co] # [n, 16] * 3 58 | o = tf.concat(co, axis=1) # [n, 16*3] 59 | h = self.encoder(o) # [n, units] 60 | return [h, h] 61 | 62 | def inference(self, x): 63 | s = self.encode(x) 64 | done, i, s = self.decoder_eval.initialize( 65 | self.dec_embeddings.variables[0], 66 | start_tokens=tf.fill([x.shape[0], ], self.start_token), 67 | end_token=self.end_token, 68 | initial_state=s, 69 | ) 70 | pred_id = np.zeros((x.shape[0], self.max_pred_len), dtype=np.int32) 71 | for l in range(self.max_pred_len): 72 | o, s, i, done = self.decoder_eval.step( 73 | time=l, inputs=i, state=s, training=False) 74 | pred_id[:, l] = o.sample_id 75 | return pred_id 76 | 77 | def train_logits(self, x, y, seq_len): 78 | s = self.encode(x) 79 | dec_in = y[:, :-1] # ignore 80 | dec_emb_in = self.dec_embeddings(dec_in) 81 | o, _, _ = self.decoder_train(dec_emb_in, s, sequence_length=seq_len) 82 | logits = o.rnn_output 83 | return logits 84 | 85 | def step(self, x, y, seq_len): 86 | with tf.GradientTape() as tape: 87 | logits = self.train_logits(x, y, seq_len) 88 | dec_out = y[:, 1:] # ignore 89 | loss = self.cross_entropy(dec_out, logits) 90 | grads = tape.gradient(loss, self.trainable_variables) 91 | self.opt.apply_gradients(zip(grads, self.trainable_variables)) 92 | return loss.numpy() 93 | 94 | 95 | def train(): 96 | # get and process data 97 | data = utils.DateData(4000) 98 | print("Chinese time order: yy/mm/dd ", data.date_cn[:3], "\nEnglish time order: dd/M/yyyy ", data.date_en[:3]) 99 | print("vocabularies: ", data.vocab) 100 | print("x index sample: \n{}\n{}".format(data.idx2str(data.x[0]), data.x[0]), 101 | "\ny index sample: \n{}\n{}".format(data.idx2str(data.y[0]), data.y[0])) 102 | 103 | model = CNNTranslation( 104 | data.num_word, data.num_word, emb_dim=16, units=32, 105 | max_pred_len=11, start_token=data.start_token, end_token=data.end_token) 106 | 107 | # training 108 | for t in range(1500): 109 | bx, by, decoder_len = data.sample(32) 110 | loss = model.step(bx, by, decoder_len) 111 | if t % 70 == 0: 112 | target = data.idx2str(by[0, 1:-1]) 113 | pred = model.inference(bx[0:1]) 114 | res = data.idx2str(pred[0]) 115 | src = data.idx2str(bx[0]) 116 | print( 117 | "t: ", t, 118 | "| loss: %.3f" % loss, 119 | "| input: ", src, 120 | "| target: ", target, 121 | "| inference: ", res, 122 | ) 123 | 124 | 125 | if __name__ == "__main__": 126 | train() 127 | -------------------------------------------------------------------------------- /pytorch/BERT.py: -------------------------------------------------------------------------------- 1 | from pickle import load 2 | import numpy as np 3 | from torch import nn 4 | import torch 5 | from torch.utils.data import DataLoader 6 | from torch.nn.functional import cross_entropy,softmax, relu 7 | 8 | import utils 9 | from GPT import GPT 10 | import os 11 | import pickle 12 | 13 | MASK_RATE = 0.15 14 | 15 | class BERT(GPT): 16 | 17 | def __init__( 18 | self, model_dim, max_len, num_layer, num_head, n_vocab, lr, 19 | max_seg=3, drop_rate=0.2, padding_idx=0) -> None: 20 | super().__init__(model_dim, max_len, num_layer, num_head, n_vocab, lr, max_seg, drop_rate, padding_idx) 21 | 22 | def step(self,seqs,segs,seqs_, loss_mask,nsp_labels): 23 | device = next(self.parameters()).device 24 | self.opt.zero_grad() 25 | mlm_logits, nsp_logits = self(seqs, segs, training=True) # [n, step, n_vocab], [n, n_cls] 26 | mlm_loss = cross_entropy( 27 | torch.masked_select(mlm_logits,loss_mask).reshape(-1,mlm_logits.shape[2]), 28 | torch.masked_select(seqs_,loss_mask.squeeze(2)) 29 | ) 30 | nsp_loss = cross_entropy(nsp_logits,nsp_labels.reshape(-1)) 31 | loss = mlm_loss + 0.2 * nsp_loss 32 | loss.backward() 33 | self.opt.step() 34 | return loss.cpu().data.numpy(),mlm_logits 35 | 36 | def mask(self, seqs): 37 | mask = torch.eq(seqs,self.padding_idx) 38 | return mask[:, None, None, :] 39 | 40 | def _get_loss_mask(len_arange, seq, pad_id): 41 | rand_id = np.random.choice(len_arange, size=max(2, int(MASK_RATE * len(len_arange))), replace=False) 42 | loss_mask = np.full_like(seq, pad_id, dtype=np.bool) 43 | loss_mask[rand_id] = True 44 | return loss_mask[None, :], rand_id 45 | 46 | def do_mask(seq, len_arange, pad_id, mask_id): 47 | loss_mask, rand_id = _get_loss_mask(len_arange, seq, pad_id) 48 | seq[rand_id] = mask_id 49 | return loss_mask 50 | 51 | def do_replace(seq, len_arange, pad_id, word_ids): 52 | loss_mask, rand_id = _get_loss_mask(len_arange, seq, pad_id) 53 | seq[rand_id] = torch.from_numpy(np.random.choice(word_ids, size=len(rand_id))).type(torch.IntTensor) 54 | return loss_mask 55 | 56 | def do_nothing(seq, len_arange, pad_id): 57 | loss_mask, _ = _get_loss_mask(len_arange, seq, pad_id) 58 | return loss_mask 59 | 60 | def random_mask_or_replace(data,arange,dataset): 61 | seqs, segs,xlen,nsp_labels = data 62 | seqs_ = seqs.data.clone() 63 | p = np.random.random() 64 | if p < 0.7: 65 | # mask 66 | loss_mask = np.concatenate([ 67 | do_mask( 68 | seqs[i], 69 | np.concatenate((arange[:xlen[i,0]],arange[xlen[i,0]+1:xlen[i].sum()+1])), 70 | dataset.pad_id, 71 | dataset.mask_id 72 | ) 73 | for i in range(len(seqs))], axis=0) 74 | elif p < 0.85: 75 | # do nothing 76 | loss_mask = np.concatenate([ 77 | do_nothing( 78 | seqs[i], 79 | np.concatenate((arange[:xlen[i,0]],arange[xlen[i,0]+1:xlen[i].sum()+1])), 80 | dataset.pad_id 81 | ) 82 | for i in range(len(seqs))], axis=0) 83 | else: 84 | # replace 85 | loss_mask = np.concatenate([ 86 | do_replace( 87 | seqs[i], 88 | np.concatenate((arange[:xlen[i,0]],arange[xlen[i,0]+1:xlen[i].sum()+1])), 89 | dataset.pad_id, 90 | dataset.word_ids 91 | ) 92 | for i in range(len(seqs))], axis=0) 93 | loss_mask = torch.from_numpy(loss_mask).unsqueeze(2) 94 | return seqs, segs, seqs_, loss_mask, xlen, nsp_labels 95 | 96 | def train(): 97 | MODEL_DIM = 256 98 | N_LAYER = 4 99 | LEARNING_RATE = 1e-4 100 | dataset = utils.MRPCData("./MRPC",2000) 101 | print("num word: ",dataset.num_word) 102 | model = BERT( 103 | model_dim=MODEL_DIM, max_len=dataset.max_len, num_layer=N_LAYER, num_head=4, n_vocab=dataset.num_word, 104 | lr=LEARNING_RATE, max_seg=dataset.num_seg, drop_rate=0.2, padding_idx=dataset.pad_id 105 | ) 106 | if torch.cuda.is_available(): 107 | print("GPU train avaliable") 108 | device =torch.device("cuda") 109 | model = model.cuda() 110 | else: 111 | device = torch.device("cpu") 112 | model = model.cpu() 113 | 114 | loader = DataLoader(dataset,batch_size=32,shuffle=True) 115 | arange = np.arange(0,dataset.max_len) 116 | for epoch in range(500): 117 | for batch_idx, batch in enumerate(loader): 118 | seqs, segs, seqs_, loss_mask, xlen, nsp_labels = random_mask_or_replace(batch,arange,dataset) 119 | seqs, segs, seqs_, nsp_labels, loss_mask = seqs.type(torch.LongTensor).to(device), segs.type(torch.LongTensor).to(device),seqs_.type(torch.LongTensor).to(device),nsp_labels.to(device),loss_mask.to(device) 120 | loss, pred = model.step(seqs, segs, seqs_, loss_mask, nsp_labels) 121 | if batch_idx % 100 == 0: 122 | pred = pred[0].cpu().data.numpy().argmax(axis=1) 123 | print( 124 | "\n\nEpoch: ",epoch, 125 | "|batch: ", batch_idx, 126 | "| loss: %.3f" % loss, 127 | "\n| tgt: ", " ".join([dataset.i2v[i] for i in seqs[0].cpu().data.numpy()[:xlen[0].sum()+1]]), 128 | "\n| prd: ", " ".join([dataset.i2v[i] for i in pred[:xlen[0].sum()+1]]), 129 | "\n| tgt word: ", [dataset.i2v[i] for i in (seqs_[0]*loss_mask[0].view(-1)).cpu().data.numpy() if i != dataset.v2i[""]], 130 | "\n| prd word: ", [dataset.i2v[i] for i in pred*(loss_mask[0].view(-1).cpu().data.numpy()) if i != dataset.v2i[""]], 131 | ) 132 | os.makedirs("./visual/models/bert",exist_ok=True) 133 | torch.save(model.state_dict(),"./visual/models/bert/model.pth") 134 | export_attention(model,device,dataset) 135 | 136 | def export_attention(model,device,data,name="bert"): 137 | model.load_state_dict(torch.load("./visual/models/bert/model.pth",map_location=device)) 138 | seqs, segs,xlen,nsp_labels = data[:32] 139 | seqs, segs,xlen,nsp_labels = torch.from_numpy(seqs),torch.from_numpy(segs),torch.from_numpy(xlen),torch.from_numpy(nsp_labels) 140 | seqs, segs,nsp_labels = seqs.type(torch.LongTensor).to(device), segs.type(torch.LongTensor).to(device),nsp_labels.to(device) 141 | model(seqs,segs,False) 142 | seqs = seqs.cpu().data.numpy() 143 | data = {"src": [[data.i2v[i] for i in seqs[j]] for j in range(len(seqs))], "attentions": model.attentions} 144 | path = "./visual/tmp/%s_attention_matrix.pkl" % name 145 | os.makedirs(os.path.dirname(path), exist_ok=True) 146 | with open(path, "wb") as f: 147 | pickle.dump(data, f) 148 | if __name__ == "__main__": 149 | train() -------------------------------------------------------------------------------- /pytorch/CBOW.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | import torch 3 | from torch.nn.functional import cross_entropy,softmax 4 | from utils import Dataset,process_w2v_data 5 | from visual import show_w2v_word_embedding 6 | 7 | corpus = [ 8 | # numbers 9 | "5 2 4 8 6 2 3 6 4", 10 | "4 8 5 6 9 5 5 6", 11 | "1 1 5 2 3 3 8", 12 | "3 6 9 6 8 7 4 6 3", 13 | "8 9 9 6 1 4 3 4", 14 | "1 0 2 0 2 1 3 3 3 3 3", 15 | "9 3 3 0 1 4 7 8", 16 | "9 9 8 5 6 7 1 2 3 0 1 0", 17 | 18 | # alphabets, expecting that 9 is close to letters 19 | "a t g q e h 9 u f", 20 | "e q y u o i p s", 21 | "q o 9 p l k j o k k o p", 22 | "h g y i u t t a e q", 23 | "i k d q r e 9 e a d", 24 | "o p d g 9 s a f g a", 25 | "i u y g h k l a s w", 26 | "o l u y a o g f s", 27 | "o p i u y g d a s j d l", 28 | "u k i l o 9 l j s", 29 | "y g i s h k j l f r f", 30 | "i o h n 9 9 d 9 f a 9", 31 | ] 32 | 33 | class CBOW(nn.Module): 34 | def __init__(self,v_dim,emb_dim): 35 | super().__init__() 36 | self.v_dim = v_dim 37 | self.embeddings = nn.Embedding(v_dim,emb_dim) 38 | self.embeddings.weight.data.normal_(0,0.1) 39 | 40 | # self.opt = torch.optim.Adam(0.01) 41 | self.hidden_out = nn.Linear(emb_dim,v_dim) 42 | self.opt = torch.optim.SGD(self.parameters(),momentum=0.9,lr=0.01) 43 | 44 | def forward(self,x,training=None, mask=None): 45 | # x.shape = [n,skip_window*2] 46 | o = self.embeddings(x) # [n, skip_window*2, emb_dim] 47 | o = torch.mean(o,dim=1) # [n, emb_dim] 48 | return o 49 | 50 | def loss(self, x, y, training=None): 51 | embedded = self(x,training) 52 | pred= self.hidden_out(embedded) 53 | return cross_entropy(pred,y) 54 | 55 | def step(self,x,y): 56 | self.opt.zero_grad() 57 | loss = self.loss(x,y,True) 58 | loss.backward() 59 | self.opt.step() 60 | return loss.detach().numpy() 61 | 62 | def train(model,data): 63 | if torch.cuda.is_available(): 64 | print("GPU train avaliable") 65 | device =torch.device("cuda") 66 | model = model.cuda() 67 | else: 68 | device = torch.device("cpu") 69 | model = model.cpu() 70 | for t in range(2500): 71 | bx,by = data.sample(16) 72 | bx,by = torch.from_numpy(bx).to(device), torch.from_numpy(by).to(device) 73 | loss = model.step(bx,by) 74 | if t%200 == 0: 75 | print(f"step: {t} | loss: {loss}") 76 | 77 | if __name__ == "__main__": 78 | d = process_w2v_data(corpus,skip_window=2, method="cbow") 79 | m = CBOW(d.num_word, 2) 80 | train(m,d) 81 | 82 | show_w2v_word_embedding(m,d,"./visual/results/cbow.png") -------------------------------------------------------------------------------- /pytorch/ELMo.py: -------------------------------------------------------------------------------- 1 | from torch import nn,optim 2 | import torch 3 | from torch.nn.functional import cross_entropy,softmax 4 | import utils 5 | from torch.utils.data import DataLoader 6 | import os 7 | 8 | 9 | class ELMo(nn.Module): 10 | 11 | def __init__(self, v_dim, emb_dim, units, n_layers, lr): 12 | super().__init__() 13 | self.n_layers = n_layers 14 | self.units = units 15 | self.v_dim = v_dim 16 | 17 | # encoder 18 | self.word_embed = nn.Embedding(num_embeddings= v_dim, embedding_dim= emb_dim,padding_idx=0) 19 | self.word_embed.weight.data.normal_(0,0.1) 20 | 21 | # forward LSTM 22 | self.fs = nn.ModuleList( 23 | [nn.LSTM(input_size = emb_dim, hidden_size = units, batch_first=True) if i==0 else nn.LSTM(input_size = units, hidden_size = units, batch_first=True) for i in range(n_layers)]) 24 | self.f_logits = nn.Linear(in_features=units, out_features=v_dim) 25 | 26 | # backward LSTM 27 | self.bs = nn.ModuleList( 28 | [nn.LSTM(input_size = emb_dim, hidden_size = units, batch_first=True) if i==0 else nn.LSTM(input_size = units, hidden_size = units, batch_first=True) for i in range(n_layers)]) 29 | self.b_logits = nn.Linear(in_features=units, out_features=v_dim) 30 | 31 | self.opt = optim.Adam(self.parameters(),lr = lr) 32 | 33 | def forward(self,seqs): 34 | device = next(self.parameters()).device 35 | embedded = self.word_embed(seqs) # [n, step, emb_dim] 36 | fxs = [embedded[:, :-1, :]] # [n, step-1, emb_dim] 37 | bxs = [embedded[:, 1:, :]] # [n, step-1, emb_dim] 38 | (h_f,c_f) = (torch.zeros(1,seqs.shape[0],self.units).to(device),torch.zeros(1,seqs.shape[0],self.units).to(device)) 39 | (h_b,c_b) = (torch.zeros(1,seqs.shape[0],self.units).to(device),torch.zeros(1,seqs.shape[0],self.units).to(device)) 40 | for fl,bl in zip(self.fs,self.bs): 41 | output_f,(h_f,c_f) = fl(fxs[-1], (h_f,c_f)) # [n, step-1, units], [1, n, units] 42 | fxs.append(output_f) 43 | 44 | output_b,(h_b,c_b) = bl(torch.flip(bxs[-1],dims=[1,]), (h_b,c_b)) # [n, step-1, units], [1, n, units] 45 | bxs.append(torch.flip(output_b,dims=(1,))) 46 | return fxs,bxs 47 | 48 | def step(self,seqs): 49 | self.opt.zero_grad() 50 | fo,bo = self(seqs) 51 | fo = self.f_logits(fo[-1]) # [n, step-1, v_dim] 52 | bo = self.b_logits(bo[-1]) # [n, step-1, v_dim] 53 | loss = ( 54 | cross_entropy(fo.reshape(-1,self.v_dim),seqs[:,1:].reshape(-1)) + 55 | cross_entropy(bo.reshape(-1,self.v_dim),seqs[:,:-1].reshape(-1)))/2 56 | loss.backward() 57 | self.opt.step() 58 | return loss.cpu().detach().numpy(), (fo,bo) 59 | 60 | def get_emb(self,seqs): 61 | fxs,bxs = self(seqs) 62 | xs = [ 63 | torch.cat((fxs[0][:,1:,:],bxs[0][:,:-1,:]),dim=2).cpu().data.numpy() 64 | ] + [ 65 | torch.cat((f[:,1:,:],b[:,:-1,:]),dim=2).cpu().data.numpy() for f,b in zip(fxs[1:],bxs[1:]) 66 | ] 67 | for x in xs: 68 | print("layers shape=",x.shape) 69 | return xs 70 | 71 | 72 | 73 | def train(): 74 | dataset = utils.MRPCSingle("./MRPC",rows=2000) 75 | UNITS = 256 76 | N_LAYERS = 2 77 | BATCH_SIZE = 16 78 | LEARNING_RATE = 2e-3 79 | print('num word: ',dataset.num_word) 80 | model = ELMo(v_dim = dataset.num_word,emb_dim = UNITS, units=UNITS, n_layers=N_LAYERS,lr=LEARNING_RATE) 81 | if torch.cuda.is_available(): 82 | print("GPU train avaliable") 83 | device =torch.device("cuda") 84 | model = model.cuda() 85 | else: 86 | device = torch.device("cpu") 87 | model = model.cpu() 88 | loader = DataLoader(dataset,batch_size=BATCH_SIZE,shuffle=True) 89 | for i in range(10): 90 | for batch_idx , batch in enumerate(loader): 91 | batch = batch.type(torch.LongTensor).to(device) 92 | loss, (fo,bo) = model.step(batch) 93 | if batch_idx % 20 ==0: 94 | fp = fo[0].cpu().data.numpy().argmax(axis=1) 95 | bp = bo[0].cpu().data.numpy().argmax(axis=1) 96 | print("\n\nEpoch: ", i, 97 | "| batch: ", batch_idx, 98 | "| loss: %.3f" % loss, 99 | "\n| tgt: ", " ".join([dataset.i2v[i] for i in batch[0].cpu().data.numpy() if i != dataset.pad_id]), 100 | "\n| f_prd: ", " ".join([dataset.i2v[i] for i in fp if i != dataset.pad_id]), 101 | "\n| b_prd: ", " ".join([dataset.i2v[i] for i in bp if i != dataset.pad_id]), 102 | ) 103 | os.makedirs("./visual/models/elmo",exist_ok=True) 104 | torch.save(model.state_dict(),"./visual/models/elmo/model.pth") 105 | export_w2v(model,batch[:4],device) 106 | 107 | def export_w2v(model,data,device): 108 | model.load_state_dict(torch.load("./visual/models/elmo/model.pth",map_location=device)) 109 | emb = model.get_emb(data) 110 | print(emb) 111 | if __name__ == "__main__": 112 | train() 113 | -------------------------------------------------------------------------------- /pytorch/GPT.py: -------------------------------------------------------------------------------- 1 | from transformer import Encoder 2 | from torch import nn,optim 3 | from torch.nn.functional import cross_entropy,softmax, relu 4 | from torch.utils.data import DataLoader 5 | from torch.utils.data.dataloader import default_collate 6 | 7 | import torch 8 | import utils 9 | import os 10 | import pickle 11 | 12 | class GPT(nn.Module): 13 | 14 | def __init__(self, model_dim, max_len, num_layer, num_head, n_vocab, lr, max_seg=3, drop_rate=0.2,padding_idx=0): 15 | super().__init__() 16 | self.padding_idx = padding_idx 17 | self.n_vocab = n_vocab 18 | self.max_len = max_len 19 | 20 | self.word_emb = nn.Embedding(n_vocab,model_dim) 21 | self.word_emb.weight.data.normal_(0,0.1) 22 | 23 | self.segment_emb = nn.Embedding(num_embeddings= max_seg, embedding_dim=model_dim) 24 | self.segment_emb.weight.data.normal_(0,0.1) 25 | self.position_emb = torch.empty(1,max_len,model_dim) 26 | nn.init.kaiming_normal_(self.position_emb,mode='fan_out', nonlinearity='relu') 27 | self.position_emb = nn.Parameter(self.position_emb) 28 | 29 | 30 | self.encoder = Encoder(n_head=num_head, emb_dim=model_dim, drop_rate=drop_rate, n_layer=num_layer) 31 | self.task_mlm = nn.Linear(in_features=model_dim, out_features=n_vocab) 32 | self.task_nsp = nn.Linear(in_features=model_dim*self.max_len, out_features=2) 33 | 34 | self.opt = optim.Adam(self.parameters(),lr) 35 | 36 | def forward(self,seqs, segs, training=False): 37 | embed = self.input_emb(seqs, segs) 38 | z = self.encoder(embed, training, mask = self.mask(seqs)) # [n, step, model_dim] 39 | mlm_logits = self.task_mlm(z) # [n, step, n_vocab] 40 | nsp_logits = self.task_nsp(z.reshape(z.shape[0],-1)) # [n, n_cls] 41 | return mlm_logits, nsp_logits 42 | 43 | def step(self, seqs, segs, seqs_, nsp_labels): 44 | self.opt.zero_grad() 45 | mlm_logits, nsp_logits = self(seqs, segs, training=True) 46 | pred_loss = cross_entropy(mlm_logits.reshape(-1,self.n_vocab),seqs_.reshape(-1)) 47 | nsp_loss = cross_entropy(nsp_logits,nsp_labels.reshape(-1)) 48 | loss = pred_loss + 0.2 * nsp_loss 49 | loss.backward() 50 | self.opt.step() 51 | return loss.cpu().data.numpy(), mlm_logits 52 | 53 | def input_emb(self,seqs, segs): 54 | # device = next(self.parameters()).device 55 | # self.position_emb = self.position_emb.to(device) 56 | return self.word_emb(seqs) + self.segment_emb(segs) + self.position_emb 57 | 58 | def mask(self, seqs): 59 | device = next(self.parameters()).device 60 | batch_size, seq_len = seqs.shape 61 | mask = torch.triu(torch.ones((seq_len,seq_len), dtype=torch.long), diagonal=1).to(device) # [seq_len ,seq_len] 62 | pad = torch.eq(seqs,self.padding_idx) # [n, seq_len] 63 | mask = torch.where(pad[:,None,None,:],1,mask[None,None,:,:]).to(device) # [n, 1, seq_len, seq_len] 64 | return mask>0 # [n, 1, seq_len, seq_len] 65 | 66 | @property 67 | def attentions(self): 68 | attentions = { 69 | "encoder": [l.mh.attention.cpu().data.numpy() for l in self.encoder.encoder_layers] 70 | } 71 | return attentions 72 | 73 | def train(): 74 | MODEL_DIM = 256 75 | N_LAYER = 4 76 | LEARNING_RATE = 1e-4 77 | dataset = utils.MRPCData("./MRPC",2000) 78 | print("num word: ",dataset.num_word) 79 | model = GPT( 80 | model_dim=MODEL_DIM, max_len=dataset.max_len-1, num_layer=N_LAYER, num_head=4, n_vocab=dataset.num_word, 81 | lr=LEARNING_RATE, max_seg=dataset.num_seg, drop_rate=0.2, padding_idx=dataset.pad_id 82 | ) 83 | if torch.cuda.is_available(): 84 | print("GPU train avaliable") 85 | device =torch.device("cuda") 86 | model = model.cuda() 87 | else: 88 | device = torch.device("cpu") 89 | model = model.cpu() 90 | 91 | loader = DataLoader(dataset,batch_size=32,shuffle=True) 92 | 93 | for epoch in range(100): 94 | for batch_idx, batch in enumerate(loader): 95 | seqs, segs,xlen,nsp_labels = batch 96 | seqs, segs,nsp_labels = seqs.type(torch.LongTensor).to(device), segs.type(torch.LongTensor).to(device),nsp_labels.to(device) 97 | # pred: [n, step, n_vocab] 98 | loss,pred = model.step(seqs=seqs[:,:-1], segs= segs[:,:-1], seqs_=seqs[:,1:], nsp_labels=nsp_labels) 99 | if batch_idx %100 == 0: 100 | pred = pred[0].cpu().data.numpy().argmax(axis = 1) # [step] 101 | print( 102 | "Epoch: ",epoch, 103 | "|batch: ", batch_idx, 104 | "| loss: %.3f" % loss, 105 | "\n| tgt: ", " ".join([dataset.i2v[i] for i in seqs[0, 1:].cpu().data.numpy()[:xlen[0].sum()+1]]), 106 | "\n| prd: ", " ".join([dataset.i2v[i] for i in pred[:xlen[0].sum()+1]]), 107 | ) 108 | os.makedirs("./visual/models/gpt",exist_ok=True) 109 | torch.save(model.state_dict(),"./visual/models/gpt/model.pth") 110 | export_attention(model,device,dataset) 111 | 112 | def export_attention(model,device,data,name="gpt"): 113 | model.load_state_dict(torch.load("./visual/models/gpt/model.pth",map_location=device)) 114 | seqs, segs,xlen,nsp_labels = data[:32] 115 | seqs, segs,xlen,nsp_labels = torch.from_numpy(seqs),torch.from_numpy(segs),torch.from_numpy(xlen),torch.from_numpy(nsp_labels) 116 | seqs, segs,nsp_labels = seqs.type(torch.LongTensor).to(device), segs.type(torch.LongTensor).to(device),nsp_labels.to(device) 117 | model(seqs[:,:-1],segs[:,:-1],False) 118 | seqs = seqs.cpu().data.numpy() 119 | data = {"src": [[data.i2v[i] for i in seqs[j]] for j in range(len(seqs))], "attentions": model.attentions} 120 | path = "./visual/tmp/%s_attention_matrix.pkl" % name 121 | os.makedirs(os.path.dirname(path), exist_ok=True) 122 | with open(path, "wb") as f: 123 | pickle.dump(data, f) 124 | if __name__ == "__main__": 125 | train() 126 | 127 | 128 | 129 | 130 | -------------------------------------------------------------------------------- /pytorch/README.md: -------------------------------------------------------------------------------- 1 | ### Dependencies: 2 | |Library| Version | 3 | |----- |-----| 4 | |PyTorch|1.7.1| 5 | |NumPy|1.18.1| 6 | |Matplotlib|3.3.4| -------------------------------------------------------------------------------- /pytorch/__pycache__/transformer.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MorvanZhou/NLP-Tutorials/3aa02a13151b696e8c034da5f7bd2ef4294a5e5f/pytorch/__pycache__/transformer.cpython-37.pyc -------------------------------------------------------------------------------- /pytorch/__pycache__/transformer.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MorvanZhou/NLP-Tutorials/3aa02a13151b696e8c034da5f7bd2ef4294a5e5f/pytorch/__pycache__/transformer.cpython-38.pyc -------------------------------------------------------------------------------- /pytorch/__pycache__/utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MorvanZhou/NLP-Tutorials/3aa02a13151b696e8c034da5f7bd2ef4294a5e5f/pytorch/__pycache__/utils.cpython-37.pyc -------------------------------------------------------------------------------- /pytorch/__pycache__/utils.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MorvanZhou/NLP-Tutorials/3aa02a13151b696e8c034da5f7bd2ef4294a5e5f/pytorch/__pycache__/utils.cpython-38.pyc -------------------------------------------------------------------------------- /pytorch/cnn_lm.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | import torch 3 | import numpy as np 4 | import utils 5 | from torch.utils.data import DataLoader 6 | from torch.nn.functional import cross_entropy,softmax, relu 7 | 8 | 9 | 10 | class CNNTranslation(nn.Module): 11 | 12 | def __init__(self,enc_v_dim, dec_v_dim, emb_dim, units, max_pred_len, start_token, end_token): 13 | super().__init__() 14 | self.units = units 15 | self.dec_v_dim = dec_v_dim 16 | 17 | 18 | # encoder 19 | self.enc_embeddings = nn.Embedding(enc_v_dim,emb_dim) 20 | self.enc_embeddings.weight.data.normal_(0,0.1) 21 | self.conv2ds = [nn.Conv2d(1,16,(n,emb_dim),padding=0) for n in range(2,5)] 22 | self.max_pools = [nn.MaxPool2d((n,1)) for n in [7,6,5]] 23 | self.encoder = nn.Linear(16*3,units) 24 | 25 | # decoder 26 | self.dec_embeddings = nn.Embedding(dec_v_dim,emb_dim) 27 | self.dec_embeddings.weight.data.normal_(0,0.1) 28 | self.decoder_cell = nn.LSTMCell(emb_dim,units) 29 | self.decoder_dense = nn.Linear(units,dec_v_dim) 30 | 31 | self.opt = torch.optim.Adam(self.parameters(),lr=0.001) 32 | self.max_pred_len = max_pred_len 33 | self.start_token = start_token 34 | self.end_token = end_token 35 | 36 | def encode(self,x): 37 | embedded = self.enc_embeddings(x) # [n, step, emb] 38 | o = torch.unsqueeze(embedded,1) # [n, 1, step=8, emb=16] 39 | co = [relu(conv2d(o)) for conv2d in self.conv2ds] # [n, 16, 7, 1], [n, 16, 6, 1], [n, 16, 5, 1] 40 | co = [self.max_pools[i](co[i]) for i in range(len(co))] # [n, 16, 1, 1] * 3 41 | co = [torch.squeeze(torch.squeeze(c,dim=3),dim=2) for c in co] # [n, 16] * 3 42 | o = torch.cat(co,dim=1) # [n, 16*3] 43 | h = self.encoder(o) # [n, units] 44 | return [h,h] 45 | 46 | def inference(self,x): 47 | self.eval() 48 | hx,cx = self.encode(x) 49 | start = torch.ones(x.shape[0],1) 50 | start[:,0] = torch.tensor(self.start_token) 51 | start= start.type(torch.LongTensor) 52 | dec_emb_in = self.dec_embeddings(start) # [n, step, emb] 53 | dec_emb_in = dec_emb_in.permute(1,0,2) # [step, n, emb] 54 | dec_in = dec_emb_in[0] # The first word use for decoding 55 | output = [] 56 | for i in range(self.max_pred_len): 57 | hx, cx = self.decoder_cell(dec_in, (hx, cx)) 58 | o = self.decoder_dense(hx) 59 | o = o.argmax(dim=1).view(-1,1) 60 | dec_in=self.dec_embeddings(o).permute(1,0,2)[0] 61 | output.append(o) 62 | output = torch.stack(output,dim=0) # [self.max_pred_len, n, 1] 63 | self.train() 64 | 65 | return output.permute(1,0,2).view(-1,self.max_pred_len) # [n, self.max_pred_len] 66 | 67 | def train_logit(self,x,y): 68 | hx,cx = self.encode(x) #[n, units] 69 | dec_in = y[:,:-1] 70 | dec_emb_in = self.dec_embeddings(dec_in) 71 | dec_emb_in = dec_emb_in.permute(1,0,2) 72 | output = [] 73 | for i in range(dec_emb_in.shape[0]): 74 | hx, cx = self.decoder_cell(dec_emb_in[i], (hx, cx)) 75 | o = self.decoder_dense(hx) 76 | output.append(o) 77 | output = torch.stack(output,dim=0) 78 | return output.permute(1,0,2) 79 | 80 | def step(self,x,y): 81 | self.opt.zero_grad() 82 | batch_size = x.shape[0] 83 | logit = self.train_logit(x,y) 84 | dec_out = y[:,1:] 85 | loss = cross_entropy(logit.reshape(-1,self.dec_v_dim),dec_out.reshape(-1)) 86 | loss.backward() 87 | self.opt.step() 88 | return loss.detach().numpy() 89 | 90 | 91 | def train(): 92 | dataset = utils.DateData(4000) 93 | print("Chinese time order: yy/mm/dd ",dataset.date_cn[:3],"\nEnglish time order: dd/M/yyyy", dataset.date_en[:3]) 94 | print("Vocabularies: ", dataset.vocab) 95 | print(f"x index sample: \n{dataset.idx2str(dataset.x[0])}\n{dataset.x[0]}", 96 | f"\ny index sample: \n{dataset.idx2str(dataset.y[0])}\n{dataset.y[0]}") 97 | loader = DataLoader(dataset,batch_size=32,shuffle=True) 98 | model = CNNTranslation(dataset.num_word,dataset.num_word,emb_dim=16,units=32,max_pred_len=11,start_token=dataset.start_token,end_token=dataset.end_token) 99 | 100 | for i in range(100): 101 | for batch_idx , batch in enumerate(loader): 102 | bx, by, decoder_len = batch 103 | loss = model.step(bx,by) 104 | if batch_idx % 70 == 0: 105 | target = dataset.idx2str(by[0, 1:-1].data.numpy()) 106 | pred = model.inference(bx[0:1]) 107 | res = dataset.idx2str(pred[0].data.numpy()) 108 | src = dataset.idx2str(bx[0].data.numpy()) 109 | print( 110 | "Epoch: ",i, 111 | "| t: ", batch_idx, 112 | "| loss: %.3f" % loss, 113 | "| input: ", src, 114 | "| target: ", target, 115 | "| inference: ", res, 116 | ) 117 | 118 | 119 | if __name__ == "__main__": 120 | train() -------------------------------------------------------------------------------- /pytorch/seq2seq.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | import torch 3 | import numpy as np 4 | import utils 5 | from torch.utils.data import DataLoader 6 | from torch.nn.functional import cross_entropy,softmax 7 | 8 | 9 | class Seq2Seq(nn.Module): 10 | def __init__(self,enc_v_dim, dec_v_dim, emb_dim, units, max_pred_len, start_token, end_token): 11 | super().__init__() 12 | self.units = units 13 | self.dec_v_dim = dec_v_dim 14 | 15 | # encoder 16 | self.enc_embeddings = nn.Embedding(enc_v_dim,emb_dim) 17 | self.enc_embeddings.weight.data.normal_(0,0.1) 18 | self.encoder = nn.LSTM(emb_dim,units,1,batch_first=True) 19 | 20 | 21 | # decoder 22 | self.dec_embeddings = nn.Embedding(dec_v_dim,emb_dim) 23 | self.dec_embeddings.weight.data.normal_(0,0.1) 24 | self.decoder_cell = nn.LSTMCell(emb_dim,units) 25 | self.decoder_dense = nn.Linear(units,dec_v_dim) 26 | 27 | self.opt = torch.optim.Adam(self.parameters(),lr=0.001) 28 | self.max_pred_len = max_pred_len 29 | self.start_token = start_token 30 | self.end_token = end_token 31 | 32 | 33 | def encode(self,x): 34 | embedded = self.enc_embeddings(x) # [n, step, emb] 35 | hidden = (torch.zeros(1,x.shape[0],self.units),torch.zeros(1,x.shape[0],self.units)) 36 | o,(h,c) = self.encoder(embedded,hidden) 37 | return h,c 38 | 39 | def inference(self,x): 40 | self.eval() 41 | hx,cx = self.encode(x) 42 | hx,cx = hx[0],cx[0] 43 | start = torch.ones(x.shape[0],1) 44 | start[:,0] = torch.tensor(self.start_token) 45 | start= start.type(torch.LongTensor) 46 | dec_emb_in = self.dec_embeddings(start) 47 | dec_emb_in = dec_emb_in.permute(1,0,2) 48 | dec_in = dec_emb_in[0] 49 | output = [] 50 | for i in range(self.max_pred_len): 51 | hx, cx = self.decoder_cell(dec_in, (hx, cx)) 52 | o = self.decoder_dense(hx) 53 | o = o.argmax(dim=1).view(-1,1) 54 | dec_in=self.dec_embeddings(o).permute(1,0,2)[0] 55 | output.append(o) 56 | output = torch.stack(output,dim=0) 57 | self.train() 58 | 59 | return output.permute(1,0,2).view(-1,self.max_pred_len) 60 | 61 | 62 | def train_logit(self,x,y): 63 | hx,cx = self.encode(x) 64 | hx,cx = hx[0],cx[0] 65 | dec_in = y[:,:-1] 66 | dec_emb_in = self.dec_embeddings(dec_in) 67 | dec_emb_in = dec_emb_in.permute(1,0,2) 68 | output = [] 69 | for i in range(dec_emb_in.shape[0]): 70 | hx, cx = self.decoder_cell(dec_emb_in[i], (hx, cx)) 71 | o = self.decoder_dense(hx) 72 | output.append(o) 73 | output = torch.stack(output,dim=0) 74 | return output.permute(1,0,2) 75 | 76 | def step(self,x,y): 77 | self.opt.zero_grad() 78 | batch_size = x.shape[0] 79 | logit = self.train_logit(x,y) 80 | dec_out = y[:,1:] 81 | loss = cross_entropy(logit.reshape(-1,self.dec_v_dim),dec_out.reshape(-1)) 82 | loss.backward() 83 | self.opt.step() 84 | return loss.detach().numpy() 85 | 86 | def train(): 87 | dataset = utils.DateData(4000) 88 | print("Chinese time order: yy/mm/dd ",dataset.date_cn[:3],"\nEnglish time order: dd/M/yyyy", dataset.date_en[:3]) 89 | print("Vocabularies: ", dataset.vocab) 90 | print(f"x index sample: \n{dataset.idx2str(dataset.x[0])}\n{dataset.x[0]}", 91 | f"\ny index sample: \n{dataset.idx2str(dataset.y[0])}\n{dataset.y[0]}") 92 | loader = DataLoader(dataset,batch_size=32,shuffle=True) 93 | model = Seq2Seq(dataset.num_word,dataset.num_word,emb_dim=16,units=32,max_pred_len=11,start_token=dataset.start_token,end_token=dataset.end_token) 94 | for i in range(100): 95 | for batch_idx , batch in enumerate(loader): 96 | bx, by, decoder_len = batch 97 | bx = bx.type(torch.LongTensor) 98 | by = by.type(torch.LongTensor) 99 | loss = model.step(bx,by) 100 | if batch_idx % 70 == 0: 101 | target = dataset.idx2str(by[0, 1:-1].data.numpy()) 102 | pred = model.inference(bx[0:1]) 103 | res = dataset.idx2str(pred[0].data.numpy()) 104 | src = dataset.idx2str(bx[0].data.numpy()) 105 | print( 106 | "Epoch: ",i, 107 | "| t: ", batch_idx, 108 | "| loss: %.3f" % loss, 109 | "| input: ", src, 110 | "| target: ", target, 111 | "| inference: ", res, 112 | ) 113 | 114 | 115 | if __name__ == "__main__": 116 | train() 117 | 118 | 119 | 120 | -------------------------------------------------------------------------------- /pytorch/seq2seq_attention.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | import torch 3 | import numpy as np 4 | import utils 5 | from torch.utils.data import DataLoader 6 | from torch.nn.functional import cross_entropy,softmax 7 | 8 | class Seq2Seq(nn.Module): 9 | def __init__(self,enc_v_dim, dec_v_dim, emb_dim, units, max_pred_len, start_token, end_token): 10 | super().__init__() 11 | self.units = units 12 | self.dec_v_dim = dec_v_dim 13 | 14 | # encoder 15 | self.enc_embeddings = nn.Embedding(enc_v_dim,emb_dim) 16 | self.enc_embeddings.weight.data.normal_(0,0.1) 17 | self.encoder = nn.LSTM(emb_dim,units,1,batch_first=True) 18 | 19 | # decoder 20 | self.dec_embeddings = nn.Embedding(dec_v_dim,emb_dim) 21 | self.attn = nn.Linear(units,units) 22 | self.decoder_cell = nn.LSTMCell(emb_dim,units) 23 | self.decoder_dense = nn.Linear(units*2,dec_v_dim) 24 | 25 | self.opt = torch.optim.Adam(self.parameters(),lr=0.001) 26 | self.max_pred_len = max_pred_len 27 | self.start_token = start_token 28 | self.end_token = end_token 29 | 30 | def encode(self,x): 31 | embedded = self.enc_embeddings(x) # [n, step, emb] 32 | hidden = (torch.zeros(1,x.shape[0],self.units),torch.zeros(1,x.shape[0],self.units)) 33 | o,(h,c) = self.encoder(embedded,hidden) # [n, step, units], [num_layers * num_directions, n, units] 34 | return o,h,c 35 | 36 | def inference(self,x,return_align=False): 37 | self.eval() 38 | o,hx,cx = self.encode(x) # [n, step, units], [num_layers * num_directions, n, units] * 2 39 | hx,cx = hx[0],cx[0] # [n, units] 40 | start = torch.ones(x.shape[0],1) # [n, 1] 41 | start[:,0] = torch.tensor(self.start_token) 42 | start= start.type(torch.LongTensor) 43 | dec_emb_in = self.dec_embeddings(start) # [n, 1, emb_dim] 44 | dec_emb_in = dec_emb_in.permute(1,0,2) # [1, n, emb_dim] 45 | dec_in = dec_emb_in[0] # [n, emb_dim] 46 | output = [] 47 | for i in range(self.max_pred_len): 48 | attn_prod = torch.matmul(self.attn(hx.unsqueeze(1)),o.permute(0,2,1)) # [n, 1, step] 49 | att_weight = softmax(attn_prod, dim=2) # [n, 1, step] 50 | context = torch.matmul(att_weight,o) # [n, 1, units] 51 | # attn_prod = torch.matmul(self.attn(o),hx.unsqueeze(2)) # [n, step, 1] 52 | # attn_weight = softmax(attn_prod,dim=1) # [n, step, 1] 53 | # context = torch.matmul(o.permute(0,2,1),attn_weight) # [n, units, 1] 54 | hx, cx = self.decoder_cell(dec_in, (hx, cx)) 55 | hc = torch.cat([context.squeeze(1),hx],dim=1) # [n, units *2] 56 | # hc = torch.cat([context.squeeze(2),hx],dim=1) # [n, units *2] 57 | result = self.decoder_dense(hc) 58 | result = result.argmax(dim=1).view(-1,1) 59 | dec_in=self.dec_embeddings(result).permute(1,0,2)[0] 60 | output.append(result) 61 | output = torch.stack(output,dim=0) 62 | self.train() 63 | 64 | return output.permute(1,0,2).view(-1,self.max_pred_len) 65 | 66 | def train_logit(self,x,y): 67 | o,hx,cx = self.encode(x) # [n, step, units], [num_layers * num_directions, n, units] * 2 68 | hx,cx = hx[0],cx[0] # [n, units] 69 | dec_in = y[:,:-1] # [n, step] 70 | dec_emb_in = self.dec_embeddings(dec_in) # [n, step, emb_dim] 71 | dec_emb_in = dec_emb_in.permute(1,0,2) # [step, n, emb_dim] 72 | output = [] 73 | for i in range(dec_emb_in.shape[0]): 74 | # General Attention: 75 | # score(ht,hs) = (ht^T)(Wa)hs 76 | # hs is the output from encoder 77 | # ht is the previous hidden state from decoder 78 | # self.attn(o): [n, step, units] 79 | attn_prod = torch.matmul(self.attn(hx.unsqueeze(1)),o.permute(0,2,1)) # [n, 1, step] 80 | att_weight = softmax(attn_prod, dim=2) # [n, 1, step] 81 | context = torch.matmul(att_weight,o) # [n, 1, units] 82 | # attn_prod = torch.matmul(self.attn(o),hx.unsqueeze(2)) # [n, step, 1] 83 | # attn_weight = softmax(attn_prod,dim=1) # [n, step, 1] 84 | # context = torch.matmul(o.permute(0,2,1),attn_weight) # [n, units, 1] 85 | hx, cx = self.decoder_cell(dec_emb_in[i], (hx, cx)) # [n, units] 86 | hc = torch.cat([context.squeeze(1),hx],dim=1) # [n, units *2] 87 | # hc = torch.cat([context.squeeze(2),hx],dim=1) # [n, units *2] 88 | result = self.decoder_dense(hc) # [n, dec_v_dim] 89 | output.append(result) 90 | output = torch.stack(output,dim=0) # [step, n, dec_v_dim] 91 | return output.permute(1,0,2) # [n, step, dec_v_dim] 92 | 93 | def step(self,x,y): 94 | self.opt.zero_grad() 95 | batch_size = x.shape[0] 96 | logit = self.train_logit(x,y) 97 | dec_out = y[:,1:] 98 | loss = cross_entropy(logit.reshape(-1,self.dec_v_dim),dec_out.reshape(-1)) 99 | loss.backward() 100 | self.opt.step() 101 | return loss.detach().numpy() 102 | 103 | 104 | def train(): 105 | dataset = utils.DateData(4000) 106 | print("Chinese time order: yy/mm/dd ",dataset.date_cn[:3],"\nEnglish time order: dd/M/yyyy", dataset.date_en[:3]) 107 | print("Vocabularies: ", dataset.vocab) 108 | print(f"x index sample: \n{dataset.idx2str(dataset.x[0])}\n{dataset.x[0]}", 109 | f"\ny index sample: \n{dataset.idx2str(dataset.y[0])}\n{dataset.y[0]}") 110 | loader = DataLoader(dataset,batch_size=32,shuffle=True) 111 | model = Seq2Seq(dataset.num_word,dataset.num_word,emb_dim=16,units=32,max_pred_len=11,start_token=dataset.start_token,end_token=dataset.end_token) 112 | for i in range(100): 113 | for batch_idx , batch in enumerate(loader): 114 | bx, by, decoder_len = batch 115 | loss = model.step(bx,by) 116 | if batch_idx % 70 == 0: 117 | target = dataset.idx2str(by[0, 1:-1].data.numpy()) 118 | pred = model.inference(bx[0:1]) 119 | res = dataset.idx2str(pred[0].data.numpy()) 120 | src = dataset.idx2str(bx[0].data.numpy()) 121 | print( 122 | "Epoch: ",i, 123 | "| t: ", batch_idx, 124 | "| loss: %.3f" % loss, 125 | "| input: ", src, 126 | "| target: ", target, 127 | "| inference: ", res, 128 | ) 129 | # pkl_data = {"i2v": dataset.i2v, "x": dataset.x[:6], "y": dataset.y[:6], "align": model.inference(dataset.x[:6], return_align=True)} 130 | 131 | # with open("./visual/tmp/attention_align.pkl", "wb") as f: 132 | # pickle.dump(pkl_data, f) 133 | 134 | if __name__ == "__main__": 135 | train() -------------------------------------------------------------------------------- /pytorch/skip_gram.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | import torch 3 | from torch.nn.functional import cross_entropy,softmax 4 | from utils import Dataset,process_w2v_data 5 | from visual import show_w2v_word_embedding 6 | 7 | corpus = [ 8 | # numbers 9 | "5 2 4 8 6 2 3 6 4", 10 | "4 8 5 6 9 5 5 6", 11 | "1 1 5 2 3 3 8", 12 | "3 6 9 6 8 7 4 6 3", 13 | "8 9 9 6 1 4 3 4", 14 | "1 0 2 0 2 1 3 3 3 3 3", 15 | "9 3 3 0 1 4 7 8", 16 | "9 9 8 5 6 7 1 2 3 0 1 0", 17 | 18 | # alphabets, expecting that 9 is close to letters 19 | "a t g q e h 9 u f", 20 | "e q y u o i p s", 21 | "q o 9 p l k j o k k o p", 22 | "h g y i u t t a e q", 23 | "i k d q r e 9 e a d", 24 | "o p d g 9 s a f g a", 25 | "i u y g h k l a s w", 26 | "o l u y a o g f s", 27 | "o p i u y g d a s j d l", 28 | "u k i l o 9 l j s", 29 | "y g i s h k j l f r f", 30 | "i o h n 9 9 d 9 f a 9", 31 | ] 32 | 33 | 34 | class SkipGram(nn.Module): 35 | 36 | def __init__(self,v_dim,emb_dim): 37 | super().__init__() 38 | self.v_dim = v_dim 39 | self.embeddings = nn.Embedding(v_dim,emb_dim) 40 | self.embeddings.weight.data.normal_(0,0.1) 41 | self.hidden_out = nn.Linear(emb_dim,v_dim) 42 | 43 | self.opt = torch.optim.Adam(self.parameters(),lr=0.01) 44 | 45 | def forward(self,x,training=None, mask=None): 46 | # x.shape = [n,] 47 | o = self.embeddings(x) # [n, emb_dim] 48 | return o 49 | 50 | def loss(self,x,y,training=None): 51 | embedded = self(x,training) 52 | pred= self.hidden_out(embedded) 53 | return cross_entropy(pred,y) 54 | 55 | def step(self,x,y): 56 | self.opt.zero_grad() 57 | loss = self.loss(x,y,True) 58 | loss.backward() 59 | self.opt.step() 60 | return loss.detach().numpy() 61 | 62 | def train(model,data): 63 | if torch.cuda.is_available(): 64 | print("GPU train avaliable") 65 | device =torch.device("cuda") 66 | model = model.cuda() 67 | else: 68 | device = torch.device("cpu") 69 | model = model.cpu() 70 | for t in range(2500): 71 | bx,by = data.sample(8) 72 | bx,by = torch.from_numpy(bx).to(device), torch.from_numpy(by).to(device) 73 | loss = model.step(bx,by) 74 | if t%200 == 0: 75 | print(f"step: {t} | loss: {loss}") 76 | 77 | 78 | if __name__ == "__main__": 79 | d = process_w2v_data(corpus,skip_window=2, method="skip_gram") 80 | m = SkipGram(d.num_word, 2) 81 | train(m,d) 82 | 83 | #plotting 84 | show_w2v_word_embedding(m,d,"./visual/results/skipgram.png") 85 | 86 | 87 | -------------------------------------------------------------------------------- /pytorch/transformer.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | from torch.nn.functional import cross_entropy,softmax, relu 3 | import numpy as np 4 | import torch 5 | from torch.utils import data 6 | import utils 7 | from torch.utils.data import DataLoader 8 | import argparse 9 | 10 | MAX_LEN = 11 11 | 12 | class MultiHead(nn.Module): 13 | def __init__(self, n_head, model_dim, drop_rate): 14 | super().__init__() 15 | self.head_dim = model_dim // n_head 16 | self.n_head = n_head 17 | self.model_dim = model_dim 18 | self.wq = nn.Linear(model_dim, n_head * self.head_dim) 19 | self.wk = nn.Linear(model_dim, n_head * self.head_dim) 20 | self.wv = nn.Linear(model_dim, n_head * self.head_dim) 21 | 22 | self.o_dense = nn.Linear(model_dim, model_dim) 23 | self.o_drop = nn.Dropout(drop_rate) 24 | self.layer_norm = nn.LayerNorm(model_dim) 25 | self.attention = None 26 | 27 | def forward(self,q,k,v,mask,training): 28 | # residual connect 29 | residual = q 30 | dim_per_head= self.head_dim 31 | num_heads = self.n_head 32 | batch_size = q.size(0) 33 | 34 | # linear projection 35 | key = self.wk(k) # [n, step, num_heads * head_dim] 36 | value = self.wv(v) # [n, step, num_heads * head_dim] 37 | query = self.wq(q) # [n, step, num_heads * head_dim] 38 | 39 | # split by head 40 | query = self.split_heads(query) # [n, n_head, q_step, h_dim] 41 | key = self.split_heads(key) 42 | value = self.split_heads(value) # [n, h, step, h_dim] 43 | context = self.scaled_dot_product_attention(query,key, value, mask) # [n, q_step, h*dv] 44 | o = self.o_dense(context) # [n, step, dim] 45 | o = self.o_drop(o) 46 | 47 | o = self.layer_norm(residual+o) 48 | return o 49 | 50 | def split_heads(self, x): 51 | x = torch.reshape(x,(x.shape[0], x.shape[1], self.n_head, self.head_dim)) 52 | return x.permute(0,2,1,3) 53 | 54 | def scaled_dot_product_attention(self, q, k, v, mask=None): 55 | dk = torch.tensor(k.shape[-1]).type(torch.float) 56 | score = torch.matmul(q,k.permute(0,1,3,2)) / (torch.sqrt(dk) + 1e-8) # [n, n_head, step, step] 57 | if mask is not None: 58 | # change the value at masked position to negative infinity, 59 | # so the attention score at these positions after softmax will close to 0. 60 | score = score.masked_fill_(mask,-np.inf) 61 | self.attention = softmax(score,dim=-1) 62 | context = torch.matmul(self.attention,v) # [n, num_head, step, head_dim] 63 | context = context.permute(0,2,1,3) # [n, step, num_head, head_dim] 64 | context = context.reshape((context.shape[0], context.shape[1],-1)) 65 | return context # [n, step, model_dim] 66 | 67 | class PositionWiseFFN(nn.Module): 68 | def __init__(self,model_dim, dropout = 0.0): 69 | super().__init__() 70 | dff = model_dim*4 71 | self.l = nn.Linear(model_dim,dff) 72 | self.o = nn.Linear(dff,model_dim) 73 | self.dropout = nn.Dropout(dropout) 74 | self.layer_norm = nn.LayerNorm(model_dim) 75 | 76 | def forward(self,x): 77 | o = relu(self.l(x)) 78 | o = self.o(o) 79 | o = self.dropout(o) 80 | 81 | o = self.layer_norm(x + o) 82 | return o # [n, step, dim] 83 | 84 | 85 | 86 | class EncoderLayer(nn.Module): 87 | 88 | def __init__(self, n_head, emb_dim, drop_rate): 89 | super().__init__() 90 | self.mh = MultiHead(n_head, emb_dim, drop_rate) 91 | self.ffn = PositionWiseFFN(emb_dim,drop_rate) 92 | 93 | def forward(self, xz, training, mask): 94 | # xz: [n, step, emb_dim] 95 | context = self.mh(xz, xz, xz, mask, training) # [n, step, emb_dim] 96 | o = self.ffn(context) 97 | return o 98 | 99 | class Encoder(nn.Module): 100 | def __init__(self, n_head, emb_dim, drop_rate, n_layer): 101 | super().__init__() 102 | self.encoder_layers = nn.ModuleList( 103 | [EncoderLayer(n_head, emb_dim, drop_rate) for _ in range(n_layer)] 104 | ) 105 | def forward(self, xz, training, mask): 106 | 107 | for encoder in self.encoder_layers: 108 | xz = encoder(xz,training,mask) 109 | return xz # [n, step, emb_dim] 110 | 111 | class DecoderLayer(nn.Module): 112 | def __init__(self,n_head,model_dim,drop_rate): 113 | super().__init__() 114 | self.mh = nn.ModuleList([MultiHead(n_head, model_dim, drop_rate) for _ in range(2)]) 115 | self.ffn = PositionWiseFFN(model_dim,drop_rate) 116 | 117 | def forward(self,yz, xz, training, yz_look_ahead_mask,xz_pad_mask): 118 | dec_output = self.mh[0](yz, yz, yz, yz_look_ahead_mask, training) # [n, step, model_dim] 119 | 120 | dec_output = self.mh[1](dec_output, xz, xz, xz_pad_mask, training) # [n, step, model_dim] 121 | 122 | dec_output = self.ffn(dec_output) # [n, step, model_dim] 123 | 124 | return dec_output 125 | 126 | class Decoder(nn.Module): 127 | def __init__(self, n_head, model_dim, drop_rate, n_layer): 128 | super().__init__() 129 | 130 | self.num_layers = n_layer 131 | 132 | self.decoder_layers = nn.ModuleList( 133 | [DecoderLayer(n_head, model_dim, drop_rate) for _ in range(n_layer)] 134 | ) 135 | 136 | def forward(self, yz, xz, training, yz_look_ahead_mask, xz_pad_mask): 137 | for decoder in self.decoder_layers: 138 | yz = decoder(yz, xz, training, yz_look_ahead_mask, xz_pad_mask) 139 | return yz # [n, step, model_dim] 140 | 141 | class PositionEmbedding(nn.Module): 142 | def __init__(self, max_len, emb_dim, n_vocab): 143 | super().__init__() 144 | pos = np.expand_dims(np.arange(max_len),1) # [max_len, 1] 145 | pe = pos / np.power(1000, 2*np.expand_dims(np.arange(emb_dim)//2,0)/emb_dim) # [max_len, emb_dim] 146 | pe[:, 0::2] = np.sin(pe[:, 0::2]) 147 | pe[:, 1::2] = np.cos(pe[:, 1::2]) 148 | pe = np.expand_dims(pe,0) # [1, max_len, emb_dim] 149 | self.pe = torch.from_numpy(pe).type(torch.float32) 150 | self.embeddings = nn.Embedding(n_vocab,emb_dim) 151 | self.embeddings.weight.data.normal_(0,0.1) 152 | 153 | def forward(self, x): 154 | device = self.embeddings.weight.device 155 | self.pe = self.pe.to(device) 156 | x_embed = self.embeddings(x) + self.pe # [n, step, emb_dim] 157 | return x_embed # [n, step, emb_dim] 158 | 159 | class Transformer(nn.Module): 160 | def __init__(self, n_vocab, max_len, n_layer = 6, emb_dim=512, n_head = 8, drop_rate=0.1, padding_idx=0): 161 | super().__init__() 162 | self.max_len = max_len 163 | self.padding_idx = torch.tensor(padding_idx) 164 | self.dec_v_emb = n_vocab 165 | 166 | self.embed = PositionEmbedding(max_len, emb_dim, n_vocab) 167 | self.encoder = Encoder(n_head, emb_dim, drop_rate, n_layer) 168 | self.decoder = Decoder(n_head, emb_dim, drop_rate, n_layer) 169 | self.o = nn.Linear(emb_dim,n_vocab) 170 | self.opt = torch.optim.Adam(self.parameters(),lr=0.002) 171 | 172 | def forward(self,x,y,training= None): 173 | x_embed, y_embed = self.embed(x), self.embed(y) # [n, step, emb_dim] * 2 174 | pad_mask = self._pad_mask(x) # [n, 1, step, step] 175 | encoded_z = self.encoder(x_embed,training,pad_mask) # [n, step, emb_dim] 176 | yz_look_ahead_mask = self._look_ahead_mask(y) # [n, 1, step, step] 177 | decoded_z = self.decoder(y_embed,encoded_z, training, yz_look_ahead_mask, pad_mask) # [n, step, emb_dim] 178 | o = self.o(decoded_z) # [n, step, n_vocab] 179 | return o 180 | 181 | def step(self, x, y): 182 | self.opt.zero_grad() 183 | logits = self(x,y[:, :-1],training=True) 184 | pad_mask = ~torch.eq(y[:,1:],self.padding_idx) # [n, seq_len] 185 | loss = cross_entropy(logits.reshape(-1, self.dec_v_emb),y[:,1:].reshape(-1)) 186 | loss.backward() 187 | self.opt.step() 188 | return loss.cpu().data.numpy(), logits 189 | 190 | def _pad_bool(self, seqs): 191 | o = torch.eq(seqs,self.padding_idx) # [n, step] 192 | return o 193 | def _pad_mask(self, seqs): 194 | len_q = seqs.size(1) 195 | mask = self._pad_bool(seqs).unsqueeze(1).expand(-1,len_q,-1) # [n, len_q, step] 196 | return mask.unsqueeze(1) # [n, 1, len_q, step] 197 | 198 | def _look_ahead_mask(self,seqs): 199 | device = next(self.parameters()).device 200 | batch_size, seq_len = seqs.shape 201 | mask = torch.triu(torch.ones((seq_len,seq_len), dtype=torch.long), diagonal=1).to(device) # [seq_len ,seq_len] 202 | mask = torch.where(self._pad_bool(seqs)[:,None,None,:],1,mask[None,None,:,:]).to(device) # [n, 1, seq_len, seq_len] 203 | return mask>0 # [n, 1, seq_len, seq_len] 204 | 205 | def translate(self, src, v2i, i2v): 206 | self.eval() 207 | device = next(self.parameters()).device 208 | src_pad = src 209 | # Initialize Decoder input by constructing a matrix M([n, self.max_len+1]) with initial value: 210 | # M[n,0] = start token id 211 | # M[n,:] = 0 212 | target = torch.from_numpy(utils.pad_zero(np.array([[v2i[""], ] for _ in range(len(src))]), self.max_len+1)).to(device) 213 | x_embed = self.embed(src_pad) 214 | encoded_z = self.encoder(x_embed,False,mask=self._pad_mask(src_pad)) 215 | for i in range(0,self.max_len): 216 | y = target[:,:-1] 217 | y_embed = self.embed(y) 218 | decoded_z = self.decoder(y_embed,encoded_z,False,self._look_ahead_mask(y),self._pad_mask(src_pad)) 219 | o = self.o(decoded_z)[:,i,:] 220 | idx = o.argmax(dim = 1).detach() 221 | # Update the Decoder input, to predict for the next position. 222 | target[:,i+1] = idx 223 | self.train() 224 | return target 225 | 226 | 227 | 228 | 229 | def train(emb_dim=32,n_layer=3,n_head=4): 230 | 231 | dataset = utils.DateData(4000) 232 | print("Chinese time order: yy/mm/dd ",dataset.date_cn[:3],"\nEnglish time order: dd/M/yyyy", dataset.date_en[:3]) 233 | print("Vocabularies: ", dataset.vocab) 234 | print(f"x index sample: \n{dataset.idx2str(dataset.x[0])}\n{dataset.x[0]}", 235 | f"\ny index sample: \n{dataset.idx2str(dataset.y[0])}\n{dataset.y[0]}") 236 | loader = DataLoader(dataset,batch_size=32,shuffle=True) 237 | model = Transformer(n_vocab=dataset.num_word, max_len=MAX_LEN, n_layer = n_layer, emb_dim=emb_dim, n_head = n_head, drop_rate=0.1, padding_idx=0) 238 | if torch.cuda.is_available(): 239 | print("GPU train avaliable") 240 | device =torch.device("cuda") 241 | model = model.cuda() 242 | else: 243 | device = torch.device("cpu") 244 | model = model.cpu() 245 | for i in range(100): 246 | for batch_idx , batch in enumerate(loader): 247 | bx, by, decoder_len = batch 248 | bx, by = torch.from_numpy(utils.pad_zero(bx,max_len = MAX_LEN)).type(torch.LongTensor).to(device), torch.from_numpy(utils.pad_zero(by,MAX_LEN+1)).type(torch.LongTensor).to(device) 249 | loss, logits = model.step(bx,by) 250 | if batch_idx%50 == 0: 251 | target = dataset.idx2str(by[0, 1:-1].cpu().data.numpy()) 252 | pred = model.translate(bx[0:1],dataset.v2i,dataset.i2v) 253 | res = dataset.idx2str(pred[0].cpu().data.numpy()) 254 | src = dataset.idx2str(bx[0].cpu().data.numpy()) 255 | print( 256 | "Epoch: ",i, 257 | "| t: ", batch_idx, 258 | "| loss: %.3f" % loss, 259 | "| input: ", src, 260 | "| target: ", target, 261 | "| inference: ", res, 262 | ) 263 | 264 | if __name__ == "__main__": 265 | parser = argparse.ArgumentParser() 266 | parser.add_argument("--emb_dim",type=int, help="change the model dimension") 267 | parser.add_argument("--n_layer",type=int, help="change the number of layers in Encoder and Decoder") 268 | parser.add_argument("--n_head",type=int, help="change the number of heads in MultiHeadAttention") 269 | 270 | args = parser.parse_args() 271 | args = dict(filter(lambda x: x[1],vars(args).items())) 272 | train(**args) -------------------------------------------------------------------------------- /pytorch/utils.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import numpy as np 3 | from torch.utils.data import Dataset as tDataset 4 | import datetime 5 | import os 6 | import re 7 | import pandas as pd 8 | import requests 9 | import torch 10 | 11 | PAD_ID = 0 12 | class DateData(tDataset): 13 | def __init__(self,n): 14 | np.random.seed(1) 15 | self.date_cn = [] 16 | self.date_en = [] 17 | for timestamp in np.random.randint(143835585, 2043835585, n): 18 | date = datetime.datetime.fromtimestamp(timestamp) 19 | self.date_cn.append(date.strftime("%y-%m-%d")) 20 | self.date_en.append(date.strftime("%d/%b/%Y")) 21 | self.vocab= set( 22 | [str(i) for i in range(0,10)] + ["-","/","",""] + [i.split("/")[1] for i in self.date_en] 23 | ) 24 | self.v2i = {v:i for i,v in enumerate(sorted(list(self.vocab)), start=1)} 25 | self.v2i[""] = PAD_ID 26 | self.vocab.add("") 27 | self.i2v = {i:v for v,i in self.v2i.items()} 28 | self.x,self.y=[],[] 29 | for cn,en in zip(self.date_cn,self.date_en): 30 | self.x.append([self.v2i[v] for v in cn]) 31 | self.y.append([self.v2i[""], ] + [self.v2i[v] for v in en[:3]] + [ 32 | self.v2i[en[3:6]]] + [self.v2i[v] for v in en[6:]] + [self.v2i[""],]) 33 | self.x,self.y = np.array(self.x),np.array(self.y) 34 | self.start_token = self.v2i[""] 35 | self.end_token = self.v2i[""] 36 | 37 | def __len__(self): 38 | return len(self.x) 39 | 40 | @property 41 | def num_word(self): 42 | return len(self.vocab) 43 | 44 | def __getitem__(self, index): 45 | return self.x[index],self.y[index], len(self.y[index])-1 46 | 47 | def idx2str(self,idx): 48 | x=[] 49 | for i in idx: 50 | x.append(self.i2v[i]) 51 | if i == self.end_token: 52 | break 53 | return "".join(x) 54 | 55 | def pad_zero(seqs, max_len): 56 | padded = np.full((len(seqs), max_len), fill_value=PAD_ID, dtype=np.int32) 57 | for i, seq in enumerate(seqs): 58 | padded[i, :len(seq)] = seq 59 | return padded 60 | 61 | class Dataset: 62 | def __init__(self,x,y,v2i,i2v): 63 | self.x,self.y = x,y 64 | self.v2i, self.i2v = v2i,i2v 65 | self.vocab = v2i.keys() 66 | 67 | def sample(self,n): 68 | b_idx = np.random.randint(0,len(self.x),n) 69 | bx,by = self.x[b_idx],self.y[b_idx] 70 | return bx,by 71 | @property 72 | def num_word(self): 73 | return len(self.v2i) 74 | 75 | def process_w2v_data(corpus,skip_window=2,method = "skip_gram"): 76 | all_words = [sentence.split(" ") for sentence in corpus] 77 | # groups all the iterables together and produces a single iterable as output 78 | all_words = np.array(list(itertools.chain(*all_words))) 79 | vocab,v_count = np.unique(all_words,return_counts=True) 80 | vocab = vocab[np.argsort(v_count)[::-1]] 81 | 82 | print("All vocabularies are sorted by frequency in decresing oreder") 83 | v2i = {v:i for i,v in enumerate(vocab)} 84 | i2v = {i:v for v,i in v2i.items()} 85 | 86 | pairs = [] 87 | js = [i for i in range(-skip_window,skip_window+1) if i!=0] 88 | 89 | for c in corpus: 90 | words = c.split(" ") 91 | w_idx = [v2i[w] for w in words] 92 | if method == "skip_gram": 93 | for i in range(len(w_idx)): 94 | for j in js: 95 | if i+j<0 or i+j>= len(w_idx): 96 | continue 97 | pairs.append((w_idx[i],w_idx[i+j])) 98 | elif method.lower() == "cbow": 99 | for i in range(skip_window,len(w_idx)-skip_window): 100 | context = [] 101 | for j in js: 102 | context.append(w_idx[i+j]) 103 | pairs.append(context+[w_idx[i]]) 104 | else: 105 | raise ValueError 106 | 107 | pairs = np.array(pairs) 108 | print("5 expample pairs:\n",pairs[:5]) 109 | if method.lower()=="skip_gram": 110 | x,y = pairs[:,0],pairs[:,1] 111 | elif method.lower() == "cbow": 112 | x,y = pairs[:,:-1],pairs[:,-1] 113 | else: 114 | raise ValueError 115 | return Dataset(x,y,v2i,i2v) 116 | 117 | def maybe_download_mrpc(save_dir="./MRPC/", proxy=None): 118 | train_url = 'https://mofanpy.com/static/files/MRPC/msr_paraphrase_train.txt' 119 | test_url = 'https://mofanpy.com/static/files/MRPC/msr_paraphrase_test.txt' 120 | os.makedirs(save_dir, exist_ok=True) 121 | proxies = {"http": proxy, "https": proxy} 122 | for url in [train_url, test_url]: 123 | raw_path = os.path.join(save_dir, url.split("/")[-1]) 124 | if not os.path.isfile(raw_path): 125 | print("downloading from %s" % url) 126 | r = requests.get(url, proxies=proxies) 127 | with open(raw_path, "w", encoding="utf-8") as f: 128 | f.write(r.text.replace('"', "")) 129 | print("completed") 130 | 131 | 132 | def _text_standardize(text): 133 | text = re.sub(r'—', '-', text) 134 | text = re.sub(r'–', '-', text) 135 | text = re.sub(r'―', '-', text) 136 | text = re.sub(r" \d+(,\d+)?(\.\d+)? ", " ", text) 137 | text = re.sub(r" \d+-+?\d*", " -", text) 138 | return text.strip() 139 | 140 | 141 | def _process_mrpc(dir="./MRPC", rows=None): 142 | data = {"train": None, "test": None} 143 | files = os.listdir(dir) 144 | for f in files: 145 | df = pd.read_csv(os.path.join(dir, f), sep='\t', nrows=rows) 146 | k = "train" if "train" in f else "test" 147 | data[k] = {"is_same": df.iloc[:, 0].values, "s1": df["#1 String"].values, "s2": df["#2 String"].values} 148 | vocab = set() 149 | for n in ["train", "test"]: 150 | for m in ["s1", "s2"]: 151 | for i in range(len(data[n][m])): 152 | data[n][m][i] = _text_standardize(data[n][m][i].lower()) 153 | cs = data[n][m][i].split(" ") 154 | vocab.update(set(cs)) 155 | v2i = {v: i for i, v in enumerate(sorted(vocab), start=1)} 156 | v2i[""] = PAD_ID 157 | v2i[""] = len(v2i) 158 | v2i[""] = len(v2i) 159 | v2i[""] = len(v2i) 160 | i2v = {i: v for v, i in v2i.items()} 161 | for n in ["train", "test"]: 162 | for m in ["s1", "s2"]: 163 | data[n][m+"id"] = [[v2i[v] for v in c.split(" ")] for c in data[n][m]] 164 | return data, v2i, i2v 165 | 166 | class MRPCData(tDataset): 167 | num_seg = 3 168 | pad_id = PAD_ID 169 | 170 | def __init__(self, data_dir="./MRPC/", rows=None, proxy=None): 171 | maybe_download_mrpc(save_dir=data_dir, proxy=proxy) 172 | data, self.v2i, self.i2v = _process_mrpc(data_dir, rows) 173 | self.max_len = max( 174 | [len(s1) + len(s2) + 3 for s1, s2 in zip( 175 | data["train"]["s1id"] + data["test"]["s1id"], data["train"]["s2id"] + data["test"]["s2id"])]) 176 | 177 | self.xlen = np.array([ 178 | [ 179 | len(data["train"]["s1id"][i]), len(data["train"]["s2id"][i]) 180 | ] for i in range(len(data["train"]["s1id"]))], dtype=int) 181 | x = [ 182 | [self.v2i[""]] + data["train"]["s1id"][i] + [self.v2i[""]] + data["train"]["s2id"][i] + [self.v2i[""]] 183 | for i in range(len(self.xlen)) 184 | ] 185 | self.x = pad_zero(x, max_len=self.max_len) 186 | self.nsp_y = data["train"]["is_same"][:, None] 187 | 188 | self.seg = np.full(self.x.shape, self.num_seg-1, np.int32) 189 | for i in range(len(x)): 190 | si = self.xlen[i][0] + 2 191 | self.seg[i, :si] = 0 192 | si_ = si + self.xlen[i][1] + 1 193 | self.seg[i, si:si_] = 1 194 | 195 | self.word_ids = np.array(list(set(self.i2v.keys()).difference( 196 | [self.v2i[v] for v in ["", "", ""]]))) 197 | 198 | def __getitem__(self,idx): 199 | return self.x[idx], self.seg[idx], self.xlen[idx], self.nsp_y[idx] 200 | 201 | def sample(self, n): 202 | bi = np.random.randint(0, self.x.shape[0], size=n) 203 | bx, bs, bl, by = self.x[bi], self.seg[bi], self.xlen[bi], self.nsp_y[bi] 204 | return bx, bs, bl, by 205 | 206 | @property 207 | def num_word(self): 208 | return len(self.v2i) 209 | 210 | def __len__(self): 211 | return len(self.x) 212 | 213 | @property 214 | def mask_id(self): 215 | return self.v2i[""] 216 | 217 | class MRPCSingle(tDataset): 218 | pad_id = PAD_ID 219 | 220 | def __init__(self,data_dir="./MRPC/",rows = None, proxy= None): 221 | maybe_download_mrpc(save_dir=data_dir, proxy=proxy) 222 | 223 | data, self.v2i, self.i2v = _process_mrpc(data_dir, rows) 224 | 225 | self.max_len = max([len(s) + 2 for s in data["train"]["s1id"] + data["train"]["s2id"]]) 226 | x = [ 227 | [self.v2i[""]] + data["train"]["s1id"][i] + [self.v2i[""]] 228 | for i in range(len(data["train"]["s1id"])) 229 | ] 230 | x += [ 231 | [self.v2i[""]] + data["train"]["s2id"][i] + [self.v2i[""]] 232 | for i in range(len(data["train"]["s2id"])) 233 | ] 234 | self.x = pad_zero(x, max_len=self.max_len) 235 | self.word_ids = np.array(list(set(self.i2v.keys()).difference([self.v2i[""]]))) 236 | def sample(self, n): 237 | bi = np.random.randint(0, self.x.shape[0], size=n) 238 | bx = self.x[bi] 239 | return bx 240 | 241 | @property 242 | def num_word(self): 243 | return len(self.v2i) 244 | 245 | def __getitem__(self, index): 246 | return self.x[index] 247 | 248 | 249 | def __len__(self): 250 | return len(self.x) 251 | -------------------------------------------------------------------------------- /pytorch/visual.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | 3 | def show_w2v_word_embedding(model,data,path): 4 | word_emb = model.embeddings.weight.data.numpy() 5 | for i in range(data.num_word): 6 | c = "blue" 7 | try: 8 | int(data.i2v[i]) 9 | except: 10 | c = "red" 11 | 12 | plt.text(word_emb[i,0],word_emb[i,1], s= data.i2v[i], color=c,weight = "bold") 13 | 14 | plt.xlim(word_emb[:,0].min() - 0.5, word_emb[:,0].max()+0.5) 15 | plt.ylim(word_emb[:,1].min() - 0.5, word_emb[:,1].max()+0.5) 16 | plt.xticks(()) 17 | plt.yticks(()) 18 | plt.xlabel("embedding dim1") 19 | plt.ylabel("embedding dim2") 20 | plt.savefig(path,dpi=300,format="png") 21 | plt.show() -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | matplotlib==3.2.1 2 | numpy==1.18.5 3 | pandas==1.0.4 4 | requests==2.23.0 5 | sklearn==0.23.0 6 | tensorflow==2.3.1 7 | tensorflow-addons==0.10.0 8 | -------------------------------------------------------------------------------- /seq2seq.py: -------------------------------------------------------------------------------- 1 | # [Sequence to Sequence Learning with Neural Networks](https://papers.nips.cc/paper/5346-sequence-to-sequence-learning-with-neural-networks.pdf) 2 | import tensorflow as tf 3 | from tensorflow import keras 4 | import numpy as np 5 | import utils # this refers to utils.py in my [repo](https://github.com/MorvanZhou/NLP-Tutorials/) 6 | import tensorflow_addons as tfa 7 | 8 | 9 | class Seq2Seq(keras.Model): 10 | def __init__(self, enc_v_dim, dec_v_dim, emb_dim, units, max_pred_len, start_token, end_token): 11 | super().__init__() 12 | self.units = units 13 | 14 | # encoder 15 | self.enc_embeddings = keras.layers.Embedding( 16 | input_dim=enc_v_dim, output_dim=emb_dim, # [enc_n_vocab, emb_dim] 17 | embeddings_initializer=tf.initializers.RandomNormal(0., 0.1), 18 | ) 19 | self.encoder = keras.layers.LSTM(units=units, return_sequences=True, return_state=True) 20 | 21 | # decoder 22 | self.dec_embeddings = keras.layers.Embedding( 23 | input_dim=dec_v_dim, output_dim=emb_dim, # [dec_n_vocab, emb_dim] 24 | embeddings_initializer=tf.initializers.RandomNormal(0., 0.1), 25 | ) 26 | self.decoder_cell = keras.layers.LSTMCell(units=units) 27 | decoder_dense = keras.layers.Dense(dec_v_dim) 28 | # train decoder 29 | self.decoder_train = tfa.seq2seq.BasicDecoder( 30 | cell=self.decoder_cell, 31 | sampler=tfa.seq2seq.sampler.TrainingSampler(), # sampler for train 32 | output_layer=decoder_dense 33 | ) 34 | # predict decoder 35 | self.decoder_eval = tfa.seq2seq.BasicDecoder( 36 | cell=self.decoder_cell, 37 | sampler=tfa.seq2seq.sampler.GreedyEmbeddingSampler(), # sampler for predict 38 | output_layer=decoder_dense 39 | ) 40 | 41 | self.cross_entropy = keras.losses.SparseCategoricalCrossentropy(from_logits=True) 42 | self.opt = keras.optimizers.Adam(0.01) 43 | self.max_pred_len = max_pred_len 44 | self.start_token = start_token 45 | self.end_token = end_token 46 | 47 | def encode(self, x): 48 | embedded = self.enc_embeddings(x) 49 | init_s = [tf.zeros((x.shape[0], self.units)), tf.zeros((x.shape[0], self.units))] 50 | o, h, c = self.encoder(embedded, initial_state=init_s) 51 | return [h, c] 52 | 53 | def inference(self, x): 54 | s = self.encode(x) 55 | done, i, s = self.decoder_eval.initialize( 56 | self.dec_embeddings.variables[0], 57 | start_tokens=tf.fill([x.shape[0], ], self.start_token), 58 | end_token=self.end_token, 59 | initial_state=s, 60 | ) 61 | pred_id = np.zeros((x.shape[0], self.max_pred_len), dtype=np.int32) 62 | for l in range(self.max_pred_len): 63 | o, s, i, done = self.decoder_eval.step( 64 | time=l, inputs=i, state=s, training=False) 65 | pred_id[:, l] = o.sample_id 66 | return pred_id 67 | 68 | def train_logits(self, x, y, seq_len): 69 | s = self.encode(x) 70 | dec_in = y[:, :-1] # ignore 71 | dec_emb_in = self.dec_embeddings(dec_in) 72 | o, _, _ = self.decoder_train(dec_emb_in, s, sequence_length=seq_len) 73 | logits = o.rnn_output 74 | return logits 75 | 76 | def step(self, x, y, seq_len): 77 | with tf.GradientTape() as tape: 78 | logits = self.train_logits(x, y, seq_len) 79 | dec_out = y[:, 1:] # ignore 80 | loss = self.cross_entropy(dec_out, logits) 81 | grads = tape.gradient(loss, self.trainable_variables) 82 | self.opt.apply_gradients(zip(grads, self.trainable_variables)) 83 | return loss.numpy() 84 | 85 | 86 | def train(): 87 | # get and process data 88 | data = utils.DateData(4000) 89 | print("Chinese time order: yy/mm/dd ", data.date_cn[:3], "\nEnglish time order: dd/M/yyyy ", data.date_en[:3]) 90 | print("vocabularies: ", data.vocab) 91 | print("x index sample: \n{}\n{}".format(data.idx2str(data.x[0]), data.x[0]), 92 | "\ny index sample: \n{}\n{}".format(data.idx2str(data.y[0]), data.y[0])) 93 | 94 | model = Seq2Seq( 95 | data.num_word, data.num_word, emb_dim=16, units=32, 96 | max_pred_len=11, start_token=data.start_token, end_token=data.end_token) 97 | 98 | # training 99 | for t in range(1500): 100 | bx, by, decoder_len = data.sample(32) 101 | loss = model.step(bx, by, decoder_len) 102 | if t % 70 == 0: 103 | target = data.idx2str(by[0, 1:-1]) 104 | pred = model.inference(bx[0:1]) 105 | res = data.idx2str(pred[0]) 106 | src = data.idx2str(bx[0]) 107 | print( 108 | "t: ", t, 109 | "| loss: %.3f" % loss, 110 | "| input: ", src, 111 | "| target: ", target, 112 | "| inference: ", res, 113 | ) 114 | 115 | 116 | if __name__ == "__main__": 117 | train() 118 | -------------------------------------------------------------------------------- /seq2seq_attention.py: -------------------------------------------------------------------------------- 1 | # [Effective Approaches to Attention-based Neural Machine Translation](https://arxiv.org/pdf/1508.04025.pdf) 2 | import tensorflow as tf 3 | from tensorflow import keras 4 | import numpy as np 5 | import utils # this refers to utils.py in my [repo](https://github.com/MorvanZhou/NLP-Tutorials/) 6 | import tensorflow_addons as tfa 7 | import pickle 8 | 9 | 10 | class Seq2Seq(keras.Model): 11 | def __init__(self, enc_v_dim, dec_v_dim, emb_dim, units, attention_layer_size, max_pred_len, start_token, end_token): 12 | super().__init__() 13 | self.units = units 14 | 15 | # encoder 16 | self.enc_embeddings = keras.layers.Embedding( 17 | input_dim=enc_v_dim, output_dim=emb_dim, # [enc_n_vocab, emb_dim] 18 | embeddings_initializer=tf.initializers.RandomNormal(0., 0.1), 19 | ) 20 | self.encoder = keras.layers.LSTM(units=units, return_sequences=True, return_state=True) 21 | 22 | # decoder 23 | self.attention = tfa.seq2seq.LuongAttention(units, memory=None, memory_sequence_length=None) 24 | self.decoder_cell = tfa.seq2seq.AttentionWrapper( 25 | cell=keras.layers.LSTMCell(units=units), 26 | attention_mechanism=self.attention, 27 | attention_layer_size=attention_layer_size, 28 | alignment_history=True, # for attention visualization 29 | ) 30 | 31 | self.dec_embeddings = keras.layers.Embedding( 32 | input_dim=dec_v_dim, output_dim=emb_dim, # [dec_n_vocab, emb_dim] 33 | embeddings_initializer=tf.initializers.RandomNormal(0., 0.1), 34 | ) 35 | decoder_dense = keras.layers.Dense(dec_v_dim) # output layer 36 | 37 | # train decoder 38 | self.decoder_train = tfa.seq2seq.BasicDecoder( 39 | cell=self.decoder_cell, 40 | sampler=tfa.seq2seq.sampler.TrainingSampler(), # sampler for train 41 | output_layer=decoder_dense 42 | ) 43 | self.cross_entropy = keras.losses.SparseCategoricalCrossentropy(from_logits=True) 44 | self.opt = keras.optimizers.Adam(0.05, clipnorm=5.0) 45 | 46 | # predict decoder 47 | self.decoder_eval = tfa.seq2seq.BasicDecoder( 48 | cell=self.decoder_cell, 49 | sampler=tfa.seq2seq.sampler.GreedyEmbeddingSampler(), # sampler for predict 50 | output_layer=decoder_dense 51 | ) 52 | 53 | # prediction restriction 54 | self.max_pred_len = max_pred_len 55 | self.start_token = start_token 56 | self.end_token = end_token 57 | 58 | def encode(self, x): 59 | o = self.enc_embeddings(x) 60 | init_s = [tf.zeros((x.shape[0], self.units)), tf.zeros((x.shape[0], self.units))] 61 | o, h, c = self.encoder(o, initial_state=init_s) 62 | return o, h, c 63 | 64 | def set_attention(self, x): 65 | o, h, c = self.encode(x) 66 | # encoder output for attention to focus 67 | self.attention.setup_memory(o) 68 | # wrap state by attention wrapper 69 | s = self.decoder_cell.get_initial_state(batch_size=x.shape[0], dtype=tf.float32).clone(cell_state=[h, c]) 70 | return s 71 | 72 | def inference(self, x, return_align=False): 73 | s = self.set_attention(x) 74 | done, i, s = self.decoder_eval.initialize( 75 | self.dec_embeddings.variables[0], 76 | start_tokens=tf.fill([x.shape[0], ], self.start_token), 77 | end_token=self.end_token, 78 | initial_state=s, 79 | ) 80 | pred_id = np.zeros((x.shape[0], self.max_pred_len), dtype=np.int32) 81 | for l in range(self.max_pred_len): 82 | o, s, i, done = self.decoder_eval.step( 83 | time=l, inputs=i, state=s, training=False) 84 | pred_id[:, l] = o.sample_id 85 | if return_align: 86 | return np.transpose(s.alignment_history.stack().numpy(), (1, 0, 2)) 87 | else: 88 | s.alignment_history.mark_used() # otherwise gives warning 89 | return pred_id 90 | 91 | def train_logits(self, x, y, seq_len): 92 | s = self.set_attention(x) 93 | dec_in = y[:, :-1] # ignore 94 | dec_emb_in = self.dec_embeddings(dec_in) 95 | o, _, _ = self.decoder_train(dec_emb_in, s, sequence_length=seq_len) 96 | logits = o.rnn_output 97 | return logits 98 | 99 | def step(self, x, y, seq_len): 100 | with tf.GradientTape() as tape: 101 | logits = self.train_logits(x, y, seq_len) 102 | dec_out = y[:, 1:] # ignore 103 | loss = self.cross_entropy(dec_out, logits) 104 | grads = tape.gradient(loss, self.trainable_variables) 105 | self.opt.apply_gradients(zip(grads, self.trainable_variables)) 106 | return loss.numpy() 107 | 108 | 109 | def train(): 110 | # get and process data 111 | data = utils.DateData(2000) 112 | print("Chinese time order: yy/mm/dd ", data.date_cn[:3], "\nEnglish time order: dd/M/yyyy ", data.date_en[:3]) 113 | print("vocabularies: ", data.vocab) 114 | print("x index sample: \n{}\n{}".format(data.idx2str(data.x[0]), data.x[0]), 115 | "\ny index sample: \n{}\n{}".format(data.idx2str(data.y[0]), data.y[0])) 116 | 117 | model = Seq2Seq( 118 | data.num_word, data.num_word, emb_dim=12, units=14, attention_layer_size=16, 119 | max_pred_len=11, start_token=data.start_token, end_token=data.end_token) 120 | 121 | # training 122 | for t in range(1000): 123 | bx, by, decoder_len = data.sample(64) 124 | loss = model.step(bx, by, decoder_len) 125 | if t % 70 == 0: 126 | target = data.idx2str(by[0, 1:-1]) 127 | pred = model.inference(bx[0:1]) 128 | res = data.idx2str(pred[0]) 129 | src = data.idx2str(bx[0]) 130 | print( 131 | "t: ", t, 132 | "| loss: %.5f" % loss, 133 | "| input: ", src, 134 | "| target: ", target, 135 | "| inference: ", res, 136 | ) 137 | 138 | pkl_data = {"i2v": data.i2v, "x": data.x[:6], "y": data.y[:6], "align": model.inference(data.x[:6], return_align=True)} 139 | 140 | with open("./visual/tmp/attention_align.pkl", "wb") as f: 141 | pickle.dump(pkl_data, f) 142 | 143 | 144 | if __name__ == "__main__": 145 | train() 146 | -------------------------------------------------------------------------------- /simple_realize/CBOW.py: -------------------------------------------------------------------------------- 1 | # [Efficient Estimation of Word Representations in Vector Space](https://arxiv.org/pdf/1301.3781.pdf) 2 | ''' 3 | created by YuYang github.com/W1Fl 4 | ''' 5 | from io import BytesIO 6 | 7 | import imageio 8 | import matplotlib.pyplot as plt 9 | import tensorflow as tf 10 | from tensorflow import keras 11 | 12 | from utils import process_w2v_data 13 | 14 | Batch_size = 32 15 | Learn_rate = 0.01 16 | Epochs = 256 17 | DataSize = 512 18 | 19 | corpus = [ 20 | # numbers 21 | "5 2 4 8 6 2 3 6 4", 22 | "4 8 5 6 9 5 5 6", 23 | "1 1 5 2 3 3 8", 24 | "3 6 9 6 8 7 4 6 3", 25 | "8 9 9 6 1 4 3 4", 26 | "1 0 2 0 2 1 3 3 3 3 3", 27 | "9 3 3 0 1 4 7 8", 28 | "9 9 8 5 6 7 1 2 3 0 1 0", 29 | 30 | # alphabets, expecting that 9 is close to letters 31 | "a t g q e h 9 u f", 32 | "e q y u o i p s", 33 | "q o 9 p l k j o k k o p", 34 | "h g y i u t t a e q", 35 | "i k d q r e 9 e a d", 36 | "o p d g 9 s a f g a", 37 | "i u y g h k l a s w", 38 | "o l u y a o g f s", 39 | "o p i u y g d a s j d l", 40 | "u k i l o 9 l j s", 41 | "y g i s h k j l f r f", 42 | "i o h n 9 9 d 9 f a 9", 43 | ] 44 | 45 | SkipGram = lambda v_dim, emb_dim: keras.Sequential([ 46 | keras.layers.Embedding( 47 | input_dim=v_dim, output_dim=emb_dim, # [n_vocab, emb_dim] 48 | embeddings_initializer=keras.initializers.RandomNormal(0., 0.1), 49 | ), 50 | keras.layers.Lambda(lambda x:tf.reduce_mean(x,1)) 51 | ]) 52 | 53 | 54 | class myTensorboard(keras.callbacks.TensorBoard): 55 | def __init__(self, data, log_dir='logs/CBOW', histogram_freq=1, write_graph=True, write_images=True, 56 | embeddings_freq=10, **kwargs): 57 | super().__init__(log_dir=log_dir, histogram_freq=histogram_freq, write_graph=write_graph, 58 | write_images=write_images, embeddings_freq=embeddings_freq, **kwargs) 59 | self.buffer = BytesIO() 60 | self.data = data 61 | 62 | 63 | def plot(self, data): 64 | word_emb = model.layers[0].get_weights()[0] 65 | for i in range(data.num_word): 66 | c = "blue" 67 | try: 68 | int(data.i2v[i]) 69 | except ValueError: 70 | c = "red" 71 | plt.text(word_emb[i, 0], word_emb[i, 1], s=data.i2v[i], color=c, weight="bold") 72 | plt.xlim(word_emb[:, 0].min() - .5, word_emb[:, 0].max() + .5) 73 | plt.ylim(word_emb[:, 1].min() - .5, word_emb[:, 1].max() + .5) 74 | plt.xticks(()) 75 | plt.yticks(()) 76 | plt.xlabel("embedding dim1") 77 | plt.ylabel("embedding dim2") 78 | plt.savefig(self.buffer, format='png') 79 | plt.close() 80 | self.buffer.seek(0) 81 | 82 | def on_epoch_end(self, epoch, logs=None): 83 | writer = self._get_writer(self._train_run_name) 84 | if (not epoch % 1): 85 | self.plot(self.data) 86 | with writer.as_default(): 87 | tf.summary.image('embedding', imageio.imread(self.buffer)[None, :], step=epoch) 88 | self.buffer.seek(0) 89 | super(myTensorboard, self).on_epoch_end(epoch, logs) 90 | 91 | 92 | class nce_loss(keras.losses.Loss): 93 | # negative sampling: take one positive label and num_sampled negative labels to compute the loss 94 | # in order to reduce the computation of full softmax 95 | def __init__(self, model, v_dim, emb_dim): 96 | super(nce_loss, self).__init__() 97 | # noise-contrastive estimation 98 | self.nce_w = model.add_weight( 99 | name="nce_w", shape=[v_dim, emb_dim], 100 | initializer=keras.initializers.TruncatedNormal(0., 0.1)) # [n_vocab, emb_dim] 101 | self.nce_b = model.add_weight( 102 | name="nce_b", shape=(v_dim,), 103 | initializer=keras.initializers.Constant(0.1)) # [n_vocab, ] 104 | self.v_dim = v_dim 105 | 106 | def call(self, y_true, y_pred): 107 | # return keras.losses.SparseCategoricalCrossentropy()(y_true,y_pred) 108 | return tf.nn.nce_loss( 109 | weights=self.nce_w, biases=self.nce_b, labels=y_true, 110 | inputs=y_pred, num_sampled=5, num_classes=self.v_dim) 111 | 112 | 113 | if __name__ == "__main__": 114 | d = process_w2v_data(corpus, skip_window=2, method="cbow") 115 | bx, by = d.sample(DataSize) 116 | model = SkipGram(d.num_word, 2) 117 | model.compile(optimizer=keras.optimizers.Adam(Learn_rate), loss=nce_loss(model, d.num_word, 2)) 118 | model.fit(bx, by, Batch_size, Epochs, callbacks=[myTensorboard(d)], verbose=2) 119 | 120 | #use tensorboard --logdir logs --samples_per_plugin=images=255 to show all images 121 | -------------------------------------------------------------------------------- /simple_realize/README.md: -------------------------------------------------------------------------------- 1 | # NLP教程简化实现 2 | 3 | >这个目录下是NLP课程代码使用纯keras完成的简化版本,而且全部实现了tensorboard可视化 4 | 5 | 在这些简化后的代码中,你可以 6 | * 直观地了解seq2seq,LuongAttention和transformer的实现细节 7 | * 看到embedding动画 8 | * 比较带有attention的seq2seq和普通seq2seq 9 | * 更进一步扩展模型 10 | 11 | ----- 12 | 13 | ## 代码 14 | 1. Understand Word (W2V) 15 | - [Continuous Bag of Words (CBOW)](#Word2Vec) 16 | - [Skip-Gram](#Word2Vec) 17 | 2. Understand Sentence (Seq2Seq) 18 | - [seq2seq](#Seq2Seq) 19 | - [CNN language model](#Seq2Seq) 20 | 3. All about Attention 21 | - [seq2seq with attention](#Seq2SeqAttention) 22 | - [transformer](#Seq2SeqAttention) 23 | 24 | 25 | ## Word2Vec 26 | * [CBOW](CBOW.py) 27 | * [Skip-Gram](skip-gram.py) 28 | 29 | ![](./imgs/skip-gram.gif) 30 | 31 | ## Seq2Seq 32 | * [seq2seq](seq2seq.py) 33 | * [cnn-im](cnn-lm.py) 34 | 35 | ![月份和数字聚在一起,而符号则分散开](./imgs/seq2seq-embedding.gif) 36 | 37 | ## Seq2SeqAttention 38 | * [seq2seq_attention](seq2seq_attention.py) 39 | * [transformer](transformer.py) 40 | 41 | 42 | ![](./imgs/attention.gif) -------------------------------------------------------------------------------- /simple_realize/cnn-lm.py: -------------------------------------------------------------------------------- 1 | # [Sequence to Sequence Learning with Neural Networks](https://papers.nips.cc/paper/5346-sequence-to-sequence-learning-with-neural-networks.pdf) 2 | ''' 3 | created by YuYang github.com/W1Fl 4 | ''' 5 | import tensorflow as tf 6 | from tensorflow import keras 7 | import utils 8 | 9 | Batch_size = 64 10 | Learn_rate = 0.01 11 | Epochs = 15 12 | DataSize = 1600 13 | 14 | 15 | class Seq2Seq(keras.Model): 16 | def __init__(self, enc_v_dim, dec_v_dim, emb_dim, units, max_pred_len, start_token, end_token): 17 | super().__init__() 18 | self.enc_v_dim = enc_v_dim 19 | self.emb_dim = emb_dim 20 | self.units = units 21 | self.dec_v_dim = dec_v_dim 22 | self.max_pred_len = max_pred_len 23 | self.start_token = start_token 24 | self.end_token = end_token 25 | 26 | def build(self, input_shape): 27 | # encoder 28 | self.enc_embeddings = keras.layers.Embedding( 29 | input_dim=self.enc_v_dim, 30 | output_dim=self.emb_dim, # [enc_n_vocab, emb_dim] 31 | embeddings_initializer=tf.initializers.RandomNormal(0., 0.1), 32 | name='encoder/embeddings' 33 | ) 34 | 35 | 36 | self.conv2ds = [ 37 | keras.layers.Conv2D(16, (n, self.emb_dim), padding="valid", activation=keras.activations.relu) 38 | for n in range(2, 5)] 39 | self.max_pools = [keras.layers.MaxPool2D((n, 1)) for n in [7, 6, 5]] 40 | self.encoder = keras.layers.Dense(self.units, activation=keras.activations.relu) 41 | 42 | 43 | # decoder 44 | self.dec_embeddings = keras.layers.Embedding( 45 | input_dim=self.dec_v_dim, output_dim=self.emb_dim, # [dec_n_vocab, emb_dim] 46 | embeddings_initializer=tf.initializers.RandomNormal(0., 0.1), 47 | name='decoder/embeddings' 48 | 49 | ) 50 | self.dec_embeddings.build((None, self.dec_v_dim)) 51 | self.decoder = keras.layers.LSTM(units=self.units, return_state=True, return_sequences=True, 52 | name='decoder/LSTM') 53 | self.decoder_dense = keras.layers.Dense(self.dec_v_dim, activation=keras.activations.softmax, 54 | name='decoder/Dense') 55 | 56 | self.batch = input_shape[0][0] 57 | super(Seq2Seq, self).build([*input_shape]) 58 | 59 | def encode(self, x): 60 | embedded = self.enc_embeddings(x) # [n, step, emb] 61 | o = tf.expand_dims(embedded, axis=3) # [n, step=8, emb=16, 1] 62 | co = [conv2d(o) for conv2d in self.conv2ds] # [n, 7, 1, 16], [n, 6, 1, 16], [n, 5, 1, 16] 63 | co = [self.max_pools[i](co[i]) for i in range(len(co))] # [n, 1, 1, 16] * 3 64 | co = [tf.squeeze(c, axis=[1, 2]) for c in co] # [n, 16] * 3 65 | o = tf.concat(co, axis=1) # [n, 16*3] 66 | h = self.encoder(o) # [n, units] 67 | return h, h 68 | 69 | def decode(self, batch, h, c, y=None, training=None): 70 | if training: #将上一时刻的标签作为当前时刻的输入 71 | y = self.dec_embeddings(y) 72 | y, h, c = self.decoder(y, (h, c)) 73 | y = self.decoder_dense(y) 74 | else:#将上一时刻的输出作为当前时刻的输入 75 | y = [] 76 | o = tf.zeros((batch, 1, self.dec_v_dim)) 77 | for i in range(self.max_pred_len): 78 | o = o @ self.dec_embeddings.weights 79 | o, h, c = self.decoder(o, (h, c)) 80 | o = self.decoder_dense(o) 81 | y.append(o) 82 | y = tf.concat(y, 1) 83 | return y 84 | 85 | # @tf.function 86 | def call(self, inputs, training=None, mask=None): 87 | x = inputs[0] 88 | y = inputs[1] 89 | if training: 90 | y = tf.pad(y[:, :-1], [[0, 0], [1, 0]]) 91 | h, c = self.encode(x) 92 | batch = tf.shape(x)[0] 93 | y = self.decode(batch, h, c, y, training) 94 | return y 95 | 96 | 97 | class myTensorboard(keras.callbacks.TensorBoard): 98 | def __init__(self, data, log_dir='logs/cnn-lm', histogram_freq=1, write_graph=True, write_images=True, 99 | embeddings_freq=10, **kwargs): 100 | self.data = data 101 | super().__init__(log_dir=log_dir, histogram_freq=histogram_freq, write_graph=write_graph, 102 | write_images=write_images, embeddings_freq=embeddings_freq, **kwargs) 103 | def on_epoch_end(self, epoch, logs=None): 104 | if (not epoch % 1): 105 | x, y, l = self.data.sample(1) 106 | y_ = self.model((x, y), training=False) 107 | y_ = tf.argmax(y_, -1).numpy() 108 | target = self.data.idx2str(y[0]) 109 | res = self.data.idx2str(y_[0]) 110 | src = self.data.idx2str(x[0]) 111 | print( 112 | '\n', 113 | "t: ", epoch, 114 | "| input: ", src, 115 | "| target: ", target, 116 | "| inference: ", res, 117 | ) 118 | super(myTensorboard, self).on_epoch_end(epoch, logs) 119 | 120 | 121 | def train(): 122 | # get and process data 123 | data = utils.DateData(DataSize) 124 | train_x, train_y, train_l = data.sample(DataSize) 125 | 126 | print("Chinese time order: yy/mm/dd ", data.date_cn[:3], "\nEnglish time order: dd/M/yyyy ", data.date_en[:3]) 127 | print("vocabularies: ", data.vocab) 128 | print("x index sample: \n{}\n{}".format(data.idx2str(data.x[0]), data.x[0]), 129 | "\ny index sample: \n{}\n{}".format(data.idx2str(data.y[0]), data.y[0])) 130 | 131 | model = Seq2Seq( 132 | data.num_word, data.num_word, emb_dim=16, units=32, 133 | max_pred_len=11, start_token=data.start_token, end_token=data.end_token) 134 | model.compile(optimizer=keras.optimizers.Adam(Learn_rate), loss=keras.losses.SparseCategoricalCrossentropy(False), 135 | metrics=[keras.metrics.sparse_categorical_accuracy]) 136 | model.fit((train_x, train_y), train_y, callbacks=[myTensorboard(data)], batch_size=Batch_size, epochs=Epochs) 137 | 138 | 139 | if __name__ == "__main__": 140 | train() 141 | -------------------------------------------------------------------------------- /simple_realize/imgs/attention.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MorvanZhou/NLP-Tutorials/3aa02a13151b696e8c034da5f7bd2ef4294a5e5f/simple_realize/imgs/attention.gif -------------------------------------------------------------------------------- /simple_realize/imgs/seq2seq-embedding.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MorvanZhou/NLP-Tutorials/3aa02a13151b696e8c034da5f7bd2ef4294a5e5f/simple_realize/imgs/seq2seq-embedding.gif -------------------------------------------------------------------------------- /simple_realize/imgs/skip-gram.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MorvanZhou/NLP-Tutorials/3aa02a13151b696e8c034da5f7bd2ef4294a5e5f/simple_realize/imgs/skip-gram.gif -------------------------------------------------------------------------------- /simple_realize/seq2seq.py: -------------------------------------------------------------------------------- 1 | # [Sequence to Sequence Learning with Neural Networks](https://papers.nips.cc/paper/5346-sequence-to-sequence-learning-with-neural-networks.pdf) 2 | ''' 3 | created by YuYang github.com/W1Fl 4 | ''' 5 | import tensorflow as tf 6 | from tensorflow import keras 7 | import utils 8 | 9 | Batch_size = 64 10 | Learn_rate = 0.01 11 | Epochs = 15 12 | DataSize = 8192 13 | 14 | 15 | class Seq2Seq(keras.Model): 16 | def __init__(self, enc_v_dim, dec_v_dim, emb_dim, units, max_pred_len, start_token, end_token): 17 | super().__init__() 18 | self.enc_v_dim = enc_v_dim 19 | self.emb_dim = emb_dim 20 | self.units = units 21 | self.dec_v_dim = dec_v_dim 22 | self.max_pred_len = max_pred_len 23 | self.start_token = start_token 24 | self.end_token = end_token 25 | 26 | def build(self, input_shape): 27 | # encoder 28 | self.enc_embeddings = keras.layers.Embedding( 29 | input_dim=self.enc_v_dim, 30 | output_dim=self.emb_dim, # [enc_n_vocab, emb_dim] 31 | embeddings_initializer=tf.initializers.RandomNormal(0., 0.1), 32 | name='encoder/embeddings' 33 | ) 34 | self.encoder = keras.layers.LSTM(units=self.units, return_state=True, name='encoder/LSTM') 35 | 36 | # decoder 37 | self.dec_embeddings = keras.layers.Embedding( 38 | input_dim=self.dec_v_dim, output_dim=self.emb_dim, # [dec_n_vocab, emb_dim] 39 | embeddings_initializer=tf.initializers.RandomNormal(0., 0.1), 40 | name='decoder/embeddings' 41 | 42 | ) 43 | self.dec_embeddings.build((None, self.dec_v_dim)) 44 | self.decoder = keras.layers.LSTM(units=self.units, return_state=True, return_sequences=True, 45 | name='decoder/LSTM') 46 | self.decoder_dense = keras.layers.Dense(self.dec_v_dim, activation=keras.activations.softmax, 47 | name='decoder/Dense') 48 | 49 | self.batch = input_shape[0][0] 50 | super(Seq2Seq, self).build([*input_shape]) 51 | 52 | def encode(self, x): 53 | embedded = self.enc_embeddings(x) 54 | o, h, c = self.encoder(embedded) 55 | return h, c 56 | 57 | def decode(self, batch, h, c, y=None, training=None): 58 | if training: #将上一时刻的标签作为当前时刻的输入 59 | y = self.dec_embeddings(y) 60 | y, h, c = self.decoder(y, (h, c)) 61 | y = self.decoder_dense(y) 62 | else:#将上一时刻的输出作为当前时刻的输入 63 | y = [] 64 | o = tf.zeros((batch, 1, self.dec_v_dim)) 65 | for i in range(self.max_pred_len): 66 | o = o @ self.dec_embeddings.weights 67 | o, h, c = self.decoder(o, (h, c)) 68 | o = self.decoder_dense(o) 69 | y.append(o) 70 | y = tf.concat(y, 1) 71 | return y 72 | 73 | # @tf.function 74 | def call(self, inputs, training=None, mask=None): 75 | x = inputs[0] 76 | y = inputs[1] 77 | if training: 78 | y = tf.pad(y[:, :-1], [[0, 0], [1, 0]]) 79 | h, c = self.encode(x) 80 | batch = tf.shape(x)[0] 81 | y = self.decode(batch, h, c, y, training) 82 | return y 83 | 84 | 85 | class myTensorboard(keras.callbacks.TensorBoard): 86 | def __init__(self, data, log_dir='logs/seq2seq', histogram_freq=1, write_graph=True, write_images=True, 87 | embeddings_freq=10, **kwargs): 88 | self.data = data 89 | super().__init__(log_dir=log_dir, histogram_freq=histogram_freq, write_graph=write_graph, 90 | write_images=write_images, embeddings_freq=embeddings_freq, **kwargs) 91 | def on_epoch_end(self, epoch, logs=None): 92 | if (not epoch % 1): 93 | x, y, l = self.data.sample(1) 94 | y_ = self.model((x, y), training=False) 95 | y_ = tf.argmax(y_, -1).numpy() 96 | target = self.data.idx2str(y[0]) 97 | res = self.data.idx2str(y_[0]) 98 | src = self.data.idx2str(x[0]) 99 | print( 100 | '\n', 101 | "t: ", epoch, 102 | "| input: ", src, 103 | "| target: ", target, 104 | "| inference: ", res, 105 | ) 106 | super(myTensorboard, self).on_epoch_end(epoch, logs) 107 | 108 | 109 | def train(): 110 | # get and process data 111 | data = utils.DateData(DataSize) 112 | train_x, train_y, train_l = data.sample(DataSize) 113 | 114 | print("Chinese time order: yy/mm/dd ", data.date_cn[:3], "\nEnglish time order: dd/M/yyyy ", data.date_en[:3]) 115 | print("vocabularies: ", data.vocab) 116 | print("x index sample: \n{}\n{}".format(data.idx2str(data.x[0]), data.x[0]), 117 | "\ny index sample: \n{}\n{}".format(data.idx2str(data.y[0]), data.y[0])) 118 | 119 | model = Seq2Seq( 120 | data.num_word, data.num_word, emb_dim=16, units=32, 121 | max_pred_len=11, start_token=data.start_token, end_token=data.end_token) 122 | model.compile(optimizer=keras.optimizers.Adam(Learn_rate), loss=keras.losses.SparseCategoricalCrossentropy(False), 123 | metrics=[keras.metrics.sparse_categorical_accuracy]) 124 | model.fit((train_x, train_y), train_y, callbacks=[myTensorboard(data)], batch_size=Batch_size, epochs=Epochs) 125 | 126 | 127 | if __name__ == "__main__": 128 | train() 129 | -------------------------------------------------------------------------------- /simple_realize/seq2seq_attention.py: -------------------------------------------------------------------------------- 1 | # [Effective Approaches to Attention-based Neural Machine Translation](https://arxiv.org/pdf/1508.04025.pdf) 2 | ''' 3 | created by YuYang github.com/W1Fl 4 | ''' 5 | import tensorflow as tf 6 | from tensorflow import keras 7 | import utils 8 | 9 | Batch_size = 64 10 | Learn_rate = 0.01 11 | Epochs = 15 12 | DataSize = 8192 13 | use_attention=True 14 | 15 | 16 | 17 | class Attention(keras.layers.Layer): 18 | def __init__(self,dec_v_dim,**kwargs): 19 | super(Attention, self).__init__(**kwargs) 20 | self.dec_v_dim=dec_v_dim 21 | 22 | def build(self, input_shape): 23 | hs_shape,ht_shape=input_shape 24 | units=hs_shape[2] 25 | super(Attention, self).build(input_shape) 26 | self.Wa=self.add_weight('Wa',[units,units],tf.float32,keras.initializers.RandomNormal(),trainable=True) 27 | self.Wc=self.add_weight('Wc',[units*2,units],tf.float32,keras.initializers.RandomNormal(),trainable=True) 28 | self.dense=keras.layers.Dense(self.dec_v_dim,activation=keras.activations.softmax) 29 | 30 | def call(self,inputs, **kwargs): 31 | hs,ht=inputs #encoder输出序列[banch,enc_len,units],decoder输出[b,dec_len,units] 32 | # dec_len = 1 if not training 33 | score=ht@self.Wa@tf.transpose(hs,[0,2,1])#[banch,1,enc_len] 34 | at=tf.nn.softmax(score,name='attentionValue') 35 | ct=at@hs 36 | ht_=tf.nn.tanh(tf.concat([ct,ht],2)@self.Wc) 37 | ht_=ht_ if use_attention else ht 38 | y=self.dense(ht_) #ht_.shape==ht.shape 39 | return y,at 40 | 41 | 42 | 43 | class Seq2Seq(keras.Model): 44 | def __init__(self, enc_v_dim, dec_v_dim, emb_dim, units, max_pred_len, start_token, end_token): 45 | super().__init__() 46 | self.enc_v_dim = enc_v_dim 47 | self.emb_dim = emb_dim 48 | self.units = units 49 | self.dec_v_dim = dec_v_dim 50 | self.max_pred_len = max_pred_len 51 | self.start_token = start_token 52 | self.end_token = end_token 53 | 54 | def build(self, input_shape): 55 | # encoder 56 | self.enc_embeddings = keras.layers.Embedding( 57 | input_dim=self.enc_v_dim, 58 | output_dim=self.emb_dim, # [enc_n_vocab, emb_dim] 59 | embeddings_initializer=tf.initializers.RandomNormal(0., 0.1), 60 | name='encoder/embeddings' 61 | ) 62 | self.encoder = keras.layers.LSTM(units=self.units, return_state=True,return_sequences=True, name='encoder/LSTM') 63 | 64 | # decoder 65 | self.dec_embeddings = keras.layers.Embedding( 66 | input_dim=self.dec_v_dim, output_dim=self.emb_dim, # [dec_n_vocab, emb_dim] 67 | embeddings_initializer=tf.initializers.RandomNormal(0., 0.1), 68 | name='decoder/embeddings' 69 | 70 | ) 71 | self.dec_embeddings.build((None, self.dec_v_dim)) 72 | self.decoder = keras.layers.LSTM(units=self.units, return_state=True, return_sequences=True, 73 | name='decoder/LSTM') 74 | self.decoder_dense = keras.layers.Dense(self.dec_v_dim, activation=keras.activations.softmax, 75 | name='decoder/Dense') 76 | self.attention=Attention(self.dec_v_dim) 77 | self.batch = input_shape[0][0] 78 | super(Seq2Seq, self).build([*input_shape]) 79 | 80 | def encode(self, x): 81 | embedded = self.enc_embeddings(x) 82 | o, h, c = self.encoder(embedded) 83 | return o,h, c 84 | 85 | def decode(self, batch,enc_o, h, c, y=None, training=None): 86 | if training: #将上一时刻的标签作为当前时刻的输入 87 | y = self.dec_embeddings(y) 88 | y, h, c = self.decoder(y, (h, c)) 89 | y,at = self.attention((enc_o,y)) 90 | else:#将上一时刻的输出作为当前时刻的输入 91 | y = [] 92 | o = tf.zeros((batch, 1, self.dec_v_dim)) 93 | for i in range(self.max_pred_len): 94 | o = o @ self.dec_embeddings.weights 95 | o, h, c = self.decoder(o, (h, c)) 96 | o,at = self.attention((enc_o,o)) 97 | y.append(o) 98 | y = tf.concat(y, 1) 99 | return y 100 | 101 | # @tf.function 102 | def call(self, inputs, training=None, mask=None): 103 | x = inputs[0] 104 | y = inputs[1] 105 | if training: 106 | y = tf.pad(y[:, :-1], [[0, 0], [1, 0]]) 107 | o,h, c = self.encode(x) 108 | batch = tf.shape(x)[0] 109 | y = self.decode(batch,o, h, c, y, training) 110 | return y 111 | 112 | 113 | class myTensorboard(keras.callbacks.TensorBoard): 114 | def __init__(self, data, log_dir='logs/seq2seq_attention', histogram_freq=1, write_graph=True, write_images=True, 115 | embeddings_freq=10, **kwargs): 116 | self.data = data 117 | super().__init__(log_dir=log_dir, histogram_freq=histogram_freq, write_graph=write_graph, 118 | write_images=write_images, embeddings_freq=embeddings_freq, **kwargs) 119 | def on_epoch_end(self, epoch, logs=None): 120 | if (not epoch % 1): 121 | x, y, l = self.data.sample(1) 122 | y_ = self.model((x, y), training=False) 123 | y_ = tf.argmax(y_, -1).numpy() 124 | target = self.data.idx2str(y[0]) 125 | res = self.data.idx2str(y_[0]) 126 | src = self.data.idx2str(x[0]) 127 | print( 128 | '\n', 129 | "t: ", epoch, 130 | "| input: ", src, 131 | "| target: ", target, 132 | "| inference: ", res, 133 | ) 134 | super(myTensorboard, self).on_epoch_end(epoch, logs) 135 | 136 | 137 | def train(): 138 | # get and process data 139 | data = utils.DateData(DataSize) 140 | train_x, train_y, train_l = data.sample(DataSize) 141 | 142 | print("Chinese time order: yy/mm/dd ", data.date_cn[:3], "\nEnglish time order: dd/M/yyyy ", data.date_en[:3]) 143 | print("vocabularies: ", data.vocab) 144 | print("x index sample: \n{}\n{}".format(data.idx2str(data.x[0]), data.x[0]), 145 | "\ny index sample: \n{}\n{}".format(data.idx2str(data.y[0]), data.y[0])) 146 | 147 | model = Seq2Seq( 148 | data.num_word, data.num_word, emb_dim=16, units=32, 149 | max_pred_len=11, start_token=data.start_token, end_token=data.end_token) 150 | model.compile(optimizer=keras.optimizers.Adam(Learn_rate), loss=keras.losses.SparseCategoricalCrossentropy(False), 151 | metrics=[keras.metrics.sparse_categorical_accuracy]) 152 | model.fit((train_x, train_y), train_y, callbacks=[myTensorboard(data)], batch_size=Batch_size, epochs=Epochs) 153 | 154 | 155 | if __name__ == "__main__": 156 | train() -------------------------------------------------------------------------------- /simple_realize/skip-gram.py: -------------------------------------------------------------------------------- 1 | # [Efficient Estimation of Word Representations in Vector Space](https://arxiv.org/pdf/1301.3781.pdf) 2 | ''' 3 | created by YuYang github.com/W1Fl 4 | ''' 5 | from io import BytesIO 6 | 7 | import imageio 8 | import matplotlib.pyplot as plt 9 | import tensorflow as tf 10 | from tensorflow import keras 11 | 12 | from utils import process_w2v_data 13 | 14 | Batch_size = 32 15 | Learn_rate = 0.01 16 | Epochs = 256 17 | DataSize = 512 18 | 19 | corpus = [ 20 | # numbers 21 | "5 2 4 8 6 2 3 6 4", 22 | "4 8 5 6 9 5 5 6", 23 | "1 1 5 2 3 3 8", 24 | "3 6 9 6 8 7 4 6 3", 25 | "8 9 9 6 1 4 3 4", 26 | "1 0 2 0 2 1 3 3 3 3 3", 27 | "9 3 3 0 1 4 7 8", 28 | "9 9 8 5 6 7 1 2 3 0 1 0", 29 | 30 | # alphabets, expecting that 9 is close to letters 31 | "a t g q e h 9 u f", 32 | "e q y u o i p s", 33 | "q o 9 p l k j o k k o p", 34 | "h g y i u t t a e q", 35 | "i k d q r e 9 e a d", 36 | "o p d g 9 s a f g a", 37 | "i u y g h k l a s w", 38 | "o l u y a o g f s", 39 | "o p i u y g d a s j d l", 40 | "u k i l o 9 l j s", 41 | "y g i s h k j l f r f", 42 | "i o h n 9 9 d 9 f a 9", 43 | ] 44 | 45 | SkipGram = lambda v_dim, emb_dim: keras.Sequential([ 46 | keras.layers.Embedding( 47 | input_dim=v_dim, output_dim=emb_dim, # [n_vocab, emb_dim] 48 | embeddings_initializer=keras.initializers.RandomNormal(0., 0.1), 49 | ), 50 | keras.layers.Flatten() 51 | ]) 52 | 53 | 54 | class myTensorboard(keras.callbacks.TensorBoard): 55 | def __init__(self, data, log_dir='logs/skip-gram', histogram_freq=1, write_graph=True, write_images=True, 56 | embeddings_freq=10, **kwargs): 57 | super().__init__(log_dir=log_dir, histogram_freq=histogram_freq, write_graph=write_graph, 58 | write_images=write_images, embeddings_freq=embeddings_freq, **kwargs) 59 | self.buffer = BytesIO() 60 | self.data = data 61 | 62 | 63 | def plot(self, data): 64 | word_emb = model.layers[0].get_weights()[0] 65 | for i in range(data.num_word): 66 | c = "blue" 67 | try: 68 | int(data.i2v[i]) 69 | except ValueError: 70 | c = "red" 71 | plt.text(word_emb[i, 0], word_emb[i, 1], s=data.i2v[i], color=c, weight="bold") 72 | plt.xlim(word_emb[:, 0].min() - .5, word_emb[:, 0].max() + .5) 73 | plt.ylim(word_emb[:, 1].min() - .5, word_emb[:, 1].max() + .5) 74 | plt.xticks(()) 75 | plt.yticks(()) 76 | plt.xlabel("embedding dim1") 77 | plt.ylabel("embedding dim2") 78 | plt.savefig(self.buffer, format='png') 79 | plt.close() 80 | self.buffer.seek(0) 81 | 82 | def on_epoch_end(self, epoch, logs=None): 83 | writer = self._get_writer(self._train_run_name) 84 | if (not epoch % 1): 85 | self.plot(self.data) 86 | with writer.as_default(): 87 | tf.summary.image('embedding', imageio.imread(self.buffer)[None, :], step=epoch) 88 | self.buffer.seek(0) 89 | super(myTensorboard, self).on_epoch_end(epoch, logs) 90 | 91 | 92 | class nce_loss(keras.losses.Loss): 93 | # negative sampling: take one positive label and num_sampled negative labels to compute the loss 94 | # in order to reduce the computation of full softmax 95 | def __init__(self, model, v_dim, emb_dim): 96 | super(nce_loss, self).__init__() 97 | # noise-contrastive estimation 98 | self.nce_w = model.add_weight( 99 | name="nce_w", shape=[v_dim, emb_dim], 100 | initializer=keras.initializers.TruncatedNormal(0., 0.1)) # [n_vocab, emb_dim] 101 | self.nce_b = model.add_weight( 102 | name="nce_b", shape=(v_dim,), 103 | initializer=keras.initializers.Constant(0.1)) # [n_vocab, ] 104 | self.v_dim = v_dim 105 | 106 | def call(self, y_true, y_pred): 107 | # return keras.losses.SparseCategoricalCrossentropy()(y_true,y_pred) 108 | return tf.nn.nce_loss( 109 | weights=self.nce_w, biases=self.nce_b, labels=y_true, 110 | inputs=y_pred, num_sampled=5, num_classes=self.v_dim) 111 | 112 | 113 | if __name__ == "__main__": 114 | d = process_w2v_data(corpus, skip_window=2, method="skip_gram") 115 | bx, by = d.sample(DataSize) 116 | model = SkipGram(d.num_word, 2) 117 | model.compile(optimizer=keras.optimizers.Adam(Learn_rate), loss=nce_loss(model, d.num_word, 2)) 118 | model.fit(bx[..., None], by[..., None], Batch_size, Epochs, callbacks=[myTensorboard(d)], verbose=2) 119 | 120 | #use tensorboard --logdir logs --samples_per_plugin=images=255 to show all images 121 | -------------------------------------------------------------------------------- /simple_realize/transformer.py: -------------------------------------------------------------------------------- 1 | # [Attention Is All You Need](https://arxiv.org/pdf/1706.03762.pdf) 2 | ''' 3 | created by YuYang github.com/W1Fl 4 | ''' 5 | import numpy as np 6 | import tensorflow as tf 7 | from tensorflow import keras 8 | 9 | import utils 10 | 11 | MODEL_DIM = 32 12 | MAX_LEN = 12 13 | N_LAYER = 3 14 | N_HEAD = 4 15 | DATA_SIZE = 6400 16 | BATCH_SIZE = 64 17 | LEARN_RATE = 0.001 18 | EPOCHS = 60 19 | 20 | 21 | class MultiHead(keras.layers.Layer): 22 | def __init__(self, n_head): 23 | super().__init__() 24 | self.n_head = n_head 25 | self.attention = None 26 | 27 | def build(self, input_shape): 28 | (q_b, q_t, q_f), (k_b, k_t, k_f), (v_b, v_t, v_f) = input_shape 29 | self.k_f = tf.cast(q_f, tf.float32) 30 | h_dim = q_f // self.n_head 31 | self.wq = self.add_weight('wq', [self.n_head, q_f, h_dim]) 32 | self.wk = self.add_weight('wk', [self.n_head, k_f, h_dim]) 33 | self.wv = self.add_weight('wv', [self.n_head, v_f, h_dim]) 34 | self.wo = self.add_weight('wo', [self.n_head * h_dim, v_f]) 35 | super(MultiHead, self).build(input_shape) 36 | 37 | def call(self, inputs, mask=None, **kwargs): 38 | i_q, i_k, i_v = [i[:, tf.newaxis, ...] for i in inputs] # add multihead axis 39 | q = i_q @ self.wq # [b,h,s,h_dim] 40 | k = i_k @ self.wk 41 | v = i_v @ self.wv 42 | s = q @ tf.transpose(k, [0, 1, 3, 2]) / (tf.math.sqrt(self.k_f) + 1e-8) 43 | if mask is not None: 44 | s += mask * -1e9 45 | a = tf.nn.softmax(s) 46 | self.attention = a 47 | b = a @ v 48 | o = tf.concat(tf.unstack(b, axis=1), 2) @ self.wo 49 | return o 50 | 51 | 52 | class PositionWiseFFN(keras.layers.Layer): 53 | def build(self, input_shape): 54 | model_dim = input_shape[-1] 55 | dff = model_dim * 4 56 | self.l = keras.layers.Dense(dff, activation=keras.activations.relu) 57 | self.o = keras.layers.Dense(model_dim) 58 | super(PositionWiseFFN, self).build(input_shape) 59 | 60 | def call(self, x, **kwargs): 61 | o = self.l(x) 62 | o = self.o(o) 63 | return o # [n, step, dim] 64 | 65 | 66 | class EncodeLayer(keras.layers.Layer): 67 | def __init__(self, n_head): 68 | self.n_head = n_head 69 | super().__init__() 70 | 71 | def build(self, input_shape): 72 | model_dim = input_shape[-1] 73 | self.ln = [keras.layers.LayerNormalization() for _ in range(2)] 74 | self.mh = MultiHead(self.n_head) 75 | self.ffn = PositionWiseFFN(model_dim) 76 | super(EncodeLayer, self).build(input_shape) 77 | 78 | def call(self, inputs, mask=None, **kwargs): 79 | attn = self.mh([inputs] * 3, mask) # [n, step, dim] 80 | o1 = self.ln[0](attn + inputs) 81 | ffn = self.ffn(o1) 82 | o = self.ln[1](ffn + o1) # [n, step, dim] 83 | return o 84 | 85 | 86 | class Encoder(keras.layers.Layer): 87 | def __init__(self, n_head, n_layer): 88 | super().__init__() 89 | self.n_layer = n_layer 90 | self.n_head = n_head 91 | 92 | def build(self, input_shape): 93 | self.ls = [EncodeLayer(self.n_head) for _ in range(self.n_layer)] 94 | super(Encoder, self).build(input_shape) 95 | 96 | def call(self, inputs, mask=None, **kwargs): 97 | xz = inputs 98 | for l in self.ls: 99 | xz = l(xz, mask) 100 | return xz # [n, step, dim] 101 | 102 | 103 | class DecoderLayer(keras.layers.Layer): 104 | def __init__(self, n_head): 105 | super().__init__() 106 | self.n_head = n_head 107 | 108 | def build(self, input_shape): 109 | self.mh = [MultiHead(self.n_head) for _ in range(2)] 110 | self.ffn = PositionWiseFFN(input_shape[-1]) 111 | self.ln = [keras.layers.LayerNormalization() for i in range(3)] 112 | super(DecoderLayer, self).build(input_shape) 113 | 114 | def call(self, inputs, look_ahead_mask=None, pad_mask=None, **kwargs): 115 | xz, yz = inputs 116 | attn = self.mh[0]([yz] * 3, mask=look_ahead_mask) # decoder self attention 117 | o1 = self.ln[0](attn + yz) 118 | attn = self.mh[1]([o1, xz, xz], mask=pad_mask) # decoder + encoder attention 119 | o2 = self.ln[1](attn + o1) 120 | ffn = self.ffn(o2) 121 | o = self.ln[2](ffn + o2) 122 | return o 123 | 124 | 125 | class Decoder(keras.layers.Layer): 126 | def __init__(self, n_head, n_layer): 127 | super().__init__() 128 | self.n_head = n_head 129 | self.n_layer = n_layer 130 | 131 | def build(self, input_shape): 132 | self.ls = [DecoderLayer(self.n_head) for _ in range(self.n_layer)] 133 | super(Decoder, self).build(input_shape) 134 | 135 | def call(self, inputs, look_ahead_mask=None, pad_mask=None): 136 | xz, yz = inputs 137 | for l in self.ls: 138 | yz = l((xz, yz), look_ahead_mask, pad_mask) 139 | return yz 140 | 141 | 142 | class PositionEmbedding(keras.layers.Layer): 143 | def __init__(self, max_len, model_dim, n_vocab): 144 | super().__init__() 145 | self.n_vocab = n_vocab 146 | self.max_len = max_len 147 | self.model_dim = model_dim 148 | 149 | def build(self, input_shape): 150 | pos = np.arange(self.max_len)[:, None] 151 | pe = pos / np.power(10000, 2. * np.arange(self.model_dim)[None, :] / self.model_dim) # [max_len, dim] 152 | pe[:, 0::2] = np.sin(pe[:, 0::2]) 153 | pe[:, 1::2] = np.cos(pe[:, 1::2]) 154 | pe = pe[None, :, :] # [1, max_len, model_dim] for batch adding 155 | self.pe = tf.constant(pe, dtype=tf.float32) 156 | self.embeddings = keras.layers.Embedding( 157 | input_dim=self.n_vocab, output_dim=self.model_dim, # [n_vocab, dim] 158 | embeddings_initializer=tf.initializers.RandomNormal(0., 0.01), 159 | ) 160 | super(PositionEmbedding, self).build(input_shape) 161 | 162 | def call(self, x, **kwargs): 163 | x_embed = self.embeddings(x) + self.pe # [n, step, dim] 164 | return x_embed 165 | 166 | 167 | class Transformer(keras.Model): 168 | def __init__(self, model_dim, max_len, n_encoder_layer, n_decoder_layer, n_head, n_vocab, padding_idx=0): 169 | super().__init__() 170 | self.n_vocab = n_vocab 171 | self.n_decoder_layer = n_decoder_layer 172 | self.n_encoder_layer = n_encoder_layer 173 | self.n_head = n_head 174 | self.model_dim = model_dim 175 | self.max_len = max_len 176 | self.padding_idx = padding_idx 177 | 178 | def build(self, input_shape): 179 | self.embed = PositionEmbedding(self.max_len, self.model_dim, self.n_vocab) 180 | self.encoder = Encoder(self.n_head, self.n_encoder_layer) 181 | self.decoder = Decoder(self.n_head, self.n_decoder_layer) 182 | self.o = keras.layers.Dense(self.n_vocab) 183 | super(Transformer, self).build(input_shape) 184 | 185 | def call(self, inputs, training=None, **kwargs): 186 | x, y = inputs 187 | x_embed, y_embed = self.embed(x), self.embed(y) 188 | pad_mask = self._pad_mask(x) 189 | encoded_z = self.encoder(x_embed, mask=pad_mask) 190 | decoded_z = self.decoder( 191 | (encoded_z, y_embed), look_ahead_mask=self._look_ahead_mask(y), pad_mask=pad_mask) 192 | o = self.o(decoded_z) 193 | return o 194 | 195 | def _pad_mask(self, seqs): 196 | mask = tf.cast(tf.math.equal(seqs, self.padding_idx), tf.float32) 197 | return mask[:, tf.newaxis, tf.newaxis, :] # (n, 1, 1, step) 198 | 199 | def _look_ahead_mask(self, seqs): 200 | mask = 1. - tf.linalg.band_part(tf.ones((self.max_len, self.max_len)), -1, 0) 201 | pad_mask = self._pad_mask(seqs) 202 | mask = tf.sign(pad_mask + mask[tf.newaxis, tf.newaxis, ...]) 203 | return mask # (step, step) 204 | 205 | def translate(self, src, i2v, v2i): 206 | src = tf.reshape(src, (-1, src.shape[-1])) 207 | src_pad = utils.pad_zero(src, self.max_len) 208 | tgt = utils.pad_zero(v2i[""] * tf.ones_like(src), self.max_len + 1) 209 | tgti = 0 210 | x_embed = self.embed(src_pad) 211 | encoded_z = self.encoder(x_embed, mask=self._pad_mask(src_pad)) 212 | while True: 213 | y = tgt[:, :-1] 214 | y_embed = self.embed(y) 215 | decoded_z = self.decoder( 216 | (encoded_z, y_embed), look_ahead_mask=self._look_ahead_mask(y), pad_mask=self._pad_mask(src_pad)) 217 | logit = self.o(decoded_z)[:, tgti, :].numpy() 218 | idx = np.argmax(logit, 1) 219 | tgti += 1 220 | tgt[:, tgti] = idx 221 | if tgti >= self.max_len: 222 | break 223 | return ["".join([i2v[i] for i in tgt[j, 1:tgti]]) for j in range(len(src))] 224 | 225 | 226 | class Loss(keras.losses.Loss): 227 | def __init__(self, padding_idx=0): 228 | super().__init__() 229 | self.padding_idx = padding_idx 230 | self.crossentropy = keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction="none") 231 | 232 | def call(self, y_true, y_pred): 233 | y_true = tf.reshape(y_true, [tf.shape(y_true)[0], tf.shape(y_true)[1]]) 234 | pad_mask = tf.math.not_equal(y_true, self.padding_idx) 235 | loss = tf.reduce_mean(tf.boolean_mask(self.crossentropy(y_true, y_pred), pad_mask)) 236 | return loss 237 | 238 | 239 | class myTensorboard(keras.callbacks.TensorBoard): 240 | def __init__(self, data, log_dir='logs/transformer', histogram_freq=1, write_graph=True, write_images=True, 241 | embeddings_freq=10, **kwargs): 242 | self.data = data 243 | super().__init__(log_dir=log_dir, histogram_freq=histogram_freq, write_graph=write_graph, 244 | write_images=write_images, embeddings_freq=embeddings_freq, **kwargs) 245 | 246 | def on_epoch_end(self, epoch, logs=None): 247 | idx2str=lambda idx:[self.data.idx2str(i) for i in idx] 248 | if (not epoch % 1): 249 | (x, y), _ = load_data(self.data,3) 250 | res = self.model.translate(x, self.data.i2v, self.data.v2i) 251 | target =idx2str(y) 252 | src = idx2str(x) 253 | print( 254 | '\n', 255 | "| input: ", *src,'\n', 256 | "| target: ",*target,'\n', 257 | "| inference: ", *res,'\n', 258 | ) 259 | super(myTensorboard, self).on_epoch_end(epoch, logs) 260 | 261 | 262 | def load_data(data,size): 263 | x, y, seq_len = data.sample(size) 264 | x = utils.pad_zero(x, MAX_LEN) 265 | y = utils.pad_zero(y, MAX_LEN + 1) 266 | return (x, y[:, :-1]), y[:, 1:] 267 | 268 | 269 | def train(model: Transformer, data): 270 | x, y = load_data(data,DATA_SIZE) 271 | tb = myTensorboard(data) 272 | model.compile(keras.optimizers.Adam(LEARN_RATE), loss=Loss()) 273 | model.fit(x, y, batch_size=BATCH_SIZE, epochs=EPOCHS, callbacks=[tb]) 274 | 275 | 276 | if __name__ == "__main__": 277 | d = utils.DateData(DATA_SIZE) 278 | print("Chinese time order: yy/mm/dd ", d.date_cn[:3], "\nEnglish time order: dd/M/yyyy ", d.date_en[:3]) 279 | print("vocabularies: ", d.vocab) 280 | print("x index sample: \n{}\n{}".format(d.idx2str(d.x[0]), d.x[0]), 281 | "\ny index sample: \n{}\n{}".format(d.idx2str(d.y[0]), d.y[0])) 282 | m = Transformer(MODEL_DIM, MAX_LEN, N_LAYER, N_LAYER, N_HEAD, d.num_word) 283 | m.build([[None, 12], [None, 12]]) 284 | train(m, d) 285 | -------------------------------------------------------------------------------- /skip-gram.py: -------------------------------------------------------------------------------- 1 | # [Efficient Estimation of Word Representations in Vector Space](https://arxiv.org/pdf/1301.3781.pdf) 2 | import tensorflow as tf 3 | from tensorflow import keras 4 | from utils import process_w2v_data # this refers to utils.py in my [repo](https://github.com/MorvanZhou/NLP-Tutorials/) 5 | from visual import show_w2v_word_embedding # this refers to visual.py in my [repo](https://github.com/MorvanZhou/NLP-Tutorials/) 6 | 7 | corpus = [ 8 | # numbers 9 | "5 2 4 8 6 2 3 6 4", 10 | "4 8 5 6 9 5 5 6", 11 | "1 1 5 2 3 3 8", 12 | "3 6 9 6 8 7 4 6 3", 13 | "8 9 9 6 1 4 3 4", 14 | "1 0 2 0 2 1 3 3 3 3 3", 15 | "9 3 3 0 1 4 7 8", 16 | "9 9 8 5 6 7 1 2 3 0 1 0", 17 | 18 | # alphabets, expecting that 9 is close to letters 19 | "a t g q e h 9 u f", 20 | "e q y u o i p s", 21 | "q o 9 p l k j o k k o p", 22 | "h g y i u t t a e q", 23 | "i k d q r e 9 e a d", 24 | "o p d g 9 s a f g a", 25 | "i u y g h k l a s w", 26 | "o l u y a o g f s", 27 | "o p i u y g d a s j d l", 28 | "u k i l o 9 l j s", 29 | "y g i s h k j l f r f", 30 | "i o h n 9 9 d 9 f a 9", 31 | ] 32 | 33 | 34 | class SkipGram(keras.Model): 35 | def __init__(self, v_dim, emb_dim): 36 | super().__init__() 37 | self.v_dim = v_dim 38 | self.embeddings = keras.layers.Embedding( 39 | input_dim=v_dim, output_dim=emb_dim, # [n_vocab, emb_dim] 40 | embeddings_initializer=keras.initializers.RandomNormal(0., 0.1), 41 | ) 42 | 43 | # noise-contrastive estimation 44 | self.nce_w = self.add_weight( 45 | name="nce_w", shape=[v_dim, emb_dim], 46 | initializer=keras.initializers.TruncatedNormal(0., 0.1)) # [n_vocab, emb_dim] 47 | self.nce_b = self.add_weight( 48 | name="nce_b", shape=(v_dim,), 49 | initializer=keras.initializers.Constant(0.1)) # [n_vocab, ] 50 | 51 | self.opt = keras.optimizers.Adam(0.01) 52 | 53 | def call(self, x, training=None, mask=None): 54 | # x.shape = [n, ] 55 | o = self.embeddings(x) # [n, emb_dim] 56 | return o 57 | 58 | # negative sampling: take one positive label and num_sampled negative labels to compute the loss 59 | # in order to reduce the computation of full softmax 60 | def loss(self, x, y, training=None): 61 | embedded = self.call(x, training) 62 | return tf.reduce_mean( 63 | tf.nn.nce_loss( 64 | weights=self.nce_w, biases=self.nce_b, labels=tf.expand_dims(y, axis=1), 65 | inputs=embedded, num_sampled=5, num_classes=self.v_dim)) 66 | 67 | def step(self, x, y): 68 | with tf.GradientTape() as tape: 69 | loss = self.loss(x, y, True) 70 | grads = tape.gradient(loss, self.trainable_variables) 71 | self.opt.apply_gradients(zip(grads, self.trainable_variables)) 72 | return loss.numpy() 73 | 74 | 75 | def train(model, data): 76 | for t in range(2500): 77 | bx, by = data.sample(8) 78 | loss = model.step(bx, by) 79 | if t % 200 == 0: 80 | print("step: {} | loss: {}".format(t, loss)) 81 | 82 | 83 | if __name__ == "__main__": 84 | d = process_w2v_data(corpus, skip_window=2, method="skip_gram") 85 | m = SkipGram(d.num_word, 2) 86 | train(m, d) 87 | 88 | # plotting 89 | show_w2v_word_embedding(m, d, "./visual/results/skipgram.png") -------------------------------------------------------------------------------- /tf_idf.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from collections import Counter 3 | import itertools 4 | from visual import show_tfidf # this refers to visual.py in my [repo](https://github.com/MorvanZhou/NLP-Tutorials/) 5 | 6 | docs = [ 7 | "it is a good day, I like to stay here", 8 | "I am happy to be here", 9 | "I am bob", 10 | "it is sunny today", 11 | "I have a party today", 12 | "it is a dog and that is a cat", 13 | "there are dog and cat on the tree", 14 | "I study hard this morning", 15 | "today is a good day", 16 | "tomorrow will be a good day", 17 | "I like coffee, I like book and I like apple", 18 | "I do not like it", 19 | "I am kitty, I like bob", 20 | "I do not care who like bob, but I like kitty", 21 | "It is coffee time, bring your cup", 22 | ] 23 | 24 | docs_words = [d.replace(",", "").split(" ") for d in docs] 25 | vocab = set(itertools.chain(*docs_words)) 26 | v2i = {v: i for i, v in enumerate(vocab)} 27 | i2v = {i: v for v, i in v2i.items()} 28 | 29 | 30 | def safe_log(x): 31 | mask = x != 0 32 | x[mask] = np.log(x[mask]) 33 | return x 34 | 35 | 36 | tf_methods = { 37 | "log": lambda x: np.log(1+x), 38 | "augmented": lambda x: 0.5 + 0.5 * x / np.max(x, axis=1, keepdims=True), 39 | "boolean": lambda x: np.minimum(x, 1), 40 | "log_avg": lambda x: (1 + safe_log(x)) / (1 + safe_log(np.mean(x, axis=1, keepdims=True))), 41 | } 42 | idf_methods = { 43 | "log": lambda x: 1 + np.log(len(docs) / (x+1)), 44 | "prob": lambda x: np.maximum(0, np.log((len(docs) - x) / (x+1))), 45 | "len_norm": lambda x: x / (np.sum(np.square(x))+1), 46 | } 47 | 48 | 49 | def get_tf(method="log"): 50 | # term frequency: how frequent a word appears in a doc 51 | _tf = np.zeros((len(vocab), len(docs)), dtype=np.float64) # [n_vocab, n_doc] 52 | for i, d in enumerate(docs_words): 53 | counter = Counter(d) 54 | for v in counter.keys(): 55 | _tf[v2i[v], i] = counter[v] / counter.most_common(1)[0][1] 56 | 57 | weighted_tf = tf_methods.get(method, None) 58 | if weighted_tf is None: 59 | raise ValueError 60 | return weighted_tf(_tf) 61 | 62 | 63 | def get_idf(method="log"): 64 | # inverse document frequency: low idf for a word appears in more docs, mean less important 65 | df = np.zeros((len(i2v), 1)) 66 | for i in range(len(i2v)): 67 | d_count = 0 68 | for d in docs_words: 69 | d_count += 1 if i2v[i] in d else 0 70 | df[i, 0] = d_count 71 | 72 | idf_fn = idf_methods.get(method, None) 73 | if idf_fn is None: 74 | raise ValueError 75 | return idf_fn(df) 76 | 77 | 78 | def cosine_similarity(q, _tf_idf): 79 | unit_q = q / np.sqrt(np.sum(np.square(q), axis=0, keepdims=True)) 80 | unit_ds = _tf_idf / np.sqrt(np.sum(np.square(_tf_idf), axis=0, keepdims=True)) 81 | similarity = unit_ds.T.dot(unit_q).ravel() 82 | return similarity 83 | 84 | 85 | def docs_score(q, len_norm=False): 86 | q_words = q.replace(",", "").split(" ") 87 | 88 | # add unknown words 89 | unknown_v = 0 90 | for v in set(q_words): 91 | if v not in v2i: 92 | v2i[v] = len(v2i) 93 | i2v[len(v2i)-1] = v 94 | unknown_v += 1 95 | if unknown_v > 0: 96 | _idf = np.concatenate((idf, np.zeros((unknown_v, 1), dtype=np.float)), axis=0) 97 | _tf_idf = np.concatenate((tf_idf, np.zeros((unknown_v, tf_idf.shape[1]), dtype=np.float)), axis=0) 98 | else: 99 | _idf, _tf_idf = idf, tf_idf 100 | counter = Counter(q_words) 101 | q_tf = np.zeros((len(_idf), 1), dtype=np.float) # [n_vocab, 1] 102 | for v in counter.keys(): 103 | q_tf[v2i[v], 0] = counter[v] 104 | 105 | q_vec = q_tf * _idf # [n_vocab, 1] 106 | 107 | q_scores = cosine_similarity(q_vec, _tf_idf) 108 | if len_norm: 109 | len_docs = [len(d) for d in docs_words] 110 | q_scores = q_scores / np.array(len_docs) 111 | return q_scores 112 | 113 | 114 | def get_keywords(n=2): 115 | for c in range(3): 116 | col = tf_idf[:, c] 117 | idx = np.argsort(col)[-n:] 118 | print("doc{}, top{} keywords {}".format(c, n, [i2v[i] for i in idx])) 119 | 120 | 121 | tf = get_tf() # [n_vocab, n_doc] 122 | idf = get_idf() # [n_vocab, 1] 123 | tf_idf = tf * idf # [n_vocab, n_doc] 124 | print("tf shape(vecb in each docs): ", tf.shape) 125 | print("\ntf samples:\n", tf[:2]) 126 | print("\nidf shape(vecb in all docs): ", idf.shape) 127 | print("\nidf samples:\n", idf[:2]) 128 | print("\ntf_idf shape: ", tf_idf.shape) 129 | print("\ntf_idf sample:\n", tf_idf[:2]) 130 | 131 | 132 | # test 133 | get_keywords() 134 | q = "I get a coffee cup" 135 | scores = docs_score(q) 136 | d_ids = scores.argsort()[-3:][::-1] 137 | print("\ntop 3 docs for '{}':\n{}".format(q, [docs[i] for i in d_ids])) 138 | 139 | show_tfidf(tf_idf.T, [i2v[i] for i in range(tf_idf.shape[0])], "tfidf_matrix") -------------------------------------------------------------------------------- /tf_idf_sklearn.py: -------------------------------------------------------------------------------- 1 | from sklearn.feature_extraction.text import TfidfVectorizer 2 | from sklearn.metrics.pairwise import cosine_similarity 3 | from visual import show_tfidf # this refers to visual.py in my [repo](https://github.com/MorvanZhou/NLP-Tutorials/) 4 | 5 | 6 | docs = [ 7 | "it is a good day, I like to stay here", 8 | "I am happy to be here", 9 | "I am bob", 10 | "it is sunny today", 11 | "I have a party today", 12 | "it is a dog and that is a cat", 13 | "there are dog and cat on the tree", 14 | "I study hard this morning", 15 | "today is a good day", 16 | "tomorrow will be a good day", 17 | "I like coffee, I like book and I like apple", 18 | "I do not like it", 19 | "I am kitty, I like bob", 20 | "I do not care who like bob, but I like kitty", 21 | "It is coffee time, bring your cup", 22 | ] 23 | 24 | vectorizer = TfidfVectorizer() 25 | tf_idf = vectorizer.fit_transform(docs) 26 | print("idf: ", [(n, idf) for idf, n in zip(vectorizer.idf_, vectorizer.get_feature_names())]) 27 | print("v2i: ", vectorizer.vocabulary_) 28 | 29 | 30 | q = "I get a coffee cup" 31 | qtf_idf = vectorizer.transform([q]) 32 | res = cosine_similarity(tf_idf, qtf_idf) 33 | res = res.ravel().argsort()[-3:] 34 | print("\ntop 3 docs for '{}':\n{}".format(q, [docs[i] for i in res[::-1]])) 35 | 36 | 37 | i2v = {i: v for v, i in vectorizer.vocabulary_.items()} 38 | dense_tfidf = tf_idf.todense() 39 | show_tfidf(dense_tfidf, [i2v[i] for i in range(dense_tfidf.shape[1])], "tfidf_sklearn_matrix") -------------------------------------------------------------------------------- /transformer.py: -------------------------------------------------------------------------------- 1 | # [Attention Is All You Need](https://arxiv.org/pdf/1706.03762.pdf) 2 | import tensorflow as tf 3 | from tensorflow import keras 4 | import numpy as np 5 | import utils # this refers to utils.py in my [repo](https://github.com/MorvanZhou/NLP-Tutorials/) 6 | import time 7 | import pickle 8 | import os 9 | 10 | MODEL_DIM = 32 11 | MAX_LEN = 12 12 | N_LAYER = 3 13 | N_HEAD = 4 14 | DROP_RATE = 0.1 15 | 16 | 17 | class MultiHead(keras.layers.Layer): 18 | def __init__(self, n_head, model_dim, drop_rate): 19 | super().__init__() 20 | self.head_dim = model_dim // n_head 21 | self.n_head = n_head 22 | self.model_dim = model_dim 23 | self.wq = keras.layers.Dense(n_head * self.head_dim) 24 | self.wk = keras.layers.Dense(n_head * self.head_dim) 25 | self.wv = keras.layers.Dense(n_head * self.head_dim) # [n, step, h*h_dim] 26 | 27 | self.o_dense = keras.layers.Dense(model_dim) 28 | self.o_drop = keras.layers.Dropout(rate=drop_rate) 29 | self.attention = None 30 | 31 | def call(self, q, k, v, mask, training): 32 | _q = self.wq(q) # [n, q_step, h*h_dim] 33 | _k, _v = self.wk(k), self.wv(v) # [n, step, h*h_dim] 34 | _q = self.split_heads(_q) # [n, h, q_step, h_dim] 35 | _k, _v = self.split_heads(_k), self.split_heads(_v) # [n, h, step, h_dim] 36 | context = self.scaled_dot_product_attention(_q, _k, _v, mask) # [n, q_step, h*dv] 37 | o = self.o_dense(context) # [n, step, dim] 38 | o = self.o_drop(o, training=training) 39 | return o 40 | 41 | def split_heads(self, x): 42 | x = tf.reshape(x, (x.shape[0], x.shape[1], self.n_head, self.head_dim)) # [n, step, h, h_dim] 43 | return tf.transpose(x, perm=[0, 2, 1, 3]) # [n, h, step, h_dim] 44 | 45 | def scaled_dot_product_attention(self, q, k, v, mask=None): 46 | dk = tf.cast(k.shape[-1], dtype=tf.float32) 47 | score = tf.matmul(q, k, transpose_b=True) / (tf.math.sqrt(dk) + 1e-8) # [n, h_dim, q_step, step] 48 | if mask is not None: 49 | score += mask * -1e9 50 | self.attention = tf.nn.softmax(score, axis=-1) # [n, h, q_step, step] 51 | context = tf.matmul(self.attention, v) # [n, h, q_step, step] @ [n, h, step, dv] = [n, h, q_step, dv] 52 | context = tf.transpose(context, perm=[0, 2, 1, 3]) # [n, q_step, h, dv] 53 | context = tf.reshape(context, (context.shape[0], context.shape[1], -1)) # [n, q_step, h*dv] 54 | return context 55 | 56 | 57 | class PositionWiseFFN(keras.layers.Layer): 58 | def __init__(self, model_dim): 59 | super().__init__() 60 | dff = model_dim * 4 61 | self.l = keras.layers.Dense(dff, activation=keras.activations.relu) 62 | self.o = keras.layers.Dense(model_dim) 63 | 64 | def call(self, x): 65 | o = self.l(x) 66 | o = self.o(o) 67 | return o # [n, step, dim] 68 | 69 | 70 | class EncodeLayer(keras.layers.Layer): 71 | def __init__(self, n_head, model_dim, drop_rate): 72 | super().__init__() 73 | self.ln = [keras.layers.LayerNormalization(axis=-1) for _ in range(2)] # only norm z-dim 74 | self.mh = MultiHead(n_head, model_dim, drop_rate) 75 | self.ffn = PositionWiseFFN(model_dim) 76 | self.drop = keras.layers.Dropout(drop_rate) 77 | 78 | def call(self, xz, training, mask): 79 | attn = self.mh.call(xz, xz, xz, mask, training) # [n, step, dim] 80 | o1 = self.ln[0](attn + xz) 81 | ffn = self.drop(self.ffn.call(o1), training) 82 | o = self.ln[1](ffn + o1) # [n, step, dim] 83 | return o 84 | 85 | 86 | class Encoder(keras.layers.Layer): 87 | def __init__(self, n_head, model_dim, drop_rate, n_layer): 88 | super().__init__() 89 | self.ls = [EncodeLayer(n_head, model_dim, drop_rate) for _ in range(n_layer)] 90 | 91 | def call(self, xz, training, mask): 92 | for l in self.ls: 93 | xz = l.call(xz, training, mask) 94 | return xz # [n, step, dim] 95 | 96 | 97 | class DecoderLayer(keras.layers.Layer): 98 | def __init__(self, n_head, model_dim, drop_rate): 99 | super().__init__() 100 | self.ln = [keras.layers.LayerNormalization(axis=-1) for _ in range(3)] # only norm z-dim 101 | self.drop = keras.layers.Dropout(drop_rate) 102 | self.mh = [MultiHead(n_head, model_dim, drop_rate) for _ in range(2)] 103 | self.ffn = PositionWiseFFN(model_dim) 104 | 105 | def call(self, yz, xz, training, yz_look_ahead_mask, xz_pad_mask): 106 | attn = self.mh[0].call(yz, yz, yz, yz_look_ahead_mask, training) # decoder self attention 107 | o1 = self.ln[0](attn + yz) 108 | attn = self.mh[1].call(o1, xz, xz, xz_pad_mask, training) # decoder + encoder attention 109 | o2 = self.ln[1](attn + o1) 110 | ffn = self.drop(self.ffn.call(o2), training) 111 | o = self.ln[2](ffn + o2) 112 | return o 113 | 114 | 115 | class Decoder(keras.layers.Layer): 116 | def __init__(self, n_head, model_dim, drop_rate, n_layer): 117 | super().__init__() 118 | self.ls = [DecoderLayer(n_head, model_dim, drop_rate) for _ in range(n_layer)] 119 | 120 | def call(self, yz, xz, training, yz_look_ahead_mask, xz_pad_mask): 121 | for l in self.ls: 122 | yz = l.call(yz, xz, training, yz_look_ahead_mask, xz_pad_mask) 123 | return yz 124 | 125 | 126 | class PositionEmbedding(keras.layers.Layer): 127 | def __init__(self, max_len, model_dim, n_vocab): 128 | super().__init__() 129 | pos = np.arange(max_len)[:, None] 130 | pe = pos / np.power(10000, 2. * np.arange(model_dim)[None, :] / model_dim) # [max_len, dim] 131 | pe[:, 0::2] = np.sin(pe[:, 0::2]) 132 | pe[:, 1::2] = np.cos(pe[:, 1::2]) 133 | pe = pe[None, :, :] # [1, max_len, model_dim] for batch adding 134 | self.pe = tf.constant(pe, dtype=tf.float32) 135 | self.embeddings = keras.layers.Embedding( 136 | input_dim=n_vocab, output_dim=model_dim, # [n_vocab, dim] 137 | embeddings_initializer=tf.initializers.RandomNormal(0., 0.01), 138 | ) 139 | 140 | def call(self, x): 141 | x_embed = self.embeddings(x) + self.pe # [n, step, dim] 142 | return x_embed 143 | 144 | 145 | class Transformer(keras.Model): 146 | def __init__(self, model_dim, max_len, n_layer, n_head, n_vocab, drop_rate=0.1, padding_idx=0): 147 | super().__init__() 148 | self.max_len = max_len 149 | self.padding_idx = padding_idx 150 | 151 | self.embed = PositionEmbedding(max_len, model_dim, n_vocab) 152 | self.encoder = Encoder(n_head, model_dim, drop_rate, n_layer) 153 | self.decoder = Decoder(n_head, model_dim, drop_rate, n_layer) 154 | self.o = keras.layers.Dense(n_vocab) 155 | 156 | self.cross_entropy = keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction="none") 157 | self.opt = keras.optimizers.Adam(0.002) 158 | 159 | def call(self, x, y, training=None): 160 | x_embed, y_embed = self.embed(x), self.embed(y) 161 | pad_mask = self._pad_mask(x) 162 | encoded_z = self.encoder.call(x_embed, training, mask=pad_mask) 163 | decoded_z = self.decoder.call( 164 | y_embed, encoded_z, training, yz_look_ahead_mask=self._look_ahead_mask(y), xz_pad_mask=pad_mask) 165 | o = self.o(decoded_z) 166 | return o 167 | 168 | def step(self, x, y): 169 | with tf.GradientTape() as tape: 170 | logits = self.call(x, y[:, :-1], training=True) 171 | pad_mask = tf.math.not_equal(y[:, 1:], self.padding_idx) 172 | loss = tf.reduce_mean(tf.boolean_mask(self.cross_entropy(y[:, 1:], logits), pad_mask)) 173 | grads = tape.gradient(loss, self.trainable_variables) 174 | self.opt.apply_gradients(zip(grads, self.trainable_variables)) 175 | return loss, logits 176 | 177 | def _pad_bool(self, seqs): 178 | return tf.math.equal(seqs, self.padding_idx) 179 | 180 | def _pad_mask(self, seqs): 181 | mask = tf.cast(self._pad_bool(seqs), tf.float32) 182 | return mask[:, tf.newaxis, tf.newaxis, :] # (n, 1, 1, step) 183 | 184 | def _look_ahead_mask(self, seqs): 185 | mask = 1 - tf.linalg.band_part(tf.ones((self.max_len, self.max_len)), -1, 0) 186 | mask = tf.where(self._pad_bool(seqs)[:, tf.newaxis, tf.newaxis, :], 1, mask[tf.newaxis, tf.newaxis, :, :]) 187 | return mask # (step, step) 188 | 189 | def translate(self, src, v2i, i2v): 190 | src_pad = utils.pad_zero(src, self.max_len) 191 | tgt = utils.pad_zero(np.array([[v2i[""], ] for _ in range(len(src))]), self.max_len+1) 192 | tgti = 0 193 | x_embed = self.embed(src_pad) 194 | encoded_z = self.encoder.call(x_embed, False, mask=self._pad_mask(src_pad)) 195 | while True: 196 | y = tgt[:, :-1] 197 | y_embed = self.embed(y) 198 | decoded_z = self.decoder.call( 199 | y_embed, encoded_z, False, yz_look_ahead_mask=self._look_ahead_mask(y), xz_pad_mask=self._pad_mask(src_pad)) 200 | logits = self.o(decoded_z)[:, tgti, :].numpy() 201 | idx = np.argmax(logits, axis=1) 202 | tgti += 1 203 | tgt[:, tgti] = idx 204 | if tgti >= self.max_len: 205 | break 206 | return ["".join([i2v[i] for i in tgt[j, 1:tgti]]) for j in range(len(src))] 207 | 208 | @property 209 | def attentions(self): 210 | attentions = { 211 | "encoder": [l.mh.attention.numpy() for l in self.encoder.ls], 212 | "decoder": { 213 | "mh1": [l.mh[0].attention.numpy() for l in self.decoder.ls], 214 | "mh2": [l.mh[1].attention.numpy() for l in self.decoder.ls], 215 | }} 216 | return attentions 217 | 218 | 219 | def train(model, data, step): 220 | # training 221 | t0 = time.time() 222 | for t in range(step): 223 | bx, by, seq_len = data.sample(64) 224 | bx, by = utils.pad_zero(bx, max_len=MAX_LEN), utils.pad_zero(by, max_len=MAX_LEN + 1) 225 | loss, logits = model.step(bx, by) 226 | if t % 50 == 0: 227 | logits = logits[0].numpy() 228 | t1 = time.time() 229 | print( 230 | "step: ", t, 231 | "| time: %.2f" % (t1 - t0), 232 | "| loss: %.4f" % loss.numpy(), 233 | "| target: ", "".join([data.i2v[i] for i in by[0, 1:10]]), 234 | "| inference: ", "".join([data.i2v[i] for i in np.argmax(logits, axis=1)[:10]]), 235 | ) 236 | t0 = t1 237 | 238 | os.makedirs("./visual/models/transformer", exist_ok=True) 239 | model.save_weights("./visual/models/transformer/model.ckpt") 240 | os.makedirs("./visual/tmp", exist_ok=True) 241 | with open("./visual/tmp/transformer_v2i_i2v.pkl", "wb") as f: 242 | pickle.dump({"v2i": data.v2i, "i2v": data.i2v}, f) 243 | 244 | 245 | def export_attention(model, data, name="transformer"): 246 | with open("./visual/tmp/transformer_v2i_i2v.pkl", "rb") as f: 247 | dic = pickle.load(f) 248 | model.load_weights("./visual/models/transformer/model.ckpt") 249 | bx, by, seq_len = data.sample(32) 250 | model.translate(bx, dic["v2i"], dic["i2v"]) 251 | attn_data = { 252 | "src": [[data.i2v[i] for i in bx[j]] for j in range(len(bx))], 253 | "tgt": [[data.i2v[i] for i in by[j]] for j in range(len(by))], 254 | "attentions": model.attentions} 255 | path = "./visual/tmp/%s_attention_matrix.pkl" % name 256 | os.makedirs(os.path.dirname(path), exist_ok=True) 257 | with open(path, "wb") as f: 258 | pickle.dump(attn_data, f) 259 | 260 | 261 | if __name__ == "__main__": 262 | utils.set_soft_gpu(True) 263 | d = utils.DateData(4000) 264 | print("Chinese time order: yy/mm/dd ", d.date_cn[:3], "\nEnglish time order: dd/M/yyyy ", d.date_en[:3]) 265 | print("vocabularies: ", d.vocab) 266 | print("x index sample: \n{}\n{}".format(d.idx2str(d.x[0]), d.x[0]), 267 | "\ny index sample: \n{}\n{}".format(d.idx2str(d.y[0]), d.y[0])) 268 | 269 | m = Transformer(MODEL_DIM, MAX_LEN, N_LAYER, N_HEAD, d.num_word, DROP_RATE) 270 | train(m, d, step=800) 271 | export_attention(m, d) -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import datetime 3 | import os 4 | import requests 5 | import pandas as pd 6 | import re 7 | import itertools 8 | 9 | PAD_ID = 0 10 | 11 | 12 | class DateData: 13 | def __init__(self, n): 14 | np.random.seed(1) 15 | self.date_cn = [] 16 | self.date_en = [] 17 | for timestamp in np.random.randint(143835585, 2043835585, n): 18 | date = datetime.datetime.fromtimestamp(timestamp) 19 | self.date_cn.append(date.strftime("%y-%m-%d")) 20 | self.date_en.append(date.strftime("%d/%b/%Y")) 21 | self.vocab = set( 22 | [str(i) for i in range(0, 10)] + ["-", "/", "", ""] + [ 23 | i.split("/")[1] for i in self.date_en]) 24 | self.v2i = {v: i for i, v in enumerate(sorted(list(self.vocab)), start=1)} 25 | self.v2i[""] = PAD_ID 26 | self.vocab.add("") 27 | self.i2v = {i: v for v, i in self.v2i.items()} 28 | self.x, self.y = [], [] 29 | for cn, en in zip(self.date_cn, self.date_en): 30 | self.x.append([self.v2i[v] for v in cn]) 31 | self.y.append( 32 | [self.v2i[""], ] + [self.v2i[v] for v in en[:3]] + [ 33 | self.v2i[en[3:6]], ] + [self.v2i[v] for v in en[6:]] + [ 34 | self.v2i[""], ]) 35 | self.x, self.y = np.array(self.x), np.array(self.y) 36 | self.start_token = self.v2i[""] 37 | self.end_token = self.v2i[""] 38 | 39 | def sample(self, n=64): 40 | bi = np.random.randint(0, len(self.x), size=n) 41 | bx, by = self.x[bi], self.y[bi] 42 | decoder_len = np.full((len(bx),), by.shape[1] - 1, dtype=np.int32) 43 | return bx, by, decoder_len 44 | 45 | def idx2str(self, idx): 46 | x = [] 47 | for i in idx: 48 | x.append(self.i2v[i]) 49 | if i == self.end_token: 50 | break 51 | return "".join(x) 52 | 53 | @property 54 | def num_word(self): 55 | return len(self.vocab) 56 | 57 | 58 | def pad_zero(seqs, max_len): 59 | padded = np.full((len(seqs), max_len), fill_value=PAD_ID, dtype=np.long) 60 | for i, seq in enumerate(seqs): 61 | padded[i, :len(seq)] = seq 62 | return padded 63 | 64 | 65 | def maybe_download_mrpc(save_dir="./MRPC/", proxy=None): 66 | train_url = 'https://mofanpy.com/static/files/MRPC/msr_paraphrase_train.txt' 67 | test_url = 'https://mofanpy.com/static/files/MRPC/msr_paraphrase_test.txt' 68 | os.makedirs(save_dir, exist_ok=True) 69 | proxies = {"http": proxy, "https": proxy} 70 | for url in [train_url, test_url]: 71 | raw_path = os.path.join(save_dir, url.split("/")[-1]) 72 | if not os.path.isfile(raw_path): 73 | print("downloading from %s" % url) 74 | r = requests.get(url, proxies=proxies) 75 | with open(raw_path, "w", encoding="utf-8") as f: 76 | f.write(r.text.replace('"', "")) 77 | print("completed") 78 | 79 | 80 | def _text_standardize(text): 81 | text = re.sub(r'—', '-', text) 82 | text = re.sub(r'–', '-', text) 83 | text = re.sub(r'―', '-', text) 84 | text = re.sub(r" \d+(,\d+)?(\.\d+)? ", " ", text) 85 | text = re.sub(r" \d+-+?\d*", " -", text) 86 | return text.strip() 87 | 88 | 89 | def _process_mrpc(dir="./MRPC", rows=None): 90 | data = {"train": None, "test": None} 91 | files = os.listdir(dir) 92 | for f in files: 93 | df = pd.read_csv(os.path.join(dir, f), sep='\t', nrows=rows) 94 | k = "train" if "train" in f else "test" 95 | data[k] = {"is_same": df.iloc[:, 0].values, "s1": df["#1 String"].values, "s2": df["#2 String"].values} 96 | vocab = set() 97 | for n in ["train", "test"]: 98 | for m in ["s1", "s2"]: 99 | for i in range(len(data[n][m])): 100 | data[n][m][i] = _text_standardize(data[n][m][i].lower()) 101 | cs = data[n][m][i].split(" ") 102 | vocab.update(set(cs)) 103 | v2i = {v: i for i, v in enumerate(sorted(vocab), start=1)} 104 | v2i[""] = PAD_ID 105 | v2i[""] = len(v2i) 106 | v2i[""] = len(v2i) 107 | v2i[""] = len(v2i) 108 | i2v = {i: v for v, i in v2i.items()} 109 | for n in ["train", "test"]: 110 | for m in ["s1", "s2"]: 111 | data[n][m+"id"] = [[v2i[v] for v in c.split(" ")] for c in data[n][m]] 112 | return data, v2i, i2v 113 | 114 | 115 | class MRPCData: 116 | num_seg = 3 117 | pad_id = PAD_ID 118 | 119 | def __init__(self, data_dir="./MRPC/", rows=None, proxy=None): 120 | maybe_download_mrpc(save_dir=data_dir, proxy=proxy) 121 | data, self.v2i, self.i2v = _process_mrpc(data_dir, rows) 122 | self.max_len = max( 123 | [len(s1) + len(s2) + 3 for s1, s2 in zip( 124 | data["train"]["s1id"] + data["test"]["s1id"], data["train"]["s2id"] + data["test"]["s2id"])]) 125 | 126 | self.xlen = np.array([ 127 | [ 128 | len(data["train"]["s1id"][i]), len(data["train"]["s2id"][i]) 129 | ] for i in range(len(data["train"]["s1id"]))], dtype=int) 130 | x = [ 131 | [self.v2i[""]] + data["train"]["s1id"][i] + [self.v2i[""]] + data["train"]["s2id"][i] + [self.v2i[""]] 132 | for i in range(len(self.xlen)) 133 | ] 134 | self.x = pad_zero(x, max_len=self.max_len) 135 | self.nsp_y = data["train"]["is_same"][:, None] 136 | 137 | self.seg = np.full(self.x.shape, self.num_seg-1, np.int32) 138 | for i in range(len(x)): 139 | si = self.xlen[i][0] + 2 140 | self.seg[i, :si] = 0 141 | si_ = si + self.xlen[i][1] + 1 142 | self.seg[i, si:si_] = 1 143 | 144 | self.word_ids = np.array(list(set(self.i2v.keys()).difference( 145 | [self.v2i[v] for v in ["", "", ""]]))) 146 | 147 | def sample(self, n): 148 | bi = np.random.randint(0, self.x.shape[0], size=n) 149 | bx, bs, bl, by = self.x[bi], self.seg[bi], self.xlen[bi], self.nsp_y[bi] 150 | return bx, bs, bl, by 151 | 152 | @property 153 | def num_word(self): 154 | return len(self.v2i) 155 | 156 | @property 157 | def mask_id(self): 158 | return self.v2i[""] 159 | 160 | 161 | class MRPCSingle: 162 | pad_id = PAD_ID 163 | 164 | def __init__(self, data_dir="./MRPC/", rows=None, proxy=None): 165 | maybe_download_mrpc(save_dir=data_dir, proxy=proxy) 166 | data, self.v2i, self.i2v = _process_mrpc(data_dir, rows) 167 | 168 | self.max_len = max([len(s) + 2 for s in data["train"]["s1id"] + data["train"]["s2id"]]) 169 | x = [ 170 | [self.v2i[""]] + data["train"]["s1id"][i] + [self.v2i[""]] 171 | for i in range(len(data["train"]["s1id"])) 172 | ] 173 | x += [ 174 | [self.v2i[""]] + data["train"]["s2id"][i] + [self.v2i[""]] 175 | for i in range(len(data["train"]["s2id"])) 176 | ] 177 | self.x = pad_zero(x, max_len=self.max_len) 178 | self.word_ids = np.array(list(set(self.i2v.keys()).difference([self.v2i[""]]))) 179 | 180 | def sample(self, n): 181 | bi = np.random.randint(0, self.x.shape[0], size=n) 182 | bx = self.x[bi] 183 | return bx 184 | 185 | @property 186 | def num_word(self): 187 | return len(self.v2i) 188 | 189 | 190 | class Dataset: 191 | def __init__(self, x, y, v2i, i2v): 192 | self.x, self.y = x, y 193 | self.v2i, self.i2v = v2i, i2v 194 | self.vocab = v2i.keys() 195 | 196 | def sample(self, n): 197 | b_idx = np.random.randint(0, len(self.x), n) 198 | bx, by = self.x[b_idx], self.y[b_idx] 199 | return bx, by 200 | 201 | @property 202 | def num_word(self): 203 | return len(self.v2i) 204 | 205 | 206 | def process_w2v_data(corpus, skip_window=2, method="skip_gram"): 207 | all_words = [sentence.split(" ") for sentence in corpus] 208 | all_words = np.array(list(itertools.chain(*all_words))) 209 | # vocab sort by decreasing frequency for the negative sampling below (nce_loss). 210 | vocab, v_count = np.unique(all_words, return_counts=True) 211 | vocab = vocab[np.argsort(v_count)[::-1]] 212 | 213 | print("all vocabularies sorted from more frequent to less frequent:\n", vocab) 214 | v2i = {v: i for i, v in enumerate(vocab)} 215 | i2v = {i: v for v, i in v2i.items()} 216 | 217 | # pair data 218 | pairs = [] 219 | js = [i for i in range(-skip_window, skip_window + 1) if i != 0] 220 | 221 | for c in corpus: 222 | words = c.split(" ") 223 | w_idx = [v2i[w] for w in words] 224 | if method == "skip_gram": 225 | for i in range(len(w_idx)): 226 | for j in js: 227 | if i + j < 0 or i + j >= len(w_idx): 228 | continue 229 | pairs.append((w_idx[i], w_idx[i + j])) # (center, context) or (feature, target) 230 | elif method.lower() == "cbow": 231 | for i in range(skip_window, len(w_idx) - skip_window): 232 | context = [] 233 | for j in js: 234 | context.append(w_idx[i + j]) 235 | pairs.append(context + [w_idx[i]]) # (contexts, center) or (feature, target) 236 | else: 237 | raise ValueError 238 | pairs = np.array(pairs) 239 | print("5 example pairs:\n", pairs[:5]) 240 | if method.lower() == "skip_gram": 241 | x, y = pairs[:, 0], pairs[:, 1] 242 | elif method.lower() == "cbow": 243 | x, y = pairs[:, :-1], pairs[:, -1] 244 | else: 245 | raise ValueError 246 | return Dataset(x, y, v2i, i2v) 247 | 248 | 249 | def set_soft_gpu(soft_gpu): 250 | import tensorflow as tf 251 | if soft_gpu: 252 | gpus = tf.config.experimental.list_physical_devices('GPU') 253 | if gpus: 254 | # Currently, memory growth needs to be the same across GPUs 255 | for gpu in gpus: 256 | tf.config.experimental.set_memory_growth(gpu, True) 257 | logical_gpus = tf.config.experimental.list_logical_devices('GPU') 258 | print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs") -------------------------------------------------------------------------------- /visual.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | import pickle 4 | from matplotlib.pyplot import cm 5 | import os 6 | import utils 7 | 8 | 9 | def show_tfidf(tfidf, vocab, filename): 10 | # [n_doc, n_vocab] 11 | plt.imshow(tfidf, cmap="YlGn", vmin=tfidf.min(), vmax=tfidf.max()) 12 | plt.xticks(np.arange(tfidf.shape[1]), vocab, fontsize=6, rotation=90) 13 | plt.yticks(np.arange(tfidf.shape[0]), np.arange(1, tfidf.shape[0]+1), fontsize=6) 14 | plt.tight_layout() 15 | # creating the output folder 16 | output_folder = './visual/results/' 17 | os.makedirs(output_folder, exist_ok=True) 18 | plt.savefig(os.path.join(output_folder, '%s.png') % filename, format="png", dpi=500) 19 | plt.show() 20 | 21 | 22 | def show_w2v_word_embedding(model, data: utils.Dataset, path): 23 | word_emb = model.embeddings.get_weights()[0] 24 | for i in range(data.num_word): 25 | c = "blue" 26 | try: 27 | int(data.i2v[i]) 28 | except ValueError: 29 | c = "red" 30 | plt.text(word_emb[i, 0], word_emb[i, 1], s=data.i2v[i], color=c, weight="bold") 31 | plt.xlim(word_emb[:, 0].min() - .5, word_emb[:, 0].max() + .5) 32 | plt.ylim(word_emb[:, 1].min() - .5, word_emb[:, 1].max() + .5) 33 | plt.xticks(()) 34 | plt.yticks(()) 35 | plt.xlabel("embedding dim1") 36 | plt.ylabel("embedding dim2") 37 | plt.savefig(path, dpi=300, format="png") 38 | plt.show() 39 | 40 | 41 | def seq2seq_attention(): 42 | with open("./visual/tmp/attention_align.pkl", "rb") as f: 43 | data = pickle.load(f) 44 | i2v, x, y, align = data["i2v"], data["x"], data["y"], data["align"] 45 | plt.rcParams['xtick.bottom'] = plt.rcParams['xtick.labelbottom'] = False 46 | plt.rcParams['xtick.top'] = plt.rcParams['xtick.labeltop'] = True 47 | for i in range(6): 48 | plt.subplot(2, 3, i + 1) 49 | x_vocab = [i2v[j] for j in np.ravel(x[i])] 50 | y_vocab = [i2v[j] for j in y[i, 1:]] 51 | plt.imshow(align[i], cmap="YlGn", vmin=0., vmax=1.) 52 | plt.yticks([j for j in range(len(y_vocab))], y_vocab) 53 | plt.xticks([j for j in range(len(x_vocab))], x_vocab) 54 | if i == 0 or i == 3: 55 | plt.ylabel("Output") 56 | if i >= 3: 57 | plt.xlabel("Input") 58 | plt.tight_layout() 59 | plt.savefig("./visual/results/seq2seq_attention.png", format="png", dpi=200) 60 | plt.show() 61 | 62 | 63 | def all_mask_kinds(): 64 | seqs = ["I love you", "My name is M", "This is a very long seq", "Short one"] 65 | vocabs = set((" ".join(seqs)).split(" ")) 66 | i2v = {i: v for i, v in enumerate(vocabs, start=1)} 67 | i2v[""] = 0 # add 0 idx for 68 | v2i = {v: i for i, v in i2v.items()} 69 | 70 | id_seqs = [[v2i[v] for v in seq.split(" ")] for seq in seqs] 71 | padded_id_seqs = np.array([l + [0] * (6 - len(l)) for l in id_seqs]) 72 | 73 | # padding mask 74 | pmask = np.where(padded_id_seqs == 0, np.ones_like(padded_id_seqs), np.zeros_like(padded_id_seqs)) # 0 idx is padding 75 | pmask = np.repeat(pmask[:, None, :], pmask.shape[-1], axis=1) # [n, step, step] 76 | plt.rcParams['xtick.bottom'] = plt.rcParams['xtick.labelbottom'] = False 77 | plt.rcParams['xtick.top'] = plt.rcParams['xtick.labeltop'] = True 78 | for i in range(1, 5): 79 | plt.subplot(2, 2, i) 80 | plt.imshow(pmask[i-1], vmax=1, vmin=0, cmap="YlGn") 81 | plt.xticks(range(6), seqs[i - 1].split(" "), rotation=45) 82 | plt.yticks(range(6), seqs[i - 1].split(" "),) 83 | plt.grid(which="minor", c="w", lw=0.5, linestyle="-") 84 | plt.tight_layout() 85 | plt.savefig("./visual/results/transformer_pad_mask.png", dpi=200) 86 | plt.show() 87 | 88 | # look ahead mask 89 | max_len = pmask.shape[-1] 90 | omask = ~np.triu(np.ones((max_len, max_len), dtype=np.bool), 1) 91 | omask = np.tile(np.expand_dims(omask, axis=0), [np.shape(seqs)[0], 1, 1]) # [n, step, step] 92 | omask = np.where(omask, pmask, 1) 93 | 94 | plt.rcParams['xtick.bottom'] = plt.rcParams['xtick.labelbottom'] = False 95 | plt.rcParams['xtick.top'] = plt.rcParams['xtick.labeltop'] = True 96 | for i in range(1, 5): 97 | plt.subplot(2, 2, i) 98 | plt.imshow(omask[i - 1], vmax=1, vmin=0, cmap="YlGn") 99 | plt.xticks(range(6), seqs[i - 1].split(" "), rotation=45) 100 | plt.yticks(range(6), seqs[i - 1].split(" "), ) 101 | plt.grid(which="minor", c="w", lw=0.5, linestyle="-") 102 | plt.tight_layout() 103 | plt.savefig("./visual/results/transformer_look_ahead_mask.png", dpi=200) 104 | plt.show() 105 | 106 | 107 | def position_embedding(): 108 | max_len = 500 109 | model_dim = 512 110 | pos = np.arange(max_len)[:, None] 111 | pe = pos / np.power(10000, 2. * np.arange(model_dim)[None, :] / model_dim) # [max_len, model_dim] 112 | pe[:, 0::2] = np.sin(pe[:, 0::2]) 113 | pe[:, 1::2] = np.cos(pe[:, 1::2]) 114 | plt.imshow(pe, vmax=1, vmin=-1, cmap="rainbow") 115 | plt.ylabel("word position") 116 | plt.xlabel("embedding dim") 117 | plt.savefig("./visual/results/transformer_position_embedding.png", dpi=200) 118 | plt.show() 119 | 120 | 121 | def transformer_attention_matrix(case=0): 122 | with open("./visual/tmp/transformer_attention_matrix.pkl", "rb") as f: 123 | data = pickle.load(f) 124 | src = data["src"][case] 125 | tgt = data["tgt"][case] 126 | attentions = data["attentions"] 127 | 128 | encoder_atten = attentions["encoder"] 129 | decoder_tgt_atten = attentions["decoder"]["mh1"] 130 | decoder_src_atten = attentions["decoder"]["mh2"] 131 | plt.rcParams['xtick.bottom'] = plt.rcParams['xtick.labelbottom'] = False 132 | plt.rcParams['xtick.top'] = plt.rcParams['xtick.labeltop'] = True 133 | 134 | plt.figure(0, (7, 7)) 135 | plt.suptitle("Encoder self-attention") 136 | for i in range(3): 137 | for j in range(4): 138 | plt.subplot(3, 4, i * 4 + j + 1) 139 | plt.imshow(encoder_atten[i][case, j][:len(src), :len(src)], vmax=1, vmin=0, cmap="rainbow") 140 | plt.xticks(range(len(src)), src) 141 | plt.yticks(range(len(src)), src) 142 | if j == 0: 143 | plt.ylabel("layer %i" % (i+1)) 144 | if i == 2: 145 | plt.xlabel("head %i" % (j+1)) 146 | plt.tight_layout() 147 | plt.subplots_adjust(top=0.9) 148 | plt.savefig("./visual/results/transformer%d_encoder_self_attention.png" % case, dpi=200) 149 | plt.show() 150 | 151 | plt.figure(1, (7, 7)) 152 | plt.suptitle("Decoder self-attention") 153 | for i in range(3): 154 | for j in range(4): 155 | plt.subplot(3, 4, i * 4 + j + 1) 156 | plt.imshow(decoder_tgt_atten[i][case, j][:len(tgt), :len(tgt)], vmax=1, vmin=0, cmap="rainbow") 157 | plt.xticks(range(len(tgt)), tgt, rotation=90, fontsize=7) 158 | plt.yticks(range(len(tgt)), tgt, fontsize=7) 159 | if j == 0: 160 | plt.ylabel("layer %i" % (i+1)) 161 | if i == 2: 162 | plt.xlabel("head %i" % (j+1)) 163 | plt.tight_layout() 164 | plt.subplots_adjust(top=0.9) 165 | plt.savefig("./visual/results/transformer%d_decoder_self_attention.png" % case, dpi=200) 166 | plt.show() 167 | 168 | plt.figure(2, (7, 8)) 169 | plt.suptitle("Decoder-Encoder attention") 170 | for i in range(3): 171 | for j in range(4): 172 | plt.subplot(3, 4, i*4+j+1) 173 | plt.imshow(decoder_src_atten[i][case, j][:len(tgt), :len(src)], vmax=1, vmin=0, cmap="rainbow") 174 | plt.xticks(range(len(src)), src, fontsize=7) 175 | plt.yticks(range(len(tgt)), tgt, fontsize=7) 176 | if j == 0: 177 | plt.ylabel("layer %i" % (i+1)) 178 | if i == 2: 179 | plt.xlabel("head %i" % (j+1)) 180 | plt.tight_layout() 181 | plt.subplots_adjust(top=0.9) 182 | plt.savefig("./visual/results/transformer%d_decoder_encoder_attention.png" % case, dpi=200) 183 | plt.show() 184 | 185 | 186 | def transformer_attention_line(case=0): 187 | with open("./visual/tmp/transformer_attention_matrix.pkl", "rb") as f: 188 | data = pickle.load(f) 189 | src = data["src"][case] 190 | tgt = data["tgt"][case] 191 | attentions = data["attentions"] 192 | 193 | decoder_src_atten = attentions["decoder"]["mh2"] 194 | 195 | tgt_label = tgt[1:11][::-1] 196 | src_label = ["" for _ in range(2)] + src[::-1] 197 | fig, ax = plt.subplots(nrows=2, ncols=2, sharex=True, figsize=(7, 14)) 198 | 199 | for i in range(2): 200 | for j in range(2): 201 | ax[i, j].set_yticks(np.arange(len(src_label))) 202 | ax[i, j].set_yticklabels(src_label, fontsize=9) # src 203 | ax[i, j].set_ylim(0, len(src_label)-1) 204 | ax_ = ax[i, j].twinx() 205 | ax_.set_yticks(np.linspace(ax_.get_yticks()[0], ax_.get_yticks()[-1], len(ax[i, j].get_yticks()))) 206 | ax_.set_yticklabels(tgt_label, fontsize=9) # tgt 207 | img = decoder_src_atten[-1][case, i + j][:10, :8] 208 | color = cm.rainbow(np.linspace(0, 1, img.shape[0])) 209 | left_top, right_top = img.shape[1], img.shape[0] 210 | for ri, c in zip(range(right_top), color): # tgt 211 | for li in range(left_top): # src 212 | alpha = (img[ri, li] / img[ri].max()) ** 8 213 | ax[i, j].plot([0, 1], [left_top - li + 1, right_top - 1 - ri], alpha=alpha, c=c) 214 | ax[i, j].set_xticks(()) 215 | ax[i, j].set_xlabel("head %i" % (j + 1 + i * 2)) 216 | ax[i, j].set_xlim(0, 1) 217 | plt.subplots_adjust(top=0.9) 218 | plt.tight_layout() 219 | plt.savefig("./visual/results/transformer%d_encoder_decoder_attention_line.png" % case, dpi=100) 220 | 221 | 222 | def self_attention_matrix(bert_or_gpt="bert", case=0): 223 | with open("./visual/tmp/"+bert_or_gpt+"_attention_matrix.pkl", "rb") as f: 224 | data = pickle.load(f) 225 | src = data["src"] 226 | attentions = data["attentions"] 227 | 228 | encoder_atten = attentions["encoder"] 229 | plt.rcParams['xtick.bottom'] = plt.rcParams['xtick.labelbottom'] = False 230 | plt.rcParams['xtick.top'] = plt.rcParams['xtick.labeltop'] = True 231 | 232 | s_len = 0 233 | for s in src[case]: 234 | if s == "": 235 | break 236 | s_len += 1 237 | 238 | plt.figure(0, (7, 28)) 239 | for j in range(4): 240 | plt.subplot(4, 1, j + 1) 241 | img = encoder_atten[-1][case, j][:s_len-1, :s_len-1] 242 | plt.imshow(img, vmax=img.max(), vmin=0, cmap="rainbow") 243 | plt.xticks(range(s_len-1), src[case][:s_len-1], rotation=90, fontsize=9) 244 | plt.yticks(range(s_len-1), src[case][1:s_len], fontsize=9) 245 | plt.xlabel("head %i" % (j+1)) 246 | plt.subplots_adjust(top=0.9) 247 | plt.tight_layout() 248 | plt.savefig("./visual/results/"+bert_or_gpt+"%d_self_attention.png" % case, dpi=500) 249 | # plt.show() 250 | 251 | 252 | def self_attention_line(bert_or_gpt="bert", case=0): 253 | with open("./visual/tmp/"+bert_or_gpt+"_attention_matrix.pkl", "rb") as f: 254 | data = pickle.load(f) 255 | src = data["src"][case] 256 | attentions = data["attentions"] 257 | 258 | encoder_atten = attentions["encoder"] 259 | 260 | s_len = 0 261 | print(" ".join(src)) 262 | for s in src: 263 | if s == "": 264 | break 265 | s_len += 1 266 | y_label = src[:s_len][::-1] 267 | fig, ax = plt.subplots(nrows=2, ncols=2, sharex=True, figsize=(7, 14)) 268 | 269 | for i in range(2): 270 | for j in range(2): 271 | ax[i, j].set_yticks(np.arange(len(y_label))) 272 | ax[i, j].tick_params(labelright=True) 273 | ax[i, j].set_yticklabels(y_label, fontsize=9) # input 274 | 275 | img = encoder_atten[-1][case, i+j][:s_len - 1, :s_len - 1] 276 | color = cm.rainbow(np.linspace(0, 1, img.shape[0])) 277 | for row, c in zip(range(img.shape[0]), color): 278 | for col in range(img.shape[1]): 279 | alpha = (img[row, col] / img[row].max()) ** 5 280 | ax[i, j].plot([0, 1], [img.shape[1]-col, img.shape[0]-row-1], alpha=alpha, c=c) 281 | ax[i, j].set_xticks(()) 282 | ax[i, j].set_xlabel("head %i" % (j+1+i*2)) 283 | ax[i, j].set_xlim(0, 1) 284 | plt.subplots_adjust(top=0.9) 285 | plt.tight_layout() 286 | plt.savefig("./visual/results/"+bert_or_gpt+"%d_self_attention_line.png" % case, dpi=100) 287 | 288 | 289 | if __name__ == "__main__": 290 | os.makedirs("./visual/results", exist_ok=True) 291 | # all_mask_kinds() 292 | # seq2seq_attention() 293 | # position_embedding() 294 | transformer_attention_matrix(case=0) 295 | transformer_attention_line(case=0) 296 | 297 | # model = ["gpt", "bert", "bert_window_mask"][1] 298 | # case = 6 299 | # self_attention_matrix(model, case=case) 300 | # self_attention_line(model, case=case) 301 | --------------------------------------------------------------------------------