├── .gitignore
├── BERT.py
├── BERT_window_mask.py
├── BLEU.py
├── CBOW.py
├── ELMo.py
├── GPT.py
├── LICENSE
├── README.md
├── cnn-lm.py
├── pytorch
    ├── BERT.py
    ├── CBOW.py
    ├── ELMo.py
    ├── GPT.py
    ├── README.md
    ├── __pycache__
    │   ├── transformer.cpython-37.pyc
    │   ├── transformer.cpython-38.pyc
    │   ├── utils.cpython-37.pyc
    │   └── utils.cpython-38.pyc
    ├── cnn_lm.py
    ├── seq2seq.py
    ├── seq2seq_attention.py
    ├── skip_gram.py
    ├── transformer.py
    ├── utils.py
    └── visual.py
├── requirements.txt
├── seq2seq.py
├── seq2seq_attention.py
├── simple_realize
    ├── CBOW.py
    ├── README.md
    ├── cnn-lm.py
    ├── imgs
    │   ├── attention.gif
    │   ├── seq2seq-embedding.gif
    │   └── skip-gram.gif
    ├── seq2seq.py
    ├── seq2seq_attention.py
    ├── skip-gram.py
    └── transformer.py
├── skip-gram.py
├── tf_idf.py
├── tf_idf_sklearn.py
├── transformer.py
├── utils.py
└── visual.py


/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_*
2 | .idea/
3 | visual/
4 | MRPC/
5 | img/
6 | 


--------------------------------------------------------------------------------
/BERT.py:
--------------------------------------------------------------------------------
  1 | # [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/pdf/1810.04805.pdf)
  2 | import numpy as np
  3 | import tensorflow as tf
  4 | import utils    # this refers to utils.py in my [repo](https://github.com/MorvanZhou/NLP-Tutorials/)
  5 | import time
  6 | from GPT import GPT
  7 | import os
  8 | import pickle
  9 | 
 10 | 
 11 | class BERT(GPT):
 12 |     def __init__(self, model_dim, max_len, n_layer, n_head, n_vocab, lr, max_seg=3, drop_rate=0.1, padding_idx=0):
 13 |         super().__init__(model_dim, max_len, n_layer, n_head, n_vocab, lr, max_seg, drop_rate, padding_idx)
 14 |         # I think task emb is not necessary for pretraining,
 15 |         # because the aim of all tasks is to train a universal sentence embedding
 16 |         # the body encoder is the same across all tasks,
 17 |         # and different output layer defines different task just like transfer learning.
 18 |         # finetuning replaces output layer and leaves the body encoder unchanged.
 19 | 
 20 |         # self.task_emb = keras.layers.Embedding(
 21 |         #     input_dim=n_task, output_dim=model_dim,  # [n_task, dim]
 22 |         #     embeddings_initializer=tf.initializers.RandomNormal(0., 0.01),
 23 |         # )
 24 | 
 25 |     def step(self, seqs, segs, seqs_, loss_mask, nsp_labels):
 26 |         with tf.GradientTape() as tape:
 27 |             mlm_logits, nsp_logits = self.call(seqs, segs, training=True)
 28 |             mlm_loss_batch = tf.boolean_mask(self.cross_entropy(seqs_, mlm_logits), loss_mask)
 29 |             mlm_loss = tf.reduce_mean(mlm_loss_batch)
 30 |             nsp_loss = tf.reduce_mean(self.cross_entropy(nsp_labels, nsp_logits))
 31 |             loss = mlm_loss + 0.2 * nsp_loss
 32 |             grads = tape.gradient(loss, self.trainable_variables)
 33 |             self.opt.apply_gradients(zip(grads, self.trainable_variables))
 34 |         return loss, mlm_logits
 35 | 
 36 |     def mask(self, seqs):
 37 |         mask = tf.cast(tf.math.equal(seqs, self.padding_idx), tf.float32)
 38 |         return mask[:, tf.newaxis, tf.newaxis, :]  # [n, 1, 1, step]
 39 | 
 40 | 
 41 | def _get_loss_mask(len_arange, seq, pad_id):
 42 |     rand_id = np.random.choice(len_arange, size=max(2, int(MASK_RATE * len(len_arange))), replace=False)
 43 |     loss_mask = np.full_like(seq, pad_id, dtype=np.bool)
 44 |     loss_mask[rand_id] = True
 45 |     return loss_mask[None, :], rand_id
 46 | 
 47 | 
 48 | def do_mask(seq, len_arange, pad_id, mask_id):
 49 |     loss_mask, rand_id = _get_loss_mask(len_arange, seq, pad_id)
 50 |     seq[rand_id] = mask_id
 51 |     return loss_mask
 52 | 
 53 | 
 54 | def do_replace(seq, len_arange, pad_id, word_ids):
 55 |     loss_mask, rand_id = _get_loss_mask(len_arange, seq, pad_id)
 56 |     seq[rand_id] = np.random.choice(word_ids, size=len(rand_id))
 57 |     return loss_mask
 58 | 
 59 | 
 60 | def do_nothing(seq, len_arange, pad_id):
 61 |     loss_mask, _ = _get_loss_mask(len_arange, seq, pad_id)
 62 |     return loss_mask
 63 | 
 64 | 
 65 | def random_mask_or_replace(data, arange, batch_size):
 66 |     seqs, segs, xlen, nsp_labels = data.sample(batch_size)
 67 |     seqs_ = seqs.copy()
 68 |     p = np.random.random()
 69 |     if p < 0.7:
 70 |         # mask
 71 |         loss_mask = np.concatenate(
 72 |             [do_mask(
 73 |                 seqs[i],
 74 |                 np.concatenate((arange[:xlen[i, 0]], arange[xlen[i, 0] + 1:xlen[i].sum() + 1])),
 75 |                 data.pad_id,
 76 |                 data.v2i["<MASK>"]) for i in range(len(seqs))], axis=0)
 77 |     elif p < 0.85:
 78 |         # do nothing
 79 |         loss_mask = np.concatenate(
 80 |             [do_nothing(
 81 |                 seqs[i],
 82 |                 np.concatenate((arange[:xlen[i, 0]], arange[xlen[i, 0] + 1:xlen[i].sum() + 1])),
 83 |                 data.pad_id) for i in range(len(seqs))], axis=0)
 84 |     else:
 85 |         # replace
 86 |         loss_mask = np.concatenate(
 87 |             [do_replace(
 88 |                 seqs[i],
 89 |                 np.concatenate((arange[:xlen[i, 0]], arange[xlen[i, 0] + 1:xlen[i].sum() + 1])),
 90 |                 data.pad_id,
 91 |                 data.word_ids) for i in range(len(seqs))], axis=0)
 92 |     return seqs, segs, seqs_, loss_mask, xlen, nsp_labels
 93 | 
 94 | 
 95 | def train(model, data, step=10000, name="bert"):
 96 |     t0 = time.time()
 97 |     arange = np.arange(0, data.max_len)
 98 |     for t in range(step):
 99 |         seqs, segs, seqs_, loss_mask, xlen, nsp_labels = random_mask_or_replace(data, arange, 16)
100 |         loss, pred = model.step(seqs, segs, seqs_, loss_mask, nsp_labels)
101 |         if t % 100 == 0:
102 |             pred = pred[0].numpy().argmax(axis=1)
103 |             t1 = time.time()
104 |             print(
105 |                 "\n\nstep: ", t,
106 |                 "| time: %.2f" % (t1 - t0),
107 |                 "| loss: %.3f" % loss.numpy(),
108 |                 "\n| tgt: ", " ".join([data.i2v[i] for i in seqs[0][:xlen[0].sum()+1]]),
109 |                 "\n| prd: ", " ".join([data.i2v[i] for i in pred[:xlen[0].sum()+1]]),
110 |                 "\n| tgt word: ", [data.i2v[i] for i in seqs_[0]*loss_mask[0] if i != data.v2i["<PAD>"]],
111 |                 "\n| prd word: ", [data.i2v[i] for i in pred*loss_mask[0] if i != data.v2i["<PAD>"]],
112 |                 )
113 |             t0 = t1
114 |     os.makedirs("./visual/models/%s" % name, exist_ok=True)
115 |     model.save_weights("./visual/models/%s/model.ckpt" % name)
116 | 
117 | 
118 | def export_attention(model, data, name="bert"):
119 |     model.load_weights("./visual/models/%s/model.ckpt" % name)
120 | 
121 |     # save attention matrix for visualization
122 |     seqs, segs, xlen, nsp_labels = data.sample(32)
123 |     model.call(seqs, segs, False)
124 |     data = {"src": [[data.i2v[i] for i in seqs[j]] for j in range(len(seqs))], "attentions": model.attentions}
125 |     path = "./visual/tmp/%s_attention_matrix.pkl" % name
126 |     os.makedirs(os.path.dirname(path), exist_ok=True)
127 |     with open(path, "wb") as f:
128 |         pickle.dump(data, f)
129 | 
130 | 
131 | if __name__ == "__main__":
132 |     utils.set_soft_gpu(True)
133 |     MODEL_DIM = 256
134 |     N_LAYER = 4
135 |     LEARNING_RATE = 1e-4
136 |     MASK_RATE = 0.15
137 | 
138 |     d = utils.MRPCData("./MRPC", 2000)
139 |     print("num word: ", d.num_word)
140 |     m = BERT(
141 |         model_dim=MODEL_DIM, max_len=d.max_len, n_layer=N_LAYER, n_head=4, n_vocab=d.num_word,
142 |         lr=LEARNING_RATE, max_seg=d.num_seg, drop_rate=0.2, padding_idx=d.v2i["<PAD>"])
143 |     train(m, d, step=10000, name="bert")
144 |     export_attention(m, d, "bert")
145 | 
146 | 


--------------------------------------------------------------------------------
/BERT_window_mask.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from GPT import GPT, train, export_attention
 3 | import utils    # this refers to utils.py in my [repo](https://github.com/MorvanZhou/NLP-Tutorials/)
 4 | 
 5 | 
 6 | class BERT(GPT):
 7 |     def __init__(self, model_dim, max_len, n_layer, n_head, n_vocab, lr, max_seg=3, drop_rate=0.1, padding_idx=0):
 8 |         super().__init__(model_dim, max_len, n_layer, n_head, n_vocab, lr, max_seg, drop_rate, padding_idx)
 9 | 
10 |     def mask(self, seqs):
11 |         """
12 |          abcd--
13 |         a010011
14 |         b001011
15 |         c000111
16 |         d000011
17 |         -000001
18 |         -000000
19 | 
20 |         a is a embedding for a-cd
21 |         b is a embedding for ab-d
22 |         c is a embedding for abc-
23 |         later, b embedding will + another b embedding from previous residual input to predict c
24 |         """
25 |         eye = tf.eye(self.max_len+1, batch_shape=[len(seqs)], dtype=tf.float32)[:, 1:, :-1]
26 |         pad = tf.math.equal(seqs, self.padding_idx)
27 |         mask = tf.where(pad[:, tf.newaxis, tf.newaxis, :], 1, eye[:, tf.newaxis, :, :])
28 |         return mask  # [n, 1, step, step]
29 | 
30 | 
31 | if __name__ == "__main__":
32 |     utils.set_soft_gpu(True)
33 |     MODEL_DIM = 256
34 |     N_LAYER = 4
35 |     LEARNING_RATE = 1e-4
36 |     d = utils.MRPCData("./MRPC", 2000)
37 |     print("num word: ", d.num_word)
38 |     m = BERT(
39 |         model_dim=MODEL_DIM, max_len=d.max_len - 1, n_layer=N_LAYER, n_head=4, n_vocab=d.num_word,
40 |         lr=LEARNING_RATE, max_seg=d.num_seg, drop_rate=0.2, padding_idx=d.pad_id)
41 |     train(m, d, step=5000, name="bert_window_mask")
42 |     export_attention(m, d, "bert_window_mask")
43 | 
44 | 


--------------------------------------------------------------------------------
/BLEU.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from collections import Counter
 3 | 
 4 | 
 5 | def BLEU(references, generated, max_grams=4, weights=None):
 6 |     ref_list = [ref.lower().split(" ") for ref in references]
 7 |     gen = generated.lower().split(" ")
 8 |     cpn = np.empty((max_grams,), dtype=np.float32)
 9 |     for n in range(1, max_grams+1):
10 |         gen_gram = [" ".join(gen[i:i+n]) for i in range(0, len(gen)-n+1)]
11 |         refs_gram = [[" ".join(ref[i:i+n]) for i in range(0, len(ref)-n+1)] for ref in ref_list]
12 |         g_counter = Counter(gen_gram)
13 |         r_counters = [Counter(ref_gram) for ref_gram in refs_gram]
14 |         count_clip = 0
15 |         for k, v in g_counter.items():
16 |             count_clip += min(v, max([r.get(k, 0) for r in r_counters]))
17 |         cpn[n-1] = count_clip/sum(g_counter.values())
18 | 
19 |     ls = len(gen)
20 |     lc = max([len(ref) for ref in ref_list])
21 |     brevity_penalty = 1 if lc > ls else np.exp(1-ls/lc)
22 | 
23 |     if weights is None:
24 |         weights = np.ones_like(cpn)
25 |     bleu = brevity_penalty * np.exp(np.mean(weights * np.log(cpn)))
26 |     return bleu
27 | 
28 | 
29 | bleu = BLEU(["The cat is on the mat", "There is a cat on the mat"], "The cat is on the mat", 3)
30 | print(bleu)


--------------------------------------------------------------------------------
/CBOW.py:
--------------------------------------------------------------------------------
 1 | # [Efficient Estimation of Word Representations in Vector Space](https://arxiv.org/pdf/1301.3781.pdf)
 2 | from tensorflow import keras
 3 | import tensorflow as tf
 4 | from utils import process_w2v_data  # this refers to utils.py in my [repo](https://github.com/MorvanZhou/NLP-Tutorials/)
 5 | from visual import show_w2v_word_embedding  # this refers to visual.py in my [repo](https://github.com/MorvanZhou/NLP-Tutorials/)
 6 | 
 7 | corpus = [
 8 |     # numbers
 9 |     "5 2 4 8 6 2 3 6 4",
10 |     "4 8 5 6 9 5 5 6",
11 |     "1 1 5 2 3 3 8",
12 |     "3 6 9 6 8 7 4 6 3",
13 |     "8 9 9 6 1 4 3 4",
14 |     "1 0 2 0 2 1 3 3 3 3 3",
15 |     "9 3 3 0 1 4 7 8",
16 |     "9 9 8 5 6 7 1 2 3 0 1 0",
17 | 
18 |     # alphabets, expecting that 9 is close to letters
19 |     "a t g q e h 9 u f",
20 |     "e q y u o i p s",
21 |     "q o 9 p l k j o k k o p",
22 |     "h g y i u t t a e q",
23 |     "i k d q r e 9 e a d",
24 |     "o p d g 9 s a f g a",
25 |     "i u y g h k l a s w",
26 |     "o l u y a o g f s",
27 |     "o p i u y g d a s j d l",
28 |     "u k i l o 9 l j s",
29 |     "y g i s h k j l f r f",
30 |     "i o h n 9 9 d 9 f a 9",
31 | ]
32 | 
33 | 
34 | class CBOW(keras.Model):
35 |     def __init__(self, v_dim, emb_dim):
36 |         super().__init__()
37 |         self.v_dim = v_dim
38 |         self.embeddings = keras.layers.Embedding(
39 |             input_dim=v_dim, output_dim=emb_dim,  # [n_vocab, emb_dim]
40 |             embeddings_initializer=keras.initializers.RandomNormal(0., 0.1),
41 |         )
42 | 
43 |         # noise-contrastive estimation
44 |         self.nce_w = self.add_weight(
45 |             name="nce_w", shape=[v_dim, emb_dim],
46 |             initializer=keras.initializers.TruncatedNormal(0., 0.1))  # [n_vocab, emb_dim]
47 |         self.nce_b = self.add_weight(
48 |             name="nce_b", shape=(v_dim,),
49 |             initializer=keras.initializers.Constant(0.1))  # [n_vocab, ]
50 | 
51 |         self.opt = keras.optimizers.Adam(0.01)
52 | 
53 |     def call(self, x, training=None, mask=None):
54 |         # x.shape = [n, skip_window*2]
55 |         o = self.embeddings(x)          # [n, skip_window*2, emb_dim]
56 |         o = tf.reduce_mean(o, axis=1)   # [n, emb_dim]
57 |         return o
58 | 
59 |     # negative sampling: take one positive label and num_sampled negative labels to compute the loss
60 |     # in order to reduce the computation of full softmax
61 |     def loss(self, x, y, training=None):
62 |         embedded = self.call(x, training)
63 |         return tf.reduce_mean(
64 |             tf.nn.nce_loss(
65 |                 weights=self.nce_w, biases=self.nce_b, labels=tf.expand_dims(y, axis=1),
66 |                 inputs=embedded, num_sampled=5, num_classes=self.v_dim))
67 | 
68 |     def step(self, x, y):
69 |         with tf.GradientTape() as tape:
70 |             loss = self.loss(x, y, True)
71 |             grads = tape.gradient(loss, self.trainable_variables)
72 |         self.opt.apply_gradients(zip(grads, self.trainable_variables))
73 |         return loss.numpy()
74 | 
75 | 
76 | def train(model, data):
77 |     for t in range(2500):
78 |         bx, by = data.sample(8)
79 |         loss = model.step(bx, by)
80 |         if t % 200 == 0:
81 |             print("step: {} | loss: {}".format(t, loss))
82 | 
83 | 
84 | if __name__ == "__main__":
85 |     d = process_w2v_data(corpus, skip_window=2, method="cbow")
86 |     m = CBOW(d.num_word, 2)
87 |     train(m, d)
88 | 
89 |     # plotting
90 |     show_w2v_word_embedding(m, d, "./visual/results/cbow.png")


--------------------------------------------------------------------------------
/ELMo.py:
--------------------------------------------------------------------------------
  1 | # [Deep contextualized word representations](https://arxiv.org/pdf/1802.05365.pdf)
  2 | from tensorflow import keras
  3 | import tensorflow as tf
  4 | import utils    # this refers to utils.py in my [repo](https://github.com/MorvanZhou/NLP-Tutorials/)
  5 | import time
  6 | import os
  7 | 
  8 | 
  9 | class ELMo(keras.Model):
 10 |     def __init__(self, v_dim, emb_dim, units, n_layers, lr):
 11 |         super().__init__()
 12 |         self.n_layers = n_layers
 13 |         self.units = units
 14 | 
 15 |         # encoder
 16 |         self.word_embed = keras.layers.Embedding(
 17 |             input_dim=v_dim, output_dim=emb_dim,  # [n_vocab, emb_dim]
 18 |             embeddings_initializer=keras.initializers.RandomNormal(0., 0.001),
 19 |             mask_zero=True,
 20 |         )
 21 |         # forward lstm
 22 |         self.fs = [keras.layers.LSTM(units, return_sequences=True) for _ in range(n_layers)]
 23 |         self.f_logits = keras.layers.Dense(v_dim)
 24 |         # backward lstm
 25 |         self.bs = [keras.layers.LSTM(units, return_sequences=True, go_backwards=True) for _ in range(n_layers)]
 26 |         self.b_logits = keras.layers.Dense(v_dim)
 27 | 
 28 |         self.cross_entropy1 = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
 29 |         self.cross_entropy2 = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
 30 |         self.opt = keras.optimizers.Adam(lr)
 31 | 
 32 |     def call(self, seqs):
 33 |         embedded = self.word_embed(seqs)        # [n, step, dim]
 34 |         """
 35 |         0123    forward
 36 |         1234    forward predict
 37 |          1234   backward 
 38 |          0123   backward predict
 39 |         """
 40 |         mask = self.word_embed.compute_mask(seqs)
 41 |         fxs, bxs = [embedded[:, :-1]], [embedded[:, 1:]]
 42 |         for fl, bl in zip(self.fs, self.bs):
 43 |             fx = fl(
 44 |                 fxs[-1], mask=mask[:, :-1], initial_state=fl.get_initial_state(fxs[-1])
 45 |             )           # [n, step-1, dim]
 46 |             bx = bl(
 47 |                 bxs[-1], mask=mask[:, 1:], initial_state=bl.get_initial_state(bxs[-1])
 48 |             )  # [n, step-1, dim]
 49 |             fxs.append(fx)      # predict 1234
 50 |             bxs.append(tf.reverse(bx, axis=[1]))    # predict 0123
 51 |         return fxs, bxs
 52 | 
 53 |     def step(self, seqs):
 54 |         with tf.GradientTape() as tape:
 55 |             fxs, bxs = self.call(seqs)
 56 |             fo, bo = self.f_logits(fxs[-1]), self.b_logits(bxs[-1])
 57 |             loss = (self.cross_entropy1(seqs[:, 1:], fo) + self.cross_entropy2(seqs[:, :-1], bo))/2
 58 |         grads = tape.gradient(loss, self.trainable_variables)
 59 |         self.opt.apply_gradients(zip(grads, self.trainable_variables))
 60 |         return loss, (fo, bo)
 61 | 
 62 |     def get_emb(self, seqs):
 63 |         fxs, bxs = self.call(seqs)
 64 |         xs = [
 65 |                  tf.concat((fxs[0][:, 1:, :], bxs[0][:, :-1, :]), axis=2).numpy()   # from word embedding
 66 |              ] + [
 67 |             tf.concat((f[:, :-1, :], b[:, 1:, :]), axis=2).numpy() for f, b in zip(fxs[1:], bxs[1:])] # from sentence embedding
 68 |         for x in xs:
 69 |             print("layers shape=", x.shape)
 70 |         return xs
 71 | 
 72 | 
 73 | def train(model, data, step):
 74 |     t0 = time.time()
 75 |     for t in range(step):
 76 |         seqs = data.sample(BATCH_SIZE)
 77 |         loss, (fo, bo) = model.step(seqs)
 78 |         if t % 80 == 0:
 79 |             fp = fo[0].numpy().argmax(axis=1)
 80 |             bp = bo[0].numpy().argmax(axis=1)
 81 |             t1 = time.time()
 82 |             print(
 83 |                 "\n\nstep: ", t,
 84 |                 "| time: %.2f" % (t1 - t0),
 85 |                 "| loss: %.3f" % loss.numpy(),
 86 |                 "\n| tgt: ", " ".join([data.i2v[i] for i in seqs[0] if i != data.pad_id]),
 87 |                 "\n| f_prd: ", " ".join([data.i2v[i] for i in fp if i != data.pad_id]),
 88 |                 "\n| b_prd: ", " ".join([data.i2v[i] for i in bp if i != data.pad_id]),
 89 |                 )
 90 |             t0 = t1
 91 |     os.makedirs("./visual/models/elmo", exist_ok=True)
 92 |     model.save_weights("./visual/models/elmo/model.ckpt")
 93 | 
 94 | 
 95 | def export_w2v(model, data):
 96 |     model.load_weights("./visual/models/elmo/model.ckpt")
 97 |     emb = model.get_emb(data.sample(4))
 98 |     print(emb)
 99 | 
100 | 
101 | if __name__ == "__main__":
102 |     utils.set_soft_gpu(True)
103 |     UNITS = 256
104 |     N_LAYERS = 2
105 |     BATCH_SIZE = 16
106 |     LEARNING_RATE = 2e-3
107 |     d = utils.MRPCSingle("./MRPC", rows=2000)
108 |     print("num word: ", d.num_word)
109 |     m = ELMo(d.num_word, emb_dim=UNITS, units=UNITS, n_layers=N_LAYERS, lr=LEARNING_RATE)
110 |     train(m, d, 10000)
111 |     export_w2v(m, d)


--------------------------------------------------------------------------------
/GPT.py:
--------------------------------------------------------------------------------
  1 | # [Improving Language Understanding by Generative Pre-Training](https://cdn.openai.com/research-covers/language-unsupervised/language_understanding_paper.pdf)
  2 | import tensorflow as tf
  3 | from tensorflow import keras
  4 | import utils    # this refers to utils.py in my [repo](https://github.com/MorvanZhou/NLP-Tutorials/)
  5 | import time
  6 | from transformer import Encoder
  7 | import pickle
  8 | import os
  9 | 
 10 | 
 11 | class GPT(keras.Model):
 12 |     def __init__(self, model_dim, max_len, n_layer, n_head, n_vocab, lr, max_seg=3, drop_rate=0.1, padding_idx=0):
 13 |         super().__init__()
 14 |         self.padding_idx = padding_idx
 15 |         self.n_vocab = n_vocab
 16 |         self.max_len = max_len
 17 | 
 18 |         # I think task emb is not necessary for pretraining,
 19 |         # because the aim of all tasks is to train a universal sentence embedding
 20 |         # the body encoder is the same across all tasks,
 21 |         # and different output layer defines different task just like transfer learning.
 22 |         # finetuning replaces output layer and leaves the body encoder unchanged.
 23 | 
 24 |         # self.task_emb = keras.layers.Embedding(
 25 |         #     input_dim=n_task, output_dim=model_dim,  # [n_task, dim]
 26 |         #     embeddings_initializer=tf.initializers.RandomNormal(0., 0.01),
 27 |         # )
 28 | 
 29 |         self.word_emb = keras.layers.Embedding(
 30 |             input_dim=n_vocab, output_dim=model_dim,  # [n_vocab, dim]
 31 |             embeddings_initializer=tf.initializers.RandomNormal(0., 0.01),
 32 |         )
 33 |         self.segment_emb = keras.layers.Embedding(
 34 |             input_dim=max_seg, output_dim=model_dim,  # [max_seg, dim]
 35 |             embeddings_initializer=tf.initializers.RandomNormal(0., 0.01),
 36 |         )
 37 |         self.position_emb = self.add_weight(
 38 |             name="pos", shape=[1, max_len, model_dim], dtype=tf.float32,   # [1, step, dim]
 39 |             initializer=keras.initializers.RandomNormal(0., 0.01))
 40 |         self.encoder = Encoder(n_head, model_dim, drop_rate, n_layer)
 41 |         self.task_mlm = keras.layers.Dense(n_vocab)
 42 |         self.task_nsp = keras.layers.Dense(2)
 43 | 
 44 |         self.cross_entropy = keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction="none")
 45 |         self.opt = keras.optimizers.Adam(lr)
 46 | 
 47 |     def call(self, seqs, segs, training=False):
 48 |         embed = self.input_emb(seqs, segs)  # [n, step, dim]
 49 |         z = self.encoder(embed, training=training, mask=self.mask(seqs))     # [n, step, dim]
 50 |         mlm_logits = self.task_mlm(z)  # [n, step, n_vocab]
 51 |         nsp_logits = self.task_nsp(tf.reshape(z, [z.shape[0], -1]))  # [n, n_cls]
 52 |         return mlm_logits, nsp_logits
 53 | 
 54 |     def step(self, seqs, segs, seqs_, nsp_labels):
 55 |         with tf.GradientTape() as tape:
 56 |             mlm_logits, nsp_logits = self.call(seqs, segs, training=True)
 57 |             pad_mask = tf.math.not_equal(seqs_, self.padding_idx)
 58 |             pred_loss = tf.reduce_mean(tf.boolean_mask(self.cross_entropy(seqs_, mlm_logits), pad_mask))
 59 |             nsp_loss = tf.reduce_mean(self.cross_entropy(nsp_labels, nsp_logits))
 60 |             loss = pred_loss + 0.2 * nsp_loss
 61 |             grads = tape.gradient(loss, self.trainable_variables)
 62 |             self.opt.apply_gradients(zip(grads, self.trainable_variables))
 63 |         return loss, mlm_logits
 64 | 
 65 |     def input_emb(self, seqs, segs):
 66 |         return self.word_emb(seqs) + self.segment_emb(segs) + self.position_emb  # [n, step, dim]
 67 | 
 68 |     def mask(self, seqs):
 69 |         """
 70 |          abcd--
 71 |         a011111
 72 |         b001111
 73 |         c000111
 74 |         d000011
 75 |         -000011
 76 |         -000011
 77 | 
 78 |         force head not to see afterward. eg.
 79 |         a is a embedding for a---
 80 |         b is a embedding for ab--
 81 |         c is a embedding for abc-
 82 |         later, b embedding will + b another embedding from previous residual input to predict c
 83 |         """
 84 |         mask = 1 - tf.linalg.band_part(tf.ones((self.max_len, self.max_len)), -1, 0)
 85 |         pad = tf.math.equal(seqs, self.padding_idx)
 86 |         mask = tf.where(pad[:, tf.newaxis, tf.newaxis, :], 1, mask[tf.newaxis, tf.newaxis, :, :])
 87 |         return mask  # (step, step)
 88 | 
 89 |     @property
 90 |     def attentions(self):
 91 |         attentions = {
 92 |             "encoder": [l.mh.attention.numpy() for l in self.encoder.ls],
 93 |         }
 94 |         return attentions
 95 | 
 96 | 
 97 | def train(model, data, step=10000, name="gpt"):
 98 |     t0 = time.time()
 99 |     for t in range(step):
100 |         seqs, segs, xlen, nsp_labels = data.sample(16)
101 |         loss, pred = model.step(seqs[:, :-1], segs[:, :-1], seqs[:, 1:], nsp_labels)
102 |         if t % 100 == 0:
103 |             pred = pred[0].numpy().argmax(axis=1)
104 |             t1 = time.time()
105 |             print(
106 |                 "\n\nstep: ", t,
107 |                 "| time: %.2f" % (t1 - t0),
108 |                 "| loss: %.3f" % loss.numpy(),
109 |                 "\n| tgt: ", " ".join([data.i2v[i] for i in seqs[0, 1:][:xlen[0].sum()+1]]),
110 |                 "\n| prd: ", " ".join([data.i2v[i] for i in pred[:xlen[0].sum()+1]]),
111 |                 )
112 |             t0 = t1
113 |     os.makedirs("./visual/models/%s" % name, exist_ok=True)
114 |     model.save_weights("./visual/models/%s/model.ckpt" % name)
115 | 
116 | 
117 | def export_attention(model, data, name="gpt"):
118 |     model.load_weights("./visual/models/%s/model.ckpt" % name)
119 | 
120 |     # save attention matrix for visualization
121 |     seqs, segs, xlen, nsp_labels = data.sample(32)
122 |     model.call(seqs[:, :-1], segs[:, :-1], False)
123 |     data = {"src": [[data.i2v[i] for i in seqs[j]] for j in range(len(seqs))], "attentions": model.attentions}
124 |     path = "./visual/tmp/%s_attention_matrix.pkl" % name
125 |     os.makedirs(os.path.dirname(path), exist_ok=True)
126 |     with open(path, "wb") as f:
127 |         pickle.dump(data, f)
128 | 
129 | 
130 | if __name__ == "__main__":
131 |     utils.set_soft_gpu(True)
132 |     MODEL_DIM = 256
133 |     N_LAYER = 4
134 |     LEARNING_RATE = 1e-4
135 | 
136 |     d = utils.MRPCData("./MRPC", 2000)
137 |     print("num word: ", d.num_word)
138 |     m = GPT(
139 |         model_dim=MODEL_DIM, max_len=d.max_len - 1, n_layer=N_LAYER, n_head=4, n_vocab=d.num_word,
140 |         lr=LEARNING_RATE, max_seg=d.num_seg, drop_rate=0.2, padding_idx=d.pad_id)
141 |     train(m, d, step=5000, name="gpt")
142 |     export_attention(m, d, name="gpt")
143 | 
144 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Andrew Gambardella
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Natural Language Processing Tutorial
  2 | 
  3 | Tutorial in Chinese can be found in [mofanpy.com](https://mofanpy.com/tutorials/machine-learning/nlp/).
  4 | 
  5 | This repo includes many simple implementations of models in Neural Language Processing (NLP).
  6 | 
  7 | All code implementations in this tutorial are organized as following:
  8 | 
  9 | 1. Search Engine
 10 |   - [TF-IDF numpy / TF-IDF skearn](#TF-IDF)
 11 | 2. Understand Word (W2V)
 12 |   - [Continuous Bag of Words (CBOW)](#Word2Vec)
 13 |   - [Skip-Gram](#Word2Vec)
 14 | 3. Understand Sentence (Seq2Seq)
 15 |   - [seq2seq](#Seq2Seq)
 16 |   - [CNN language model](#CNNLanguageModel)
 17 | 4. All about Attention
 18 |   - [seq2seq with attention](#Seq2SeqAttention)
 19 |   - [Transformer](#Transformer)
 20 | 5. Pretrained Models
 21 |   - [ELMo](#ELMO)
 22 |   - [GPT](#GPT)
 23 |   - [BERT](#BERT)
 24 | 
 25 | Thanks for the contribution made by [@W1Fl](https://github.com/W1Fl) with a simplified keras codes in [simple_realize](simple_realize).
 26 | And the a [pytorch version of this NLP](/pytorch) tutorial made by [@ruifanxu](https://github.com/ruifan831).
 27 | 
 28 | ## Installation
 29 | 
 30 | ```shell script
 31 | $ git clone https://github.com/MorvanZhou/NLP-Tutorials
 32 | $ cd NLP-Tutorials/
 33 | $ sudo pip3 install -r requirements.txt
 34 | ```
 35 | 
 36 | 
 37 | ## TF-IDF
 38 | 
 39 | TF-IDF numpy [code](tf_idf.py)
 40 | 
 41 | TF-IDF short sklearn [code](tf_idf_sklearn.py)
 42 | 
 43 | <a target="_blank" href="https://mofanpy.com/static/results/nlp/tfidf_matrix.png" style="text-align: center">
 44 | <img src="https://mofanpy.com/static/results/nlp/tfidf_matrix.png" height="250px" alt="image">
 45 | </a>
 46 | 
 47 | 
 48 | ## Word2Vec
 49 | [Efficient Estimation of Word Representations in Vector Space](https://arxiv.org/pdf/1301.3781.pdf)
 50 | 
 51 | Skip-Gram [code](skip-gram.py)
 52 | 
 53 | CBOW [code](CBOW.py)
 54 | 
 55 | <a target="_blank" href="https://mofanpy.com/static/results/nlp/cbow_illustration.png" style="text-align: center">
 56 | <img src="https://mofanpy.com/static/results/nlp/cbow_illustration.png" height="250px" alt="image">
 57 | </a>
 58 | 
 59 | <a target="_blank" href="https://mofanpy.com/static/results/nlp/skip_gram_illustration.png" style="text-align: center">
 60 | <img src="https://mofanpy.com/static/results/nlp/skip_gram_illustration.png" height="250px" alt="image">
 61 | </a>
 62 | 
 63 | <a target="_blank" href="https://mofanpy.com/static/results/nlp/cbow_code_result.png" style="text-align: center">
 64 | <img src="https://mofanpy.com/static/results/nlp/cbow_code_result.png" height="250px" alt="image">
 65 | </a>
 66 | 
 67 | 
 68 | ## Seq2Seq
 69 | [Sequence to Sequence Learning with Neural Networks](https://papers.nips.cc/paper/5346-sequence-to-sequence-learning-with-neural-networks.pdf)
 70 | 
 71 | Seq2Seq [code](seq2seq.py)
 72 | 
 73 | <a target="_blank" href="https://mofanpy.com/static/results/nlp/seq2seq_illustration.png" style="text-align: center">
 74 | <img src="https://mofanpy.com/static/results/nlp/seq2seq_illustration.png" height="250px" alt="image">
 75 | </a>
 76 | 
 77 | ## CNNLanguageModel
 78 | [Convolutional Neural Networks for Sentence Classification](https://arxiv.org/pdf/1408.5882.pdf)
 79 | 
 80 | CNN language model [code](cnn-lm.py)
 81 | 
 82 | <a target="_blank" href="https://mofanpy.com/static/results/nlp/cnn-ml_sentence_embedding.png" style="text-align: center">
 83 | <img src="https://mofanpy.com/static/results/nlp/cnn-ml_sentence_embedding.png" height="250px" alt="image">
 84 | </a>
 85 | 
 86 | 
 87 | ## Seq2SeqAttention
 88 | [Effective Approaches to Attention-based Neural Machine Translation](https://arxiv.org/pdf/1508.04025.pdf)
 89 | 
 90 | Seq2Seq Attention [code](seq2seq_attention.py)
 91 | 
 92 | <a target="_blank" href="https://mofanpy.com/static/results/nlp/luong_attention.png" style="text-align: center">
 93 | <img src="https://mofanpy.com/static/results/nlp/luong_attention.png" height="250px" alt="image">
 94 | </a>
 95 | <a target="_blank" href="https://mofanpy.com/static/results/nlp/seq2seq_attention_res.png" style="text-align: center">
 96 | <img src="https://mofanpy.com/static/results/nlp/seq2seq_attention_res.png" height="250px" alt="image">
 97 | </a>
 98 | 
 99 | 
100 | 
101 | ## Transformer
102 | [Attention Is All You Need](https://arxiv.org/pdf/1706.03762.pdf)
103 | 
104 | Transformer [code](transformer.py)
105 | 
106 | <a target="_blank" href="https://mofanpy.com/static/results/nlp/transformer_encoder_decoder.png" style="text-align: center">
107 | <img src="https://mofanpy.com/static/results/nlp/transformer_encoder_decoder.png" height="250px" alt="image">
108 | </a>
109 | <a target="_blank" href="https://mofanpy.com/static/results/nlp/transformer0_decoder_encoder_attention.png" style="text-align: center">
110 | <img src="https://mofanpy.com/static/results/nlp/transformer0_decoder_encoder_attention.png" height="250px" alt="image">
111 | </a>
112 | <a target="_blank" href="https://mofanpy.com/static/results/nlp/transformer0_encoder_decoder_attention_line.png" style="text-align: center">
113 | <img src="https://mofanpy.com/static/results/nlp/transformer0_encoder_decoder_attention_line.png" height="250px" alt="image">
114 | </a>
115 | 
116 | 
117 | ## ELMO
118 | [Deep contextualized word representations](https://arxiv.org/pdf/1802.05365.pdf)
119 | 
120 | ELMO [code](ELMo.py)
121 | 
122 | <a target="_blank" href="https://mofanpy.com/static/results/nlp/elmo_training.png" style="text-align: center">
123 | <img src="https://mofanpy.com/static/results/nlp/elmo_training.png" height="250px" alt="image">
124 | </a>
125 | <a target="_blank" href="https://mofanpy.com/static/results/nlp/elmo_word_emb.png" style="text-align: center">
126 | <img src="https://mofanpy.com/static/results/nlp/elmo_word_emb.png" height="250px" alt="image">
127 | </a>
128 | 
129 | 
130 | ## GPT
131 | [Improving Language Understanding by Generative Pre-Training](https://cdn.openai.com/research-covers/language-unsupervised/language_understanding_paper.pdf)
132 | 
133 | GPT [code](GPT.py)
134 | 
135 | <a target="_blank" href="https://mofanpy.com/static/results/nlp/gpt_structure.png" style="text-align: center">
136 | <img src="https://mofanpy.com/static/results/nlp/gpt_structure.png" height="250px" alt="image">
137 | </a>
138 | <a target="_blank" href="https://mofanpy.com/static/results/nlp/gpt7_self_attention_line.png" style="text-align: center">
139 | <img src="https://mofanpy.com/static/results/nlp/gpt7_self_attention_line.png" height="250px" alt="image">
140 | </a>
141 | 
142 | 
143 | ## BERT
144 | [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/pdf/1810.04805.pdf)
145 | 
146 | BERT [code](BERT.py)
147 | 
148 | My new attempt [Bert with window mask](BERT_window_mask.py)
149 | 
150 | <a target="_blank" href="https://mofanpy.com/static/results/nlp/bert_gpt_comparison.png" style="text-align: center">
151 | <img src="https://mofanpy.com/static/results/nlp/bert_gpt_comparison.png" height="250px" alt="image">
152 | </a>
153 | <a target="_blank" href="https://mofanpy.com/static/results/nlp/bert_self_mask4_self_attention_line.png" style="text-align: center">
154 | <img src="https://mofanpy.com/static/results/nlp/bert_self_mask4_self_attention_line.png" height="250px" alt="image">
155 | </a>
156 | 
157 | 


--------------------------------------------------------------------------------
/cnn-lm.py:
--------------------------------------------------------------------------------
  1 | # a modification from [Convolutional Neural Networks for Sentence Classification](https://arxiv.org/pdf/1408.5882.pdf)
  2 | 
  3 | import tensorflow as tf
  4 | from tensorflow import keras
  5 | import numpy as np
  6 | import utils    # this refers to utils.py in my [repo](https://github.com/MorvanZhou/NLP-Tutorials/)
  7 | import tensorflow_addons as tfa
  8 | 
  9 | 
 10 | class CNNTranslation(keras.Model):
 11 |     def __init__(self, enc_v_dim, dec_v_dim, emb_dim, units, max_pred_len, start_token, end_token):
 12 |         super().__init__()
 13 |         self.units = units
 14 | 
 15 |         # encoder
 16 |         self.enc_embeddings = keras.layers.Embedding(
 17 |             input_dim=enc_v_dim, output_dim=emb_dim,  # [enc_n_vocab, emb_dim]
 18 |             embeddings_initializer=tf.initializers.RandomNormal(0., 0.1),
 19 |         )
 20 |         self.conv2ds = [
 21 |             keras.layers.Conv2D(16, (n, emb_dim), padding="valid", activation=keras.activations.relu)
 22 |             for n in range(2, 5)]
 23 |         self.max_pools = [keras.layers.MaxPool2D((n, 1)) for n in [7, 6, 5]]
 24 |         self.encoder = keras.layers.Dense(units, activation=keras.activations.relu)
 25 | 
 26 |         # decoder
 27 |         self.dec_embeddings = keras.layers.Embedding(
 28 |             input_dim=dec_v_dim, output_dim=emb_dim,  # [dec_n_vocab, emb_dim]
 29 |             embeddings_initializer=tf.initializers.RandomNormal(0., 0.1),
 30 |         )
 31 |         self.decoder_cell = keras.layers.LSTMCell(units=units)
 32 |         decoder_dense = keras.layers.Dense(dec_v_dim)
 33 |         # train decoder
 34 |         self.decoder_train = tfa.seq2seq.BasicDecoder(
 35 |             cell=self.decoder_cell,
 36 |             sampler=tfa.seq2seq.sampler.TrainingSampler(),   # sampler for train
 37 |             output_layer=decoder_dense
 38 |         )
 39 |         # predict decoder
 40 |         self.decoder_eval = tfa.seq2seq.BasicDecoder(
 41 |             cell=self.decoder_cell,
 42 |             sampler=tfa.seq2seq.sampler.GreedyEmbeddingSampler(),       # sampler for predict
 43 |             output_layer=decoder_dense
 44 |         )
 45 | 
 46 |         self.cross_entropy = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
 47 |         self.opt = keras.optimizers.Adam(0.01)
 48 |         self.max_pred_len = max_pred_len
 49 |         self.start_token = start_token
 50 |         self.end_token = end_token
 51 | 
 52 |     def encode(self, x):
 53 |         embedded = self.enc_embeddings(x)               # [n, step, emb]
 54 |         o = tf.expand_dims(embedded, axis=3)            # [n, step=8, emb=16, 1]
 55 |         co = [conv2d(o) for conv2d in self.conv2ds]     # [n, 7, 1, 16], [n, 6, 1, 16], [n, 5, 1, 16]
 56 |         co = [self.max_pools[i](co[i]) for i in range(len(co))]     # [n, 1, 1, 16] * 3
 57 |         co = [tf.squeeze(c, axis=[1, 2]) for c in co]               # [n, 16] * 3
 58 |         o = tf.concat(co, axis=1)           # [n, 16*3]
 59 |         h = self.encoder(o)                 # [n, units]
 60 |         return [h, h]
 61 | 
 62 |     def inference(self, x):
 63 |         s = self.encode(x)
 64 |         done, i, s = self.decoder_eval.initialize(
 65 |             self.dec_embeddings.variables[0],
 66 |             start_tokens=tf.fill([x.shape[0], ], self.start_token),
 67 |             end_token=self.end_token,
 68 |             initial_state=s,
 69 |         )
 70 |         pred_id = np.zeros((x.shape[0], self.max_pred_len), dtype=np.int32)
 71 |         for l in range(self.max_pred_len):
 72 |             o, s, i, done = self.decoder_eval.step(
 73 |                 time=l, inputs=i, state=s, training=False)
 74 |             pred_id[:, l] = o.sample_id
 75 |         return pred_id
 76 | 
 77 |     def train_logits(self, x, y, seq_len):
 78 |         s = self.encode(x)
 79 |         dec_in = y[:, :-1]   # ignore <EOS>
 80 |         dec_emb_in = self.dec_embeddings(dec_in)
 81 |         o, _, _ = self.decoder_train(dec_emb_in, s, sequence_length=seq_len)
 82 |         logits = o.rnn_output
 83 |         return logits
 84 | 
 85 |     def step(self, x, y, seq_len):
 86 |         with tf.GradientTape() as tape:
 87 |             logits = self.train_logits(x, y, seq_len)
 88 |             dec_out = y[:, 1:]  # ignore <GO>
 89 |             loss = self.cross_entropy(dec_out, logits)
 90 |             grads = tape.gradient(loss, self.trainable_variables)
 91 |         self.opt.apply_gradients(zip(grads, self.trainable_variables))
 92 |         return loss.numpy()
 93 | 
 94 | 
 95 | def train():
 96 |     # get and process data
 97 |     data = utils.DateData(4000)
 98 |     print("Chinese time order: yy/mm/dd ", data.date_cn[:3], "\nEnglish time order: dd/M/yyyy ", data.date_en[:3])
 99 |     print("vocabularies: ", data.vocab)
100 |     print("x index sample: \n{}\n{}".format(data.idx2str(data.x[0]), data.x[0]),
101 |           "\ny index sample: \n{}\n{}".format(data.idx2str(data.y[0]), data.y[0]))
102 | 
103 |     model = CNNTranslation(
104 |         data.num_word, data.num_word, emb_dim=16, units=32,
105 |         max_pred_len=11, start_token=data.start_token, end_token=data.end_token)
106 | 
107 |     # training
108 |     for t in range(1500):
109 |         bx, by, decoder_len = data.sample(32)
110 |         loss = model.step(bx, by, decoder_len)
111 |         if t % 70 == 0:
112 |             target = data.idx2str(by[0, 1:-1])
113 |             pred = model.inference(bx[0:1])
114 |             res = data.idx2str(pred[0])
115 |             src = data.idx2str(bx[0])
116 |             print(
117 |                 "t: ", t,
118 |                 "| loss: %.3f" % loss,
119 |                 "| input: ", src,
120 |                 "| target: ", target,
121 |                 "| inference: ", res,
122 |             )
123 | 
124 | 
125 | if __name__ == "__main__":
126 |     train()
127 | 


--------------------------------------------------------------------------------
/pytorch/BERT.py:
--------------------------------------------------------------------------------
  1 | from pickle import load
  2 | import numpy as np
  3 | from torch import nn
  4 | import torch
  5 | from torch.utils.data import DataLoader
  6 | from torch.nn.functional import cross_entropy,softmax, relu
  7 | 
  8 | import utils
  9 | from GPT import GPT
 10 | import os
 11 | import pickle
 12 | 
 13 | MASK_RATE = 0.15
 14 | 
 15 | class BERT(GPT):
 16 | 
 17 |     def __init__(
 18 |         self, model_dim, max_len, num_layer, num_head, n_vocab, lr,
 19 |         max_seg=3, drop_rate=0.2, padding_idx=0) -> None:
 20 |         super().__init__(model_dim, max_len, num_layer, num_head, n_vocab, lr, max_seg, drop_rate, padding_idx)
 21 |     
 22 |     def step(self,seqs,segs,seqs_, loss_mask,nsp_labels):
 23 |         device = next(self.parameters()).device
 24 |         self.opt.zero_grad()
 25 |         mlm_logits, nsp_logits = self(seqs, segs, training=True)    # [n, step, n_vocab], [n, n_cls]
 26 |         mlm_loss = cross_entropy(
 27 |             torch.masked_select(mlm_logits,loss_mask).reshape(-1,mlm_logits.shape[2]),
 28 |             torch.masked_select(seqs_,loss_mask.squeeze(2))
 29 |             )
 30 |         nsp_loss = cross_entropy(nsp_logits,nsp_labels.reshape(-1))
 31 |         loss = mlm_loss + 0.2 * nsp_loss
 32 |         loss.backward()
 33 |         self.opt.step()
 34 |         return loss.cpu().data.numpy(),mlm_logits
 35 | 
 36 |     def mask(self, seqs):
 37 |         mask = torch.eq(seqs,self.padding_idx)
 38 |         return mask[:, None, None, :]
 39 | 
 40 | def _get_loss_mask(len_arange, seq, pad_id):
 41 |     rand_id = np.random.choice(len_arange, size=max(2, int(MASK_RATE * len(len_arange))), replace=False)
 42 |     loss_mask = np.full_like(seq, pad_id, dtype=np.bool)
 43 |     loss_mask[rand_id] = True
 44 |     return loss_mask[None, :], rand_id
 45 | 
 46 | def do_mask(seq, len_arange, pad_id, mask_id):
 47 |     loss_mask, rand_id = _get_loss_mask(len_arange, seq, pad_id)
 48 |     seq[rand_id] = mask_id
 49 |     return loss_mask
 50 | 
 51 | def do_replace(seq, len_arange, pad_id, word_ids):
 52 |     loss_mask, rand_id = _get_loss_mask(len_arange, seq, pad_id)
 53 |     seq[rand_id] = torch.from_numpy(np.random.choice(word_ids, size=len(rand_id))).type(torch.IntTensor)
 54 |     return loss_mask
 55 | 
 56 | def do_nothing(seq, len_arange, pad_id):
 57 |     loss_mask, _ = _get_loss_mask(len_arange, seq, pad_id)
 58 |     return loss_mask
 59 | 
 60 | def random_mask_or_replace(data,arange,dataset):
 61 |     seqs, segs,xlen,nsp_labels = data
 62 |     seqs_ = seqs.data.clone()
 63 |     p = np.random.random()
 64 |     if p < 0.7:
 65 |         # mask
 66 |         loss_mask = np.concatenate([
 67 |             do_mask(
 68 |                 seqs[i],
 69 |                 np.concatenate((arange[:xlen[i,0]],arange[xlen[i,0]+1:xlen[i].sum()+1])),
 70 |                 dataset.pad_id,
 71 |                 dataset.mask_id
 72 |                 )
 73 |                 for i in range(len(seqs))], axis=0)
 74 |     elif p < 0.85:
 75 |         # do nothing
 76 |         loss_mask = np.concatenate([
 77 |             do_nothing(
 78 |                 seqs[i],
 79 |                 np.concatenate((arange[:xlen[i,0]],arange[xlen[i,0]+1:xlen[i].sum()+1])),
 80 |                 dataset.pad_id
 81 |                 )
 82 |                 for i in range(len(seqs))],  axis=0)
 83 |     else:
 84 |         # replace
 85 |         loss_mask = np.concatenate([
 86 |             do_replace(
 87 |                 seqs[i],
 88 |                 np.concatenate((arange[:xlen[i,0]],arange[xlen[i,0]+1:xlen[i].sum()+1])),
 89 |                 dataset.pad_id,
 90 |                 dataset.word_ids
 91 |                 )
 92 |                 for i in range(len(seqs))],  axis=0)
 93 |     loss_mask = torch.from_numpy(loss_mask).unsqueeze(2)
 94 |     return seqs, segs, seqs_, loss_mask, xlen, nsp_labels
 95 | 
 96 | def train():
 97 |     MODEL_DIM = 256
 98 |     N_LAYER = 4
 99 |     LEARNING_RATE = 1e-4
100 |     dataset = utils.MRPCData("./MRPC",2000)
101 |     print("num word: ",dataset.num_word)
102 |     model = BERT(
103 |         model_dim=MODEL_DIM, max_len=dataset.max_len, num_layer=N_LAYER, num_head=4, n_vocab=dataset.num_word,
104 |         lr=LEARNING_RATE, max_seg=dataset.num_seg, drop_rate=0.2, padding_idx=dataset.pad_id
105 |     )
106 |     if torch.cuda.is_available():
107 |         print("GPU train avaliable")
108 |         device =torch.device("cuda")
109 |         model = model.cuda()
110 |     else:
111 |         device = torch.device("cpu")
112 |         model = model.cpu()
113 |     
114 |     loader = DataLoader(dataset,batch_size=32,shuffle=True)
115 |     arange = np.arange(0,dataset.max_len)
116 |     for epoch in range(500):
117 |         for batch_idx, batch in enumerate(loader):
118 |             seqs, segs, seqs_, loss_mask, xlen, nsp_labels = random_mask_or_replace(batch,arange,dataset)
119 |             seqs, segs, seqs_, nsp_labels, loss_mask = seqs.type(torch.LongTensor).to(device), segs.type(torch.LongTensor).to(device),seqs_.type(torch.LongTensor).to(device),nsp_labels.to(device),loss_mask.to(device)
120 |             loss, pred = model.step(seqs, segs, seqs_, loss_mask, nsp_labels)
121 |             if batch_idx % 100 == 0:
122 |                 pred = pred[0].cpu().data.numpy().argmax(axis=1)
123 |                 print(
124 |                 "\n\nEpoch: ",epoch,
125 |                 "|batch: ", batch_idx,
126 |                 "| loss: %.3f" % loss,
127 |                 "\n| tgt: ", " ".join([dataset.i2v[i] for i in seqs[0].cpu().data.numpy()[:xlen[0].sum()+1]]),
128 |                 "\n| prd: ", " ".join([dataset.i2v[i] for i in pred[:xlen[0].sum()+1]]),
129 |                 "\n| tgt word: ", [dataset.i2v[i] for i in (seqs_[0]*loss_mask[0].view(-1)).cpu().data.numpy() if i != dataset.v2i["<PAD>"]],
130 |                 "\n| prd word: ", [dataset.i2v[i] for i in pred*(loss_mask[0].view(-1).cpu().data.numpy()) if i != dataset.v2i["<PAD>"]],
131 |                 )
132 |     os.makedirs("./visual/models/bert",exist_ok=True)
133 |     torch.save(model.state_dict(),"./visual/models/bert/model.pth")
134 |     export_attention(model,device,dataset)
135 | 
136 | def export_attention(model,device,data,name="bert"):
137 |     model.load_state_dict(torch.load("./visual/models/bert/model.pth",map_location=device))
138 |     seqs, segs,xlen,nsp_labels = data[:32]
139 |     seqs, segs,xlen,nsp_labels = torch.from_numpy(seqs),torch.from_numpy(segs),torch.from_numpy(xlen),torch.from_numpy(nsp_labels)
140 |     seqs, segs,nsp_labels = seqs.type(torch.LongTensor).to(device), segs.type(torch.LongTensor).to(device),nsp_labels.to(device)
141 |     model(seqs,segs,False)
142 |     seqs = seqs.cpu().data.numpy()
143 |     data = {"src": [[data.i2v[i] for i in seqs[j]] for j in range(len(seqs))], "attentions": model.attentions}
144 |     path = "./visual/tmp/%s_attention_matrix.pkl" % name
145 |     os.makedirs(os.path.dirname(path), exist_ok=True)
146 |     with open(path, "wb") as f:
147 |         pickle.dump(data, f)
148 | if __name__ == "__main__":
149 |     train()


--------------------------------------------------------------------------------
/pytorch/CBOW.py:
--------------------------------------------------------------------------------
 1 | from torch import nn
 2 | import torch
 3 | from torch.nn.functional import cross_entropy,softmax
 4 | from utils import Dataset,process_w2v_data
 5 | from visual import show_w2v_word_embedding
 6 | 
 7 | corpus = [
 8 |     # numbers
 9 |     "5 2 4 8 6 2 3 6 4",
10 |     "4 8 5 6 9 5 5 6",
11 |     "1 1 5 2 3 3 8",
12 |     "3 6 9 6 8 7 4 6 3",
13 |     "8 9 9 6 1 4 3 4",
14 |     "1 0 2 0 2 1 3 3 3 3 3",
15 |     "9 3 3 0 1 4 7 8",
16 |     "9 9 8 5 6 7 1 2 3 0 1 0",
17 | 
18 |     # alphabets, expecting that 9 is close to letters
19 |     "a t g q e h 9 u f",
20 |     "e q y u o i p s",
21 |     "q o 9 p l k j o k k o p",
22 |     "h g y i u t t a e q",
23 |     "i k d q r e 9 e a d",
24 |     "o p d g 9 s a f g a",
25 |     "i u y g h k l a s w",
26 |     "o l u y a o g f s",
27 |     "o p i u y g d a s j d l",
28 |     "u k i l o 9 l j s",
29 |     "y g i s h k j l f r f",
30 |     "i o h n 9 9 d 9 f a 9",
31 | ]
32 | 
33 | class CBOW(nn.Module):
34 |     def __init__(self,v_dim,emb_dim):
35 |         super().__init__()
36 |         self.v_dim = v_dim
37 |         self.embeddings = nn.Embedding(v_dim,emb_dim)
38 |         self.embeddings.weight.data.normal_(0,0.1)
39 | 
40 |         # self.opt = torch.optim.Adam(0.01)
41 |         self.hidden_out = nn.Linear(emb_dim,v_dim)
42 |         self.opt = torch.optim.SGD(self.parameters(),momentum=0.9,lr=0.01)
43 |     
44 |     def forward(self,x,training=None, mask=None):
45 |         # x.shape = [n,skip_window*2]
46 |         o = self.embeddings(x)  # [n, skip_window*2, emb_dim]
47 |         o = torch.mean(o,dim=1) # [n, emb_dim]
48 |         return o
49 |     
50 |     def loss(self, x, y, training=None):
51 |         embedded = self(x,training)
52 |         pred= self.hidden_out(embedded)
53 |         return cross_entropy(pred,y)
54 |     
55 |     def step(self,x,y):
56 |         self.opt.zero_grad()
57 |         loss = self.loss(x,y,True)
58 |         loss.backward()
59 |         self.opt.step()
60 |         return loss.detach().numpy()
61 | 
62 | def train(model,data):
63 |     if torch.cuda.is_available():
64 |         print("GPU train avaliable")
65 |         device =torch.device("cuda")
66 |         model = model.cuda()
67 |     else:
68 |         device = torch.device("cpu")
69 |         model = model.cpu()
70 |     for t in range(2500):
71 |         bx,by = data.sample(16)
72 |         bx,by = torch.from_numpy(bx).to(device), torch.from_numpy(by).to(device)
73 |         loss = model.step(bx,by)
74 |         if t%200 == 0:
75 |             print(f"step: {t}  |  loss: {loss}")
76 | 
77 | if __name__ == "__main__":
78 |     d = process_w2v_data(corpus,skip_window=2, method="cbow")
79 |     m = CBOW(d.num_word, 2)
80 |     train(m,d)
81 | 
82 |     show_w2v_word_embedding(m,d,"./visual/results/cbow.png")


--------------------------------------------------------------------------------
/pytorch/ELMo.py:
--------------------------------------------------------------------------------
  1 | from torch import nn,optim
  2 | import torch
  3 | from torch.nn.functional import cross_entropy,softmax
  4 | import utils
  5 | from torch.utils.data import DataLoader
  6 | import os
  7 | 
  8 | 
  9 | class ELMo(nn.Module):
 10 | 
 11 |     def __init__(self, v_dim, emb_dim, units, n_layers, lr):
 12 |         super().__init__()
 13 |         self.n_layers = n_layers
 14 |         self.units = units
 15 |         self.v_dim = v_dim
 16 | 
 17 |         # encoder
 18 |         self.word_embed = nn.Embedding(num_embeddings= v_dim, embedding_dim= emb_dim,padding_idx=0)
 19 |         self.word_embed.weight.data.normal_(0,0.1)
 20 | 
 21 |         # forward LSTM
 22 |         self.fs = nn.ModuleList(
 23 |             [nn.LSTM(input_size = emb_dim, hidden_size = units, batch_first=True) if i==0 else nn.LSTM(input_size = units, hidden_size = units, batch_first=True) for i in range(n_layers)])
 24 |         self.f_logits = nn.Linear(in_features=units, out_features=v_dim)
 25 | 
 26 |         # backward LSTM
 27 |         self.bs = nn.ModuleList(
 28 |             [nn.LSTM(input_size = emb_dim, hidden_size = units, batch_first=True) if i==0 else nn.LSTM(input_size = units, hidden_size = units, batch_first=True) for i in range(n_layers)])
 29 |         self.b_logits = nn.Linear(in_features=units, out_features=v_dim)
 30 | 
 31 |         self.opt = optim.Adam(self.parameters(),lr = lr)
 32 | 
 33 |     def forward(self,seqs):
 34 |         device = next(self.parameters()).device
 35 |         embedded = self.word_embed(seqs)    # [n, step, emb_dim]
 36 |         fxs = [embedded[:, :-1, :]]         # [n, step-1, emb_dim]
 37 |         bxs = [embedded[:, 1:, :]]          # [n, step-1, emb_dim]
 38 |         (h_f,c_f) = (torch.zeros(1,seqs.shape[0],self.units).to(device),torch.zeros(1,seqs.shape[0],self.units).to(device))
 39 |         (h_b,c_b) = (torch.zeros(1,seqs.shape[0],self.units).to(device),torch.zeros(1,seqs.shape[0],self.units).to(device))
 40 |         for fl,bl in zip(self.fs,self.bs):
 41 |             output_f,(h_f,c_f) = fl(fxs[-1], (h_f,c_f))   # [n, step-1, units], [1, n, units]
 42 |             fxs.append(output_f)
 43 |             
 44 |             output_b,(h_b,c_b) = bl(torch.flip(bxs[-1],dims=[1,]), (h_b,c_b)) # [n, step-1, units], [1, n, units]
 45 |             bxs.append(torch.flip(output_b,dims=(1,)))
 46 |         return fxs,bxs
 47 | 
 48 |     def step(self,seqs):
 49 |         self.opt.zero_grad()
 50 |         fo,bo = self(seqs)
 51 |         fo = self.f_logits(fo[-1])  # [n, step-1, v_dim]
 52 |         bo = self.b_logits(bo[-1])  # [n, step-1, v_dim]
 53 |         loss = (
 54 |             cross_entropy(fo.reshape(-1,self.v_dim),seqs[:,1:].reshape(-1)) +
 55 |             cross_entropy(bo.reshape(-1,self.v_dim),seqs[:,:-1].reshape(-1)))/2
 56 |         loss.backward()
 57 |         self.opt.step()
 58 |         return loss.cpu().detach().numpy(), (fo,bo)
 59 |     
 60 |     def get_emb(self,seqs):
 61 |         fxs,bxs = self(seqs)
 62 |         xs = [
 63 |             torch.cat((fxs[0][:,1:,:],bxs[0][:,:-1,:]),dim=2).cpu().data.numpy()
 64 |         ] + [
 65 |             torch.cat((f[:,1:,:],b[:,:-1,:]),dim=2).cpu().data.numpy() for f,b in zip(fxs[1:],bxs[1:])
 66 |         ]
 67 |         for x in xs:
 68 |             print("layers shape=",x.shape)
 69 |         return xs
 70 | 
 71 | 
 72 | 
 73 | def train():
 74 |     dataset = utils.MRPCSingle("./MRPC",rows=2000)
 75 |     UNITS = 256
 76 |     N_LAYERS = 2
 77 |     BATCH_SIZE = 16
 78 |     LEARNING_RATE = 2e-3
 79 |     print('num word: ',dataset.num_word)
 80 |     model = ELMo(v_dim = dataset.num_word,emb_dim = UNITS, units=UNITS, n_layers=N_LAYERS,lr=LEARNING_RATE)
 81 |     if torch.cuda.is_available():
 82 |         print("GPU train avaliable")
 83 |         device =torch.device("cuda")
 84 |         model = model.cuda()
 85 |     else:
 86 |         device = torch.device("cpu")
 87 |         model = model.cpu()
 88 |     loader = DataLoader(dataset,batch_size=BATCH_SIZE,shuffle=True)
 89 |     for i in range(10):
 90 |         for batch_idx , batch in enumerate(loader):
 91 |             batch = batch.type(torch.LongTensor).to(device)
 92 |             loss, (fo,bo) = model.step(batch)
 93 |             if batch_idx % 20 ==0:
 94 |                 fp = fo[0].cpu().data.numpy().argmax(axis=1)
 95 |                 bp = bo[0].cpu().data.numpy().argmax(axis=1)
 96 |                 print("\n\nEpoch: ", i,
 97 |                 "| batch: ", batch_idx,
 98 |                 "| loss: %.3f" % loss,
 99 |                 "\n| tgt: ", " ".join([dataset.i2v[i] for i in batch[0].cpu().data.numpy() if i != dataset.pad_id]),
100 |                 "\n| f_prd: ", " ".join([dataset.i2v[i] for i in fp if i != dataset.pad_id]),
101 |                 "\n| b_prd: ", " ".join([dataset.i2v[i] for i in bp if i != dataset.pad_id]),
102 |                 )
103 |     os.makedirs("./visual/models/elmo",exist_ok=True)
104 |     torch.save(model.state_dict(),"./visual/models/elmo/model.pth")
105 |     export_w2v(model,batch[:4],device)
106 | 
107 | def export_w2v(model,data,device):
108 |     model.load_state_dict(torch.load("./visual/models/elmo/model.pth",map_location=device))
109 |     emb = model.get_emb(data)
110 |     print(emb)
111 | if __name__ == "__main__":
112 |     train()
113 | 


--------------------------------------------------------------------------------
/pytorch/GPT.py:
--------------------------------------------------------------------------------
  1 | from transformer import Encoder
  2 | from torch import nn,optim
  3 | from torch.nn.functional import cross_entropy,softmax, relu
  4 | from torch.utils.data import DataLoader
  5 | from torch.utils.data.dataloader import default_collate
  6 | 
  7 | import torch
  8 | import utils
  9 | import os
 10 | import pickle
 11 | 
 12 | class GPT(nn.Module):
 13 | 
 14 |     def __init__(self, model_dim, max_len, num_layer, num_head, n_vocab, lr, max_seg=3, drop_rate=0.2,padding_idx=0):
 15 |         super().__init__()
 16 |         self.padding_idx = padding_idx
 17 |         self.n_vocab = n_vocab
 18 |         self.max_len = max_len
 19 |         
 20 |         self.word_emb = nn.Embedding(n_vocab,model_dim)
 21 |         self.word_emb.weight.data.normal_(0,0.1)
 22 | 
 23 |         self.segment_emb = nn.Embedding(num_embeddings= max_seg, embedding_dim=model_dim)
 24 |         self.segment_emb.weight.data.normal_(0,0.1)
 25 |         self.position_emb = torch.empty(1,max_len,model_dim)
 26 |         nn.init.kaiming_normal_(self.position_emb,mode='fan_out', nonlinearity='relu')
 27 |         self.position_emb = nn.Parameter(self.position_emb)
 28 | 
 29 | 
 30 |         self.encoder = Encoder(n_head=num_head, emb_dim=model_dim, drop_rate=drop_rate, n_layer=num_layer)
 31 |         self.task_mlm = nn.Linear(in_features=model_dim, out_features=n_vocab)
 32 |         self.task_nsp = nn.Linear(in_features=model_dim*self.max_len, out_features=2)
 33 | 
 34 |         self.opt = optim.Adam(self.parameters(),lr)
 35 |     
 36 |     def forward(self,seqs, segs, training=False):
 37 |         embed = self.input_emb(seqs, segs)
 38 |         z = self.encoder(embed, training, mask = self.mask(seqs))   # [n, step, model_dim]
 39 |         mlm_logits = self.task_mlm(z)   # [n, step, n_vocab]
 40 |         nsp_logits = self.task_nsp(z.reshape(z.shape[0],-1))    # [n, n_cls]
 41 |         return mlm_logits, nsp_logits
 42 |     
 43 |     def step(self, seqs, segs, seqs_, nsp_labels):
 44 |         self.opt.zero_grad()
 45 |         mlm_logits, nsp_logits = self(seqs, segs, training=True)
 46 |         pred_loss = cross_entropy(mlm_logits.reshape(-1,self.n_vocab),seqs_.reshape(-1))
 47 |         nsp_loss = cross_entropy(nsp_logits,nsp_labels.reshape(-1))
 48 |         loss = pred_loss + 0.2 * nsp_loss
 49 |         loss.backward()
 50 |         self.opt.step()
 51 |         return loss.cpu().data.numpy(), mlm_logits
 52 |     
 53 |     def input_emb(self,seqs, segs):
 54 |         # device = next(self.parameters()).device
 55 |         # self.position_emb = self.position_emb.to(device)
 56 |         return self.word_emb(seqs) + self.segment_emb(segs) + self.position_emb
 57 |     
 58 |     def mask(self, seqs):
 59 |         device = next(self.parameters()).device
 60 |         batch_size, seq_len = seqs.shape
 61 |         mask = torch.triu(torch.ones((seq_len,seq_len), dtype=torch.long), diagonal=1).to(device)  # [seq_len ,seq_len]
 62 |         pad = torch.eq(seqs,self.padding_idx)   # [n, seq_len]
 63 |         mask = torch.where(pad[:,None,None,:],1,mask[None,None,:,:]).to(device)   # [n, 1, seq_len, seq_len]
 64 |         return mask>0   # [n, 1, seq_len, seq_len]
 65 |     
 66 |     @property
 67 |     def attentions(self):
 68 |         attentions = {
 69 |             "encoder": [l.mh.attention.cpu().data.numpy() for l in self.encoder.encoder_layers]
 70 |         }
 71 |         return attentions
 72 | 
 73 | def train():
 74 |     MODEL_DIM = 256
 75 |     N_LAYER = 4
 76 |     LEARNING_RATE = 1e-4
 77 |     dataset = utils.MRPCData("./MRPC",2000)
 78 |     print("num word: ",dataset.num_word)
 79 |     model = GPT(
 80 |         model_dim=MODEL_DIM, max_len=dataset.max_len-1, num_layer=N_LAYER, num_head=4, n_vocab=dataset.num_word,
 81 |         lr=LEARNING_RATE, max_seg=dataset.num_seg, drop_rate=0.2, padding_idx=dataset.pad_id
 82 |     )
 83 |     if torch.cuda.is_available():
 84 |         print("GPU train avaliable")
 85 |         device =torch.device("cuda")
 86 |         model = model.cuda()
 87 |     else:
 88 |         device = torch.device("cpu")
 89 |         model = model.cpu()
 90 |     
 91 |     loader = DataLoader(dataset,batch_size=32,shuffle=True)
 92 | 
 93 |     for epoch in range(100):
 94 |         for batch_idx, batch in enumerate(loader):
 95 |             seqs, segs,xlen,nsp_labels = batch
 96 |             seqs, segs,nsp_labels = seqs.type(torch.LongTensor).to(device), segs.type(torch.LongTensor).to(device),nsp_labels.to(device)
 97 |             # pred: [n, step, n_vocab]
 98 |             loss,pred = model.step(seqs=seqs[:,:-1], segs= segs[:,:-1], seqs_=seqs[:,1:], nsp_labels=nsp_labels)
 99 |             if batch_idx %100 == 0:
100 |                 pred = pred[0].cpu().data.numpy().argmax(axis = 1) # [step]
101 |                 print(
102 |                     "Epoch: ",epoch,
103 |                 "|batch: ", batch_idx,
104 |                 "| loss: %.3f" % loss,
105 |                 "\n| tgt: ", " ".join([dataset.i2v[i] for i in seqs[0, 1:].cpu().data.numpy()[:xlen[0].sum()+1]]),
106 |                 "\n| prd: ", " ".join([dataset.i2v[i] for i in pred[:xlen[0].sum()+1]]),
107 |                 )
108 |     os.makedirs("./visual/models/gpt",exist_ok=True)
109 |     torch.save(model.state_dict(),"./visual/models/gpt/model.pth")
110 |     export_attention(model,device,dataset)
111 | 
112 | def export_attention(model,device,data,name="gpt"):
113 |     model.load_state_dict(torch.load("./visual/models/gpt/model.pth",map_location=device))
114 |     seqs, segs,xlen,nsp_labels = data[:32]
115 |     seqs, segs,xlen,nsp_labels = torch.from_numpy(seqs),torch.from_numpy(segs),torch.from_numpy(xlen),torch.from_numpy(nsp_labels)
116 |     seqs, segs,nsp_labels = seqs.type(torch.LongTensor).to(device), segs.type(torch.LongTensor).to(device),nsp_labels.to(device)
117 |     model(seqs[:,:-1],segs[:,:-1],False)
118 |     seqs = seqs.cpu().data.numpy()
119 |     data = {"src": [[data.i2v[i] for i in seqs[j]] for j in range(len(seqs))], "attentions": model.attentions}
120 |     path = "./visual/tmp/%s_attention_matrix.pkl" % name
121 |     os.makedirs(os.path.dirname(path), exist_ok=True)
122 |     with open(path, "wb") as f:
123 |         pickle.dump(data, f)
124 | if __name__ == "__main__":
125 |     train()
126 |             
127 | 
128 | 
129 | 
130 | 


--------------------------------------------------------------------------------
/pytorch/README.md:
--------------------------------------------------------------------------------
1 | ### Dependencies:
2 | |Library| Version |
3 | |-----  |-----|
4 | |PyTorch|1.7.1|
5 | |NumPy|1.18.1|
6 | |Matplotlib|3.3.4|


--------------------------------------------------------------------------------
/pytorch/__pycache__/transformer.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MorvanZhou/NLP-Tutorials/3aa02a13151b696e8c034da5f7bd2ef4294a5e5f/pytorch/__pycache__/transformer.cpython-37.pyc


--------------------------------------------------------------------------------
/pytorch/__pycache__/transformer.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MorvanZhou/NLP-Tutorials/3aa02a13151b696e8c034da5f7bd2ef4294a5e5f/pytorch/__pycache__/transformer.cpython-38.pyc


--------------------------------------------------------------------------------
/pytorch/__pycache__/utils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MorvanZhou/NLP-Tutorials/3aa02a13151b696e8c034da5f7bd2ef4294a5e5f/pytorch/__pycache__/utils.cpython-37.pyc


--------------------------------------------------------------------------------
/pytorch/__pycache__/utils.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MorvanZhou/NLP-Tutorials/3aa02a13151b696e8c034da5f7bd2ef4294a5e5f/pytorch/__pycache__/utils.cpython-38.pyc


--------------------------------------------------------------------------------
/pytorch/cnn_lm.py:
--------------------------------------------------------------------------------
  1 | from torch import nn
  2 | import torch
  3 | import numpy as np
  4 | import utils
  5 | from torch.utils.data import DataLoader
  6 | from torch.nn.functional import cross_entropy,softmax, relu
  7 | 
  8 | 
  9 | 
 10 | class CNNTranslation(nn.Module):
 11 | 
 12 |     def __init__(self,enc_v_dim, dec_v_dim, emb_dim, units, max_pred_len, start_token, end_token):
 13 |         super().__init__()
 14 |         self.units = units
 15 |         self.dec_v_dim = dec_v_dim
 16 | 
 17 | 
 18 |         # encoder
 19 |         self.enc_embeddings = nn.Embedding(enc_v_dim,emb_dim)
 20 |         self.enc_embeddings.weight.data.normal_(0,0.1)
 21 |         self.conv2ds = [nn.Conv2d(1,16,(n,emb_dim),padding=0) for n in range(2,5)]
 22 |         self.max_pools = [nn.MaxPool2d((n,1)) for n in [7,6,5]]
 23 |         self.encoder = nn.Linear(16*3,units)
 24 | 
 25 |         # decoder
 26 |         self.dec_embeddings = nn.Embedding(dec_v_dim,emb_dim)
 27 |         self.dec_embeddings.weight.data.normal_(0,0.1)
 28 |         self.decoder_cell = nn.LSTMCell(emb_dim,units)
 29 |         self.decoder_dense = nn.Linear(units,dec_v_dim)
 30 | 
 31 |         self.opt = torch.optim.Adam(self.parameters(),lr=0.001)
 32 |         self.max_pred_len = max_pred_len
 33 |         self.start_token = start_token
 34 |         self.end_token = end_token
 35 |     
 36 |     def encode(self,x):
 37 |         embedded = self.enc_embeddings(x)                   # [n, step, emb]
 38 |         o = torch.unsqueeze(embedded,1)                     # [n, 1, step=8, emb=16]
 39 |         co = [relu(conv2d(o)) for conv2d in self.conv2ds]   # [n, 16, 7, 1], [n, 16, 6, 1], [n, 16, 5, 1]
 40 |         co = [self.max_pools[i](co[i]) for i in range(len(co))]         # [n, 16, 1, 1] * 3
 41 |         co = [torch.squeeze(torch.squeeze(c,dim=3),dim=2) for c in co]  # [n, 16] * 3
 42 |         o = torch.cat(co,dim=1)     # [n, 16*3]
 43 |         h = self.encoder(o)         # [n, units]
 44 |         return [h,h]
 45 |     
 46 |     def inference(self,x):
 47 |         self.eval()
 48 |         hx,cx = self.encode(x)
 49 |         start = torch.ones(x.shape[0],1)
 50 |         start[:,0] = torch.tensor(self.start_token)
 51 |         start= start.type(torch.LongTensor)
 52 |         dec_emb_in = self.dec_embeddings(start) # [n, step, emb]
 53 |         dec_emb_in = dec_emb_in.permute(1,0,2)  # [step, n, emb]
 54 |         dec_in = dec_emb_in[0]  # The first word use for decoding
 55 |         output = []
 56 |         for i in range(self.max_pred_len):
 57 |             hx, cx = self.decoder_cell(dec_in, (hx, cx))
 58 |             o = self.decoder_dense(hx)
 59 |             o = o.argmax(dim=1).view(-1,1)
 60 |             dec_in=self.dec_embeddings(o).permute(1,0,2)[0]
 61 |             output.append(o)
 62 |         output = torch.stack(output,dim=0)  # [self.max_pred_len, n, 1]
 63 |         self.train()
 64 | 
 65 |         return output.permute(1,0,2).view(-1,self.max_pred_len) # [n, self.max_pred_len]
 66 |     
 67 |     def train_logit(self,x,y):
 68 |         hx,cx = self.encode(x)  #[n, units]
 69 |         dec_in = y[:,:-1]
 70 |         dec_emb_in = self.dec_embeddings(dec_in)
 71 |         dec_emb_in = dec_emb_in.permute(1,0,2)
 72 |         output = []
 73 |         for i in range(dec_emb_in.shape[0]):
 74 |             hx, cx = self.decoder_cell(dec_emb_in[i], (hx, cx))
 75 |             o = self.decoder_dense(hx)
 76 |             output.append(o)
 77 |         output = torch.stack(output,dim=0)
 78 |         return output.permute(1,0,2)
 79 |     
 80 |     def step(self,x,y):
 81 |         self.opt.zero_grad()
 82 |         batch_size = x.shape[0]
 83 |         logit = self.train_logit(x,y)    
 84 |         dec_out = y[:,1:]
 85 |         loss = cross_entropy(logit.reshape(-1,self.dec_v_dim),dec_out.reshape(-1))
 86 |         loss.backward()
 87 |         self.opt.step()
 88 |         return loss.detach().numpy()
 89 | 
 90 | 
 91 | def train():
 92 |     dataset = utils.DateData(4000)
 93 |     print("Chinese time order: yy/mm/dd ",dataset.date_cn[:3],"\nEnglish time order: dd/M/yyyy", dataset.date_en[:3])
 94 |     print("Vocabularies: ", dataset.vocab)
 95 |     print(f"x index sample:  \n{dataset.idx2str(dataset.x[0])}\n{dataset.x[0]}",
 96 |     f"\ny index sample:  \n{dataset.idx2str(dataset.y[0])}\n{dataset.y[0]}")
 97 |     loader = DataLoader(dataset,batch_size=32,shuffle=True)
 98 |     model = CNNTranslation(dataset.num_word,dataset.num_word,emb_dim=16,units=32,max_pred_len=11,start_token=dataset.start_token,end_token=dataset.end_token)
 99 | 
100 |     for i in range(100):
101 |         for batch_idx , batch in enumerate(loader):
102 |             bx, by, decoder_len = batch
103 |             loss = model.step(bx,by)
104 |             if batch_idx % 70 == 0:
105 |                 target = dataset.idx2str(by[0, 1:-1].data.numpy())
106 |                 pred = model.inference(bx[0:1])
107 |                 res = dataset.idx2str(pred[0].data.numpy())
108 |                 src = dataset.idx2str(bx[0].data.numpy())
109 |                 print(
110 |                     "Epoch: ",i,
111 |                     "| t: ", batch_idx,
112 |                     "| loss: %.3f" % loss,
113 |                     "| input: ", src,
114 |                     "| target: ", target,
115 |                     "| inference: ", res,
116 |                 )
117 | 
118 | 
119 | if __name__ == "__main__":
120 |     train()


--------------------------------------------------------------------------------
/pytorch/seq2seq.py:
--------------------------------------------------------------------------------
  1 | from torch import nn
  2 | import torch
  3 | import numpy as np
  4 | import utils
  5 | from torch.utils.data import DataLoader
  6 | from torch.nn.functional import cross_entropy,softmax
  7 | 
  8 | 
  9 | class Seq2Seq(nn.Module):
 10 |     def __init__(self,enc_v_dim, dec_v_dim, emb_dim, units, max_pred_len, start_token, end_token):
 11 |         super().__init__()
 12 |         self.units = units
 13 |         self.dec_v_dim = dec_v_dim
 14 | 
 15 |         # encoder
 16 |         self.enc_embeddings = nn.Embedding(enc_v_dim,emb_dim)
 17 |         self.enc_embeddings.weight.data.normal_(0,0.1)
 18 |         self.encoder = nn.LSTM(emb_dim,units,1,batch_first=True)
 19 |     
 20 | 
 21 |         # decoder
 22 |         self.dec_embeddings = nn.Embedding(dec_v_dim,emb_dim)
 23 |         self.dec_embeddings.weight.data.normal_(0,0.1)
 24 |         self.decoder_cell = nn.LSTMCell(emb_dim,units)
 25 |         self.decoder_dense = nn.Linear(units,dec_v_dim)
 26 | 
 27 |         self.opt = torch.optim.Adam(self.parameters(),lr=0.001)
 28 |         self.max_pred_len = max_pred_len
 29 |         self.start_token = start_token
 30 |         self.end_token = end_token
 31 | 
 32 |     
 33 |     def encode(self,x):
 34 |         embedded = self.enc_embeddings(x)   # [n, step, emb]
 35 |         hidden = (torch.zeros(1,x.shape[0],self.units),torch.zeros(1,x.shape[0],self.units))
 36 |         o,(h,c) = self.encoder(embedded,hidden)
 37 |         return h,c
 38 |     
 39 |     def inference(self,x):
 40 |         self.eval()
 41 |         hx,cx = self.encode(x)
 42 |         hx,cx = hx[0],cx[0]
 43 |         start = torch.ones(x.shape[0],1)
 44 |         start[:,0] = torch.tensor(self.start_token)
 45 |         start= start.type(torch.LongTensor)
 46 |         dec_emb_in = self.dec_embeddings(start)
 47 |         dec_emb_in = dec_emb_in.permute(1,0,2)
 48 |         dec_in = dec_emb_in[0]
 49 |         output = []
 50 |         for i in range(self.max_pred_len):
 51 |             hx, cx = self.decoder_cell(dec_in, (hx, cx))
 52 |             o = self.decoder_dense(hx)
 53 |             o = o.argmax(dim=1).view(-1,1)
 54 |             dec_in=self.dec_embeddings(o).permute(1,0,2)[0]
 55 |             output.append(o)
 56 |         output = torch.stack(output,dim=0)
 57 |         self.train()
 58 | 
 59 |         return output.permute(1,0,2).view(-1,self.max_pred_len)
 60 | 
 61 |     
 62 |     def train_logit(self,x,y):
 63 |         hx,cx = self.encode(x)
 64 |         hx,cx = hx[0],cx[0]
 65 |         dec_in = y[:,:-1]
 66 |         dec_emb_in = self.dec_embeddings(dec_in)
 67 |         dec_emb_in = dec_emb_in.permute(1,0,2)
 68 |         output = []
 69 |         for i in range(dec_emb_in.shape[0]):
 70 |             hx, cx = self.decoder_cell(dec_emb_in[i], (hx, cx))
 71 |             o = self.decoder_dense(hx)
 72 |             output.append(o)
 73 |         output = torch.stack(output,dim=0)
 74 |         return output.permute(1,0,2)
 75 |     
 76 |     def step(self,x,y):
 77 |         self.opt.zero_grad()
 78 |         batch_size = x.shape[0]
 79 |         logit = self.train_logit(x,y)    
 80 |         dec_out = y[:,1:]
 81 |         loss = cross_entropy(logit.reshape(-1,self.dec_v_dim),dec_out.reshape(-1))
 82 |         loss.backward()
 83 |         self.opt.step()
 84 |         return loss.detach().numpy()
 85 | 
 86 | def train():
 87 |     dataset = utils.DateData(4000)
 88 |     print("Chinese time order: yy/mm/dd ",dataset.date_cn[:3],"\nEnglish time order: dd/M/yyyy", dataset.date_en[:3])
 89 |     print("Vocabularies: ", dataset.vocab)
 90 |     print(f"x index sample:  \n{dataset.idx2str(dataset.x[0])}\n{dataset.x[0]}",
 91 |     f"\ny index sample:  \n{dataset.idx2str(dataset.y[0])}\n{dataset.y[0]}")
 92 |     loader = DataLoader(dataset,batch_size=32,shuffle=True)
 93 |     model = Seq2Seq(dataset.num_word,dataset.num_word,emb_dim=16,units=32,max_pred_len=11,start_token=dataset.start_token,end_token=dataset.end_token)
 94 |     for i in range(100):
 95 |         for batch_idx , batch in enumerate(loader):
 96 |             bx, by, decoder_len = batch
 97 |             bx = bx.type(torch.LongTensor)
 98 |             by = by.type(torch.LongTensor)
 99 |             loss = model.step(bx,by)
100 |             if batch_idx % 70 == 0:
101 |                 target = dataset.idx2str(by[0, 1:-1].data.numpy())
102 |                 pred = model.inference(bx[0:1])
103 |                 res = dataset.idx2str(pred[0].data.numpy())
104 |                 src = dataset.idx2str(bx[0].data.numpy())
105 |                 print(
106 |                     "Epoch: ",i,
107 |                     "| t: ", batch_idx,
108 |                     "| loss: %.3f" % loss,
109 |                     "| input: ", src,
110 |                     "| target: ", target,
111 |                     "| inference: ", res,
112 |                 )
113 | 
114 | 
115 | if __name__ == "__main__":
116 |     train()
117 |     
118 | 
119 | 
120 | 


--------------------------------------------------------------------------------
/pytorch/seq2seq_attention.py:
--------------------------------------------------------------------------------
  1 | from torch import nn
  2 | import torch
  3 | import numpy as np
  4 | import utils
  5 | from torch.utils.data import DataLoader
  6 | from torch.nn.functional import cross_entropy,softmax
  7 | 
  8 | class Seq2Seq(nn.Module):
  9 |     def __init__(self,enc_v_dim, dec_v_dim, emb_dim, units, max_pred_len, start_token, end_token):
 10 |         super().__init__()
 11 |         self.units = units
 12 |         self.dec_v_dim = dec_v_dim
 13 | 
 14 |         # encoder
 15 |         self.enc_embeddings = nn.Embedding(enc_v_dim,emb_dim)
 16 |         self.enc_embeddings.weight.data.normal_(0,0.1)
 17 |         self.encoder = nn.LSTM(emb_dim,units,1,batch_first=True)
 18 | 
 19 |         # decoder
 20 |         self.dec_embeddings = nn.Embedding(dec_v_dim,emb_dim)
 21 |         self.attn = nn.Linear(units,units)
 22 |         self.decoder_cell = nn.LSTMCell(emb_dim,units)
 23 |         self.decoder_dense = nn.Linear(units*2,dec_v_dim)
 24 | 
 25 |         self.opt = torch.optim.Adam(self.parameters(),lr=0.001)
 26 |         self.max_pred_len = max_pred_len
 27 |         self.start_token = start_token
 28 |         self.end_token = end_token
 29 |     
 30 |     def encode(self,x):
 31 |         embedded = self.enc_embeddings(x)   # [n, step, emb]
 32 |         hidden = (torch.zeros(1,x.shape[0],self.units),torch.zeros(1,x.shape[0],self.units))
 33 |         o,(h,c) = self.encoder(embedded,hidden) # [n, step, units], [num_layers * num_directions, n, units]
 34 |         return o,h,c
 35 |     
 36 |     def inference(self,x,return_align=False):
 37 |         self.eval()
 38 |         o,hx,cx = self.encode(x)    # [n, step, units], [num_layers * num_directions, n, units] * 2
 39 |         hx,cx = hx[0],cx[0]         # [n, units]
 40 |         start = torch.ones(x.shape[0],1)    # [n, 1]
 41 |         start[:,0] = torch.tensor(self.start_token)
 42 |         start= start.type(torch.LongTensor)
 43 |         dec_emb_in = self.dec_embeddings(start) # [n, 1, emb_dim]
 44 |         dec_emb_in = dec_emb_in.permute(1,0,2)  # [1, n, emb_dim]
 45 |         dec_in = dec_emb_in[0]                  # [n, emb_dim]
 46 |         output = []
 47 |         for i in range(self.max_pred_len):
 48 |             attn_prod = torch.matmul(self.attn(hx.unsqueeze(1)),o.permute(0,2,1)) # [n, 1, step]
 49 |             att_weight = softmax(attn_prod, dim=2)  # [n, 1, step]
 50 |             context = torch.matmul(att_weight,o)    # [n, 1, units]
 51 |             # attn_prod = torch.matmul(self.attn(o),hx.unsqueeze(2))  # [n, step, 1]
 52 |             # attn_weight = softmax(attn_prod,dim=1)                  # [n, step, 1]
 53 |             # context = torch.matmul(o.permute(0,2,1),attn_weight)    # [n, units, 1]
 54 |             hx, cx = self.decoder_cell(dec_in, (hx, cx))
 55 |             hc = torch.cat([context.squeeze(1),hx],dim=1)           # [n, units *2]
 56 |             # hc = torch.cat([context.squeeze(2),hx],dim=1)           # [n, units *2]
 57 |             result = self.decoder_dense(hc)
 58 |             result = result.argmax(dim=1).view(-1,1)
 59 |             dec_in=self.dec_embeddings(result).permute(1,0,2)[0]
 60 |             output.append(result)
 61 |         output = torch.stack(output,dim=0)
 62 |         self.train()
 63 | 
 64 |         return output.permute(1,0,2).view(-1,self.max_pred_len)
 65 |     
 66 |     def train_logit(self,x,y):
 67 |         o,hx,cx = self.encode(x)    # [n, step, units], [num_layers * num_directions, n, units] * 2
 68 |         hx,cx = hx[0],cx[0]         # [n, units]
 69 |         dec_in = y[:,:-1]           # [n, step]
 70 |         dec_emb_in = self.dec_embeddings(dec_in)    # [n, step, emb_dim]
 71 |         dec_emb_in = dec_emb_in.permute(1,0,2)      # [step, n, emb_dim]
 72 |         output = []
 73 |         for i in range(dec_emb_in.shape[0]):
 74 |             # General Attention:
 75 |             # score(ht,hs) = (ht^T)(Wa)hs
 76 |             # hs is the output from encoder
 77 |             # ht is the previous hidden state from decoder
 78 |             # self.attn(o): [n, step, units]
 79 |             attn_prod = torch.matmul(self.attn(hx.unsqueeze(1)),o.permute(0,2,1)) # [n, 1, step]
 80 |             att_weight = softmax(attn_prod, dim=2)  # [n, 1, step]
 81 |             context = torch.matmul(att_weight,o)    # [n, 1, units]
 82 |             # attn_prod = torch.matmul(self.attn(o),hx.unsqueeze(2))  # [n, step, 1]
 83 |             # attn_weight = softmax(attn_prod,dim=1)                  # [n, step, 1]
 84 |             # context = torch.matmul(o.permute(0,2,1),attn_weight)    # [n, units, 1]
 85 |             hx, cx = self.decoder_cell(dec_emb_in[i], (hx, cx))     # [n, units]
 86 |             hc = torch.cat([context.squeeze(1),hx],dim=1)           # [n, units *2]
 87 |             # hc = torch.cat([context.squeeze(2),hx],dim=1)           # [n, units *2]
 88 |             result = self.decoder_dense(hc)                              # [n, dec_v_dim]
 89 |             output.append(result)
 90 |         output = torch.stack(output,dim=0)  # [step, n, dec_v_dim]
 91 |         return output.permute(1,0,2)        # [n, step, dec_v_dim]
 92 |     
 93 |     def step(self,x,y):
 94 |         self.opt.zero_grad()
 95 |         batch_size = x.shape[0]
 96 |         logit = self.train_logit(x,y)    
 97 |         dec_out = y[:,1:]
 98 |         loss = cross_entropy(logit.reshape(-1,self.dec_v_dim),dec_out.reshape(-1))
 99 |         loss.backward()
100 |         self.opt.step()
101 |         return loss.detach().numpy()
102 | 
103 | 
104 | def train():
105 |     dataset = utils.DateData(4000)
106 |     print("Chinese time order: yy/mm/dd ",dataset.date_cn[:3],"\nEnglish time order: dd/M/yyyy", dataset.date_en[:3])
107 |     print("Vocabularies: ", dataset.vocab)
108 |     print(f"x index sample:  \n{dataset.idx2str(dataset.x[0])}\n{dataset.x[0]}",
109 |     f"\ny index sample:  \n{dataset.idx2str(dataset.y[0])}\n{dataset.y[0]}")
110 |     loader = DataLoader(dataset,batch_size=32,shuffle=True)
111 |     model = Seq2Seq(dataset.num_word,dataset.num_word,emb_dim=16,units=32,max_pred_len=11,start_token=dataset.start_token,end_token=dataset.end_token)
112 |     for i in range(100):
113 |         for batch_idx , batch in enumerate(loader):
114 |             bx, by, decoder_len = batch
115 |             loss = model.step(bx,by)
116 |             if batch_idx % 70 == 0:
117 |                 target = dataset.idx2str(by[0, 1:-1].data.numpy())
118 |                 pred = model.inference(bx[0:1])
119 |                 res = dataset.idx2str(pred[0].data.numpy())
120 |                 src = dataset.idx2str(bx[0].data.numpy())
121 |                 print(
122 |                     "Epoch: ",i,
123 |                     "| t: ", batch_idx,
124 |                     "| loss: %.3f" % loss,
125 |                     "| input: ", src,
126 |                     "| target: ", target,
127 |                     "| inference: ", res,
128 |                 )
129 |     # pkl_data = {"i2v": dataset.i2v, "x": dataset.x[:6], "y": dataset.y[:6], "align": model.inference(dataset.x[:6], return_align=True)}
130 | 
131 |     # with open("./visual/tmp/attention_align.pkl", "wb") as f:
132 |     #     pickle.dump(pkl_data, f)
133 | 
134 | if __name__ == "__main__":
135 |     train()


--------------------------------------------------------------------------------
/pytorch/skip_gram.py:
--------------------------------------------------------------------------------
 1 | from torch import nn
 2 | import torch
 3 | from torch.nn.functional import cross_entropy,softmax
 4 | from utils import Dataset,process_w2v_data
 5 | from visual import show_w2v_word_embedding
 6 | 
 7 | corpus = [
 8 |     # numbers
 9 |     "5 2 4 8 6 2 3 6 4",
10 |     "4 8 5 6 9 5 5 6",
11 |     "1 1 5 2 3 3 8",
12 |     "3 6 9 6 8 7 4 6 3",
13 |     "8 9 9 6 1 4 3 4",
14 |     "1 0 2 0 2 1 3 3 3 3 3",
15 |     "9 3 3 0 1 4 7 8",
16 |     "9 9 8 5 6 7 1 2 3 0 1 0",
17 | 
18 |     # alphabets, expecting that 9 is close to letters
19 |     "a t g q e h 9 u f",
20 |     "e q y u o i p s",
21 |     "q o 9 p l k j o k k o p",
22 |     "h g y i u t t a e q",
23 |     "i k d q r e 9 e a d",
24 |     "o p d g 9 s a f g a",
25 |     "i u y g h k l a s w",
26 |     "o l u y a o g f s",
27 |     "o p i u y g d a s j d l",
28 |     "u k i l o 9 l j s",
29 |     "y g i s h k j l f r f",
30 |     "i o h n 9 9 d 9 f a 9",
31 | ]
32 | 
33 | 
34 | class SkipGram(nn.Module):
35 | 
36 |     def __init__(self,v_dim,emb_dim):
37 |         super().__init__()
38 |         self.v_dim = v_dim
39 |         self.embeddings = nn.Embedding(v_dim,emb_dim)
40 |         self.embeddings.weight.data.normal_(0,0.1)
41 |         self.hidden_out = nn.Linear(emb_dim,v_dim)
42 | 
43 |         self.opt = torch.optim.Adam(self.parameters(),lr=0.01)
44 |     
45 |     def forward(self,x,training=None, mask=None):
46 |         # x.shape = [n,]
47 |         o = self.embeddings(x)  # [n, emb_dim]
48 |         return o
49 |     
50 |     def loss(self,x,y,training=None):
51 |         embedded = self(x,training)
52 |         pred= self.hidden_out(embedded)
53 |         return cross_entropy(pred,y)
54 |     
55 |     def step(self,x,y):
56 |         self.opt.zero_grad()
57 |         loss = self.loss(x,y,True)
58 |         loss.backward()
59 |         self.opt.step()
60 |         return loss.detach().numpy()
61 | 
62 | def train(model,data):
63 |     if torch.cuda.is_available():
64 |         print("GPU train avaliable")
65 |         device =torch.device("cuda")
66 |         model = model.cuda()
67 |     else:
68 |         device = torch.device("cpu")
69 |         model = model.cpu()
70 |     for t in range(2500):
71 |         bx,by = data.sample(8)
72 |         bx,by = torch.from_numpy(bx).to(device), torch.from_numpy(by).to(device)
73 |         loss = model.step(bx,by)
74 |         if t%200 == 0:
75 |             print(f"step: {t}  |  loss: {loss}")
76 | 
77 | 
78 | if __name__ == "__main__":
79 |     d = process_w2v_data(corpus,skip_window=2, method="skip_gram")
80 |     m = SkipGram(d.num_word, 2)
81 |     train(m,d)
82 | 
83 |     #plotting
84 |     show_w2v_word_embedding(m,d,"./visual/results/skipgram.png")
85 | 
86 | 
87 | 


--------------------------------------------------------------------------------
/pytorch/transformer.py:
--------------------------------------------------------------------------------
  1 | import torch.nn as nn
  2 | from torch.nn.functional import cross_entropy,softmax, relu
  3 | import numpy as np
  4 | import torch
  5 | from torch.utils import data
  6 | import utils
  7 | from torch.utils.data import DataLoader
  8 | import argparse
  9 | 
 10 | MAX_LEN = 11
 11 | 
 12 | class MultiHead(nn.Module):
 13 |     def __init__(self, n_head, model_dim, drop_rate):
 14 |         super().__init__()
 15 |         self.head_dim = model_dim // n_head
 16 |         self.n_head = n_head
 17 |         self.model_dim = model_dim
 18 |         self.wq = nn.Linear(model_dim, n_head * self.head_dim)
 19 |         self.wk = nn.Linear(model_dim, n_head * self.head_dim)
 20 |         self.wv = nn.Linear(model_dim, n_head * self.head_dim)
 21 | 
 22 |         self.o_dense = nn.Linear(model_dim, model_dim)
 23 |         self.o_drop = nn.Dropout(drop_rate)
 24 |         self.layer_norm = nn.LayerNorm(model_dim)
 25 |         self.attention = None
 26 | 
 27 |     def forward(self,q,k,v,mask,training):
 28 |         # residual connect
 29 |         residual = q
 30 |         dim_per_head= self.head_dim
 31 |         num_heads = self.n_head
 32 |         batch_size = q.size(0)
 33 | 
 34 |         # linear projection
 35 |         key = self.wk(k)    # [n, step, num_heads * head_dim]
 36 |         value = self.wv(v)  # [n, step, num_heads * head_dim]
 37 |         query = self.wq(q)  # [n, step, num_heads * head_dim]
 38 | 
 39 |         # split by head
 40 |         query = self.split_heads(query)       # [n, n_head, q_step, h_dim]
 41 |         key = self.split_heads(key)
 42 |         value = self.split_heads(value)  # [n, h, step, h_dim]
 43 |         context = self.scaled_dot_product_attention(query,key, value, mask)    # [n, q_step, h*dv]
 44 |         o = self.o_dense(context)   # [n, step, dim]
 45 |         o = self.o_drop(o)
 46 | 
 47 |         o = self.layer_norm(residual+o)
 48 |         return o
 49 | 
 50 |     def split_heads(self, x):
 51 |         x = torch.reshape(x,(x.shape[0], x.shape[1], self.n_head, self.head_dim))
 52 |         return x.permute(0,2,1,3)
 53 |     
 54 |     def scaled_dot_product_attention(self, q, k, v, mask=None):
 55 |         dk = torch.tensor(k.shape[-1]).type(torch.float)
 56 |         score = torch.matmul(q,k.permute(0,1,3,2)) / (torch.sqrt(dk) + 1e-8)    # [n, n_head, step, step]
 57 |         if mask is not None:
 58 |             # change the value at masked position to negative infinity,
 59 |             # so the attention score at these positions after softmax will close to 0.
 60 |             score = score.masked_fill_(mask,-np.inf)
 61 |         self.attention = softmax(score,dim=-1)
 62 |         context = torch.matmul(self.attention,v)    # [n, num_head, step, head_dim]
 63 |         context = context.permute(0,2,1,3)          # [n, step, num_head, head_dim]
 64 |         context = context.reshape((context.shape[0], context.shape[1],-1))  
 65 |         return context  # [n, step, model_dim]
 66 | 
 67 | class PositionWiseFFN(nn.Module):
 68 |     def __init__(self,model_dim, dropout = 0.0):
 69 |         super().__init__()
 70 |         dff = model_dim*4
 71 |         self.l = nn.Linear(model_dim,dff)
 72 |         self.o = nn.Linear(dff,model_dim)
 73 |         self.dropout = nn.Dropout(dropout)
 74 |         self.layer_norm = nn.LayerNorm(model_dim)
 75 | 
 76 |     def forward(self,x):
 77 |         o = relu(self.l(x))
 78 |         o = self.o(o)
 79 |         o = self.dropout(o)
 80 | 
 81 |         o = self.layer_norm(x + o)
 82 |         return o    # [n, step, dim]
 83 | 
 84 | 
 85 | 
 86 | class EncoderLayer(nn.Module):
 87 | 
 88 |     def __init__(self, n_head, emb_dim, drop_rate):
 89 |         super().__init__()
 90 |         self.mh = MultiHead(n_head, emb_dim, drop_rate)
 91 |         self.ffn = PositionWiseFFN(emb_dim,drop_rate)
 92 |     
 93 |     def forward(self, xz, training, mask):
 94 |         # xz: [n, step, emb_dim]
 95 |         context = self.mh(xz, xz, xz, mask, training)  # [n, step, emb_dim]
 96 |         o = self.ffn(context)
 97 |         return o
 98 | 
 99 | class Encoder(nn.Module):
100 |     def __init__(self, n_head, emb_dim, drop_rate, n_layer):
101 |         super().__init__()
102 |         self.encoder_layers = nn.ModuleList(
103 |             [EncoderLayer(n_head, emb_dim, drop_rate) for _ in range(n_layer)]
104 |         )    
105 |     def forward(self, xz, training, mask):
106 | 
107 |         for encoder in self.encoder_layers:
108 |             xz = encoder(xz,training,mask)
109 |         return xz       # [n, step, emb_dim]
110 | 
111 | class DecoderLayer(nn.Module):
112 |     def __init__(self,n_head,model_dim,drop_rate):
113 |         super().__init__()
114 |         self.mh = nn.ModuleList([MultiHead(n_head, model_dim, drop_rate) for _ in range(2)])
115 |         self.ffn = PositionWiseFFN(model_dim,drop_rate)
116 |     
117 |     def forward(self,yz, xz, training, yz_look_ahead_mask,xz_pad_mask):
118 |         dec_output = self.mh[0](yz, yz, yz, yz_look_ahead_mask, training)   # [n, step, model_dim]
119 |         
120 |         dec_output = self.mh[1](dec_output, xz, xz, xz_pad_mask, training)  # [n, step, model_dim]
121 | 
122 |         dec_output = self.ffn(dec_output)   # [n, step, model_dim]
123 | 
124 |         return dec_output
125 |     
126 | class Decoder(nn.Module):
127 |     def __init__(self, n_head, model_dim, drop_rate, n_layer):
128 |         super().__init__()
129 | 
130 |         self.num_layers = n_layer
131 | 
132 |         self.decoder_layers = nn.ModuleList(
133 |             [DecoderLayer(n_head, model_dim, drop_rate) for _ in range(n_layer)]
134 |         )
135 |     
136 |     def forward(self, yz, xz, training, yz_look_ahead_mask, xz_pad_mask):
137 |         for decoder in self.decoder_layers:
138 |             yz = decoder(yz, xz, training, yz_look_ahead_mask, xz_pad_mask)
139 |         return yz   # [n, step, model_dim]
140 | 
141 | class PositionEmbedding(nn.Module):
142 |     def __init__(self, max_len, emb_dim, n_vocab):
143 |         super().__init__()
144 |         pos = np.expand_dims(np.arange(max_len),1)  # [max_len, 1]
145 |         pe = pos / np.power(1000, 2*np.expand_dims(np.arange(emb_dim)//2,0)/emb_dim)  # [max_len, emb_dim]
146 |         pe[:, 0::2] = np.sin(pe[:, 0::2])
147 |         pe[:, 1::2] = np.cos(pe[:, 1::2])
148 |         pe = np.expand_dims(pe,0) # [1, max_len, emb_dim]
149 |         self.pe = torch.from_numpy(pe).type(torch.float32)
150 |         self.embeddings = nn.Embedding(n_vocab,emb_dim)
151 |         self.embeddings.weight.data.normal_(0,0.1)
152 |         
153 |     def forward(self, x):
154 |         device = self.embeddings.weight.device
155 |         self.pe = self.pe.to(device)    
156 |         x_embed = self.embeddings(x) + self.pe  # [n, step, emb_dim]
157 |         return x_embed  # [n, step, emb_dim]
158 | 
159 | class Transformer(nn.Module):
160 |     def __init__(self, n_vocab, max_len, n_layer = 6, emb_dim=512, n_head = 8, drop_rate=0.1, padding_idx=0):
161 |         super().__init__()
162 |         self.max_len = max_len
163 |         self.padding_idx = torch.tensor(padding_idx)
164 |         self.dec_v_emb = n_vocab 
165 | 
166 |         self.embed = PositionEmbedding(max_len, emb_dim, n_vocab)
167 |         self.encoder = Encoder(n_head, emb_dim, drop_rate, n_layer)
168 |         self.decoder = Decoder(n_head, emb_dim, drop_rate, n_layer)
169 |         self.o = nn.Linear(emb_dim,n_vocab)
170 |         self.opt = torch.optim.Adam(self.parameters(),lr=0.002)
171 |     
172 |     def forward(self,x,y,training= None):
173 |         x_embed, y_embed = self.embed(x), self.embed(y) # [n, step, emb_dim] * 2
174 |         pad_mask = self._pad_mask(x)    # [n, 1, step, step]
175 |         encoded_z = self.encoder(x_embed,training,pad_mask) # [n, step, emb_dim]
176 |         yz_look_ahead_mask = self._look_ahead_mask(y)   # [n, 1, step, step]
177 |         decoded_z = self.decoder(y_embed,encoded_z, training, yz_look_ahead_mask, pad_mask) # [n, step, emb_dim]
178 |         o = self.o(decoded_z)   # [n, step, n_vocab]
179 |         return o
180 |     
181 |     def step(self, x, y):
182 |         self.opt.zero_grad()
183 |         logits = self(x,y[:, :-1],training=True)
184 |         pad_mask = ~torch.eq(y[:,1:],self.padding_idx)  # [n, seq_len]
185 |         loss = cross_entropy(logits.reshape(-1, self.dec_v_emb),y[:,1:].reshape(-1))
186 |         loss.backward()
187 |         self.opt.step()
188 |         return loss.cpu().data.numpy(), logits
189 | 
190 |     def _pad_bool(self, seqs):
191 |         o = torch.eq(seqs,self.padding_idx) # [n, step]
192 |         return o
193 |     def _pad_mask(self, seqs):
194 |         len_q = seqs.size(1)
195 |         mask = self._pad_bool(seqs).unsqueeze(1).expand(-1,len_q,-1)    # [n, len_q, step]
196 |         return mask.unsqueeze(1)    # [n, 1, len_q, step]
197 |     
198 |     def _look_ahead_mask(self,seqs):
199 |         device = next(self.parameters()).device
200 |         batch_size, seq_len = seqs.shape
201 |         mask = torch.triu(torch.ones((seq_len,seq_len), dtype=torch.long), diagonal=1).to(device)  # [seq_len ,seq_len]
202 |         mask = torch.where(self._pad_bool(seqs)[:,None,None,:],1,mask[None,None,:,:]).to(device)   # [n, 1, seq_len, seq_len]
203 |         return mask>0   # [n, 1, seq_len, seq_len]
204 |     
205 |     def translate(self, src, v2i, i2v):
206 |         self.eval()
207 |         device = next(self.parameters()).device
208 |         src_pad = src
209 |         # Initialize Decoder input by constructing a matrix M([n, self.max_len+1]) with initial value:
210 |         # M[n,0] = start token id
211 |         # M[n,:] = 0
212 |         target = torch.from_numpy(utils.pad_zero(np.array([[v2i["<GO>"], ] for _ in range(len(src))]), self.max_len+1)).to(device)
213 |         x_embed = self.embed(src_pad)
214 |         encoded_z = self.encoder(x_embed,False,mask=self._pad_mask(src_pad))
215 |         for i in range(0,self.max_len):
216 |             y = target[:,:-1]
217 |             y_embed = self.embed(y)
218 |             decoded_z = self.decoder(y_embed,encoded_z,False,self._look_ahead_mask(y),self._pad_mask(src_pad))
219 |             o = self.o(decoded_z)[:,i,:]
220 |             idx = o.argmax(dim = 1).detach()
221 |             # Update the Decoder input, to predict for the next position.
222 |             target[:,i+1] = idx
223 |         self.train()
224 |         return target
225 | 
226 | 
227 | 
228 | 
229 | def train(emb_dim=32,n_layer=3,n_head=4):
230 |     
231 |     dataset = utils.DateData(4000)
232 |     print("Chinese time order: yy/mm/dd ",dataset.date_cn[:3],"\nEnglish time order: dd/M/yyyy", dataset.date_en[:3])
233 |     print("Vocabularies: ", dataset.vocab)
234 |     print(f"x index sample:  \n{dataset.idx2str(dataset.x[0])}\n{dataset.x[0]}",
235 |     f"\ny index sample:  \n{dataset.idx2str(dataset.y[0])}\n{dataset.y[0]}")
236 |     loader = DataLoader(dataset,batch_size=32,shuffle=True)
237 |     model = Transformer(n_vocab=dataset.num_word, max_len=MAX_LEN, n_layer = n_layer, emb_dim=emb_dim, n_head = n_head, drop_rate=0.1, padding_idx=0)
238 |     if torch.cuda.is_available():
239 |         print("GPU train avaliable")
240 |         device =torch.device("cuda")
241 |         model = model.cuda()
242 |     else:
243 |         device = torch.device("cpu")
244 |         model = model.cpu()
245 |     for i in range(100):
246 |         for batch_idx , batch in enumerate(loader):
247 |             bx, by, decoder_len = batch
248 |             bx, by = torch.from_numpy(utils.pad_zero(bx,max_len = MAX_LEN)).type(torch.LongTensor).to(device), torch.from_numpy(utils.pad_zero(by,MAX_LEN+1)).type(torch.LongTensor).to(device)
249 |             loss, logits = model.step(bx,by)
250 |             if batch_idx%50 == 0:
251 |                 target = dataset.idx2str(by[0, 1:-1].cpu().data.numpy())
252 |                 pred = model.translate(bx[0:1],dataset.v2i,dataset.i2v)
253 |                 res = dataset.idx2str(pred[0].cpu().data.numpy())
254 |                 src = dataset.idx2str(bx[0].cpu().data.numpy())
255 |                 print(
256 |                     "Epoch: ",i,
257 |                     "| t: ", batch_idx,
258 |                     "| loss: %.3f" % loss,
259 |                     "| input: ", src,
260 |                     "| target: ", target,
261 |                     "| inference: ", res,
262 |                 )
263 | 
264 | if __name__ == "__main__":
265 |     parser = argparse.ArgumentParser()
266 |     parser.add_argument("--emb_dim",type=int, help="change the model dimension")
267 |     parser.add_argument("--n_layer",type=int, help="change the number of layers in Encoder and Decoder")
268 |     parser.add_argument("--n_head",type=int, help="change the number of heads in MultiHeadAttention")
269 | 
270 |     args = parser.parse_args()
271 |     args = dict(filter(lambda x: x[1],vars(args).items()))
272 |     train(**args)


--------------------------------------------------------------------------------
/pytorch/utils.py:
--------------------------------------------------------------------------------
  1 | import itertools
  2 | import numpy as np
  3 | from torch.utils.data import Dataset as tDataset
  4 | import datetime
  5 | import os
  6 | import re
  7 | import pandas as pd
  8 | import requests
  9 | import torch
 10 | 
 11 | PAD_ID = 0
 12 | class DateData(tDataset):
 13 |     def __init__(self,n):
 14 |         np.random.seed(1)
 15 |         self.date_cn = []
 16 |         self.date_en = []
 17 |         for timestamp in np.random.randint(143835585, 2043835585, n):
 18 |             date = datetime.datetime.fromtimestamp(timestamp)
 19 |             self.date_cn.append(date.strftime("%y-%m-%d"))
 20 |             self.date_en.append(date.strftime("%d/%b/%Y"))
 21 |         self.vocab= set(
 22 |             [str(i) for i in range(0,10)] + ["-","/","<GO>","<EOS>"] + [i.split("/")[1] for i in self.date_en]
 23 |         )
 24 |         self.v2i = {v:i for i,v in enumerate(sorted(list(self.vocab)), start=1)}
 25 |         self.v2i["<PAD>"] = PAD_ID
 26 |         self.vocab.add("<PAD>")
 27 |         self.i2v = {i:v for v,i in self.v2i.items()}
 28 |         self.x,self.y=[],[]
 29 |         for cn,en in zip(self.date_cn,self.date_en):
 30 |             self.x.append([self.v2i[v] for v in cn])
 31 |             self.y.append([self.v2i["<GO>"], ] + [self.v2i[v] for v in en[:3]] + [
 32 |                 self.v2i[en[3:6]]] + [self.v2i[v] for v in en[6:]] + [self.v2i["<EOS>"],])
 33 |         self.x,self.y = np.array(self.x),np.array(self.y)
 34 |         self.start_token = self.v2i["<GO>"]
 35 |         self.end_token = self.v2i["<EOS>"]
 36 |     
 37 |     def __len__(self):
 38 |         return len(self.x)
 39 |     
 40 |     @property
 41 |     def num_word(self):
 42 |         return len(self.vocab)
 43 |     
 44 |     def __getitem__(self, index):
 45 |         return self.x[index],self.y[index], len(self.y[index])-1
 46 |     
 47 |     def idx2str(self,idx):
 48 |         x=[]
 49 |         for i in idx:
 50 |             x.append(self.i2v[i])
 51 |             if i == self.end_token:
 52 |                 break
 53 |         return "".join(x)
 54 | 
 55 | def pad_zero(seqs, max_len):
 56 |     padded = np.full((len(seqs), max_len), fill_value=PAD_ID, dtype=np.int32)
 57 |     for i, seq in enumerate(seqs):
 58 |         padded[i, :len(seq)] = seq
 59 |     return padded
 60 | 
 61 | class Dataset:
 62 |     def __init__(self,x,y,v2i,i2v):
 63 |         self.x,self.y = x,y
 64 |         self.v2i, self.i2v = v2i,i2v
 65 |         self.vocab = v2i.keys()
 66 |     
 67 |     def sample(self,n):
 68 |         b_idx = np.random.randint(0,len(self.x),n)
 69 |         bx,by = self.x[b_idx],self.y[b_idx]
 70 |         return bx,by
 71 |     @property
 72 |     def num_word(self):
 73 |         return len(self.v2i)
 74 | 
 75 | def process_w2v_data(corpus,skip_window=2,method = "skip_gram"):
 76 |     all_words = [sentence.split(" ") for sentence in corpus]
 77 |     # groups all the iterables together and produces a single iterable as output
 78 |     all_words = np.array(list(itertools.chain(*all_words)))
 79 |     vocab,v_count = np.unique(all_words,return_counts=True)
 80 |     vocab = vocab[np.argsort(v_count)[::-1]]
 81 |     
 82 |     print("All vocabularies are sorted by frequency in decresing oreder")
 83 |     v2i = {v:i for i,v in enumerate(vocab)}
 84 |     i2v = {i:v for v,i in v2i.items()}
 85 | 
 86 |     pairs = []
 87 |     js = [i for i in range(-skip_window,skip_window+1) if i!=0]
 88 | 
 89 |     for c in corpus:
 90 |         words = c.split(" ")
 91 |         w_idx = [v2i[w] for w in words]
 92 |         if method == "skip_gram":
 93 |             for i in range(len(w_idx)):
 94 |                 for j in js:
 95 |                     if i+j<0 or i+j>= len(w_idx):
 96 |                         continue
 97 |                     pairs.append((w_idx[i],w_idx[i+j]))
 98 |         elif method.lower() == "cbow":
 99 |             for i in range(skip_window,len(w_idx)-skip_window):
100 |                 context = []
101 |                 for j in js:
102 |                     context.append(w_idx[i+j])
103 |                 pairs.append(context+[w_idx[i]])
104 |         else:
105 |             raise ValueError
106 |     
107 |     pairs = np.array(pairs)
108 |     print("5 expample pairs:\n",pairs[:5])
109 |     if method.lower()=="skip_gram":
110 |         x,y = pairs[:,0],pairs[:,1]
111 |     elif method.lower() == "cbow":
112 |         x,y = pairs[:,:-1],pairs[:,-1]
113 |     else:
114 |         raise ValueError
115 |     return Dataset(x,y,v2i,i2v)
116 | 
117 | def maybe_download_mrpc(save_dir="./MRPC/", proxy=None):
118 |     train_url = 'https://mofanpy.com/static/files/MRPC/msr_paraphrase_train.txt'
119 |     test_url = 'https://mofanpy.com/static/files/MRPC/msr_paraphrase_test.txt'
120 |     os.makedirs(save_dir, exist_ok=True)
121 |     proxies = {"http": proxy, "https": proxy}
122 |     for url in [train_url, test_url]:
123 |         raw_path = os.path.join(save_dir, url.split("/")[-1])
124 |         if not os.path.isfile(raw_path):
125 |             print("downloading from %s" % url)
126 |             r = requests.get(url, proxies=proxies)
127 |             with open(raw_path, "w", encoding="utf-8") as f:
128 |                 f.write(r.text.replace('"', "<QUOTE>"))
129 |                 print("completed")
130 | 
131 | 
132 | def _text_standardize(text):
133 |     text = re.sub(r'—', '-', text)
134 |     text = re.sub(r'–', '-', text)
135 |     text = re.sub(r'―', '-', text)
136 |     text = re.sub(r" \d+(,\d+)?(\.\d+)? ", " <NUM> ", text)
137 |     text = re.sub(r" \d+-+?\d*", " <NUM>-", text)
138 |     return text.strip()
139 | 
140 | 
141 | def _process_mrpc(dir="./MRPC", rows=None):
142 |     data = {"train": None, "test": None}
143 |     files = os.listdir(dir)
144 |     for f in files:
145 |         df = pd.read_csv(os.path.join(dir, f), sep='\t', nrows=rows)
146 |         k = "train" if "train" in f else "test"
147 |         data[k] = {"is_same": df.iloc[:, 0].values, "s1": df["#1 String"].values, "s2": df["#2 String"].values}
148 |     vocab = set()
149 |     for n in ["train", "test"]:
150 |         for m in ["s1", "s2"]:
151 |             for i in range(len(data[n][m])):
152 |                 data[n][m][i] = _text_standardize(data[n][m][i].lower())
153 |                 cs = data[n][m][i].split(" ")
154 |                 vocab.update(set(cs))
155 |     v2i = {v: i for i, v in enumerate(sorted(vocab), start=1)}
156 |     v2i["<PAD>"] = PAD_ID
157 |     v2i["<MASK>"] = len(v2i)
158 |     v2i["<SEP>"] = len(v2i)
159 |     v2i["<GO>"] = len(v2i)
160 |     i2v = {i: v for v, i in v2i.items()}
161 |     for n in ["train", "test"]:
162 |         for m in ["s1", "s2"]:
163 |             data[n][m+"id"] = [[v2i[v] for v in c.split(" ")] for c in data[n][m]]
164 |     return data, v2i, i2v
165 | 
166 | class MRPCData(tDataset):
167 |     num_seg = 3
168 |     pad_id = PAD_ID
169 | 
170 |     def __init__(self, data_dir="./MRPC/", rows=None, proxy=None):
171 |         maybe_download_mrpc(save_dir=data_dir, proxy=proxy)
172 |         data, self.v2i, self.i2v = _process_mrpc(data_dir, rows)
173 |         self.max_len = max(
174 |             [len(s1) + len(s2) + 3 for s1, s2 in zip(
175 |                 data["train"]["s1id"] + data["test"]["s1id"], data["train"]["s2id"] + data["test"]["s2id"])])
176 | 
177 |         self.xlen = np.array([
178 |             [
179 |                 len(data["train"]["s1id"][i]), len(data["train"]["s2id"][i])
180 |              ] for i in range(len(data["train"]["s1id"]))], dtype=int)
181 |         x = [
182 |             [self.v2i["<GO>"]] + data["train"]["s1id"][i] + [self.v2i["<SEP>"]] + data["train"]["s2id"][i] + [self.v2i["<SEP>"]]
183 |             for i in range(len(self.xlen))
184 |         ]
185 |         self.x = pad_zero(x, max_len=self.max_len)
186 |         self.nsp_y = data["train"]["is_same"][:, None]
187 | 
188 |         self.seg = np.full(self.x.shape, self.num_seg-1, np.int32)
189 |         for i in range(len(x)):
190 |             si = self.xlen[i][0] + 2
191 |             self.seg[i, :si] = 0
192 |             si_ = si + self.xlen[i][1] + 1
193 |             self.seg[i, si:si_] = 1
194 | 
195 |         self.word_ids = np.array(list(set(self.i2v.keys()).difference(
196 |             [self.v2i[v] for v in ["<PAD>", "<MASK>", "<SEP>"]])))
197 |     
198 |     def __getitem__(self,idx):
199 |         return self.x[idx], self.seg[idx], self.xlen[idx], self.nsp_y[idx]
200 | 
201 |     def sample(self, n):
202 |         bi = np.random.randint(0, self.x.shape[0], size=n)
203 |         bx, bs, bl, by = self.x[bi], self.seg[bi], self.xlen[bi], self.nsp_y[bi]
204 |         return bx, bs, bl, by
205 | 
206 |     @property
207 |     def num_word(self):
208 |         return len(self.v2i)
209 |     
210 |     def __len__(self):
211 |         return len(self.x)
212 | 
213 |     @property
214 |     def mask_id(self):
215 |         return self.v2i["<MASK>"]
216 | 
217 | class MRPCSingle(tDataset):
218 |     pad_id = PAD_ID
219 | 
220 |     def __init__(self,data_dir="./MRPC/",rows = None, proxy= None):
221 |         maybe_download_mrpc(save_dir=data_dir, proxy=proxy)
222 | 
223 |         data, self.v2i, self.i2v = _process_mrpc(data_dir, rows)
224 | 
225 |         self.max_len = max([len(s) + 2 for s in data["train"]["s1id"] + data["train"]["s2id"]])
226 |         x = [
227 |             [self.v2i["<GO>"]] + data["train"]["s1id"][i] + [self.v2i["<SEP>"]]
228 |             for i in range(len(data["train"]["s1id"]))
229 |         ]
230 |         x += [
231 |             [self.v2i["<GO>"]] + data["train"]["s2id"][i] + [self.v2i["<SEP>"]]
232 |             for i in range(len(data["train"]["s2id"]))
233 |         ]
234 |         self.x = pad_zero(x, max_len=self.max_len)
235 |         self.word_ids = np.array(list(set(self.i2v.keys()).difference([self.v2i["<PAD>"]])))
236 |     def sample(self, n):
237 |         bi = np.random.randint(0, self.x.shape[0], size=n)
238 |         bx = self.x[bi]
239 |         return bx
240 | 
241 |     @property
242 |     def num_word(self):
243 |         return len(self.v2i)
244 |     
245 |     def __getitem__(self, index):
246 |         return self.x[index]
247 | 
248 |     
249 |     def __len__(self):
250 |         return len(self.x)
251 | 


--------------------------------------------------------------------------------
/pytorch/visual.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | 
 3 | def show_w2v_word_embedding(model,data,path):
 4 |     word_emb = model.embeddings.weight.data.numpy()
 5 |     for i in range(data.num_word):
 6 |         c = "blue"
 7 |         try:
 8 |             int(data.i2v[i])
 9 |         except:
10 |             c = "red"
11 |         
12 |         plt.text(word_emb[i,0],word_emb[i,1], s= data.i2v[i], color=c,weight = "bold")
13 |     
14 |     plt.xlim(word_emb[:,0].min() - 0.5, word_emb[:,0].max()+0.5)
15 |     plt.ylim(word_emb[:,1].min() - 0.5, word_emb[:,1].max()+0.5)
16 |     plt.xticks(())
17 |     plt.yticks(())
18 |     plt.xlabel("embedding dim1")
19 |     plt.ylabel("embedding dim2")
20 |     plt.savefig(path,dpi=300,format="png")
21 |     plt.show()


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | matplotlib==3.2.1
2 | numpy==1.18.5
3 | pandas==1.0.4
4 | requests==2.23.0
5 | sklearn==0.23.0
6 | tensorflow==2.3.1
7 | tensorflow-addons==0.10.0
8 | 


--------------------------------------------------------------------------------
/seq2seq.py:
--------------------------------------------------------------------------------
  1 | # [Sequence to Sequence Learning with Neural Networks](https://papers.nips.cc/paper/5346-sequence-to-sequence-learning-with-neural-networks.pdf)
  2 | import tensorflow as tf
  3 | from tensorflow import keras
  4 | import numpy as np
  5 | import utils    # this refers to utils.py in my [repo](https://github.com/MorvanZhou/NLP-Tutorials/)
  6 | import tensorflow_addons as tfa
  7 | 
  8 | 
  9 | class Seq2Seq(keras.Model):
 10 |     def __init__(self, enc_v_dim, dec_v_dim, emb_dim, units, max_pred_len, start_token, end_token):
 11 |         super().__init__()
 12 |         self.units = units
 13 | 
 14 |         # encoder
 15 |         self.enc_embeddings = keras.layers.Embedding(
 16 |             input_dim=enc_v_dim, output_dim=emb_dim,  # [enc_n_vocab, emb_dim]
 17 |             embeddings_initializer=tf.initializers.RandomNormal(0., 0.1),
 18 |         )
 19 |         self.encoder = keras.layers.LSTM(units=units, return_sequences=True, return_state=True)
 20 | 
 21 |         # decoder
 22 |         self.dec_embeddings = keras.layers.Embedding(
 23 |             input_dim=dec_v_dim, output_dim=emb_dim,  # [dec_n_vocab, emb_dim]
 24 |             embeddings_initializer=tf.initializers.RandomNormal(0., 0.1),
 25 |         )
 26 |         self.decoder_cell = keras.layers.LSTMCell(units=units)
 27 |         decoder_dense = keras.layers.Dense(dec_v_dim)
 28 |         # train decoder
 29 |         self.decoder_train = tfa.seq2seq.BasicDecoder(
 30 |             cell=self.decoder_cell,
 31 |             sampler=tfa.seq2seq.sampler.TrainingSampler(),   # sampler for train
 32 |             output_layer=decoder_dense
 33 |         )
 34 |         # predict decoder
 35 |         self.decoder_eval = tfa.seq2seq.BasicDecoder(
 36 |             cell=self.decoder_cell,
 37 |             sampler=tfa.seq2seq.sampler.GreedyEmbeddingSampler(),       # sampler for predict
 38 |             output_layer=decoder_dense
 39 |         )
 40 | 
 41 |         self.cross_entropy = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
 42 |         self.opt = keras.optimizers.Adam(0.01)
 43 |         self.max_pred_len = max_pred_len
 44 |         self.start_token = start_token
 45 |         self.end_token = end_token
 46 | 
 47 |     def encode(self, x):
 48 |         embedded = self.enc_embeddings(x)
 49 |         init_s = [tf.zeros((x.shape[0], self.units)), tf.zeros((x.shape[0], self.units))]
 50 |         o, h, c = self.encoder(embedded, initial_state=init_s)
 51 |         return [h, c]
 52 | 
 53 |     def inference(self, x):
 54 |         s = self.encode(x)
 55 |         done, i, s = self.decoder_eval.initialize(
 56 |             self.dec_embeddings.variables[0],
 57 |             start_tokens=tf.fill([x.shape[0], ], self.start_token),
 58 |             end_token=self.end_token,
 59 |             initial_state=s,
 60 |         )
 61 |         pred_id = np.zeros((x.shape[0], self.max_pred_len), dtype=np.int32)
 62 |         for l in range(self.max_pred_len):
 63 |             o, s, i, done = self.decoder_eval.step(
 64 |                 time=l, inputs=i, state=s, training=False)
 65 |             pred_id[:, l] = o.sample_id
 66 |         return pred_id
 67 | 
 68 |     def train_logits(self, x, y, seq_len):
 69 |         s = self.encode(x)
 70 |         dec_in = y[:, :-1]   # ignore <EOS>
 71 |         dec_emb_in = self.dec_embeddings(dec_in)
 72 |         o, _, _ = self.decoder_train(dec_emb_in, s, sequence_length=seq_len)
 73 |         logits = o.rnn_output
 74 |         return logits
 75 | 
 76 |     def step(self, x, y, seq_len):
 77 |         with tf.GradientTape() as tape:
 78 |             logits = self.train_logits(x, y, seq_len)
 79 |             dec_out = y[:, 1:]  # ignore <GO>
 80 |             loss = self.cross_entropy(dec_out, logits)
 81 |             grads = tape.gradient(loss, self.trainable_variables)
 82 |         self.opt.apply_gradients(zip(grads, self.trainable_variables))
 83 |         return loss.numpy()
 84 | 
 85 | 
 86 | def train():
 87 |     # get and process data
 88 |     data = utils.DateData(4000)
 89 |     print("Chinese time order: yy/mm/dd ", data.date_cn[:3], "\nEnglish time order: dd/M/yyyy ", data.date_en[:3])
 90 |     print("vocabularies: ", data.vocab)
 91 |     print("x index sample: \n{}\n{}".format(data.idx2str(data.x[0]), data.x[0]),
 92 |           "\ny index sample: \n{}\n{}".format(data.idx2str(data.y[0]), data.y[0]))
 93 | 
 94 |     model = Seq2Seq(
 95 |         data.num_word, data.num_word, emb_dim=16, units=32,
 96 |         max_pred_len=11, start_token=data.start_token, end_token=data.end_token)
 97 | 
 98 |     # training
 99 |     for t in range(1500):
100 |         bx, by, decoder_len = data.sample(32)
101 |         loss = model.step(bx, by, decoder_len)
102 |         if t % 70 == 0:
103 |             target = data.idx2str(by[0, 1:-1])
104 |             pred = model.inference(bx[0:1])
105 |             res = data.idx2str(pred[0])
106 |             src = data.idx2str(bx[0])
107 |             print(
108 |                 "t: ", t,
109 |                 "| loss: %.3f" % loss,
110 |                 "| input: ", src,
111 |                 "| target: ", target,
112 |                 "| inference: ", res,
113 |             )
114 | 
115 | 
116 | if __name__ == "__main__":
117 |     train()
118 | 


--------------------------------------------------------------------------------
/seq2seq_attention.py:
--------------------------------------------------------------------------------
  1 | # [Effective Approaches to Attention-based Neural Machine Translation](https://arxiv.org/pdf/1508.04025.pdf)
  2 | import tensorflow as tf
  3 | from tensorflow import keras
  4 | import numpy as np
  5 | import utils    # this refers to utils.py in my [repo](https://github.com/MorvanZhou/NLP-Tutorials/)
  6 | import tensorflow_addons as tfa
  7 | import pickle
  8 | 
  9 | 
 10 | class Seq2Seq(keras.Model):
 11 |     def __init__(self, enc_v_dim, dec_v_dim, emb_dim, units, attention_layer_size, max_pred_len, start_token, end_token):
 12 |         super().__init__()
 13 |         self.units = units
 14 | 
 15 |         # encoder
 16 |         self.enc_embeddings = keras.layers.Embedding(
 17 |             input_dim=enc_v_dim, output_dim=emb_dim,    # [enc_n_vocab, emb_dim]
 18 |             embeddings_initializer=tf.initializers.RandomNormal(0., 0.1),
 19 |         )
 20 |         self.encoder = keras.layers.LSTM(units=units, return_sequences=True, return_state=True)
 21 | 
 22 |         # decoder
 23 |         self.attention = tfa.seq2seq.LuongAttention(units, memory=None, memory_sequence_length=None)
 24 |         self.decoder_cell = tfa.seq2seq.AttentionWrapper(
 25 |             cell=keras.layers.LSTMCell(units=units),
 26 |             attention_mechanism=self.attention,
 27 |             attention_layer_size=attention_layer_size,
 28 |             alignment_history=True,                     # for attention visualization
 29 |         )
 30 | 
 31 |         self.dec_embeddings = keras.layers.Embedding(
 32 |             input_dim=dec_v_dim, output_dim=emb_dim,    # [dec_n_vocab, emb_dim]
 33 |             embeddings_initializer=tf.initializers.RandomNormal(0., 0.1),
 34 |         )
 35 |         decoder_dense = keras.layers.Dense(dec_v_dim)   # output layer
 36 | 
 37 |         # train decoder
 38 |         self.decoder_train = tfa.seq2seq.BasicDecoder(
 39 |             cell=self.decoder_cell,
 40 |             sampler=tfa.seq2seq.sampler.TrainingSampler(),   # sampler for train
 41 |             output_layer=decoder_dense
 42 |         )
 43 |         self.cross_entropy = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
 44 |         self.opt = keras.optimizers.Adam(0.05, clipnorm=5.0)
 45 | 
 46 |         # predict decoder
 47 |         self.decoder_eval = tfa.seq2seq.BasicDecoder(
 48 |             cell=self.decoder_cell,
 49 |             sampler=tfa.seq2seq.sampler.GreedyEmbeddingSampler(),       # sampler for predict
 50 |             output_layer=decoder_dense
 51 |         )
 52 | 
 53 |         # prediction restriction
 54 |         self.max_pred_len = max_pred_len
 55 |         self.start_token = start_token
 56 |         self.end_token = end_token
 57 | 
 58 |     def encode(self, x):
 59 |         o = self.enc_embeddings(x)
 60 |         init_s = [tf.zeros((x.shape[0], self.units)), tf.zeros((x.shape[0], self.units))]
 61 |         o, h, c = self.encoder(o, initial_state=init_s)
 62 |         return o, h, c
 63 | 
 64 |     def set_attention(self, x):
 65 |         o, h, c = self.encode(x)
 66 |         # encoder output for attention to focus
 67 |         self.attention.setup_memory(o)
 68 |         # wrap state by attention wrapper
 69 |         s = self.decoder_cell.get_initial_state(batch_size=x.shape[0], dtype=tf.float32).clone(cell_state=[h, c])
 70 |         return s
 71 | 
 72 |     def inference(self, x, return_align=False):
 73 |         s = self.set_attention(x)
 74 |         done, i, s = self.decoder_eval.initialize(
 75 |             self.dec_embeddings.variables[0],
 76 |             start_tokens=tf.fill([x.shape[0], ], self.start_token),
 77 |             end_token=self.end_token,
 78 |             initial_state=s,
 79 |         )
 80 |         pred_id = np.zeros((x.shape[0], self.max_pred_len), dtype=np.int32)
 81 |         for l in range(self.max_pred_len):
 82 |             o, s, i, done = self.decoder_eval.step(
 83 |                 time=l, inputs=i, state=s, training=False)
 84 |             pred_id[:, l] = o.sample_id
 85 |         if return_align:
 86 |             return np.transpose(s.alignment_history.stack().numpy(), (1, 0, 2))
 87 |         else:
 88 |             s.alignment_history.mark_used()  # otherwise gives warning
 89 |             return pred_id
 90 | 
 91 |     def train_logits(self, x, y, seq_len):
 92 |         s = self.set_attention(x)
 93 |         dec_in = y[:, :-1]   # ignore <EOS>
 94 |         dec_emb_in = self.dec_embeddings(dec_in)
 95 |         o, _, _ = self.decoder_train(dec_emb_in, s, sequence_length=seq_len)
 96 |         logits = o.rnn_output
 97 |         return logits
 98 | 
 99 |     def step(self, x, y, seq_len):
100 |         with tf.GradientTape() as tape:
101 |             logits = self.train_logits(x, y, seq_len)
102 |             dec_out = y[:, 1:]  # ignore <GO>
103 |             loss = self.cross_entropy(dec_out, logits)
104 |             grads = tape.gradient(loss, self.trainable_variables)
105 |         self.opt.apply_gradients(zip(grads, self.trainable_variables))
106 |         return loss.numpy()
107 | 
108 | 
109 | def train():
110 |     # get and process data
111 |     data = utils.DateData(2000)
112 |     print("Chinese time order: yy/mm/dd ", data.date_cn[:3], "\nEnglish time order: dd/M/yyyy ", data.date_en[:3])
113 |     print("vocabularies: ", data.vocab)
114 |     print("x index sample: \n{}\n{}".format(data.idx2str(data.x[0]), data.x[0]),
115 |           "\ny index sample: \n{}\n{}".format(data.idx2str(data.y[0]), data.y[0]))
116 | 
117 |     model = Seq2Seq(
118 |         data.num_word, data.num_word, emb_dim=12, units=14, attention_layer_size=16,
119 |         max_pred_len=11, start_token=data.start_token, end_token=data.end_token)
120 | 
121 |     # training
122 |     for t in range(1000):
123 |         bx, by, decoder_len = data.sample(64)
124 |         loss = model.step(bx, by, decoder_len)
125 |         if t % 70 == 0:
126 |             target = data.idx2str(by[0, 1:-1])
127 |             pred = model.inference(bx[0:1])
128 |             res = data.idx2str(pred[0])
129 |             src = data.idx2str(bx[0])
130 |             print(
131 |                 "t: ", t,
132 |                 "| loss: %.5f" % loss,
133 |                 "| input: ", src,
134 |                 "| target: ", target,
135 |                 "| inference: ", res,
136 |             )
137 | 
138 |     pkl_data = {"i2v": data.i2v, "x": data.x[:6], "y": data.y[:6], "align": model.inference(data.x[:6], return_align=True)}
139 | 
140 |     with open("./visual/tmp/attention_align.pkl", "wb") as f:
141 |         pickle.dump(pkl_data, f)
142 | 
143 | 
144 | if __name__ == "__main__":
145 |     train()
146 | 


--------------------------------------------------------------------------------
/simple_realize/CBOW.py:
--------------------------------------------------------------------------------
  1 | # [Efficient Estimation of Word Representations in Vector Space](https://arxiv.org/pdf/1301.3781.pdf)
  2 | '''
  3 | created by YuYang github.com/W1Fl
  4 | '''
  5 | from io import BytesIO
  6 | 
  7 | import imageio
  8 | import matplotlib.pyplot as plt
  9 | import tensorflow as tf
 10 | from tensorflow import keras
 11 | 
 12 | from utils import process_w2v_data
 13 | 
 14 | Batch_size = 32
 15 | Learn_rate = 0.01
 16 | Epochs = 256
 17 | DataSize = 512
 18 | 
 19 | corpus = [
 20 |     # numbers
 21 |     "5 2 4 8 6 2 3 6 4",
 22 |     "4 8 5 6 9 5 5 6",
 23 |     "1 1 5 2 3 3 8",
 24 |     "3 6 9 6 8 7 4 6 3",
 25 |     "8 9 9 6 1 4 3 4",
 26 |     "1 0 2 0 2 1 3 3 3 3 3",
 27 |     "9 3 3 0 1 4 7 8",
 28 |     "9 9 8 5 6 7 1 2 3 0 1 0",
 29 | 
 30 |     # alphabets, expecting that 9 is close to letters
 31 |     "a t g q e h 9 u f",
 32 |     "e q y u o i p s",
 33 |     "q o 9 p l k j o k k o p",
 34 |     "h g y i u t t a e q",
 35 |     "i k d q r e 9 e a d",
 36 |     "o p d g 9 s a f g a",
 37 |     "i u y g h k l a s w",
 38 |     "o l u y a o g f s",
 39 |     "o p i u y g d a s j d l",
 40 |     "u k i l o 9 l j s",
 41 |     "y g i s h k j l f r f",
 42 |     "i o h n 9 9 d 9 f a 9",
 43 | ]
 44 | 
 45 | SkipGram = lambda v_dim, emb_dim: keras.Sequential([
 46 |     keras.layers.Embedding(
 47 |         input_dim=v_dim, output_dim=emb_dim,  # [n_vocab, emb_dim]
 48 |         embeddings_initializer=keras.initializers.RandomNormal(0., 0.1),
 49 |     ),
 50 |     keras.layers.Lambda(lambda x:tf.reduce_mean(x,1))
 51 | ])
 52 | 
 53 | 
 54 | class myTensorboard(keras.callbacks.TensorBoard):
 55 |     def __init__(self, data, log_dir='logs/CBOW', histogram_freq=1, write_graph=True, write_images=True,
 56 |                  embeddings_freq=10, **kwargs):
 57 |         super().__init__(log_dir=log_dir, histogram_freq=histogram_freq, write_graph=write_graph,
 58 |                          write_images=write_images, embeddings_freq=embeddings_freq, **kwargs)
 59 |         self.buffer = BytesIO()
 60 |         self.data = data
 61 | 
 62 | 
 63 |     def plot(self, data):
 64 |         word_emb = model.layers[0].get_weights()[0]
 65 |         for i in range(data.num_word):
 66 |             c = "blue"
 67 |             try:
 68 |                 int(data.i2v[i])
 69 |             except ValueError:
 70 |                 c = "red"
 71 |             plt.text(word_emb[i, 0], word_emb[i, 1], s=data.i2v[i], color=c, weight="bold")
 72 |         plt.xlim(word_emb[:, 0].min() - .5, word_emb[:, 0].max() + .5)
 73 |         plt.ylim(word_emb[:, 1].min() - .5, word_emb[:, 1].max() + .5)
 74 |         plt.xticks(())
 75 |         plt.yticks(())
 76 |         plt.xlabel("embedding dim1")
 77 |         plt.ylabel("embedding dim2")
 78 |         plt.savefig(self.buffer, format='png')
 79 |         plt.close()
 80 |         self.buffer.seek(0)
 81 | 
 82 |     def on_epoch_end(self, epoch, logs=None):
 83 |         writer = self._get_writer(self._train_run_name)
 84 |         if (not epoch % 1):
 85 |             self.plot(self.data)
 86 |             with writer.as_default():
 87 |                 tf.summary.image('embedding', imageio.imread(self.buffer)[None, :], step=epoch)
 88 |                 self.buffer.seek(0)
 89 |         super(myTensorboard, self).on_epoch_end(epoch, logs)
 90 | 
 91 | 
 92 | class nce_loss(keras.losses.Loss):
 93 |     # negative sampling: take one positive label and num_sampled negative labels to compute the loss
 94 |     # in order to reduce the computation of full softmax
 95 |     def __init__(self, model, v_dim, emb_dim):
 96 |         super(nce_loss, self).__init__()
 97 |         # noise-contrastive estimation
 98 |         self.nce_w = model.add_weight(
 99 |             name="nce_w", shape=[v_dim, emb_dim],
100 |             initializer=keras.initializers.TruncatedNormal(0., 0.1))  # [n_vocab, emb_dim]
101 |         self.nce_b = model.add_weight(
102 |             name="nce_b", shape=(v_dim,),
103 |             initializer=keras.initializers.Constant(0.1))  # [n_vocab, ]
104 |         self.v_dim = v_dim
105 | 
106 |     def call(self, y_true, y_pred):
107 |         # return keras.losses.SparseCategoricalCrossentropy()(y_true,y_pred)
108 |         return tf.nn.nce_loss(
109 |             weights=self.nce_w, biases=self.nce_b, labels=y_true,
110 |             inputs=y_pred, num_sampled=5, num_classes=self.v_dim)
111 | 
112 | 
113 | if __name__ == "__main__":
114 |     d = process_w2v_data(corpus, skip_window=2, method="cbow")
115 |     bx, by = d.sample(DataSize)
116 |     model = SkipGram(d.num_word, 2)
117 |     model.compile(optimizer=keras.optimizers.Adam(Learn_rate), loss=nce_loss(model, d.num_word, 2))
118 |     model.fit(bx, by, Batch_size, Epochs, callbacks=[myTensorboard(d)], verbose=2)
119 | 
120 |     #use tensorboard --logdir logs --samples_per_plugin=images=255 to show all images
121 | 


--------------------------------------------------------------------------------
/simple_realize/README.md:
--------------------------------------------------------------------------------
 1 | # NLP教程简化实现
 2 | 
 3 | >这个目录下是NLP课程代码使用纯keras完成的简化版本，而且全部实现了tensorboard可视化
 4 | 
 5 | 在这些简化后的代码中，你可以
 6 | * 直观地了解seq2seq,LuongAttention和transformer的实现细节
 7 | * 看到embedding动画
 8 | * 比较带有attention的seq2seq和普通seq2seq
 9 | * 更进一步扩展模型
10 | 
11 | -----
12 | 
13 | ## 代码
14 | 1. Understand Word (W2V)
15 |   - [Continuous Bag of Words (CBOW)](#Word2Vec)
16 |   - [Skip-Gram](#Word2Vec)
17 | 2. Understand Sentence (Seq2Seq)
18 |   - [seq2seq](#Seq2Seq)
19 |   - [CNN language model](#Seq2Seq)
20 | 3. All about Attention
21 |   - [seq2seq with attention](#Seq2SeqAttention)
22 |   - [transformer](#Seq2SeqAttention)
23 | 
24 |   
25 | ## Word2Vec
26 | * [CBOW](CBOW.py)
27 | * [Skip-Gram](skip-gram.py)
28 | 
29 | ![](./imgs/skip-gram.gif)
30 | 
31 | ## Seq2Seq
32 | * [seq2seq](seq2seq.py)
33 | * [cnn-im](cnn-lm.py)
34 | 
35 | ![月份和数字聚在一起，而符号则分散开](./imgs/seq2seq-embedding.gif)
36 | 
37 | ## Seq2SeqAttention
38 | * [seq2seq_attention](seq2seq_attention.py)
39 | * [transformer](transformer.py)
40 | 
41 | 
42 | ![](./imgs/attention.gif)


--------------------------------------------------------------------------------
/simple_realize/cnn-lm.py:
--------------------------------------------------------------------------------
  1 | # [Sequence to Sequence Learning with Neural Networks](https://papers.nips.cc/paper/5346-sequence-to-sequence-learning-with-neural-networks.pdf)
  2 | '''
  3 | created by YuYang github.com/W1Fl
  4 | '''
  5 | import tensorflow as tf
  6 | from tensorflow import keras
  7 | import utils
  8 | 
  9 | Batch_size = 64
 10 | Learn_rate = 0.01
 11 | Epochs = 15
 12 | DataSize = 1600
 13 | 
 14 | 
 15 | class Seq2Seq(keras.Model):
 16 |     def __init__(self, enc_v_dim, dec_v_dim, emb_dim, units, max_pred_len, start_token, end_token):
 17 |         super().__init__()
 18 |         self.enc_v_dim = enc_v_dim
 19 |         self.emb_dim = emb_dim
 20 |         self.units = units
 21 |         self.dec_v_dim = dec_v_dim
 22 |         self.max_pred_len = max_pred_len
 23 |         self.start_token = start_token
 24 |         self.end_token = end_token
 25 | 
 26 |     def build(self, input_shape):
 27 |         # encoder
 28 |         self.enc_embeddings = keras.layers.Embedding(
 29 |             input_dim=self.enc_v_dim,
 30 |             output_dim=self.emb_dim,  # [enc_n_vocab, emb_dim]
 31 |             embeddings_initializer=tf.initializers.RandomNormal(0., 0.1),
 32 |             name='encoder/embeddings'
 33 |         )
 34 | 
 35 | 
 36 |         self.conv2ds = [
 37 |             keras.layers.Conv2D(16, (n, self.emb_dim), padding="valid", activation=keras.activations.relu)
 38 |             for n in range(2, 5)]
 39 |         self.max_pools = [keras.layers.MaxPool2D((n, 1)) for n in [7, 6, 5]]
 40 |         self.encoder = keras.layers.Dense(self.units, activation=keras.activations.relu)
 41 | 
 42 | 
 43 |         # decoder
 44 |         self.dec_embeddings = keras.layers.Embedding(
 45 |             input_dim=self.dec_v_dim, output_dim=self.emb_dim,  # [dec_n_vocab, emb_dim]
 46 |             embeddings_initializer=tf.initializers.RandomNormal(0., 0.1),
 47 |             name='decoder/embeddings'
 48 | 
 49 |         )
 50 |         self.dec_embeddings.build((None, self.dec_v_dim))
 51 |         self.decoder = keras.layers.LSTM(units=self.units, return_state=True, return_sequences=True,
 52 |                                          name='decoder/LSTM')
 53 |         self.decoder_dense = keras.layers.Dense(self.dec_v_dim, activation=keras.activations.softmax,
 54 |                                                 name='decoder/Dense')
 55 | 
 56 |         self.batch = input_shape[0][0]
 57 |         super(Seq2Seq, self).build([*input_shape])
 58 | 
 59 |     def encode(self, x):
 60 |         embedded = self.enc_embeddings(x)  # [n, step, emb]
 61 |         o = tf.expand_dims(embedded, axis=3)  # [n, step=8, emb=16, 1]
 62 |         co = [conv2d(o) for conv2d in self.conv2ds]  # [n, 7, 1, 16], [n, 6, 1, 16], [n, 5, 1, 16]
 63 |         co = [self.max_pools[i](co[i]) for i in range(len(co))]  # [n, 1, 1, 16] * 3
 64 |         co = [tf.squeeze(c, axis=[1, 2]) for c in co]  # [n, 16] * 3
 65 |         o = tf.concat(co, axis=1)  # [n, 16*3]
 66 |         h = self.encoder(o)  # [n, units]
 67 |         return h, h
 68 | 
 69 |     def decode(self, batch, h, c, y=None, training=None):
 70 |         if training: #将上一时刻的标签作为当前时刻的输入
 71 |             y = self.dec_embeddings(y)
 72 |             y, h, c = self.decoder(y, (h, c))
 73 |             y = self.decoder_dense(y)
 74 |         else:#将上一时刻的输出作为当前时刻的输入
 75 |             y = []
 76 |             o = tf.zeros((batch, 1, self.dec_v_dim))
 77 |             for i in range(self.max_pred_len):
 78 |                 o = o @ self.dec_embeddings.weights
 79 |                 o, h, c = self.decoder(o, (h, c))
 80 |                 o = self.decoder_dense(o)
 81 |                 y.append(o)
 82 |             y = tf.concat(y, 1)
 83 |         return y
 84 | 
 85 |     # @tf.function
 86 |     def call(self, inputs, training=None, mask=None):
 87 |         x = inputs[0]
 88 |         y = inputs[1]
 89 |         if training:
 90 |             y = tf.pad(y[:, :-1], [[0, 0], [1, 0]])
 91 |         h, c = self.encode(x)
 92 |         batch = tf.shape(x)[0]
 93 |         y = self.decode(batch, h, c, y, training)
 94 |         return y
 95 | 
 96 | 
 97 | class myTensorboard(keras.callbacks.TensorBoard):
 98 |     def __init__(self, data, log_dir='logs/cnn-lm', histogram_freq=1, write_graph=True, write_images=True,
 99 |                  embeddings_freq=10, **kwargs):
100 |         self.data = data
101 |         super().__init__(log_dir=log_dir, histogram_freq=histogram_freq, write_graph=write_graph,
102 |                          write_images=write_images, embeddings_freq=embeddings_freq, **kwargs)
103 |     def on_epoch_end(self, epoch, logs=None):
104 |         if (not epoch % 1):
105 |             x, y, l = self.data.sample(1)
106 |             y_ = self.model((x, y), training=False)
107 |             y_ = tf.argmax(y_, -1).numpy()
108 |             target = self.data.idx2str(y[0])
109 |             res = self.data.idx2str(y_[0])
110 |             src = self.data.idx2str(x[0])
111 |             print(
112 |                 '\n',
113 |                 "t: ", epoch,
114 |                 "| input: ", src,
115 |                 "| target: ", target,
116 |                 "| inference: ", res,
117 |             )
118 |         super(myTensorboard, self).on_epoch_end(epoch, logs)
119 | 
120 | 
121 | def train():
122 |     # get and process data
123 |     data = utils.DateData(DataSize)
124 |     train_x, train_y, train_l = data.sample(DataSize)
125 | 
126 |     print("Chinese time order: yy/mm/dd ", data.date_cn[:3], "\nEnglish time order: dd/M/yyyy ", data.date_en[:3])
127 |     print("vocabularies: ", data.vocab)
128 |     print("x index sample: \n{}\n{}".format(data.idx2str(data.x[0]), data.x[0]),
129 |           "\ny index sample: \n{}\n{}".format(data.idx2str(data.y[0]), data.y[0]))
130 | 
131 |     model = Seq2Seq(
132 |         data.num_word, data.num_word, emb_dim=16, units=32,
133 |         max_pred_len=11, start_token=data.start_token, end_token=data.end_token)
134 |     model.compile(optimizer=keras.optimizers.Adam(Learn_rate), loss=keras.losses.SparseCategoricalCrossentropy(False),
135 |                   metrics=[keras.metrics.sparse_categorical_accuracy])
136 |     model.fit((train_x, train_y), train_y, callbacks=[myTensorboard(data)], batch_size=Batch_size, epochs=Epochs)
137 | 
138 | 
139 | if __name__ == "__main__":
140 |     train()
141 | 


--------------------------------------------------------------------------------
/simple_realize/imgs/attention.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MorvanZhou/NLP-Tutorials/3aa02a13151b696e8c034da5f7bd2ef4294a5e5f/simple_realize/imgs/attention.gif


--------------------------------------------------------------------------------
/simple_realize/imgs/seq2seq-embedding.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MorvanZhou/NLP-Tutorials/3aa02a13151b696e8c034da5f7bd2ef4294a5e5f/simple_realize/imgs/seq2seq-embedding.gif


--------------------------------------------------------------------------------
/simple_realize/imgs/skip-gram.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MorvanZhou/NLP-Tutorials/3aa02a13151b696e8c034da5f7bd2ef4294a5e5f/simple_realize/imgs/skip-gram.gif


--------------------------------------------------------------------------------
/simple_realize/seq2seq.py:
--------------------------------------------------------------------------------
  1 | # [Sequence to Sequence Learning with Neural Networks](https://papers.nips.cc/paper/5346-sequence-to-sequence-learning-with-neural-networks.pdf)
  2 | '''
  3 | created by YuYang github.com/W1Fl
  4 | '''
  5 | import tensorflow as tf
  6 | from tensorflow import keras
  7 | import utils
  8 | 
  9 | Batch_size = 64
 10 | Learn_rate = 0.01
 11 | Epochs = 15
 12 | DataSize = 8192
 13 | 
 14 | 
 15 | class Seq2Seq(keras.Model):
 16 |     def __init__(self, enc_v_dim, dec_v_dim, emb_dim, units, max_pred_len, start_token, end_token):
 17 |         super().__init__()
 18 |         self.enc_v_dim = enc_v_dim
 19 |         self.emb_dim = emb_dim
 20 |         self.units = units
 21 |         self.dec_v_dim = dec_v_dim
 22 |         self.max_pred_len = max_pred_len
 23 |         self.start_token = start_token
 24 |         self.end_token = end_token
 25 | 
 26 |     def build(self, input_shape):
 27 |         # encoder
 28 |         self.enc_embeddings = keras.layers.Embedding(
 29 |             input_dim=self.enc_v_dim,
 30 |             output_dim=self.emb_dim,  # [enc_n_vocab, emb_dim]
 31 |             embeddings_initializer=tf.initializers.RandomNormal(0., 0.1),
 32 |             name='encoder/embeddings'
 33 |         )
 34 |         self.encoder = keras.layers.LSTM(units=self.units, return_state=True, name='encoder/LSTM')
 35 | 
 36 |         # decoder
 37 |         self.dec_embeddings = keras.layers.Embedding(
 38 |             input_dim=self.dec_v_dim, output_dim=self.emb_dim,  # [dec_n_vocab, emb_dim]
 39 |             embeddings_initializer=tf.initializers.RandomNormal(0., 0.1),
 40 |             name='decoder/embeddings'
 41 | 
 42 |         )
 43 |         self.dec_embeddings.build((None, self.dec_v_dim))
 44 |         self.decoder = keras.layers.LSTM(units=self.units, return_state=True, return_sequences=True,
 45 |                                          name='decoder/LSTM')
 46 |         self.decoder_dense = keras.layers.Dense(self.dec_v_dim, activation=keras.activations.softmax,
 47 |                                                 name='decoder/Dense')
 48 | 
 49 |         self.batch = input_shape[0][0]
 50 |         super(Seq2Seq, self).build([*input_shape])
 51 | 
 52 |     def encode(self, x):
 53 |         embedded = self.enc_embeddings(x)
 54 |         o, h, c = self.encoder(embedded)
 55 |         return h, c
 56 | 
 57 |     def decode(self, batch, h, c, y=None, training=None):
 58 |         if training: #将上一时刻的标签作为当前时刻的输入
 59 |             y = self.dec_embeddings(y)
 60 |             y, h, c = self.decoder(y, (h, c))
 61 |             y = self.decoder_dense(y)
 62 |         else:#将上一时刻的输出作为当前时刻的输入
 63 |             y = []
 64 |             o = tf.zeros((batch, 1, self.dec_v_dim))
 65 |             for i in range(self.max_pred_len):
 66 |                 o = o @ self.dec_embeddings.weights
 67 |                 o, h, c = self.decoder(o, (h, c))
 68 |                 o = self.decoder_dense(o)
 69 |                 y.append(o)
 70 |             y = tf.concat(y, 1)
 71 |         return y
 72 | 
 73 |     # @tf.function
 74 |     def call(self, inputs, training=None, mask=None):
 75 |         x = inputs[0]
 76 |         y = inputs[1]
 77 |         if training:
 78 |             y = tf.pad(y[:, :-1], [[0, 0], [1, 0]])
 79 |         h, c = self.encode(x)
 80 |         batch = tf.shape(x)[0]
 81 |         y = self.decode(batch, h, c, y, training)
 82 |         return y
 83 | 
 84 | 
 85 | class myTensorboard(keras.callbacks.TensorBoard):
 86 |     def __init__(self, data, log_dir='logs/seq2seq', histogram_freq=1, write_graph=True, write_images=True,
 87 |                  embeddings_freq=10, **kwargs):
 88 |         self.data = data
 89 |         super().__init__(log_dir=log_dir, histogram_freq=histogram_freq, write_graph=write_graph,
 90 |                          write_images=write_images, embeddings_freq=embeddings_freq, **kwargs)
 91 |     def on_epoch_end(self, epoch, logs=None):
 92 |         if (not epoch % 1):
 93 |             x, y, l = self.data.sample(1)
 94 |             y_ = self.model((x, y), training=False)
 95 |             y_ = tf.argmax(y_, -1).numpy()
 96 |             target = self.data.idx2str(y[0])
 97 |             res = self.data.idx2str(y_[0])
 98 |             src = self.data.idx2str(x[0])
 99 |             print(
100 |                 '\n',
101 |                 "t: ", epoch,
102 |                 "| input: ", src,
103 |                 "| target: ", target,
104 |                 "| inference: ", res,
105 |             )
106 |         super(myTensorboard, self).on_epoch_end(epoch, logs)
107 | 
108 | 
109 | def train():
110 |     # get and process data
111 |     data = utils.DateData(DataSize)
112 |     train_x, train_y, train_l = data.sample(DataSize)
113 | 
114 |     print("Chinese time order: yy/mm/dd ", data.date_cn[:3], "\nEnglish time order: dd/M/yyyy ", data.date_en[:3])
115 |     print("vocabularies: ", data.vocab)
116 |     print("x index sample: \n{}\n{}".format(data.idx2str(data.x[0]), data.x[0]),
117 |           "\ny index sample: \n{}\n{}".format(data.idx2str(data.y[0]), data.y[0]))
118 | 
119 |     model = Seq2Seq(
120 |         data.num_word, data.num_word, emb_dim=16, units=32,
121 |         max_pred_len=11, start_token=data.start_token, end_token=data.end_token)
122 |     model.compile(optimizer=keras.optimizers.Adam(Learn_rate), loss=keras.losses.SparseCategoricalCrossentropy(False),
123 |                   metrics=[keras.metrics.sparse_categorical_accuracy])
124 |     model.fit((train_x, train_y), train_y, callbacks=[myTensorboard(data)], batch_size=Batch_size, epochs=Epochs)
125 | 
126 | 
127 | if __name__ == "__main__":
128 |     train()
129 | 


--------------------------------------------------------------------------------
/simple_realize/seq2seq_attention.py:
--------------------------------------------------------------------------------
  1 | # [Effective Approaches to Attention-based Neural Machine Translation](https://arxiv.org/pdf/1508.04025.pdf)
  2 | '''
  3 | created by YuYang github.com/W1Fl
  4 | '''
  5 | import tensorflow as tf
  6 | from tensorflow import keras
  7 | import utils
  8 | 
  9 | Batch_size = 64
 10 | Learn_rate = 0.01
 11 | Epochs = 15
 12 | DataSize = 8192
 13 | use_attention=True
 14 | 
 15 | 
 16 | 
 17 | class Attention(keras.layers.Layer):
 18 |     def __init__(self,dec_v_dim,**kwargs):
 19 |         super(Attention, self).__init__(**kwargs)
 20 |         self.dec_v_dim=dec_v_dim
 21 | 
 22 |     def build(self, input_shape):
 23 |         hs_shape,ht_shape=input_shape
 24 |         units=hs_shape[2]
 25 |         super(Attention, self).build(input_shape)
 26 |         self.Wa=self.add_weight('Wa',[units,units],tf.float32,keras.initializers.RandomNormal(),trainable=True)
 27 |         self.Wc=self.add_weight('Wc',[units*2,units],tf.float32,keras.initializers.RandomNormal(),trainable=True)
 28 |         self.dense=keras.layers.Dense(self.dec_v_dim,activation=keras.activations.softmax)
 29 | 
 30 |     def call(self,inputs, **kwargs):
 31 |         hs,ht=inputs #encoder输出序列[banch,enc_len,units],decoder输出[b,dec_len,units]
 32 |         # dec_len = 1 if not training
 33 |         score=ht@self.Wa@tf.transpose(hs,[0,2,1])#[banch,1,enc_len]
 34 |         at=tf.nn.softmax(score,name='attentionValue')
 35 |         ct=at@hs
 36 |         ht_=tf.nn.tanh(tf.concat([ct,ht],2)@self.Wc)
 37 |         ht_=ht_ if use_attention else ht
 38 |         y=self.dense(ht_) #ht_.shape==ht.shape
 39 |         return y,at
 40 | 
 41 | 
 42 | 
 43 | class Seq2Seq(keras.Model):
 44 |     def __init__(self, enc_v_dim, dec_v_dim, emb_dim, units, max_pred_len, start_token, end_token):
 45 |         super().__init__()
 46 |         self.enc_v_dim = enc_v_dim
 47 |         self.emb_dim = emb_dim
 48 |         self.units = units
 49 |         self.dec_v_dim = dec_v_dim
 50 |         self.max_pred_len = max_pred_len
 51 |         self.start_token = start_token
 52 |         self.end_token = end_token
 53 | 
 54 |     def build(self, input_shape):
 55 |         # encoder
 56 |         self.enc_embeddings = keras.layers.Embedding(
 57 |             input_dim=self.enc_v_dim,
 58 |             output_dim=self.emb_dim,  # [enc_n_vocab, emb_dim]
 59 |             embeddings_initializer=tf.initializers.RandomNormal(0., 0.1),
 60 |             name='encoder/embeddings'
 61 |         )
 62 |         self.encoder = keras.layers.LSTM(units=self.units, return_state=True,return_sequences=True, name='encoder/LSTM')
 63 | 
 64 |         # decoder
 65 |         self.dec_embeddings = keras.layers.Embedding(
 66 |             input_dim=self.dec_v_dim, output_dim=self.emb_dim,  # [dec_n_vocab, emb_dim]
 67 |             embeddings_initializer=tf.initializers.RandomNormal(0., 0.1),
 68 |             name='decoder/embeddings'
 69 | 
 70 |         )
 71 |         self.dec_embeddings.build((None, self.dec_v_dim))
 72 |         self.decoder = keras.layers.LSTM(units=self.units, return_state=True, return_sequences=True,
 73 |                                          name='decoder/LSTM')
 74 |         self.decoder_dense = keras.layers.Dense(self.dec_v_dim, activation=keras.activations.softmax,
 75 |                                                 name='decoder/Dense')
 76 |         self.attention=Attention(self.dec_v_dim)
 77 |         self.batch = input_shape[0][0]
 78 |         super(Seq2Seq, self).build([*input_shape])
 79 | 
 80 |     def encode(self, x):
 81 |         embedded = self.enc_embeddings(x)
 82 |         o, h, c = self.encoder(embedded)
 83 |         return o,h, c
 84 | 
 85 |     def decode(self, batch,enc_o, h, c, y=None, training=None):
 86 |         if training: #将上一时刻的标签作为当前时刻的输入
 87 |             y = self.dec_embeddings(y)
 88 |             y, h, c = self.decoder(y, (h, c))
 89 |             y,at = self.attention((enc_o,y))
 90 |         else:#将上一时刻的输出作为当前时刻的输入
 91 |             y = []
 92 |             o = tf.zeros((batch, 1, self.dec_v_dim))
 93 |             for i in range(self.max_pred_len):
 94 |                 o = o @ self.dec_embeddings.weights
 95 |                 o, h, c = self.decoder(o, (h, c))
 96 |                 o,at = self.attention((enc_o,o))
 97 |                 y.append(o)
 98 |             y = tf.concat(y, 1)
 99 |         return y
100 | 
101 |     # @tf.function
102 |     def call(self, inputs, training=None, mask=None):
103 |         x = inputs[0]
104 |         y = inputs[1]
105 |         if training:
106 |             y = tf.pad(y[:, :-1], [[0, 0], [1, 0]])
107 |         o,h, c = self.encode(x)
108 |         batch = tf.shape(x)[0]
109 |         y = self.decode(batch,o, h, c, y, training)
110 |         return y
111 | 
112 | 
113 | class myTensorboard(keras.callbacks.TensorBoard):
114 |     def __init__(self, data, log_dir='logs/seq2seq_attention', histogram_freq=1, write_graph=True, write_images=True,
115 |                  embeddings_freq=10, **kwargs):
116 |         self.data = data
117 |         super().__init__(log_dir=log_dir, histogram_freq=histogram_freq, write_graph=write_graph,
118 |                          write_images=write_images, embeddings_freq=embeddings_freq, **kwargs)
119 |     def on_epoch_end(self, epoch, logs=None):
120 |         if (not epoch % 1):
121 |             x, y, l = self.data.sample(1)
122 |             y_ = self.model((x, y), training=False)
123 |             y_ = tf.argmax(y_, -1).numpy()
124 |             target = self.data.idx2str(y[0])
125 |             res = self.data.idx2str(y_[0])
126 |             src = self.data.idx2str(x[0])
127 |             print(
128 |                 '\n',
129 |                 "t: ", epoch,
130 |                 "| input: ", src,
131 |                 "| target: ", target,
132 |                 "| inference: ", res,
133 |             )
134 |         super(myTensorboard, self).on_epoch_end(epoch, logs)
135 | 
136 | 
137 | def train():
138 |     # get and process data
139 |     data = utils.DateData(DataSize)
140 |     train_x, train_y, train_l = data.sample(DataSize)
141 | 
142 |     print("Chinese time order: yy/mm/dd ", data.date_cn[:3], "\nEnglish time order: dd/M/yyyy ", data.date_en[:3])
143 |     print("vocabularies: ", data.vocab)
144 |     print("x index sample: \n{}\n{}".format(data.idx2str(data.x[0]), data.x[0]),
145 |           "\ny index sample: \n{}\n{}".format(data.idx2str(data.y[0]), data.y[0]))
146 | 
147 |     model = Seq2Seq(
148 |         data.num_word, data.num_word, emb_dim=16, units=32,
149 |         max_pred_len=11, start_token=data.start_token, end_token=data.end_token)
150 |     model.compile(optimizer=keras.optimizers.Adam(Learn_rate), loss=keras.losses.SparseCategoricalCrossentropy(False),
151 |                   metrics=[keras.metrics.sparse_categorical_accuracy])
152 |     model.fit((train_x, train_y), train_y, callbacks=[myTensorboard(data)], batch_size=Batch_size, epochs=Epochs)
153 | 
154 | 
155 | if __name__ == "__main__":
156 |     train()


--------------------------------------------------------------------------------
/simple_realize/skip-gram.py:
--------------------------------------------------------------------------------
  1 | # [Efficient Estimation of Word Representations in Vector Space](https://arxiv.org/pdf/1301.3781.pdf)
  2 | '''
  3 | created by YuYang github.com/W1Fl
  4 | '''
  5 | from io import BytesIO
  6 | 
  7 | import imageio
  8 | import matplotlib.pyplot as plt
  9 | import tensorflow as tf
 10 | from tensorflow import keras
 11 | 
 12 | from utils import process_w2v_data
 13 | 
 14 | Batch_size = 32
 15 | Learn_rate = 0.01
 16 | Epochs = 256
 17 | DataSize = 512
 18 | 
 19 | corpus = [
 20 |     # numbers
 21 |     "5 2 4 8 6 2 3 6 4",
 22 |     "4 8 5 6 9 5 5 6",
 23 |     "1 1 5 2 3 3 8",
 24 |     "3 6 9 6 8 7 4 6 3",
 25 |     "8 9 9 6 1 4 3 4",
 26 |     "1 0 2 0 2 1 3 3 3 3 3",
 27 |     "9 3 3 0 1 4 7 8",
 28 |     "9 9 8 5 6 7 1 2 3 0 1 0",
 29 | 
 30 |     # alphabets, expecting that 9 is close to letters
 31 |     "a t g q e h 9 u f",
 32 |     "e q y u o i p s",
 33 |     "q o 9 p l k j o k k o p",
 34 |     "h g y i u t t a e q",
 35 |     "i k d q r e 9 e a d",
 36 |     "o p d g 9 s a f g a",
 37 |     "i u y g h k l a s w",
 38 |     "o l u y a o g f s",
 39 |     "o p i u y g d a s j d l",
 40 |     "u k i l o 9 l j s",
 41 |     "y g i s h k j l f r f",
 42 |     "i o h n 9 9 d 9 f a 9",
 43 | ]
 44 | 
 45 | SkipGram = lambda v_dim, emb_dim: keras.Sequential([
 46 |     keras.layers.Embedding(
 47 |         input_dim=v_dim, output_dim=emb_dim,  # [n_vocab, emb_dim]
 48 |         embeddings_initializer=keras.initializers.RandomNormal(0., 0.1),
 49 |     ),
 50 |     keras.layers.Flatten()
 51 | ])
 52 | 
 53 | 
 54 | class myTensorboard(keras.callbacks.TensorBoard):
 55 |     def __init__(self, data, log_dir='logs/skip-gram', histogram_freq=1, write_graph=True, write_images=True,
 56 |                  embeddings_freq=10, **kwargs):
 57 |         super().__init__(log_dir=log_dir, histogram_freq=histogram_freq, write_graph=write_graph,
 58 |                          write_images=write_images, embeddings_freq=embeddings_freq, **kwargs)
 59 |         self.buffer = BytesIO()
 60 |         self.data = data
 61 | 
 62 | 
 63 |     def plot(self, data):
 64 |         word_emb = model.layers[0].get_weights()[0]
 65 |         for i in range(data.num_word):
 66 |             c = "blue"
 67 |             try:
 68 |                 int(data.i2v[i])
 69 |             except ValueError:
 70 |                 c = "red"
 71 |             plt.text(word_emb[i, 0], word_emb[i, 1], s=data.i2v[i], color=c, weight="bold")
 72 |         plt.xlim(word_emb[:, 0].min() - .5, word_emb[:, 0].max() + .5)
 73 |         plt.ylim(word_emb[:, 1].min() - .5, word_emb[:, 1].max() + .5)
 74 |         plt.xticks(())
 75 |         plt.yticks(())
 76 |         plt.xlabel("embedding dim1")
 77 |         plt.ylabel("embedding dim2")
 78 |         plt.savefig(self.buffer, format='png')
 79 |         plt.close()
 80 |         self.buffer.seek(0)
 81 | 
 82 |     def on_epoch_end(self, epoch, logs=None):
 83 |         writer = self._get_writer(self._train_run_name)
 84 |         if (not epoch % 1):
 85 |             self.plot(self.data)
 86 |             with writer.as_default():
 87 |                 tf.summary.image('embedding', imageio.imread(self.buffer)[None, :], step=epoch)
 88 |                 self.buffer.seek(0)
 89 |         super(myTensorboard, self).on_epoch_end(epoch, logs)
 90 | 
 91 | 
 92 | class nce_loss(keras.losses.Loss):
 93 |     # negative sampling: take one positive label and num_sampled negative labels to compute the loss
 94 |     # in order to reduce the computation of full softmax
 95 |     def __init__(self, model, v_dim, emb_dim):
 96 |         super(nce_loss, self).__init__()
 97 |         # noise-contrastive estimation
 98 |         self.nce_w = model.add_weight(
 99 |             name="nce_w", shape=[v_dim, emb_dim],
100 |             initializer=keras.initializers.TruncatedNormal(0., 0.1))  # [n_vocab, emb_dim]
101 |         self.nce_b = model.add_weight(
102 |             name="nce_b", shape=(v_dim,),
103 |             initializer=keras.initializers.Constant(0.1))  # [n_vocab, ]
104 |         self.v_dim = v_dim
105 | 
106 |     def call(self, y_true, y_pred):
107 |         # return keras.losses.SparseCategoricalCrossentropy()(y_true,y_pred)
108 |         return tf.nn.nce_loss(
109 |             weights=self.nce_w, biases=self.nce_b, labels=y_true,
110 |             inputs=y_pred, num_sampled=5, num_classes=self.v_dim)
111 | 
112 | 
113 | if __name__ == "__main__":
114 |     d = process_w2v_data(corpus, skip_window=2, method="skip_gram")
115 |     bx, by = d.sample(DataSize)
116 |     model = SkipGram(d.num_word, 2)
117 |     model.compile(optimizer=keras.optimizers.Adam(Learn_rate), loss=nce_loss(model, d.num_word, 2))
118 |     model.fit(bx[..., None], by[..., None], Batch_size, Epochs, callbacks=[myTensorboard(d)], verbose=2)
119 | 
120 |     #use tensorboard --logdir logs --samples_per_plugin=images=255 to show all images
121 | 


--------------------------------------------------------------------------------
/simple_realize/transformer.py:
--------------------------------------------------------------------------------
  1 | # [Attention Is All You Need](https://arxiv.org/pdf/1706.03762.pdf)
  2 | '''
  3 | created by YuYang github.com/W1Fl
  4 | '''
  5 | import numpy as np
  6 | import tensorflow as tf
  7 | from tensorflow import keras
  8 | 
  9 | import utils
 10 | 
 11 | MODEL_DIM = 32
 12 | MAX_LEN = 12
 13 | N_LAYER = 3
 14 | N_HEAD = 4
 15 | DATA_SIZE = 6400
 16 | BATCH_SIZE = 64
 17 | LEARN_RATE = 0.001
 18 | EPOCHS = 60
 19 | 
 20 | 
 21 | class MultiHead(keras.layers.Layer):
 22 |     def __init__(self, n_head):
 23 |         super().__init__()
 24 |         self.n_head = n_head
 25 |         self.attention = None
 26 | 
 27 |     def build(self, input_shape):
 28 |         (q_b, q_t, q_f), (k_b, k_t, k_f), (v_b, v_t, v_f) = input_shape
 29 |         self.k_f = tf.cast(q_f, tf.float32)
 30 |         h_dim = q_f // self.n_head
 31 |         self.wq = self.add_weight('wq', [self.n_head, q_f, h_dim])
 32 |         self.wk = self.add_weight('wk', [self.n_head, k_f, h_dim])
 33 |         self.wv = self.add_weight('wv', [self.n_head, v_f, h_dim])
 34 |         self.wo = self.add_weight('wo', [self.n_head * h_dim, v_f])
 35 |         super(MultiHead, self).build(input_shape)
 36 | 
 37 |     def call(self, inputs, mask=None, **kwargs):
 38 |         i_q, i_k, i_v = [i[:, tf.newaxis, ...] for i in inputs]  # add multihead axis
 39 |         q = i_q @ self.wq  # [b,h,s,h_dim]
 40 |         k = i_k @ self.wk
 41 |         v = i_v @ self.wv
 42 |         s = q @ tf.transpose(k, [0, 1, 3, 2]) / (tf.math.sqrt(self.k_f) + 1e-8)
 43 |         if mask is not None:
 44 |             s += mask * -1e9
 45 |         a = tf.nn.softmax(s)
 46 |         self.attention = a
 47 |         b = a @ v
 48 |         o = tf.concat(tf.unstack(b, axis=1), 2) @ self.wo
 49 |         return o
 50 | 
 51 | 
 52 | class PositionWiseFFN(keras.layers.Layer):
 53 |     def build(self, input_shape):
 54 |         model_dim = input_shape[-1]
 55 |         dff = model_dim * 4
 56 |         self.l = keras.layers.Dense(dff, activation=keras.activations.relu)
 57 |         self.o = keras.layers.Dense(model_dim)
 58 |         super(PositionWiseFFN, self).build(input_shape)
 59 | 
 60 |     def call(self, x, **kwargs):
 61 |         o = self.l(x)
 62 |         o = self.o(o)
 63 |         return o  # [n, step, dim]
 64 | 
 65 | 
 66 | class EncodeLayer(keras.layers.Layer):
 67 |     def __init__(self, n_head):
 68 |         self.n_head = n_head
 69 |         super().__init__()
 70 | 
 71 |     def build(self, input_shape):
 72 |         model_dim = input_shape[-1]
 73 |         self.ln = [keras.layers.LayerNormalization() for _ in range(2)]
 74 |         self.mh = MultiHead(self.n_head)
 75 |         self.ffn = PositionWiseFFN(model_dim)
 76 |         super(EncodeLayer, self).build(input_shape)
 77 | 
 78 |     def call(self, inputs, mask=None, **kwargs):
 79 |         attn = self.mh([inputs] * 3, mask)  # [n, step, dim]
 80 |         o1 = self.ln[0](attn + inputs)
 81 |         ffn = self.ffn(o1)
 82 |         o = self.ln[1](ffn + o1)  # [n, step, dim]
 83 |         return o
 84 | 
 85 | 
 86 | class Encoder(keras.layers.Layer):
 87 |     def __init__(self, n_head, n_layer):
 88 |         super().__init__()
 89 |         self.n_layer = n_layer
 90 |         self.n_head = n_head
 91 | 
 92 |     def build(self, input_shape):
 93 |         self.ls = [EncodeLayer(self.n_head) for _ in range(self.n_layer)]
 94 |         super(Encoder, self).build(input_shape)
 95 | 
 96 |     def call(self, inputs, mask=None, **kwargs):
 97 |         xz = inputs
 98 |         for l in self.ls:
 99 |             xz = l(xz, mask)
100 |         return xz  # [n, step, dim]
101 | 
102 | 
103 | class DecoderLayer(keras.layers.Layer):
104 |     def __init__(self, n_head):
105 |         super().__init__()
106 |         self.n_head = n_head
107 | 
108 |     def build(self, input_shape):
109 |         self.mh = [MultiHead(self.n_head) for _ in range(2)]
110 |         self.ffn = PositionWiseFFN(input_shape[-1])
111 |         self.ln = [keras.layers.LayerNormalization() for i in range(3)]
112 |         super(DecoderLayer, self).build(input_shape)
113 | 
114 |     def call(self, inputs, look_ahead_mask=None, pad_mask=None, **kwargs):
115 |         xz, yz = inputs
116 |         attn = self.mh[0]([yz] * 3, mask=look_ahead_mask)  # decoder self attention
117 |         o1 = self.ln[0](attn + yz)
118 |         attn = self.mh[1]([o1, xz, xz], mask=pad_mask)  # decoder + encoder attention
119 |         o2 = self.ln[1](attn + o1)
120 |         ffn = self.ffn(o2)
121 |         o = self.ln[2](ffn + o2)
122 |         return o
123 | 
124 | 
125 | class Decoder(keras.layers.Layer):
126 |     def __init__(self, n_head, n_layer):
127 |         super().__init__()
128 |         self.n_head = n_head
129 |         self.n_layer = n_layer
130 | 
131 |     def build(self, input_shape):
132 |         self.ls = [DecoderLayer(self.n_head) for _ in range(self.n_layer)]
133 |         super(Decoder, self).build(input_shape)
134 | 
135 |     def call(self, inputs, look_ahead_mask=None, pad_mask=None):
136 |         xz, yz = inputs
137 |         for l in self.ls:
138 |             yz = l((xz, yz), look_ahead_mask, pad_mask)
139 |         return yz
140 | 
141 | 
142 | class PositionEmbedding(keras.layers.Layer):
143 |     def __init__(self, max_len, model_dim, n_vocab):
144 |         super().__init__()
145 |         self.n_vocab = n_vocab
146 |         self.max_len = max_len
147 |         self.model_dim = model_dim
148 | 
149 |     def build(self, input_shape):
150 |         pos = np.arange(self.max_len)[:, None]
151 |         pe = pos / np.power(10000, 2. * np.arange(self.model_dim)[None, :] / self.model_dim)  # [max_len, dim]
152 |         pe[:, 0::2] = np.sin(pe[:, 0::2])
153 |         pe[:, 1::2] = np.cos(pe[:, 1::2])
154 |         pe = pe[None, :, :]  # [1, max_len, model_dim]    for batch adding
155 |         self.pe = tf.constant(pe, dtype=tf.float32)
156 |         self.embeddings = keras.layers.Embedding(
157 |             input_dim=self.n_vocab, output_dim=self.model_dim,  # [n_vocab, dim]
158 |             embeddings_initializer=tf.initializers.RandomNormal(0., 0.01),
159 |         )
160 |         super(PositionEmbedding, self).build(input_shape)
161 | 
162 |     def call(self, x, **kwargs):
163 |         x_embed = self.embeddings(x) + self.pe  # [n, step, dim]
164 |         return x_embed
165 | 
166 | 
167 | class Transformer(keras.Model):
168 |     def __init__(self, model_dim, max_len, n_encoder_layer, n_decoder_layer, n_head, n_vocab, padding_idx=0):
169 |         super().__init__()
170 |         self.n_vocab = n_vocab
171 |         self.n_decoder_layer = n_decoder_layer
172 |         self.n_encoder_layer = n_encoder_layer
173 |         self.n_head = n_head
174 |         self.model_dim = model_dim
175 |         self.max_len = max_len
176 |         self.padding_idx = padding_idx
177 | 
178 |     def build(self, input_shape):
179 |         self.embed = PositionEmbedding(self.max_len, self.model_dim, self.n_vocab)
180 |         self.encoder = Encoder(self.n_head, self.n_encoder_layer)
181 |         self.decoder = Decoder(self.n_head, self.n_decoder_layer)
182 |         self.o = keras.layers.Dense(self.n_vocab)
183 |         super(Transformer, self).build(input_shape)
184 | 
185 |     def call(self, inputs, training=None, **kwargs):
186 |         x, y = inputs
187 |         x_embed, y_embed = self.embed(x), self.embed(y)
188 |         pad_mask = self._pad_mask(x)
189 |         encoded_z = self.encoder(x_embed, mask=pad_mask)
190 |         decoded_z = self.decoder(
191 |             (encoded_z, y_embed), look_ahead_mask=self._look_ahead_mask(y), pad_mask=pad_mask)
192 |         o = self.o(decoded_z)
193 |         return o
194 | 
195 |     def _pad_mask(self, seqs):
196 |         mask = tf.cast(tf.math.equal(seqs, self.padding_idx), tf.float32)
197 |         return mask[:, tf.newaxis, tf.newaxis, :]  # (n, 1, 1, step)
198 | 
199 |     def _look_ahead_mask(self, seqs):
200 |         mask = 1. - tf.linalg.band_part(tf.ones((self.max_len, self.max_len)), -1, 0)
201 |         pad_mask = self._pad_mask(seqs)
202 |         mask = tf.sign(pad_mask + mask[tf.newaxis, tf.newaxis, ...])
203 |         return mask  # (step, step)
204 | 
205 |     def translate(self, src, i2v, v2i):
206 |         src = tf.reshape(src, (-1, src.shape[-1]))
207 |         src_pad = utils.pad_zero(src, self.max_len)
208 |         tgt = utils.pad_zero(v2i["<GO>"] * tf.ones_like(src), self.max_len + 1)
209 |         tgti = 0
210 |         x_embed = self.embed(src_pad)
211 |         encoded_z = self.encoder(x_embed, mask=self._pad_mask(src_pad))
212 |         while True:
213 |             y = tgt[:, :-1]
214 |             y_embed = self.embed(y)
215 |             decoded_z = self.decoder(
216 |                 (encoded_z, y_embed), look_ahead_mask=self._look_ahead_mask(y), pad_mask=self._pad_mask(src_pad))
217 |             logit = self.o(decoded_z)[:, tgti, :].numpy()
218 |             idx = np.argmax(logit, 1)
219 |             tgti += 1
220 |             tgt[:, tgti] = idx
221 |             if tgti >= self.max_len:
222 |                 break
223 |         return ["".join([i2v[i] for i in tgt[j, 1:tgti]]) for j in range(len(src))]
224 | 
225 | 
226 | class Loss(keras.losses.Loss):
227 |     def __init__(self, padding_idx=0):
228 |         super().__init__()
229 |         self.padding_idx = padding_idx
230 |         self.crossentropy = keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction="none")
231 | 
232 |     def call(self, y_true, y_pred):
233 |         y_true = tf.reshape(y_true, [tf.shape(y_true)[0], tf.shape(y_true)[1]])
234 |         pad_mask = tf.math.not_equal(y_true, self.padding_idx)
235 |         loss = tf.reduce_mean(tf.boolean_mask(self.crossentropy(y_true, y_pred), pad_mask))
236 |         return loss
237 | 
238 | 
239 | class myTensorboard(keras.callbacks.TensorBoard):
240 |     def __init__(self, data, log_dir='logs/transformer', histogram_freq=1, write_graph=True, write_images=True,
241 |                  embeddings_freq=10, **kwargs):
242 |         self.data = data
243 |         super().__init__(log_dir=log_dir, histogram_freq=histogram_freq, write_graph=write_graph,
244 |                          write_images=write_images, embeddings_freq=embeddings_freq, **kwargs)
245 | 
246 |     def on_epoch_end(self, epoch, logs=None):
247 |         idx2str=lambda idx:[self.data.idx2str(i) for i in idx]
248 |         if (not epoch % 1):
249 |             (x, y), _ = load_data(self.data,3)
250 |             res = self.model.translate(x, self.data.i2v, self.data.v2i)
251 |             target =idx2str(y)
252 |             src = idx2str(x)
253 |             print(
254 |                 '\n',
255 |                 "| input: ", *src,'\n',
256 |                 "| target: ",*target,'\n',
257 |                 "| inference: ", *res,'\n',
258 |             )
259 |         super(myTensorboard, self).on_epoch_end(epoch, logs)
260 | 
261 | 
262 | def load_data(data,size):
263 |     x, y, seq_len = data.sample(size)
264 |     x = utils.pad_zero(x, MAX_LEN)
265 |     y = utils.pad_zero(y, MAX_LEN + 1)
266 |     return (x, y[:, :-1]), y[:, 1:]
267 | 
268 | 
269 | def train(model: Transformer, data):
270 |     x, y = load_data(data,DATA_SIZE)
271 |     tb = myTensorboard(data)
272 |     model.compile(keras.optimizers.Adam(LEARN_RATE), loss=Loss())
273 |     model.fit(x, y, batch_size=BATCH_SIZE, epochs=EPOCHS, callbacks=[tb])
274 | 
275 | 
276 | if __name__ == "__main__":
277 |     d = utils.DateData(DATA_SIZE)
278 |     print("Chinese time order: yy/mm/dd ", d.date_cn[:3], "\nEnglish time order: dd/M/yyyy ", d.date_en[:3])
279 |     print("vocabularies: ", d.vocab)
280 |     print("x index sample: \n{}\n{}".format(d.idx2str(d.x[0]), d.x[0]),
281 |           "\ny index sample: \n{}\n{}".format(d.idx2str(d.y[0]), d.y[0]))
282 |     m = Transformer(MODEL_DIM, MAX_LEN, N_LAYER, N_LAYER, N_HEAD, d.num_word)
283 |     m.build([[None, 12], [None, 12]])
284 |     train(m, d)
285 | 


--------------------------------------------------------------------------------
/skip-gram.py:
--------------------------------------------------------------------------------
 1 | # [Efficient Estimation of Word Representations in Vector Space](https://arxiv.org/pdf/1301.3781.pdf)
 2 | import tensorflow as tf
 3 | from tensorflow import keras
 4 | from utils import process_w2v_data      # this refers to utils.py in my [repo](https://github.com/MorvanZhou/NLP-Tutorials/)
 5 | from visual import show_w2v_word_embedding  # this refers to visual.py in my [repo](https://github.com/MorvanZhou/NLP-Tutorials/)
 6 | 
 7 | corpus = [
 8 |     # numbers
 9 |     "5 2 4 8 6 2 3 6 4",
10 |     "4 8 5 6 9 5 5 6",
11 |     "1 1 5 2 3 3 8",
12 |     "3 6 9 6 8 7 4 6 3",
13 |     "8 9 9 6 1 4 3 4",
14 |     "1 0 2 0 2 1 3 3 3 3 3",
15 |     "9 3 3 0 1 4 7 8",
16 |     "9 9 8 5 6 7 1 2 3 0 1 0",
17 | 
18 |     # alphabets, expecting that 9 is close to letters
19 |     "a t g q e h 9 u f",
20 |     "e q y u o i p s",
21 |     "q o 9 p l k j o k k o p",
22 |     "h g y i u t t a e q",
23 |     "i k d q r e 9 e a d",
24 |     "o p d g 9 s a f g a",
25 |     "i u y g h k l a s w",
26 |     "o l u y a o g f s",
27 |     "o p i u y g d a s j d l",
28 |     "u k i l o 9 l j s",
29 |     "y g i s h k j l f r f",
30 |     "i o h n 9 9 d 9 f a 9",
31 | ]
32 | 
33 | 
34 | class SkipGram(keras.Model):
35 |     def __init__(self, v_dim, emb_dim):
36 |         super().__init__()
37 |         self.v_dim = v_dim
38 |         self.embeddings = keras.layers.Embedding(
39 |             input_dim=v_dim, output_dim=emb_dim,       # [n_vocab, emb_dim]
40 |             embeddings_initializer=keras.initializers.RandomNormal(0., 0.1),
41 |         )
42 | 
43 |         # noise-contrastive estimation
44 |         self.nce_w = self.add_weight(
45 |             name="nce_w", shape=[v_dim, emb_dim],
46 |             initializer=keras.initializers.TruncatedNormal(0., 0.1))  # [n_vocab, emb_dim]
47 |         self.nce_b = self.add_weight(
48 |             name="nce_b", shape=(v_dim,),
49 |             initializer=keras.initializers.Constant(0.1))  # [n_vocab, ]
50 | 
51 |         self.opt = keras.optimizers.Adam(0.01)
52 | 
53 |     def call(self, x, training=None, mask=None):
54 |         # x.shape = [n, ]
55 |         o = self.embeddings(x)      # [n, emb_dim]
56 |         return o
57 | 
58 |     # negative sampling: take one positive label and num_sampled negative labels to compute the loss
59 |     # in order to reduce the computation of full softmax
60 |     def loss(self, x, y, training=None):
61 |         embedded = self.call(x, training)
62 |         return tf.reduce_mean(
63 |             tf.nn.nce_loss(
64 |                 weights=self.nce_w, biases=self.nce_b, labels=tf.expand_dims(y, axis=1),
65 |                 inputs=embedded, num_sampled=5, num_classes=self.v_dim))
66 | 
67 |     def step(self, x, y):
68 |         with tf.GradientTape() as tape:
69 |             loss = self.loss(x, y, True)
70 |             grads = tape.gradient(loss, self.trainable_variables)
71 |         self.opt.apply_gradients(zip(grads, self.trainable_variables))
72 |         return loss.numpy()
73 | 
74 | 
75 | def train(model, data):
76 |     for t in range(2500):
77 |         bx, by = data.sample(8)
78 |         loss = model.step(bx, by)
79 |         if t % 200 == 0:
80 |             print("step: {} | loss: {}".format(t, loss))
81 | 
82 | 
83 | if __name__ == "__main__":
84 |     d = process_w2v_data(corpus, skip_window=2, method="skip_gram")
85 |     m = SkipGram(d.num_word, 2)
86 |     train(m, d)
87 | 
88 |     # plotting
89 |     show_w2v_word_embedding(m, d, "./visual/results/skipgram.png")


--------------------------------------------------------------------------------
/tf_idf.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from collections import Counter
  3 | import itertools
  4 | from visual import show_tfidf   # this refers to visual.py in my [repo](https://github.com/MorvanZhou/NLP-Tutorials/)
  5 | 
  6 | docs = [
  7 |     "it is a good day, I like to stay here",
  8 |     "I am happy to be here",
  9 |     "I am bob",
 10 |     "it is sunny today",
 11 |     "I have a party today",
 12 |     "it is a dog and that is a cat",
 13 |     "there are dog and cat on the tree",
 14 |     "I study hard this morning",
 15 |     "today is a good day",
 16 |     "tomorrow will be a good day",
 17 |     "I like coffee, I like book and I like apple",
 18 |     "I do not like it",
 19 |     "I am kitty, I like bob",
 20 |     "I do not care who like bob, but I like kitty",
 21 |     "It is coffee time, bring your cup",
 22 | ]
 23 | 
 24 | docs_words = [d.replace(",", "").split(" ") for d in docs]
 25 | vocab = set(itertools.chain(*docs_words))
 26 | v2i = {v: i for i, v in enumerate(vocab)}
 27 | i2v = {i: v for v, i in v2i.items()}
 28 | 
 29 | 
 30 | def safe_log(x):
 31 |     mask = x != 0
 32 |     x[mask] = np.log(x[mask])
 33 |     return x
 34 | 
 35 | 
 36 | tf_methods = {
 37 |         "log": lambda x: np.log(1+x),
 38 |         "augmented": lambda x: 0.5 + 0.5 * x / np.max(x, axis=1, keepdims=True),
 39 |         "boolean": lambda x: np.minimum(x, 1),
 40 |         "log_avg": lambda x: (1 + safe_log(x)) / (1 + safe_log(np.mean(x, axis=1, keepdims=True))),
 41 |     }
 42 | idf_methods = {
 43 |         "log": lambda x: 1 + np.log(len(docs) / (x+1)),
 44 |         "prob": lambda x: np.maximum(0, np.log((len(docs) - x) / (x+1))),
 45 |         "len_norm": lambda x: x / (np.sum(np.square(x))+1),
 46 |     }
 47 | 
 48 | 
 49 | def get_tf(method="log"):
 50 |     # term frequency: how frequent a word appears in a doc
 51 |     _tf = np.zeros((len(vocab), len(docs)), dtype=np.float64)    # [n_vocab, n_doc]
 52 |     for i, d in enumerate(docs_words):
 53 |         counter = Counter(d)
 54 |         for v in counter.keys():
 55 |             _tf[v2i[v], i] = counter[v] / counter.most_common(1)[0][1]
 56 | 
 57 |     weighted_tf = tf_methods.get(method, None)
 58 |     if weighted_tf is None:
 59 |         raise ValueError
 60 |     return weighted_tf(_tf)
 61 | 
 62 | 
 63 | def get_idf(method="log"):
 64 |     # inverse document frequency: low idf for a word appears in more docs, mean less important
 65 |     df = np.zeros((len(i2v), 1))
 66 |     for i in range(len(i2v)):
 67 |         d_count = 0
 68 |         for d in docs_words:
 69 |             d_count += 1 if i2v[i] in d else 0
 70 |         df[i, 0] = d_count
 71 | 
 72 |     idf_fn = idf_methods.get(method, None)
 73 |     if idf_fn is None:
 74 |         raise ValueError
 75 |     return idf_fn(df)
 76 | 
 77 | 
 78 | def cosine_similarity(q, _tf_idf):
 79 |     unit_q = q / np.sqrt(np.sum(np.square(q), axis=0, keepdims=True))
 80 |     unit_ds = _tf_idf / np.sqrt(np.sum(np.square(_tf_idf), axis=0, keepdims=True))
 81 |     similarity = unit_ds.T.dot(unit_q).ravel()
 82 |     return similarity
 83 | 
 84 | 
 85 | def docs_score(q, len_norm=False):
 86 |     q_words = q.replace(",", "").split(" ")
 87 | 
 88 |     # add unknown words
 89 |     unknown_v = 0
 90 |     for v in set(q_words):
 91 |         if v not in v2i:
 92 |             v2i[v] = len(v2i)
 93 |             i2v[len(v2i)-1] = v
 94 |             unknown_v += 1
 95 |     if unknown_v > 0:
 96 |         _idf = np.concatenate((idf, np.zeros((unknown_v, 1), dtype=np.float)), axis=0)
 97 |         _tf_idf = np.concatenate((tf_idf, np.zeros((unknown_v, tf_idf.shape[1]), dtype=np.float)), axis=0)
 98 |     else:
 99 |         _idf, _tf_idf = idf, tf_idf
100 |     counter = Counter(q_words)
101 |     q_tf = np.zeros((len(_idf), 1), dtype=np.float)     # [n_vocab, 1]
102 |     for v in counter.keys():
103 |         q_tf[v2i[v], 0] = counter[v]
104 | 
105 |     q_vec = q_tf * _idf            # [n_vocab, 1]
106 | 
107 |     q_scores = cosine_similarity(q_vec, _tf_idf)
108 |     if len_norm:
109 |         len_docs = [len(d) for d in docs_words]
110 |         q_scores = q_scores / np.array(len_docs)
111 |     return q_scores
112 | 
113 | 
114 | def get_keywords(n=2):
115 |     for c in range(3):
116 |         col = tf_idf[:, c]
117 |         idx = np.argsort(col)[-n:]
118 |         print("doc{}, top{} keywords {}".format(c, n, [i2v[i] for i in idx]))
119 | 
120 | 
121 | tf = get_tf()           # [n_vocab, n_doc]
122 | idf = get_idf()         # [n_vocab, 1]
123 | tf_idf = tf * idf       # [n_vocab, n_doc]
124 | print("tf shape(vecb in each docs): ", tf.shape)
125 | print("\ntf samples:\n", tf[:2])
126 | print("\nidf shape(vecb in all docs): ", idf.shape)
127 | print("\nidf samples:\n", idf[:2])
128 | print("\ntf_idf shape: ", tf_idf.shape)
129 | print("\ntf_idf sample:\n", tf_idf[:2])
130 | 
131 | 
132 | # test
133 | get_keywords()
134 | q = "I get a coffee cup"
135 | scores = docs_score(q)
136 | d_ids = scores.argsort()[-3:][::-1]
137 | print("\ntop 3 docs for '{}':\n{}".format(q, [docs[i] for i in d_ids]))
138 | 
139 | show_tfidf(tf_idf.T, [i2v[i] for i in range(tf_idf.shape[0])], "tfidf_matrix")


--------------------------------------------------------------------------------
/tf_idf_sklearn.py:
--------------------------------------------------------------------------------
 1 | from sklearn.feature_extraction.text import TfidfVectorizer
 2 | from sklearn.metrics.pairwise import cosine_similarity
 3 | from visual import show_tfidf   # this refers to visual.py in my [repo](https://github.com/MorvanZhou/NLP-Tutorials/)
 4 | 
 5 | 
 6 | docs = [
 7 |     "it is a good day, I like to stay here",
 8 |     "I am happy to be here",
 9 |     "I am bob",
10 |     "it is sunny today",
11 |     "I have a party today",
12 |     "it is a dog and that is a cat",
13 |     "there are dog and cat on the tree",
14 |     "I study hard this morning",
15 |     "today is a good day",
16 |     "tomorrow will be a good day",
17 |     "I like coffee, I like book and I like apple",
18 |     "I do not like it",
19 |     "I am kitty, I like bob",
20 |     "I do not care who like bob, but I like kitty",
21 |     "It is coffee time, bring your cup",
22 | ]
23 | 
24 | vectorizer = TfidfVectorizer()
25 | tf_idf = vectorizer.fit_transform(docs)
26 | print("idf: ", [(n, idf) for idf, n in zip(vectorizer.idf_, vectorizer.get_feature_names())])
27 | print("v2i: ", vectorizer.vocabulary_)
28 | 
29 | 
30 | q = "I get a coffee cup"
31 | qtf_idf = vectorizer.transform([q])
32 | res = cosine_similarity(tf_idf, qtf_idf)
33 | res = res.ravel().argsort()[-3:]
34 | print("\ntop 3 docs for '{}':\n{}".format(q, [docs[i] for i in res[::-1]]))
35 | 
36 | 
37 | i2v = {i: v for v, i in vectorizer.vocabulary_.items()}
38 | dense_tfidf = tf_idf.todense()
39 | show_tfidf(dense_tfidf, [i2v[i] for i in range(dense_tfidf.shape[1])], "tfidf_sklearn_matrix")


--------------------------------------------------------------------------------
/transformer.py:
--------------------------------------------------------------------------------
  1 | # [Attention Is All You Need](https://arxiv.org/pdf/1706.03762.pdf)
  2 | import tensorflow as tf
  3 | from tensorflow import keras
  4 | import numpy as np
  5 | import utils    # this refers to utils.py in my [repo](https://github.com/MorvanZhou/NLP-Tutorials/)
  6 | import time
  7 | import pickle
  8 | import os
  9 | 
 10 | MODEL_DIM = 32
 11 | MAX_LEN = 12
 12 | N_LAYER = 3
 13 | N_HEAD = 4
 14 | DROP_RATE = 0.1
 15 | 
 16 | 
 17 | class MultiHead(keras.layers.Layer):
 18 |     def __init__(self, n_head, model_dim, drop_rate):
 19 |         super().__init__()
 20 |         self.head_dim = model_dim // n_head
 21 |         self.n_head = n_head
 22 |         self.model_dim = model_dim
 23 |         self.wq = keras.layers.Dense(n_head * self.head_dim)
 24 |         self.wk = keras.layers.Dense(n_head * self.head_dim)
 25 |         self.wv = keras.layers.Dense(n_head * self.head_dim)      # [n, step, h*h_dim]
 26 | 
 27 |         self.o_dense = keras.layers.Dense(model_dim)
 28 |         self.o_drop = keras.layers.Dropout(rate=drop_rate)
 29 |         self.attention = None
 30 | 
 31 |     def call(self, q, k, v, mask, training):
 32 |         _q = self.wq(q)      # [n, q_step, h*h_dim]
 33 |         _k, _v = self.wk(k), self.wv(v)     # [n, step, h*h_dim]
 34 |         _q = self.split_heads(_q)  # [n, h, q_step, h_dim]
 35 |         _k, _v = self.split_heads(_k), self.split_heads(_v)  # [n, h, step, h_dim]
 36 |         context = self.scaled_dot_product_attention(_q, _k, _v, mask)     # [n, q_step, h*dv]
 37 |         o = self.o_dense(context)       # [n, step, dim]
 38 |         o = self.o_drop(o, training=training)
 39 |         return o
 40 | 
 41 |     def split_heads(self, x):
 42 |         x = tf.reshape(x, (x.shape[0], x.shape[1], self.n_head, self.head_dim))  # [n, step, h, h_dim]
 43 |         return tf.transpose(x, perm=[0, 2, 1, 3])       # [n, h, step, h_dim]
 44 | 
 45 |     def scaled_dot_product_attention(self, q, k, v, mask=None):
 46 |         dk = tf.cast(k.shape[-1], dtype=tf.float32)
 47 |         score = tf.matmul(q, k, transpose_b=True) / (tf.math.sqrt(dk) + 1e-8)  # [n, h_dim, q_step, step]
 48 |         if mask is not None:
 49 |             score += mask * -1e9
 50 |         self.attention = tf.nn.softmax(score, axis=-1)                               # [n, h, q_step, step]
 51 |         context = tf.matmul(self.attention, v)         # [n, h, q_step, step] @ [n, h, step, dv] = [n, h, q_step, dv]
 52 |         context = tf.transpose(context, perm=[0, 2, 1, 3])   # [n, q_step, h, dv]
 53 |         context = tf.reshape(context, (context.shape[0], context.shape[1], -1))     # [n, q_step, h*dv]
 54 |         return context
 55 | 
 56 | 
 57 | class PositionWiseFFN(keras.layers.Layer):
 58 |     def __init__(self, model_dim):
 59 |         super().__init__()
 60 |         dff = model_dim * 4
 61 |         self.l = keras.layers.Dense(dff, activation=keras.activations.relu)
 62 |         self.o = keras.layers.Dense(model_dim)
 63 | 
 64 |     def call(self, x):
 65 |         o = self.l(x)
 66 |         o = self.o(o)
 67 |         return o         # [n, step, dim]
 68 | 
 69 | 
 70 | class EncodeLayer(keras.layers.Layer):
 71 |     def __init__(self, n_head, model_dim, drop_rate):
 72 |         super().__init__()
 73 |         self.ln = [keras.layers.LayerNormalization(axis=-1) for _ in range(2)]  # only norm z-dim
 74 |         self.mh = MultiHead(n_head, model_dim, drop_rate)
 75 |         self.ffn = PositionWiseFFN(model_dim)
 76 |         self.drop = keras.layers.Dropout(drop_rate)
 77 | 
 78 |     def call(self, xz, training, mask):
 79 |         attn = self.mh.call(xz, xz, xz, mask, training)       # [n, step, dim]
 80 |         o1 = self.ln[0](attn + xz)
 81 |         ffn = self.drop(self.ffn.call(o1), training)
 82 |         o = self.ln[1](ffn + o1)         # [n, step, dim]
 83 |         return o
 84 | 
 85 | 
 86 | class Encoder(keras.layers.Layer):
 87 |     def __init__(self, n_head, model_dim, drop_rate, n_layer):
 88 |         super().__init__()
 89 |         self.ls = [EncodeLayer(n_head, model_dim, drop_rate) for _ in range(n_layer)]
 90 | 
 91 |     def call(self, xz, training, mask):
 92 |         for l in self.ls:
 93 |             xz = l.call(xz, training, mask)
 94 |         return xz       # [n, step, dim]
 95 | 
 96 | 
 97 | class DecoderLayer(keras.layers.Layer):
 98 |     def __init__(self, n_head, model_dim, drop_rate):
 99 |         super().__init__()
100 |         self.ln = [keras.layers.LayerNormalization(axis=-1) for _ in range(3)] # only norm z-dim
101 |         self.drop = keras.layers.Dropout(drop_rate)
102 |         self.mh = [MultiHead(n_head, model_dim, drop_rate) for _ in range(2)]
103 |         self.ffn = PositionWiseFFN(model_dim)
104 | 
105 |     def call(self, yz, xz, training, yz_look_ahead_mask, xz_pad_mask):
106 |         attn = self.mh[0].call(yz, yz, yz, yz_look_ahead_mask, training)       # decoder self attention
107 |         o1 = self.ln[0](attn + yz)
108 |         attn = self.mh[1].call(o1, xz, xz, xz_pad_mask, training)       # decoder + encoder attention
109 |         o2 = self.ln[1](attn + o1)
110 |         ffn = self.drop(self.ffn.call(o2), training)
111 |         o = self.ln[2](ffn + o2)
112 |         return o
113 | 
114 | 
115 | class Decoder(keras.layers.Layer):
116 |     def __init__(self, n_head, model_dim, drop_rate, n_layer):
117 |         super().__init__()
118 |         self.ls = [DecoderLayer(n_head, model_dim, drop_rate) for _ in range(n_layer)]
119 | 
120 |     def call(self, yz, xz, training, yz_look_ahead_mask, xz_pad_mask):
121 |         for l in self.ls:
122 |             yz = l.call(yz, xz, training, yz_look_ahead_mask, xz_pad_mask)
123 |         return yz
124 | 
125 | 
126 | class PositionEmbedding(keras.layers.Layer):
127 |     def __init__(self, max_len, model_dim, n_vocab):
128 |         super().__init__()
129 |         pos = np.arange(max_len)[:, None]
130 |         pe = pos / np.power(10000, 2. * np.arange(model_dim)[None, :] / model_dim)  # [max_len, dim]
131 |         pe[:, 0::2] = np.sin(pe[:, 0::2])
132 |         pe[:, 1::2] = np.cos(pe[:, 1::2])
133 |         pe = pe[None, :, :]  # [1, max_len, model_dim]    for batch adding
134 |         self.pe = tf.constant(pe, dtype=tf.float32)
135 |         self.embeddings = keras.layers.Embedding(
136 |             input_dim=n_vocab, output_dim=model_dim,  # [n_vocab, dim]
137 |             embeddings_initializer=tf.initializers.RandomNormal(0., 0.01),
138 |         )
139 | 
140 |     def call(self, x):
141 |         x_embed = self.embeddings(x) + self.pe  # [n, step, dim]
142 |         return x_embed
143 | 
144 | 
145 | class Transformer(keras.Model):
146 |     def __init__(self, model_dim, max_len, n_layer, n_head, n_vocab, drop_rate=0.1, padding_idx=0):
147 |         super().__init__()
148 |         self.max_len = max_len
149 |         self.padding_idx = padding_idx
150 | 
151 |         self.embed = PositionEmbedding(max_len, model_dim, n_vocab)
152 |         self.encoder = Encoder(n_head, model_dim, drop_rate, n_layer)
153 |         self.decoder = Decoder(n_head, model_dim, drop_rate, n_layer)
154 |         self.o = keras.layers.Dense(n_vocab)
155 | 
156 |         self.cross_entropy = keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction="none")
157 |         self.opt = keras.optimizers.Adam(0.002)
158 | 
159 |     def call(self, x, y, training=None):
160 |         x_embed, y_embed = self.embed(x), self.embed(y)
161 |         pad_mask = self._pad_mask(x)
162 |         encoded_z = self.encoder.call(x_embed, training, mask=pad_mask)
163 |         decoded_z = self.decoder.call(
164 |             y_embed, encoded_z, training, yz_look_ahead_mask=self._look_ahead_mask(y), xz_pad_mask=pad_mask)
165 |         o = self.o(decoded_z)
166 |         return o
167 | 
168 |     def step(self, x, y):
169 |         with tf.GradientTape() as tape:
170 |             logits = self.call(x, y[:, :-1], training=True)
171 |             pad_mask = tf.math.not_equal(y[:, 1:], self.padding_idx)
172 |             loss = tf.reduce_mean(tf.boolean_mask(self.cross_entropy(y[:, 1:], logits), pad_mask))
173 |         grads = tape.gradient(loss, self.trainable_variables)
174 |         self.opt.apply_gradients(zip(grads, self.trainable_variables))
175 |         return loss, logits
176 | 
177 |     def _pad_bool(self, seqs):
178 |         return tf.math.equal(seqs, self.padding_idx)
179 | 
180 |     def _pad_mask(self, seqs):
181 |         mask = tf.cast(self._pad_bool(seqs), tf.float32)
182 |         return mask[:, tf.newaxis, tf.newaxis, :]  # (n, 1, 1, step)
183 | 
184 |     def _look_ahead_mask(self, seqs):
185 |         mask = 1 - tf.linalg.band_part(tf.ones((self.max_len, self.max_len)), -1, 0)
186 |         mask = tf.where(self._pad_bool(seqs)[:, tf.newaxis, tf.newaxis, :], 1, mask[tf.newaxis, tf.newaxis, :, :])
187 |         return mask  # (step, step)
188 | 
189 |     def translate(self, src, v2i, i2v):
190 |         src_pad = utils.pad_zero(src, self.max_len)
191 |         tgt = utils.pad_zero(np.array([[v2i["<GO>"], ] for _ in range(len(src))]), self.max_len+1)
192 |         tgti = 0
193 |         x_embed = self.embed(src_pad)
194 |         encoded_z = self.encoder.call(x_embed, False, mask=self._pad_mask(src_pad))
195 |         while True:
196 |             y = tgt[:, :-1]
197 |             y_embed = self.embed(y)
198 |             decoded_z = self.decoder.call(
199 |                 y_embed, encoded_z, False, yz_look_ahead_mask=self._look_ahead_mask(y), xz_pad_mask=self._pad_mask(src_pad))
200 |             logits = self.o(decoded_z)[:, tgti, :].numpy()
201 |             idx = np.argmax(logits, axis=1)
202 |             tgti += 1
203 |             tgt[:, tgti] = idx
204 |             if tgti >= self.max_len:
205 |                 break
206 |         return ["".join([i2v[i] for i in tgt[j, 1:tgti]]) for j in range(len(src))]
207 | 
208 |     @property
209 |     def attentions(self):
210 |         attentions = {
211 |             "encoder": [l.mh.attention.numpy() for l in self.encoder.ls],
212 |             "decoder": {
213 |                 "mh1": [l.mh[0].attention.numpy() for l in self.decoder.ls],
214 |                 "mh2": [l.mh[1].attention.numpy() for l in self.decoder.ls],
215 |         }}
216 |         return attentions
217 | 
218 | 
219 | def train(model, data, step):
220 |     # training
221 |     t0 = time.time()
222 |     for t in range(step):
223 |         bx, by, seq_len = data.sample(64)
224 |         bx, by = utils.pad_zero(bx, max_len=MAX_LEN), utils.pad_zero(by, max_len=MAX_LEN + 1)
225 |         loss, logits = model.step(bx, by)
226 |         if t % 50 == 0:
227 |             logits = logits[0].numpy()
228 |             t1 = time.time()
229 |             print(
230 |                 "step: ", t,
231 |                 "| time: %.2f" % (t1 - t0),
232 |                 "| loss: %.4f" % loss.numpy(),
233 |                 "| target: ", "".join([data.i2v[i] for i in by[0, 1:10]]),
234 |                 "| inference: ", "".join([data.i2v[i] for i in np.argmax(logits, axis=1)[:10]]),
235 |             )
236 |             t0 = t1
237 | 
238 |     os.makedirs("./visual/models/transformer", exist_ok=True)
239 |     model.save_weights("./visual/models/transformer/model.ckpt")
240 |     os.makedirs("./visual/tmp", exist_ok=True)
241 |     with open("./visual/tmp/transformer_v2i_i2v.pkl", "wb") as f:
242 |         pickle.dump({"v2i": data.v2i, "i2v": data.i2v}, f)
243 | 
244 | 
245 | def export_attention(model, data, name="transformer"):
246 |     with open("./visual/tmp/transformer_v2i_i2v.pkl", "rb") as f:
247 |         dic = pickle.load(f)
248 |     model.load_weights("./visual/models/transformer/model.ckpt")
249 |     bx, by, seq_len = data.sample(32)
250 |     model.translate(bx, dic["v2i"], dic["i2v"])
251 |     attn_data = {
252 |         "src": [[data.i2v[i] for i in bx[j]] for j in range(len(bx))],
253 |         "tgt": [[data.i2v[i] for i in by[j]] for j in range(len(by))],
254 |         "attentions": model.attentions}
255 |     path = "./visual/tmp/%s_attention_matrix.pkl" % name
256 |     os.makedirs(os.path.dirname(path), exist_ok=True)
257 |     with open(path, "wb") as f:
258 |         pickle.dump(attn_data, f)
259 | 
260 | 
261 | if __name__ == "__main__":
262 |     utils.set_soft_gpu(True)
263 |     d = utils.DateData(4000)
264 |     print("Chinese time order: yy/mm/dd ", d.date_cn[:3], "\nEnglish time order: dd/M/yyyy ", d.date_en[:3])
265 |     print("vocabularies: ", d.vocab)
266 |     print("x index sample: \n{}\n{}".format(d.idx2str(d.x[0]), d.x[0]),
267 |           "\ny index sample: \n{}\n{}".format(d.idx2str(d.y[0]), d.y[0]))
268 | 
269 |     m = Transformer(MODEL_DIM, MAX_LEN, N_LAYER, N_HEAD, d.num_word, DROP_RATE)
270 |     train(m, d, step=800)
271 |     export_attention(m, d)


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import datetime
  3 | import os
  4 | import requests
  5 | import pandas as pd
  6 | import re
  7 | import itertools
  8 | 
  9 | PAD_ID = 0
 10 | 
 11 | 
 12 | class DateData:
 13 |     def __init__(self, n):
 14 |         np.random.seed(1)
 15 |         self.date_cn = []
 16 |         self.date_en = []
 17 |         for timestamp in np.random.randint(143835585, 2043835585, n):
 18 |             date = datetime.datetime.fromtimestamp(timestamp)
 19 |             self.date_cn.append(date.strftime("%y-%m-%d"))
 20 |             self.date_en.append(date.strftime("%d/%b/%Y"))
 21 |         self.vocab = set(
 22 |             [str(i) for i in range(0, 10)] + ["-", "/", "<GO>", "<EOS>"] + [
 23 |                 i.split("/")[1] for i in self.date_en])
 24 |         self.v2i = {v: i for i, v in enumerate(sorted(list(self.vocab)), start=1)}
 25 |         self.v2i["<PAD>"] = PAD_ID
 26 |         self.vocab.add("<PAD>")
 27 |         self.i2v = {i: v for v, i in self.v2i.items()}
 28 |         self.x, self.y = [], []
 29 |         for cn, en in zip(self.date_cn, self.date_en):
 30 |             self.x.append([self.v2i[v] for v in cn])
 31 |             self.y.append(
 32 |                 [self.v2i["<GO>"], ] + [self.v2i[v] for v in en[:3]] + [
 33 |                     self.v2i[en[3:6]], ] + [self.v2i[v] for v in en[6:]] + [
 34 |                     self.v2i["<EOS>"], ])
 35 |         self.x, self.y = np.array(self.x), np.array(self.y)
 36 |         self.start_token = self.v2i["<GO>"]
 37 |         self.end_token = self.v2i["<EOS>"]
 38 | 
 39 |     def sample(self, n=64):
 40 |         bi = np.random.randint(0, len(self.x), size=n)
 41 |         bx, by = self.x[bi], self.y[bi]
 42 |         decoder_len = np.full((len(bx),), by.shape[1] - 1, dtype=np.int32)
 43 |         return bx, by, decoder_len
 44 | 
 45 |     def idx2str(self, idx):
 46 |         x = []
 47 |         for i in idx:
 48 |             x.append(self.i2v[i])
 49 |             if i == self.end_token:
 50 |                 break
 51 |         return "".join(x)
 52 | 
 53 |     @property
 54 |     def num_word(self):
 55 |         return len(self.vocab)
 56 | 
 57 | 
 58 | def pad_zero(seqs, max_len):
 59 |     padded = np.full((len(seqs), max_len), fill_value=PAD_ID, dtype=np.long)
 60 |     for i, seq in enumerate(seqs):
 61 |         padded[i, :len(seq)] = seq
 62 |     return padded
 63 | 
 64 | 
 65 | def maybe_download_mrpc(save_dir="./MRPC/", proxy=None):
 66 |     train_url = 'https://mofanpy.com/static/files/MRPC/msr_paraphrase_train.txt'
 67 |     test_url = 'https://mofanpy.com/static/files/MRPC/msr_paraphrase_test.txt'
 68 |     os.makedirs(save_dir, exist_ok=True)
 69 |     proxies = {"http": proxy, "https": proxy}
 70 |     for url in [train_url, test_url]:
 71 |         raw_path = os.path.join(save_dir, url.split("/")[-1])
 72 |         if not os.path.isfile(raw_path):
 73 |             print("downloading from %s" % url)
 74 |             r = requests.get(url, proxies=proxies)
 75 |             with open(raw_path, "w", encoding="utf-8") as f:
 76 |                 f.write(r.text.replace('"', "<QUOTE>"))
 77 |                 print("completed")
 78 | 
 79 | 
 80 | def _text_standardize(text):
 81 |     text = re.sub(r'—', '-', text)
 82 |     text = re.sub(r'–', '-', text)
 83 |     text = re.sub(r'―', '-', text)
 84 |     text = re.sub(r" \d+(,\d+)?(\.\d+)? ", " <NUM> ", text)
 85 |     text = re.sub(r" \d+-+?\d*", " <NUM>-", text)
 86 |     return text.strip()
 87 | 
 88 | 
 89 | def _process_mrpc(dir="./MRPC", rows=None):
 90 |     data = {"train": None, "test": None}
 91 |     files = os.listdir(dir)
 92 |     for f in files:
 93 |         df = pd.read_csv(os.path.join(dir, f), sep='\t', nrows=rows)
 94 |         k = "train" if "train" in f else "test"
 95 |         data[k] = {"is_same": df.iloc[:, 0].values, "s1": df["#1 String"].values, "s2": df["#2 String"].values}
 96 |     vocab = set()
 97 |     for n in ["train", "test"]:
 98 |         for m in ["s1", "s2"]:
 99 |             for i in range(len(data[n][m])):
100 |                 data[n][m][i] = _text_standardize(data[n][m][i].lower())
101 |                 cs = data[n][m][i].split(" ")
102 |                 vocab.update(set(cs))
103 |     v2i = {v: i for i, v in enumerate(sorted(vocab), start=1)}
104 |     v2i["<PAD>"] = PAD_ID
105 |     v2i["<MASK>"] = len(v2i)
106 |     v2i["<SEP>"] = len(v2i)
107 |     v2i["<GO>"] = len(v2i)
108 |     i2v = {i: v for v, i in v2i.items()}
109 |     for n in ["train", "test"]:
110 |         for m in ["s1", "s2"]:
111 |             data[n][m+"id"] = [[v2i[v] for v in c.split(" ")] for c in data[n][m]]
112 |     return data, v2i, i2v
113 | 
114 | 
115 | class MRPCData:
116 |     num_seg = 3
117 |     pad_id = PAD_ID
118 | 
119 |     def __init__(self, data_dir="./MRPC/", rows=None, proxy=None):
120 |         maybe_download_mrpc(save_dir=data_dir, proxy=proxy)
121 |         data, self.v2i, self.i2v = _process_mrpc(data_dir, rows)
122 |         self.max_len = max(
123 |             [len(s1) + len(s2) + 3 for s1, s2 in zip(
124 |                 data["train"]["s1id"] + data["test"]["s1id"], data["train"]["s2id"] + data["test"]["s2id"])])
125 | 
126 |         self.xlen = np.array([
127 |             [
128 |                 len(data["train"]["s1id"][i]), len(data["train"]["s2id"][i])
129 |              ] for i in range(len(data["train"]["s1id"]))], dtype=int)
130 |         x = [
131 |             [self.v2i["<GO>"]] + data["train"]["s1id"][i] + [self.v2i["<SEP>"]] + data["train"]["s2id"][i] + [self.v2i["<SEP>"]]
132 |             for i in range(len(self.xlen))
133 |         ]
134 |         self.x = pad_zero(x, max_len=self.max_len)
135 |         self.nsp_y = data["train"]["is_same"][:, None]
136 | 
137 |         self.seg = np.full(self.x.shape, self.num_seg-1, np.int32)
138 |         for i in range(len(x)):
139 |             si = self.xlen[i][0] + 2
140 |             self.seg[i, :si] = 0
141 |             si_ = si + self.xlen[i][1] + 1
142 |             self.seg[i, si:si_] = 1
143 | 
144 |         self.word_ids = np.array(list(set(self.i2v.keys()).difference(
145 |             [self.v2i[v] for v in ["<PAD>", "<MASK>", "<SEP>"]])))
146 | 
147 |     def sample(self, n):
148 |         bi = np.random.randint(0, self.x.shape[0], size=n)
149 |         bx, bs, bl, by = self.x[bi], self.seg[bi], self.xlen[bi], self.nsp_y[bi]
150 |         return bx, bs, bl, by
151 | 
152 |     @property
153 |     def num_word(self):
154 |         return len(self.v2i)
155 | 
156 |     @property
157 |     def mask_id(self):
158 |         return self.v2i["<MASK>"]
159 | 
160 | 
161 | class MRPCSingle:
162 |     pad_id = PAD_ID
163 | 
164 |     def __init__(self, data_dir="./MRPC/", rows=None, proxy=None):
165 |         maybe_download_mrpc(save_dir=data_dir, proxy=proxy)
166 |         data, self.v2i, self.i2v = _process_mrpc(data_dir, rows)
167 | 
168 |         self.max_len = max([len(s) + 2 for s in data["train"]["s1id"] + data["train"]["s2id"]])
169 |         x = [
170 |             [self.v2i["<GO>"]] + data["train"]["s1id"][i] + [self.v2i["<SEP>"]]
171 |             for i in range(len(data["train"]["s1id"]))
172 |         ]
173 |         x += [
174 |             [self.v2i["<GO>"]] + data["train"]["s2id"][i] + [self.v2i["<SEP>"]]
175 |             for i in range(len(data["train"]["s2id"]))
176 |         ]
177 |         self.x = pad_zero(x, max_len=self.max_len)
178 |         self.word_ids = np.array(list(set(self.i2v.keys()).difference([self.v2i["<PAD>"]])))
179 | 
180 |     def sample(self, n):
181 |         bi = np.random.randint(0, self.x.shape[0], size=n)
182 |         bx = self.x[bi]
183 |         return bx
184 | 
185 |     @property
186 |     def num_word(self):
187 |         return len(self.v2i)
188 | 
189 | 
190 | class Dataset:
191 |     def __init__(self, x, y, v2i, i2v):
192 |         self.x, self.y = x, y
193 |         self.v2i, self.i2v = v2i, i2v
194 |         self.vocab = v2i.keys()
195 | 
196 |     def sample(self, n):
197 |         b_idx = np.random.randint(0, len(self.x), n)
198 |         bx, by = self.x[b_idx], self.y[b_idx]
199 |         return bx, by
200 | 
201 |     @property
202 |     def num_word(self):
203 |         return len(self.v2i)
204 | 
205 | 
206 | def process_w2v_data(corpus, skip_window=2, method="skip_gram"):
207 |     all_words = [sentence.split(" ") for sentence in corpus]
208 |     all_words = np.array(list(itertools.chain(*all_words)))
209 |     # vocab sort by decreasing frequency for the negative sampling below (nce_loss).
210 |     vocab, v_count = np.unique(all_words, return_counts=True)
211 |     vocab = vocab[np.argsort(v_count)[::-1]]
212 | 
213 |     print("all vocabularies sorted from more frequent to less frequent:\n", vocab)
214 |     v2i = {v: i for i, v in enumerate(vocab)}
215 |     i2v = {i: v for v, i in v2i.items()}
216 | 
217 |     # pair data
218 |     pairs = []
219 |     js = [i for i in range(-skip_window, skip_window + 1) if i != 0]
220 | 
221 |     for c in corpus:
222 |         words = c.split(" ")
223 |         w_idx = [v2i[w] for w in words]
224 |         if method == "skip_gram":
225 |             for i in range(len(w_idx)):
226 |                 for j in js:
227 |                     if i + j < 0 or i + j >= len(w_idx):
228 |                         continue
229 |                     pairs.append((w_idx[i], w_idx[i + j]))  # (center, context) or (feature, target)
230 |         elif method.lower() == "cbow":
231 |             for i in range(skip_window, len(w_idx) - skip_window):
232 |                 context = []
233 |                 for j in js:
234 |                     context.append(w_idx[i + j])
235 |                 pairs.append(context + [w_idx[i]])  # (contexts, center) or (feature, target)
236 |         else:
237 |             raise ValueError
238 |     pairs = np.array(pairs)
239 |     print("5 example pairs:\n", pairs[:5])
240 |     if method.lower() == "skip_gram":
241 |         x, y = pairs[:, 0], pairs[:, 1]
242 |     elif method.lower() == "cbow":
243 |         x, y = pairs[:, :-1], pairs[:, -1]
244 |     else:
245 |         raise ValueError
246 |     return Dataset(x, y, v2i, i2v)
247 | 
248 | 
249 | def set_soft_gpu(soft_gpu):
250 |     import tensorflow as tf
251 |     if soft_gpu:
252 |         gpus = tf.config.experimental.list_physical_devices('GPU')
253 |         if gpus:
254 |             # Currently, memory growth needs to be the same across GPUs
255 |             for gpu in gpus:
256 |                 tf.config.experimental.set_memory_growth(gpu, True)
257 |             logical_gpus = tf.config.experimental.list_logical_devices('GPU')
258 |             print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")


--------------------------------------------------------------------------------
/visual.py:
--------------------------------------------------------------------------------
  1 | import matplotlib.pyplot as plt
  2 | import numpy as np
  3 | import pickle
  4 | from matplotlib.pyplot import cm
  5 | import os
  6 | import utils
  7 | 
  8 | 
  9 | def show_tfidf(tfidf, vocab, filename):
 10 |     # [n_doc, n_vocab]
 11 |     plt.imshow(tfidf, cmap="YlGn", vmin=tfidf.min(), vmax=tfidf.max())
 12 |     plt.xticks(np.arange(tfidf.shape[1]), vocab, fontsize=6, rotation=90)
 13 |     plt.yticks(np.arange(tfidf.shape[0]), np.arange(1, tfidf.shape[0]+1), fontsize=6)
 14 |     plt.tight_layout()
 15 |     # creating the output folder 
 16 |     output_folder = './visual/results/'
 17 |     os.makedirs(output_folder, exist_ok=True)
 18 |     plt.savefig(os.path.join(output_folder, '%s.png') % filename, format="png", dpi=500)
 19 |     plt.show()
 20 | 
 21 | 
 22 | def show_w2v_word_embedding(model, data: utils.Dataset, path):
 23 |     word_emb = model.embeddings.get_weights()[0]
 24 |     for i in range(data.num_word):
 25 |         c = "blue"
 26 |         try:
 27 |             int(data.i2v[i])
 28 |         except ValueError:
 29 |             c = "red"
 30 |         plt.text(word_emb[i, 0], word_emb[i, 1], s=data.i2v[i], color=c, weight="bold")
 31 |     plt.xlim(word_emb[:, 0].min() - .5, word_emb[:, 0].max() + .5)
 32 |     plt.ylim(word_emb[:, 1].min() - .5, word_emb[:, 1].max() + .5)
 33 |     plt.xticks(())
 34 |     plt.yticks(())
 35 |     plt.xlabel("embedding dim1")
 36 |     plt.ylabel("embedding dim2")
 37 |     plt.savefig(path, dpi=300, format="png")
 38 |     plt.show()
 39 | 
 40 | 
 41 | def seq2seq_attention():
 42 |     with open("./visual/tmp/attention_align.pkl", "rb") as f:
 43 |         data = pickle.load(f)
 44 |     i2v, x, y, align = data["i2v"], data["x"], data["y"], data["align"]
 45 |     plt.rcParams['xtick.bottom'] = plt.rcParams['xtick.labelbottom'] = False
 46 |     plt.rcParams['xtick.top'] = plt.rcParams['xtick.labeltop'] = True
 47 |     for i in range(6):
 48 |         plt.subplot(2, 3, i + 1)
 49 |         x_vocab = [i2v[j] for j in np.ravel(x[i])]
 50 |         y_vocab = [i2v[j] for j in y[i, 1:]]
 51 |         plt.imshow(align[i], cmap="YlGn", vmin=0., vmax=1.)
 52 |         plt.yticks([j for j in range(len(y_vocab))], y_vocab)
 53 |         plt.xticks([j for j in range(len(x_vocab))], x_vocab)
 54 |         if i == 0 or i == 3:
 55 |             plt.ylabel("Output")
 56 |         if i >= 3:
 57 |             plt.xlabel("Input")
 58 |     plt.tight_layout()
 59 |     plt.savefig("./visual/results/seq2seq_attention.png", format="png", dpi=200)
 60 |     plt.show()
 61 | 
 62 | 
 63 | def all_mask_kinds():
 64 |     seqs = ["I love you", "My name is M", "This is a very long seq", "Short one"]
 65 |     vocabs = set((" ".join(seqs)).split(" "))
 66 |     i2v = {i: v for i, v in enumerate(vocabs, start=1)}
 67 |     i2v["<PAD>"] = 0  # add 0 idx for <PAD>
 68 |     v2i = {v: i for i, v in i2v.items()}
 69 | 
 70 |     id_seqs = [[v2i[v] for v in seq.split(" ")] for seq in seqs]
 71 |     padded_id_seqs = np.array([l + [0] * (6 - len(l)) for l in id_seqs])
 72 | 
 73 |     # padding mask
 74 |     pmask = np.where(padded_id_seqs == 0, np.ones_like(padded_id_seqs), np.zeros_like(padded_id_seqs))  # 0 idx is padding
 75 |     pmask = np.repeat(pmask[:, None, :], pmask.shape[-1], axis=1)  # [n, step, step]
 76 |     plt.rcParams['xtick.bottom'] = plt.rcParams['xtick.labelbottom'] = False
 77 |     plt.rcParams['xtick.top'] = plt.rcParams['xtick.labeltop'] = True
 78 |     for i in range(1, 5):
 79 |         plt.subplot(2, 2, i)
 80 |         plt.imshow(pmask[i-1], vmax=1, vmin=0, cmap="YlGn")
 81 |         plt.xticks(range(6), seqs[i - 1].split(" "), rotation=45)
 82 |         plt.yticks(range(6), seqs[i - 1].split(" "),)
 83 |         plt.grid(which="minor", c="w", lw=0.5, linestyle="-")
 84 |     plt.tight_layout()
 85 |     plt.savefig("./visual/results/transformer_pad_mask.png", dpi=200)
 86 |     plt.show()
 87 | 
 88 |     # look ahead mask
 89 |     max_len = pmask.shape[-1]
 90 |     omask = ~np.triu(np.ones((max_len, max_len), dtype=np.bool), 1)
 91 |     omask = np.tile(np.expand_dims(omask, axis=0), [np.shape(seqs)[0], 1, 1])  # [n, step, step]
 92 |     omask = np.where(omask, pmask, 1)
 93 | 
 94 |     plt.rcParams['xtick.bottom'] = plt.rcParams['xtick.labelbottom'] = False
 95 |     plt.rcParams['xtick.top'] = plt.rcParams['xtick.labeltop'] = True
 96 |     for i in range(1, 5):
 97 |         plt.subplot(2, 2, i)
 98 |         plt.imshow(omask[i - 1], vmax=1, vmin=0, cmap="YlGn")
 99 |         plt.xticks(range(6), seqs[i - 1].split(" "), rotation=45)
100 |         plt.yticks(range(6), seqs[i - 1].split(" "), )
101 |         plt.grid(which="minor", c="w", lw=0.5, linestyle="-")
102 |     plt.tight_layout()
103 |     plt.savefig("./visual/results/transformer_look_ahead_mask.png", dpi=200)
104 |     plt.show()
105 | 
106 | 
107 | def position_embedding():
108 |     max_len = 500
109 |     model_dim = 512
110 |     pos = np.arange(max_len)[:, None]
111 |     pe = pos / np.power(10000, 2. * np.arange(model_dim)[None, :] / model_dim)  # [max_len, model_dim]
112 |     pe[:, 0::2] = np.sin(pe[:, 0::2])
113 |     pe[:, 1::2] = np.cos(pe[:, 1::2])
114 |     plt.imshow(pe, vmax=1, vmin=-1, cmap="rainbow")
115 |     plt.ylabel("word position")
116 |     plt.xlabel("embedding dim")
117 |     plt.savefig("./visual/results/transformer_position_embedding.png", dpi=200)
118 |     plt.show()
119 | 
120 | 
121 | def transformer_attention_matrix(case=0):
122 |     with open("./visual/tmp/transformer_attention_matrix.pkl", "rb") as f:
123 |         data = pickle.load(f)
124 |     src = data["src"][case]
125 |     tgt = data["tgt"][case]
126 |     attentions = data["attentions"]
127 | 
128 |     encoder_atten = attentions["encoder"]
129 |     decoder_tgt_atten = attentions["decoder"]["mh1"]
130 |     decoder_src_atten = attentions["decoder"]["mh2"]
131 |     plt.rcParams['xtick.bottom'] = plt.rcParams['xtick.labelbottom'] = False
132 |     plt.rcParams['xtick.top'] = plt.rcParams['xtick.labeltop'] = True
133 | 
134 |     plt.figure(0, (7, 7))
135 |     plt.suptitle("Encoder self-attention")
136 |     for i in range(3):
137 |         for j in range(4):
138 |             plt.subplot(3, 4, i * 4 + j + 1)
139 |             plt.imshow(encoder_atten[i][case, j][:len(src), :len(src)], vmax=1, vmin=0, cmap="rainbow")
140 |             plt.xticks(range(len(src)), src)
141 |             plt.yticks(range(len(src)), src)
142 |             if j == 0:
143 |                 plt.ylabel("layer %i" % (i+1))
144 |             if i == 2:
145 |                 plt.xlabel("head %i" % (j+1))
146 |     plt.tight_layout()
147 |     plt.subplots_adjust(top=0.9)
148 |     plt.savefig("./visual/results/transformer%d_encoder_self_attention.png" % case, dpi=200)
149 |     plt.show()
150 | 
151 |     plt.figure(1, (7, 7))
152 |     plt.suptitle("Decoder self-attention")
153 |     for i in range(3):
154 |         for j in range(4):
155 |             plt.subplot(3, 4, i * 4 + j + 1)
156 |             plt.imshow(decoder_tgt_atten[i][case, j][:len(tgt), :len(tgt)], vmax=1, vmin=0, cmap="rainbow")
157 |             plt.xticks(range(len(tgt)), tgt, rotation=90, fontsize=7)
158 |             plt.yticks(range(len(tgt)), tgt, fontsize=7)
159 |             if j == 0:
160 |                 plt.ylabel("layer %i" % (i+1))
161 |             if i == 2:
162 |                 plt.xlabel("head %i" % (j+1))
163 |     plt.tight_layout()
164 |     plt.subplots_adjust(top=0.9)
165 |     plt.savefig("./visual/results/transformer%d_decoder_self_attention.png" % case, dpi=200)
166 |     plt.show()
167 | 
168 |     plt.figure(2, (7, 8))
169 |     plt.suptitle("Decoder-Encoder attention")
170 |     for i in range(3):
171 |         for j in range(4):
172 |             plt.subplot(3, 4, i*4+j+1)
173 |             plt.imshow(decoder_src_atten[i][case, j][:len(tgt), :len(src)], vmax=1, vmin=0, cmap="rainbow")
174 |             plt.xticks(range(len(src)), src, fontsize=7)
175 |             plt.yticks(range(len(tgt)), tgt, fontsize=7)
176 |             if j == 0:
177 |                 plt.ylabel("layer %i" % (i+1))
178 |             if i == 2:
179 |                 plt.xlabel("head %i" % (j+1))
180 |     plt.tight_layout()
181 |     plt.subplots_adjust(top=0.9)
182 |     plt.savefig("./visual/results/transformer%d_decoder_encoder_attention.png" % case, dpi=200)
183 |     plt.show()
184 | 
185 | 
186 | def transformer_attention_line(case=0):
187 |     with open("./visual/tmp/transformer_attention_matrix.pkl", "rb") as f:
188 |         data = pickle.load(f)
189 |     src = data["src"][case]
190 |     tgt = data["tgt"][case]
191 |     attentions = data["attentions"]
192 | 
193 |     decoder_src_atten = attentions["decoder"]["mh2"]
194 | 
195 |     tgt_label = tgt[1:11][::-1]
196 |     src_label = ["" for _ in range(2)] + src[::-1]
197 |     fig, ax = plt.subplots(nrows=2, ncols=2, sharex=True, figsize=(7, 14))
198 | 
199 |     for i in range(2):
200 |         for j in range(2):
201 |             ax[i, j].set_yticks(np.arange(len(src_label)))
202 |             ax[i, j].set_yticklabels(src_label, fontsize=9)  # src
203 |             ax[i, j].set_ylim(0, len(src_label)-1)
204 |             ax_ = ax[i, j].twinx()
205 |             ax_.set_yticks(np.linspace(ax_.get_yticks()[0], ax_.get_yticks()[-1], len(ax[i, j].get_yticks())))
206 |             ax_.set_yticklabels(tgt_label, fontsize=9)      # tgt
207 |             img = decoder_src_atten[-1][case, i + j][:10, :8]
208 |             color = cm.rainbow(np.linspace(0, 1, img.shape[0]))
209 |             left_top, right_top = img.shape[1], img.shape[0]
210 |             for ri, c in zip(range(right_top), color):      # tgt
211 |                 for li in range(left_top):                 # src
212 |                     alpha = (img[ri, li] / img[ri].max()) ** 8
213 |                     ax[i, j].plot([0, 1], [left_top - li + 1, right_top - 1 - ri], alpha=alpha, c=c)
214 |             ax[i, j].set_xticks(())
215 |             ax[i, j].set_xlabel("head %i" % (j + 1 + i * 2))
216 |             ax[i, j].set_xlim(0, 1)
217 |     plt.subplots_adjust(top=0.9)
218 |     plt.tight_layout()
219 |     plt.savefig("./visual/results/transformer%d_encoder_decoder_attention_line.png" % case, dpi=100)
220 | 
221 | 
222 | def self_attention_matrix(bert_or_gpt="bert", case=0):
223 |     with open("./visual/tmp/"+bert_or_gpt+"_attention_matrix.pkl", "rb") as f:
224 |         data = pickle.load(f)
225 |     src = data["src"]
226 |     attentions = data["attentions"]
227 | 
228 |     encoder_atten = attentions["encoder"]
229 |     plt.rcParams['xtick.bottom'] = plt.rcParams['xtick.labelbottom'] = False
230 |     plt.rcParams['xtick.top'] = plt.rcParams['xtick.labeltop'] = True
231 | 
232 |     s_len = 0
233 |     for s in src[case]:
234 |         if s == "<SEP>":
235 |             break
236 |         s_len += 1
237 | 
238 |     plt.figure(0, (7, 28))
239 |     for j in range(4):
240 |         plt.subplot(4, 1, j + 1)
241 |         img = encoder_atten[-1][case, j][:s_len-1, :s_len-1]
242 |         plt.imshow(img, vmax=img.max(), vmin=0, cmap="rainbow")
243 |         plt.xticks(range(s_len-1), src[case][:s_len-1], rotation=90, fontsize=9)
244 |         plt.yticks(range(s_len-1), src[case][1:s_len], fontsize=9)
245 |         plt.xlabel("head %i" % (j+1))
246 |     plt.subplots_adjust(top=0.9)
247 |     plt.tight_layout()
248 |     plt.savefig("./visual/results/"+bert_or_gpt+"%d_self_attention.png" % case, dpi=500)
249 |     # plt.show()
250 | 
251 | 
252 | def self_attention_line(bert_or_gpt="bert", case=0):
253 |     with open("./visual/tmp/"+bert_or_gpt+"_attention_matrix.pkl", "rb") as f:
254 |         data = pickle.load(f)
255 |     src = data["src"][case]
256 |     attentions = data["attentions"]
257 | 
258 |     encoder_atten = attentions["encoder"]
259 | 
260 |     s_len = 0
261 |     print(" ".join(src))
262 |     for s in src:
263 |         if s == "<SEP>":
264 |             break
265 |         s_len += 1
266 |     y_label = src[:s_len][::-1]
267 |     fig, ax = plt.subplots(nrows=2, ncols=2, sharex=True, figsize=(7, 14))
268 | 
269 |     for i in range(2):
270 |         for j in range(2):
271 |             ax[i, j].set_yticks(np.arange(len(y_label)))
272 |             ax[i, j].tick_params(labelright=True)
273 |             ax[i, j].set_yticklabels(y_label, fontsize=9)     # input
274 | 
275 |             img = encoder_atten[-1][case, i+j][:s_len - 1, :s_len - 1]
276 |             color = cm.rainbow(np.linspace(0, 1, img.shape[0]))
277 |             for row, c in zip(range(img.shape[0]), color):
278 |                 for col in range(img.shape[1]):
279 |                     alpha = (img[row, col] / img[row].max()) ** 5
280 |                     ax[i, j].plot([0, 1], [img.shape[1]-col, img.shape[0]-row-1], alpha=alpha, c=c)
281 |             ax[i, j].set_xticks(())
282 |             ax[i, j].set_xlabel("head %i" % (j+1+i*2))
283 |             ax[i, j].set_xlim(0, 1)
284 |     plt.subplots_adjust(top=0.9)
285 |     plt.tight_layout()
286 |     plt.savefig("./visual/results/"+bert_or_gpt+"%d_self_attention_line.png" % case, dpi=100)
287 | 
288 | 
289 | if __name__ == "__main__":
290 |     os.makedirs("./visual/results", exist_ok=True)
291 |     # all_mask_kinds()
292 |     # seq2seq_attention()
293 |     # position_embedding()
294 |     transformer_attention_matrix(case=0)
295 |     transformer_attention_line(case=0)
296 | 
297 |     # model = ["gpt", "bert", "bert_window_mask"][1]
298 |     # case = 6
299 |     # self_attention_matrix(model, case=case)
300 |     # self_attention_line(model, case=case)
301 | 


--------------------------------------------------------------------------------