├── .gitignore
├── requirements.txt
├── utils.py
├── example_sentence_level.py
├── example_word_level.py
├── elmo.py
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | Keras==2.2.0
2 | numpy==1.13.3
3 | tensorflow==1.7.0
4 | tensorflow_hub==0.1.1
5 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
 1 | from keras.preprocessing import sequence
 2 | from keras.datasets import imdb
 3 | 
 4 | def load_data(max_sequence_length):
 5 |     print('Loading data...')
 6 |     (x_train, y_train), (x_test, y_test) = imdb.load_data()
 7 |     print(len(x_train), 'train sequences')
 8 |     print(len(x_test), 'test sequences')
 9 | 
10 |     print('Pad sequences (samples x time)')
11 |     x_train = sequence.pad_sequences(x_train, maxlen=max_sequence_length, padding='post', truncating='post')
12 |     x_test = sequence.pad_sequences(x_test, maxlen=max_sequence_length, padding='post', truncating='post')
13 |     print('x_train shape:', x_train.shape)
14 |     print('x_test shape:', x_test.shape)
15 |     return (x_train, y_train), (x_test, y_test)
16 | 
17 | def get_idx2word():
18 |     INDEX_FROM = 3 # word index offset
19 | 
20 |     word_to_id = imdb.get_word_index()
21 |     word_to_id = {k:(v+INDEX_FROM) for k,v in word_to_id.items()}
22 |     word_to_id["<PAD>"] = 0
23 |     word_to_id["<START>"] = 1
24 |     word_to_id["<UNK>"] = 2
25 | 
26 |     idx2word = {value:key for key,value in word_to_id.items()}
27 |     return idx2word


--------------------------------------------------------------------------------
/example_sentence_level.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import utils
 3 | from keras.models import Model
 4 | from keras.layers import *
 5 | from keras.optimizers import Adam
 6 | from elmo import ELMoEmbedding
 7 | 
 8 | MAX_SEQUENCE_LENGTH = 100
 9 | 
10 | (x_train, y_train), (x_test, y_test) = utils.load_data(max_sequence_length=MAX_SEQUENCE_LENGTH)
11 | idx2word = utils.get_idx2word()
12 | 
13 | print('Build model...')
14 | sentence_input = Input(shape=(x_train.shape[1],), dtype=tf.int64)
15 | sentence_embedding = ELMoEmbedding(idx2word=idx2word, output_mode="default", trainable=True)(sentence_input) # These two are interchangeable
16 | #sentence_embedding = Embedding(len(idx2word), 1024, input_length=MAX_SEQUENCE_LENGTH, trainable=False)(sentence_input) # These two are interchangeable
17 | dropout = Dropout(0.5)(sentence_embedding)
18 | hidden = Dense(50, activation='relu')(dropout)
19 | output = Dense(1, activation='sigmoid')(hidden)
20 | 
21 | model = Model(inputs=sentence_input, outputs=output)
22 | model.compile(loss='binary_crossentropy', optimizer=Adam(lr=0.001), metrics=['accuracy'])
23 | model.summary()
24 | 
25 | model.fit(x_train, y_train, batch_size=2, epochs=5, validation_data=(x_test, y_test))
26 | 


--------------------------------------------------------------------------------
/example_word_level.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import utils
 3 | from keras.models import Model
 4 | from keras.layers import *
 5 | from keras.optimizers import Adam
 6 | from elmo import ELMoEmbedding
 7 | 
 8 | MAX_SEQUENCE_LENGTH = 100
 9 | 
10 | (x_train, y_train), (x_test, y_test) = utils.load_data(max_sequence_length=MAX_SEQUENCE_LENGTH)
11 | idx2word = utils.get_idx2word()
12 | 
13 | print('Build model...')
14 | sentence_input = Input(shape=(x_train.shape[1],), dtype=tf.int64)
15 | sentence_embedding = ELMoEmbedding(idx2word=idx2word, output_mode="elmo", trainable=True)(sentence_input) # These two are interchangeable
16 | #sentence_embedding = Embedding(len(idx2word), 1024, input_length=MAX_SEQUENCE_LENGTH, trainable=False)(sentence_input) # These two are interchangeable
17 | convolution = Convolution1D(50, 3, padding='same', activation='relu')(sentence_embedding)
18 | convolution = GlobalMaxPooling1D()(convolution)
19 | dropout = Dropout(0.5)(convolution)
20 | hidden = Dense(50, activation='relu')(dropout)
21 | output = Dense(1, activation='sigmoid')(hidden)
22 | 
23 | model = Model(inputs=sentence_input, outputs=output)
24 | model.compile(loss='binary_crossentropy', optimizer=Adam(lr=0.001), metrics=['accuracy'])
25 | model.summary()
26 | 
27 | model.fit(x_train, y_train, batch_size=2, epochs=5, validation_data=(x_test, y_test))
28 | 


--------------------------------------------------------------------------------
/elmo.py:
--------------------------------------------------------------------------------
 1 | from keras import backend as K
 2 | from keras.engine.topology import Layer
 3 | import tensorflow as tf
 4 | import tensorflow_hub as hub
 5 | import numpy as np
 6 | 
 7 | class ELMoEmbedding(Layer):
 8 | 
 9 |     def __init__(self, idx2word, output_mode="default", trainable=True, **kwargs):
10 |         assert output_mode in ["default", "word_emb", "lstm_outputs1", "lstm_outputs2", "elmo"]
11 |         assert trainable in [True, False]
12 |         self.idx2word = idx2word
13 |         self.output_mode = output_mode
14 |         self.trainable = trainable
15 |         self.max_length = None
16 |         self.word_mapping = None
17 |         self.lookup_table = None
18 |         self.elmo_model = None
19 |         self.embedding = None
20 |         super(ELMoEmbedding, self).__init__(**kwargs)
21 | 
22 |     def build(self, input_shape):
23 |         self.max_length = input_shape[1]
24 |         self.word_mapping = [x[1] for x in sorted(self.idx2word.items(), key=lambda x: x[0])]
25 |         self.lookup_table = tf.contrib.lookup.index_to_string_table_from_tensor(self.word_mapping, default_value="<UNK>")
26 |         self.lookup_table.init.run(session=K.get_session())
27 |         self.elmo_model = hub.Module("https://tfhub.dev/google/elmo/2", trainable=self.trainable)
28 |         super(ELMoEmbedding, self).build(input_shape)
29 | 
30 |     def call(self, x):
31 |         x = tf.cast(x, dtype=tf.int64)
32 |         sequence_lengths = tf.cast(tf.count_nonzero(x, axis=1), dtype=tf.int32)
33 |         strings = self.lookup_table.lookup(x)
34 |         inputs = {
35 |             "tokens": strings,
36 |             "sequence_len": sequence_lengths
37 |         }
38 |         return self.elmo_model(inputs, signature="tokens", as_dict=True)[self.output_mode]
39 | 
40 |     def compute_output_shape(self, input_shape):
41 |         if self.output_mode == "default":
42 |             return (input_shape[0], 1024)
43 |         if self.output_mode == "word_emb":
44 |             return (input_shape[0], self.max_length, 512)
45 |         if self.output_mode == "lstm_outputs1":
46 |             return (input_shape[0], self.max_length, 1024)
47 |         if self.output_mode == "lstm_outputs2":
48 |             return (input_shape[0], self.max_length, 1024)
49 |         if self.output_mode == "elmo":
50 |             return (input_shape[0], self.max_length, 1024)
51 | 
52 |     def get_config(self):
53 |         config = {
54 |             'idx2word': self.idx2word,
55 |             'output_mode': self.output_mode 
56 |         }
57 |         return list(config.items())
58 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Keras ELMo Embedding Layer
  2 | 
  3 | This is a Keras layer for ELMo embeddings. It is designed to be completely interchangeable with the built-in Keras embedding layer.
  4 | 
  5 | Unfortunately the layer only works on the Tensorflow backend since it depends on a Tensorflow Hub module (https://www.tensorflow.org/hub/modules/google/elmo/2).
  6 | 
  7 | You can find the original paper on ELMo embeddings here: https://arxiv.org/abs/1802.05365.
  8 | 
  9 | I've written a blog post with a high-level overview here: https://jordanhart.co.uk/2018/09/09/elmo-embeddings-layer-in-keras/.
 10 | 
 11 | ## Requirements
 12 | 
 13 | * Keras 2.2.0
 14 | * NumPy 1.13.3
 15 | * Tensorflow 1.7.0
 16 | * Tensorflow Hub 0.1.1
 17 | 
 18 | ## Usage
 19 | 
 20 | To import the module:
 21 | 
 22 | ```
 23 | from elmo import ELMoEmbedding
 24 | ```
 25 | 
 26 | Including the embedding in your architecture is as simple as replacing an existing embedding with this layer:
 27 | ```
 28 | ELMoEmbedding(idx2word=idx2word, output_mode="default", trainable=True)
 29 | ```
 30 | 
 31 | ### Arguments
 32 | 
 33 | * `idx2word` - a dictionary where the keys are token ids and the values are the corresponding words.
 34 | * `output_mode` - a string, one of `"default"`, `"word_emb"`, `"lstm_outputs1"`, `"lstm_outputs2"`, and `"elmo"`.
 35 | * `trainable` - a boolean, whether or not to allow the embeddings to be trained.
 36 | 
 37 | ### Input
 38 | 
 39 | A 2D tensor with shape `(batch_size, max_sequence_length)`.
 40 | 
 41 | ### Output
 42 | 
 43 | * `"default"` output mode - a 2D tensor with shape `(batch_size, 1024)`.
 44 | * `"word_emb"` output mode - a 3D tensor with shape `(batch_size, max_sequence_length, 512)`.
 45 | * `"lstm_outputs1"` output mode - a 3D tensor with shape `(batch_size, max_sequence_length, 1024)`.
 46 | * `"lstm_outputs2"` output mode - a 3D tensor with shape `(batch_size, max_sequence_length, 1024)`.
 47 | * `"elmo"` output mode - a 3D tensor with shape `(batch_size, max_sequence_length, 1024)`.
 48 | 
 49 | ## Examples
 50 | 
 51 | The following are modified examples taken from the examples directory in the Keras repository (https://github.com/keras-team/keras). The `utils` class contains some of the preprocessing code for this dataset. This repository contains all of the code needed to run these examples.
 52 | 
 53 | ### Sentiment analysis with sentence-level ELMo embeddings
 54 | 
 55 | ```
 56 | import tensorflow as tf
 57 | import utils
 58 | from keras.models import Model
 59 | from keras.layers import *
 60 | from keras.optimizers import Adam
 61 | from elmo import ELMoEmbedding
 62 | 
 63 | MAX_SEQUENCE_LENGTH = 100
 64 | 
 65 | (x_train, y_train), (x_test, y_test) = utils.load_data(max_sequence_length=MAX_SEQUENCE_LENGTH)
 66 | idx2word = utils.get_idx2word()
 67 | 
 68 | sentence_input = Input(shape=(x_train.shape[1],), dtype=tf.int64)
 69 | sentence_embedding = ELMoEmbedding(idx2word=idx2word)(sentence_input) # These two are interchangeable
 70 | dropout = Dropout(0.5)(sentence_embedding)
 71 | hidden = Dense(50, activation='relu')(dropout)
 72 | output = Dense(1, activation='sigmoid')(hidden)
 73 | 
 74 | model = Model(inputs=sentence_input, outputs=output)
 75 | model.compile(loss='binary_crossentropy', optimizer=Adam(lr=0.001), metrics=['accuracy'])
 76 | model.summary()
 77 | 
 78 | model.fit(x_train, y_train, batch_size=2, epochs=5, validation_data=(x_test, y_test))
 79 | ```
 80 | 
 81 | ### Sentiment analysis with word-level ELMo embeddings
 82 | 
 83 | ```
 84 | import tensorflow as tf
 85 | import utils
 86 | from keras.models import Model
 87 | from keras.layers import *
 88 | from keras.optimizers import Adam
 89 | from elmo import ELMoEmbedding
 90 | 
 91 | MAX_SEQUENCE_LENGTH = 100
 92 | 
 93 | (x_train, y_train), (x_test, y_test) = utils.load_data(max_sequence_length=MAX_SEQUENCE_LENGTH)
 94 | idx2word = utils.get_idx2word()
 95 | 
 96 | sentence_input = Input(shape=(x_train.shape[1],), dtype=tf.int64)
 97 | sentence_embedding = ELMoEmbedding(idx2word=idx2word, output_mode="elmo", trainable=False)(sentence_input) # These two are interchangeable
 98 | convolution = Convolution1D(50, 3, padding='same', activation='relu')(sentence_embedding)
 99 | convolution = GlobalMaxPooling1D()(convolution)
100 | dropout = Dropout(0.5)(convolution)
101 | hidden = Dense(50, activation='relu')(dropout)
102 | output = Dense(1, activation='sigmoid')(hidden)
103 | 
104 | model = Model(inputs=sentence_input, outputs=output)
105 | model.compile(loss='binary_crossentropy', optimizer=Adam(lr=0.001), metrics=['accuracy'])
106 | model.summary()
107 | 
108 | model.fit(x_train, y_train, batch_size=2, epochs=5, validation_data=(x_test, y_test))
109 | ```


--------------------------------------------------------------------------------