├── .gitignore ├── requirements.txt ├── utils.py ├── example_sentence_level.py ├── example_word_level.py ├── elmo.py └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Keras==2.2.0 2 | numpy==1.13.3 3 | tensorflow==1.7.0 4 | tensorflow_hub==0.1.1 5 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | from keras.preprocessing import sequence 2 | from keras.datasets import imdb 3 | 4 | def load_data(max_sequence_length): 5 | print('Loading data...') 6 | (x_train, y_train), (x_test, y_test) = imdb.load_data() 7 | print(len(x_train), 'train sequences') 8 | print(len(x_test), 'test sequences') 9 | 10 | print('Pad sequences (samples x time)') 11 | x_train = sequence.pad_sequences(x_train, maxlen=max_sequence_length, padding='post', truncating='post') 12 | x_test = sequence.pad_sequences(x_test, maxlen=max_sequence_length, padding='post', truncating='post') 13 | print('x_train shape:', x_train.shape) 14 | print('x_test shape:', x_test.shape) 15 | return (x_train, y_train), (x_test, y_test) 16 | 17 | def get_idx2word(): 18 | INDEX_FROM = 3 # word index offset 19 | 20 | word_to_id = imdb.get_word_index() 21 | word_to_id = {k:(v+INDEX_FROM) for k,v in word_to_id.items()} 22 | word_to_id[""] = 0 23 | word_to_id[""] = 1 24 | word_to_id[""] = 2 25 | 26 | idx2word = {value:key for key,value in word_to_id.items()} 27 | return idx2word -------------------------------------------------------------------------------- /example_sentence_level.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import utils 3 | from keras.models import Model 4 | from keras.layers import * 5 | from keras.optimizers import Adam 6 | from elmo import ELMoEmbedding 7 | 8 | MAX_SEQUENCE_LENGTH = 100 9 | 10 | (x_train, y_train), (x_test, y_test) = utils.load_data(max_sequence_length=MAX_SEQUENCE_LENGTH) 11 | idx2word = utils.get_idx2word() 12 | 13 | print('Build model...') 14 | sentence_input = Input(shape=(x_train.shape[1],), dtype=tf.int64) 15 | sentence_embedding = ELMoEmbedding(idx2word=idx2word, output_mode="default", trainable=True)(sentence_input) # These two are interchangeable 16 | #sentence_embedding = Embedding(len(idx2word), 1024, input_length=MAX_SEQUENCE_LENGTH, trainable=False)(sentence_input) # These two are interchangeable 17 | dropout = Dropout(0.5)(sentence_embedding) 18 | hidden = Dense(50, activation='relu')(dropout) 19 | output = Dense(1, activation='sigmoid')(hidden) 20 | 21 | model = Model(inputs=sentence_input, outputs=output) 22 | model.compile(loss='binary_crossentropy', optimizer=Adam(lr=0.001), metrics=['accuracy']) 23 | model.summary() 24 | 25 | model.fit(x_train, y_train, batch_size=2, epochs=5, validation_data=(x_test, y_test)) 26 | -------------------------------------------------------------------------------- /example_word_level.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import utils 3 | from keras.models import Model 4 | from keras.layers import * 5 | from keras.optimizers import Adam 6 | from elmo import ELMoEmbedding 7 | 8 | MAX_SEQUENCE_LENGTH = 100 9 | 10 | (x_train, y_train), (x_test, y_test) = utils.load_data(max_sequence_length=MAX_SEQUENCE_LENGTH) 11 | idx2word = utils.get_idx2word() 12 | 13 | print('Build model...') 14 | sentence_input = Input(shape=(x_train.shape[1],), dtype=tf.int64) 15 | sentence_embedding = ELMoEmbedding(idx2word=idx2word, output_mode="elmo", trainable=True)(sentence_input) # These two are interchangeable 16 | #sentence_embedding = Embedding(len(idx2word), 1024, input_length=MAX_SEQUENCE_LENGTH, trainable=False)(sentence_input) # These two are interchangeable 17 | convolution = Convolution1D(50, 3, padding='same', activation='relu')(sentence_embedding) 18 | convolution = GlobalMaxPooling1D()(convolution) 19 | dropout = Dropout(0.5)(convolution) 20 | hidden = Dense(50, activation='relu')(dropout) 21 | output = Dense(1, activation='sigmoid')(hidden) 22 | 23 | model = Model(inputs=sentence_input, outputs=output) 24 | model.compile(loss='binary_crossentropy', optimizer=Adam(lr=0.001), metrics=['accuracy']) 25 | model.summary() 26 | 27 | model.fit(x_train, y_train, batch_size=2, epochs=5, validation_data=(x_test, y_test)) 28 | -------------------------------------------------------------------------------- /elmo.py: -------------------------------------------------------------------------------- 1 | from keras import backend as K 2 | from keras.engine.topology import Layer 3 | import tensorflow as tf 4 | import tensorflow_hub as hub 5 | import numpy as np 6 | 7 | class ELMoEmbedding(Layer): 8 | 9 | def __init__(self, idx2word, output_mode="default", trainable=True, **kwargs): 10 | assert output_mode in ["default", "word_emb", "lstm_outputs1", "lstm_outputs2", "elmo"] 11 | assert trainable in [True, False] 12 | self.idx2word = idx2word 13 | self.output_mode = output_mode 14 | self.trainable = trainable 15 | self.max_length = None 16 | self.word_mapping = None 17 | self.lookup_table = None 18 | self.elmo_model = None 19 | self.embedding = None 20 | super(ELMoEmbedding, self).__init__(**kwargs) 21 | 22 | def build(self, input_shape): 23 | self.max_length = input_shape[1] 24 | self.word_mapping = [x[1] for x in sorted(self.idx2word.items(), key=lambda x: x[0])] 25 | self.lookup_table = tf.contrib.lookup.index_to_string_table_from_tensor(self.word_mapping, default_value="") 26 | self.lookup_table.init.run(session=K.get_session()) 27 | self.elmo_model = hub.Module("https://tfhub.dev/google/elmo/2", trainable=self.trainable) 28 | super(ELMoEmbedding, self).build(input_shape) 29 | 30 | def call(self, x): 31 | x = tf.cast(x, dtype=tf.int64) 32 | sequence_lengths = tf.cast(tf.count_nonzero(x, axis=1), dtype=tf.int32) 33 | strings = self.lookup_table.lookup(x) 34 | inputs = { 35 | "tokens": strings, 36 | "sequence_len": sequence_lengths 37 | } 38 | return self.elmo_model(inputs, signature="tokens", as_dict=True)[self.output_mode] 39 | 40 | def compute_output_shape(self, input_shape): 41 | if self.output_mode == "default": 42 | return (input_shape[0], 1024) 43 | if self.output_mode == "word_emb": 44 | return (input_shape[0], self.max_length, 512) 45 | if self.output_mode == "lstm_outputs1": 46 | return (input_shape[0], self.max_length, 1024) 47 | if self.output_mode == "lstm_outputs2": 48 | return (input_shape[0], self.max_length, 1024) 49 | if self.output_mode == "elmo": 50 | return (input_shape[0], self.max_length, 1024) 51 | 52 | def get_config(self): 53 | config = { 54 | 'idx2word': self.idx2word, 55 | 'output_mode': self.output_mode 56 | } 57 | return list(config.items()) 58 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Keras ELMo Embedding Layer 2 | 3 | This is a Keras layer for ELMo embeddings. It is designed to be completely interchangeable with the built-in Keras embedding layer. 4 | 5 | Unfortunately the layer only works on the Tensorflow backend since it depends on a Tensorflow Hub module (https://www.tensorflow.org/hub/modules/google/elmo/2). 6 | 7 | You can find the original paper on ELMo embeddings here: https://arxiv.org/abs/1802.05365. 8 | 9 | I've written a blog post with a high-level overview here: https://jordanhart.co.uk/2018/09/09/elmo-embeddings-layer-in-keras/. 10 | 11 | ## Requirements 12 | 13 | * Keras 2.2.0 14 | * NumPy 1.13.3 15 | * Tensorflow 1.7.0 16 | * Tensorflow Hub 0.1.1 17 | 18 | ## Usage 19 | 20 | To import the module: 21 | 22 | ``` 23 | from elmo import ELMoEmbedding 24 | ``` 25 | 26 | Including the embedding in your architecture is as simple as replacing an existing embedding with this layer: 27 | ``` 28 | ELMoEmbedding(idx2word=idx2word, output_mode="default", trainable=True) 29 | ``` 30 | 31 | ### Arguments 32 | 33 | * `idx2word` - a dictionary where the keys are token ids and the values are the corresponding words. 34 | * `output_mode` - a string, one of `"default"`, `"word_emb"`, `"lstm_outputs1"`, `"lstm_outputs2"`, and `"elmo"`. 35 | * `trainable` - a boolean, whether or not to allow the embeddings to be trained. 36 | 37 | ### Input 38 | 39 | A 2D tensor with shape `(batch_size, max_sequence_length)`. 40 | 41 | ### Output 42 | 43 | * `"default"` output mode - a 2D tensor with shape `(batch_size, 1024)`. 44 | * `"word_emb"` output mode - a 3D tensor with shape `(batch_size, max_sequence_length, 512)`. 45 | * `"lstm_outputs1"` output mode - a 3D tensor with shape `(batch_size, max_sequence_length, 1024)`. 46 | * `"lstm_outputs2"` output mode - a 3D tensor with shape `(batch_size, max_sequence_length, 1024)`. 47 | * `"elmo"` output mode - a 3D tensor with shape `(batch_size, max_sequence_length, 1024)`. 48 | 49 | ## Examples 50 | 51 | The following are modified examples taken from the examples directory in the Keras repository (https://github.com/keras-team/keras). The `utils` class contains some of the preprocessing code for this dataset. This repository contains all of the code needed to run these examples. 52 | 53 | ### Sentiment analysis with sentence-level ELMo embeddings 54 | 55 | ``` 56 | import tensorflow as tf 57 | import utils 58 | from keras.models import Model 59 | from keras.layers import * 60 | from keras.optimizers import Adam 61 | from elmo import ELMoEmbedding 62 | 63 | MAX_SEQUENCE_LENGTH = 100 64 | 65 | (x_train, y_train), (x_test, y_test) = utils.load_data(max_sequence_length=MAX_SEQUENCE_LENGTH) 66 | idx2word = utils.get_idx2word() 67 | 68 | sentence_input = Input(shape=(x_train.shape[1],), dtype=tf.int64) 69 | sentence_embedding = ELMoEmbedding(idx2word=idx2word)(sentence_input) # These two are interchangeable 70 | dropout = Dropout(0.5)(sentence_embedding) 71 | hidden = Dense(50, activation='relu')(dropout) 72 | output = Dense(1, activation='sigmoid')(hidden) 73 | 74 | model = Model(inputs=sentence_input, outputs=output) 75 | model.compile(loss='binary_crossentropy', optimizer=Adam(lr=0.001), metrics=['accuracy']) 76 | model.summary() 77 | 78 | model.fit(x_train, y_train, batch_size=2, epochs=5, validation_data=(x_test, y_test)) 79 | ``` 80 | 81 | ### Sentiment analysis with word-level ELMo embeddings 82 | 83 | ``` 84 | import tensorflow as tf 85 | import utils 86 | from keras.models import Model 87 | from keras.layers import * 88 | from keras.optimizers import Adam 89 | from elmo import ELMoEmbedding 90 | 91 | MAX_SEQUENCE_LENGTH = 100 92 | 93 | (x_train, y_train), (x_test, y_test) = utils.load_data(max_sequence_length=MAX_SEQUENCE_LENGTH) 94 | idx2word = utils.get_idx2word() 95 | 96 | sentence_input = Input(shape=(x_train.shape[1],), dtype=tf.int64) 97 | sentence_embedding = ELMoEmbedding(idx2word=idx2word, output_mode="elmo", trainable=False)(sentence_input) # These two are interchangeable 98 | convolution = Convolution1D(50, 3, padding='same', activation='relu')(sentence_embedding) 99 | convolution = GlobalMaxPooling1D()(convolution) 100 | dropout = Dropout(0.5)(convolution) 101 | hidden = Dense(50, activation='relu')(dropout) 102 | output = Dense(1, activation='sigmoid')(hidden) 103 | 104 | model = Model(inputs=sentence_input, outputs=output) 105 | model.compile(loss='binary_crossentropy', optimizer=Adam(lr=0.001), metrics=['accuracy']) 106 | model.summary() 107 | 108 | model.fit(x_train, y_train, batch_size=2, epochs=5, validation_data=(x_test, y_test)) 109 | ``` --------------------------------------------------------------------------------