├── readme.MD ├── data.txt └── model.py /readme.MD: -------------------------------------------------------------------------------- 1 | #### Language Modelling Text Generation using LSTMs — Deep Learning for NLP 2 | 3 | -------------------------------------------------------------------------------- /data.txt: -------------------------------------------------------------------------------- 1 | The cat and her kittens 2 | They put on their mittens 3 | To eat a christmas pie 4 | The poor little kittens 5 | They lost their mittens 6 | And then they began to cry. 7 | 8 | O mother dear, we sadly fear 9 | We cannot go to-day, 10 | For we have lost our mittens 11 | If it be so, ye shall not go 12 | For ye are naughty kittens -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | from keras.preprocessing.sequence import pad_sequences 2 | from keras.layers import Embedding, LSTM, Dense, Dropout 3 | from keras.preprocessing.text import Tokenizer 4 | from keras.callbacks import EarlyStopping 5 | from keras.models import Sequential 6 | import keras.utils as ku 7 | import numpy as np 8 | 9 | tokenizer = Tokenizer() 10 | 11 | def dataset_preparation(data): 12 | 13 | # basic cleanup 14 | corpus = data.lower().split("\n") 15 | 16 | # tokenization 17 | tokenizer.fit_on_texts(corpus) 18 | total_words = len(tokenizer.word_index) + 1 19 | 20 | # create input sequences using list of tokens 21 | input_sequences = [] 22 | for line in corpus: 23 | token_list = tokenizer.texts_to_sequences([line])[0] 24 | for i in range(1, len(token_list)): 25 | n_gram_sequence = token_list[:i+1] 26 | input_sequences.append(n_gram_sequence) 27 | 28 | # pad sequences 29 | max_sequence_len = max([len(x) for x in input_sequences]) 30 | input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')) 31 | 32 | # create predictors and label 33 | predictors, label = input_sequences[:,:-1],input_sequences[:,-1] 34 | label = ku.to_categorical(label, num_classes=total_words) 35 | 36 | return predictors, label, max_sequence_len, total_words 37 | 38 | def create_model(predictors, label, max_sequence_len, total_words): 39 | 40 | model = Sequential() 41 | model.add(Embedding(total_words, 10, input_length=max_sequence_len-1)) 42 | model.add(LSTM(150, return_sequences = True)) 43 | # model.add(Dropout(0.2)) 44 | model.add(LSTM(100)) 45 | model.add(Dense(total_words, activation='softmax')) 46 | 47 | model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) 48 | earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=5, verbose=0, mode='auto') 49 | model.fit(predictors, label, epochs=100, verbose=1, callbacks=[earlystop]) 50 | print model.summary() 51 | return model 52 | 53 | def generate_text(seed_text, next_words, max_sequence_len): 54 | for _ in range(next_words): 55 | token_list = tokenizer.texts_to_sequences([seed_text])[0] 56 | token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre') 57 | predicted = model.predict_classes(token_list, verbose=0) 58 | 59 | output_word = "" 60 | for word, index in tokenizer.word_index.items(): 61 | if index == predicted: 62 | output_word = word 63 | break 64 | seed_text += " " + output_word 65 | return seed_text 66 | 67 | 68 | 69 | data = open('data.txt').read() 70 | 71 | predictors, label, max_sequence_len, total_words = dataset_preparation(data) 72 | model = create_model(predictors, label, max_sequence_len, total_words) 73 | print generate_text("we naughty", 3, max_sequence_len) --------------------------------------------------------------------------------