├── README.md ├── _config.yml ├── data ├── rt-polarity.neg └── rt-polarity.pos ├── data_helpers.py └── model.py /README.md: -------------------------------------------------------------------------------- 1 | # CNN-text-classification-keras 2 | 3 | It is simplified implementation of [Implementing a CNN for Text Classification in TensorFlow](http://www.wildml.com/2015/12/implementing-a-cnn-for-text-classification-in-tensorflow/) in Keras as functional api 4 | 5 | # Requirements 6 | - [Python 3.5.2](https://www.python.org/) 7 | - [Keras 2.1.2](http://keras.io/) 8 | - [Tensorflow 1.4.1](https://www.tensorflow.org/) 9 | 10 | # Traning 11 | Run the below command and it will run for 100 epochs if you want change it just open [model.py](https://github.com/bhaveshoswal/CNN-text-classification-keras/blob/master/model.py) 12 | 13 | `python model.py` 14 | 15 | # For new data 16 | You have to rebuild the vocabulary and then train. 17 | 18 | # For Citation 19 | ``` 20 | @misc{bhaveshoswal, 21 | author = {Bhavesh Vinod Oswal}, 22 | title = {CNN-text-classification-keras}, 23 | year = {2016}, 24 | publisher = {GitHub}, 25 | journal = {GitHub repository}, 26 | howpublished = {\url{https://github.com/bhaveshoswal/CNN-text-classification-keras}}, 27 | } 28 | ``` 29 | -------------------------------------------------------------------------------- /_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-merlot -------------------------------------------------------------------------------- /data/rt-polarity.neg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bhaveshoswal/CNN-text-classification-keras/1db39e76894c55d293dd7e1f2eecbb8dcf014dab/data/rt-polarity.neg -------------------------------------------------------------------------------- /data/rt-polarity.pos: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bhaveshoswal/CNN-text-classification-keras/1db39e76894c55d293dd7e1f2eecbb8dcf014dab/data/rt-polarity.pos -------------------------------------------------------------------------------- /data_helpers.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import re 3 | import itertools 4 | from collections import Counter 5 | 6 | 7 | def clean_str(string): 8 | """ 9 | Tokenization/string cleaning for datasets. 10 | Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py 11 | """ 12 | string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string) 13 | string = re.sub(r"\'s", " \'s", string) 14 | string = re.sub(r"\'ve", " \'ve", string) 15 | string = re.sub(r"n\'t", " n\'t", string) 16 | string = re.sub(r"\'re", " \'re", string) 17 | string = re.sub(r"\'d", " \'d", string) 18 | string = re.sub(r"\'ll", " \'ll", string) 19 | string = re.sub(r",", " , ", string) 20 | string = re.sub(r"!", " ! ", string) 21 | string = re.sub(r"\(", " \( ", string) 22 | string = re.sub(r"\)", " \) ", string) 23 | string = re.sub(r"\?", " \? ", string) 24 | string = re.sub(r"\s{2,}", " ", string) 25 | return string.strip().lower() 26 | 27 | 28 | def load_data_and_labels(): 29 | """ 30 | Loads polarity data from files, splits the data into words and generates labels. 31 | Returns split sentences and labels. 32 | """ 33 | # Load data from files 34 | positive_examples = list(open("./data/rt-polarity.pos", "r", encoding='latin-1').readlines()) 35 | positive_examples = [s.strip() for s in positive_examples] 36 | negative_examples = list(open("./data/rt-polarity.neg", "r", encoding='latin-1').readlines()) 37 | negative_examples = [s.strip() for s in negative_examples] 38 | # Split by words 39 | x_text = positive_examples + negative_examples 40 | x_text = [clean_str(sent) for sent in x_text] 41 | x_text = [s.split(" ") for s in x_text] 42 | # Generate labels 43 | positive_labels = [[0, 1] for _ in positive_examples] 44 | negative_labels = [[1, 0] for _ in negative_examples] 45 | y = np.concatenate([positive_labels, negative_labels], 0) 46 | return [x_text, y] 47 | 48 | 49 | def pad_sentences(sentences, padding_word=""): 50 | """ 51 | Pads all sentences to the same length. The length is defined by the longest sentence. 52 | Returns padded sentences. 53 | """ 54 | sequence_length = max(len(x) for x in sentences) 55 | padded_sentences = [] 56 | for i in range(len(sentences)): 57 | sentence = sentences[i] 58 | num_padding = sequence_length - len(sentence) 59 | new_sentence = sentence + [padding_word] * num_padding 60 | padded_sentences.append(new_sentence) 61 | return padded_sentences 62 | 63 | 64 | def build_vocab(sentences): 65 | """ 66 | Builds a vocabulary mapping from word to index based on the sentences. 67 | Returns vocabulary mapping and inverse vocabulary mapping. 68 | """ 69 | # Build vocabulary 70 | word_counts = Counter(itertools.chain(*sentences)) 71 | # Mapping from index to word 72 | vocabulary_inv = [x[0] for x in word_counts.most_common()] 73 | vocabulary_inv = list(sorted(vocabulary_inv)) 74 | # Mapping from word to index 75 | vocabulary = {x: i for i, x in enumerate(vocabulary_inv)} 76 | return [vocabulary, vocabulary_inv] 77 | 78 | 79 | def build_input_data(sentences, labels, vocabulary): 80 | """ 81 | Maps sentences and labels to vectors based on a vocabulary. 82 | """ 83 | x = np.array([[vocabulary[word] for word in sentence] for sentence in sentences]) 84 | y = np.array(labels) 85 | return [x, y] 86 | 87 | 88 | def load_data(): 89 | """ 90 | Loads and preprocessed data for the dataset. 91 | Returns input vectors, labels, vocabulary, and inverse vocabulary. 92 | """ 93 | # Load and preprocess data 94 | sentences, labels = load_data_and_labels() 95 | sentences_padded = pad_sentences(sentences) 96 | vocabulary, vocabulary_inv = build_vocab(sentences_padded) 97 | x, y = build_input_data(sentences_padded, labels, vocabulary) 98 | return [x, y, vocabulary, vocabulary_inv] 99 | -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | from keras.layers import Input, Dense, Embedding, Conv2D, MaxPool2D 2 | from keras.layers import Reshape, Flatten, Dropout, Concatenate 3 | from keras.callbacks import ModelCheckpoint 4 | from keras.optimizers import Adam 5 | from keras.models import Model 6 | from sklearn.model_selection import train_test_split 7 | from data_helpers import load_data 8 | 9 | print('Loading data') 10 | x, y, vocabulary, vocabulary_inv = load_data() 11 | 12 | # x.shape -> (10662, 56) 13 | # y.shape -> (10662, 2) 14 | # len(vocabulary) -> 18765 15 | # len(vocabulary_inv) -> 18765 16 | 17 | X_train, X_test, y_train, y_test = train_test_split( x, y, test_size=0.2, random_state=42) 18 | 19 | # X_train.shape -> (8529, 56) 20 | # y_train.shape -> (8529, 2) 21 | # X_test.shape -> (2133, 56) 22 | # y_test.shape -> (2133, 2) 23 | 24 | 25 | sequence_length = x.shape[1] # 56 26 | vocabulary_size = len(vocabulary_inv) # 18765 27 | embedding_dim = 256 28 | filter_sizes = [3,4,5] 29 | num_filters = 512 30 | drop = 0.5 31 | 32 | epochs = 100 33 | batch_size = 30 34 | 35 | # this returns a tensor 36 | print("Creating Model...") 37 | inputs = Input(shape=(sequence_length,), dtype='int32') 38 | embedding = Embedding(input_dim=vocabulary_size, output_dim=embedding_dim, input_length=sequence_length)(inputs) 39 | reshape = Reshape((sequence_length,embedding_dim,1))(embedding) 40 | 41 | conv_0 = Conv2D(num_filters, kernel_size=(filter_sizes[0], embedding_dim), padding='valid', kernel_initializer='normal', activation='relu')(reshape) 42 | conv_1 = Conv2D(num_filters, kernel_size=(filter_sizes[1], embedding_dim), padding='valid', kernel_initializer='normal', activation='relu')(reshape) 43 | conv_2 = Conv2D(num_filters, kernel_size=(filter_sizes[2], embedding_dim), padding='valid', kernel_initializer='normal', activation='relu')(reshape) 44 | 45 | maxpool_0 = MaxPool2D(pool_size=(sequence_length - filter_sizes[0] + 1, 1), strides=(1,1), padding='valid')(conv_0) 46 | maxpool_1 = MaxPool2D(pool_size=(sequence_length - filter_sizes[1] + 1, 1), strides=(1,1), padding='valid')(conv_1) 47 | maxpool_2 = MaxPool2D(pool_size=(sequence_length - filter_sizes[2] + 1, 1), strides=(1,1), padding='valid')(conv_2) 48 | 49 | concatenated_tensor = Concatenate(axis=1)([maxpool_0, maxpool_1, maxpool_2]) 50 | flatten = Flatten()(concatenated_tensor) 51 | dropout = Dropout(drop)(flatten) 52 | output = Dense(units=2, activation='softmax')(dropout) 53 | 54 | # this creates a model that includes 55 | model = Model(inputs=inputs, outputs=output) 56 | 57 | checkpoint = ModelCheckpoint('weights.{epoch:03d}-{val_acc:.4f}.hdf5', monitor='val_acc', verbose=1, save_best_only=True, mode='auto') 58 | adam = Adam(lr=1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0) 59 | 60 | model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy']) 61 | print("Traning Model...") 62 | model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1, callbacks=[checkpoint], validation_data=(X_test, y_test)) # starts training 63 | 64 | 65 | 66 | --------------------------------------------------------------------------------