├── README.md
├── _config.yml
├── data
    ├── rt-polarity.neg
    └── rt-polarity.pos
├── data_helpers.py
└── model.py


/README.md:
--------------------------------------------------------------------------------
 1 | # CNN-text-classification-keras
 2 | 
 3 | It is simplified implementation of [Implementing a CNN for Text Classification in TensorFlow](http://www.wildml.com/2015/12/implementing-a-cnn-for-text-classification-in-tensorflow/) in Keras as functional api
 4 | 
 5 | # Requirements
 6 | - [Python 3.5.2](https://www.python.org/)
 7 | - [Keras 2.1.2](http://keras.io/)
 8 | - [Tensorflow 1.4.1](https://www.tensorflow.org/)
 9 | 
10 | # Traning
11 | Run the below command and it will run for 100 epochs if you want change it just open [model.py](https://github.com/bhaveshoswal/CNN-text-classification-keras/blob/master/model.py)
12 | 
13 | `python model.py`
14 | 
15 | # For new data
16 | You have to rebuild the vocabulary and then train.
17 | 
18 | # For Citation
19 | ```
20 | @misc{bhaveshoswal,
21 |   author = {Bhavesh Vinod Oswal},
22 |   title = {CNN-text-classification-keras},
23 |   year = {2016},
24 |   publisher = {GitHub},
25 |   journal = {GitHub repository},
26 |   howpublished = {\url{https://github.com/bhaveshoswal/CNN-text-classification-keras}},
27 | }
28 | ```
29 | 


--------------------------------------------------------------------------------
/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-merlot


--------------------------------------------------------------------------------
/data/rt-polarity.neg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bhaveshoswal/CNN-text-classification-keras/1db39e76894c55d293dd7e1f2eecbb8dcf014dab/data/rt-polarity.neg


--------------------------------------------------------------------------------
/data/rt-polarity.pos:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bhaveshoswal/CNN-text-classification-keras/1db39e76894c55d293dd7e1f2eecbb8dcf014dab/data/rt-polarity.pos


--------------------------------------------------------------------------------
/data_helpers.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import re
 3 | import itertools
 4 | from collections import Counter
 5 | 
 6 | 
 7 | def clean_str(string):
 8 |     """
 9 |     Tokenization/string cleaning for datasets.
10 |     Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
11 |     """
12 |     string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
13 |     string = re.sub(r"\'s", " \'s", string)
14 |     string = re.sub(r"\'ve", " \'ve", string)
15 |     string = re.sub(r"n\'t", " n\'t", string)
16 |     string = re.sub(r"\'re", " \'re", string)
17 |     string = re.sub(r"\'d", " \'d", string)
18 |     string = re.sub(r"\'ll", " \'ll", string)
19 |     string = re.sub(r",", " , ", string)
20 |     string = re.sub(r"!", " ! ", string)
21 |     string = re.sub(r"\(", " \( ", string)
22 |     string = re.sub(r"\)", " \) ", string)
23 |     string = re.sub(r"\?", " \? ", string)
24 |     string = re.sub(r"\s{2,}", " ", string)
25 |     return string.strip().lower()
26 | 
27 | 
28 | def load_data_and_labels():
29 |     """
30 |     Loads polarity data from files, splits the data into words and generates labels.
31 |     Returns split sentences and labels.
32 |     """
33 |     # Load data from files
34 |     positive_examples = list(open("./data/rt-polarity.pos", "r", encoding='latin-1').readlines())
35 |     positive_examples = [s.strip() for s in positive_examples]
36 |     negative_examples = list(open("./data/rt-polarity.neg", "r", encoding='latin-1').readlines())
37 |     negative_examples = [s.strip() for s in negative_examples]
38 |     # Split by words
39 |     x_text = positive_examples + negative_examples
40 |     x_text = [clean_str(sent) for sent in x_text]
41 |     x_text = [s.split(" ") for s in x_text]
42 |     # Generate labels
43 |     positive_labels = [[0, 1] for _ in positive_examples]
44 |     negative_labels = [[1, 0] for _ in negative_examples]
45 |     y = np.concatenate([positive_labels, negative_labels], 0)
46 |     return [x_text, y]
47 | 
48 | 
49 | def pad_sentences(sentences, padding_word="<PAD/>"):
50 |     """
51 |     Pads all sentences to the same length. The length is defined by the longest sentence.
52 |     Returns padded sentences.
53 |     """
54 |     sequence_length = max(len(x) for x in sentences)
55 |     padded_sentences = []
56 |     for i in range(len(sentences)):
57 |         sentence = sentences[i]
58 |         num_padding = sequence_length - len(sentence)
59 |         new_sentence = sentence + [padding_word] * num_padding
60 |         padded_sentences.append(new_sentence)
61 |     return padded_sentences
62 | 
63 | 
64 | def build_vocab(sentences):
65 |     """
66 |     Builds a vocabulary mapping from word to index based on the sentences.
67 |     Returns vocabulary mapping and inverse vocabulary mapping.
68 |     """
69 |     # Build vocabulary
70 |     word_counts = Counter(itertools.chain(*sentences))
71 |     # Mapping from index to word
72 |     vocabulary_inv = [x[0] for x in word_counts.most_common()]
73 |     vocabulary_inv = list(sorted(vocabulary_inv))
74 |     # Mapping from word to index
75 |     vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}
76 |     return [vocabulary, vocabulary_inv]
77 | 
78 | 
79 | def build_input_data(sentences, labels, vocabulary):
80 |     """
81 |     Maps sentences and labels to vectors based on a vocabulary.
82 |     """
83 |     x = np.array([[vocabulary[word] for word in sentence] for sentence in sentences])
84 |     y = np.array(labels)
85 |     return [x, y]
86 | 
87 | 
88 | def load_data():
89 |     """
90 |     Loads and preprocessed data for the dataset.
91 |     Returns input vectors, labels, vocabulary, and inverse vocabulary.
92 |     """
93 |     # Load and preprocess data
94 |     sentences, labels = load_data_and_labels()
95 |     sentences_padded = pad_sentences(sentences)
96 |     vocabulary, vocabulary_inv = build_vocab(sentences_padded)
97 |     x, y = build_input_data(sentences_padded, labels, vocabulary)
98 |     return [x, y, vocabulary, vocabulary_inv]
99 | 


--------------------------------------------------------------------------------
/model.py:
--------------------------------------------------------------------------------
 1 | from keras.layers import Input, Dense, Embedding, Conv2D, MaxPool2D
 2 | from keras.layers import Reshape, Flatten, Dropout, Concatenate
 3 | from keras.callbacks import ModelCheckpoint
 4 | from keras.optimizers import Adam
 5 | from keras.models import Model
 6 | from sklearn.model_selection import train_test_split
 7 | from data_helpers import load_data
 8 | 
 9 | print('Loading data')
10 | x, y, vocabulary, vocabulary_inv = load_data()
11 | 
12 | # x.shape -> (10662, 56)
13 | # y.shape -> (10662, 2)
14 | # len(vocabulary) -> 18765
15 | # len(vocabulary_inv) -> 18765
16 | 
17 | X_train, X_test, y_train, y_test = train_test_split( x, y, test_size=0.2, random_state=42)
18 | 
19 | # X_train.shape -> (8529, 56)
20 | # y_train.shape -> (8529, 2)
21 | # X_test.shape -> (2133, 56)
22 | # y_test.shape -> (2133, 2)
23 | 
24 | 
25 | sequence_length = x.shape[1] # 56
26 | vocabulary_size = len(vocabulary_inv) # 18765
27 | embedding_dim = 256
28 | filter_sizes = [3,4,5]
29 | num_filters = 512
30 | drop = 0.5
31 | 
32 | epochs = 100
33 | batch_size = 30
34 | 
35 | # this returns a tensor
36 | print("Creating Model...")
37 | inputs = Input(shape=(sequence_length,), dtype='int32')
38 | embedding = Embedding(input_dim=vocabulary_size, output_dim=embedding_dim, input_length=sequence_length)(inputs)
39 | reshape = Reshape((sequence_length,embedding_dim,1))(embedding)
40 | 
41 | conv_0 = Conv2D(num_filters, kernel_size=(filter_sizes[0], embedding_dim), padding='valid', kernel_initializer='normal', activation='relu')(reshape)
42 | conv_1 = Conv2D(num_filters, kernel_size=(filter_sizes[1], embedding_dim), padding='valid', kernel_initializer='normal', activation='relu')(reshape)
43 | conv_2 = Conv2D(num_filters, kernel_size=(filter_sizes[2], embedding_dim), padding='valid', kernel_initializer='normal', activation='relu')(reshape)
44 | 
45 | maxpool_0 = MaxPool2D(pool_size=(sequence_length - filter_sizes[0] + 1, 1), strides=(1,1), padding='valid')(conv_0)
46 | maxpool_1 = MaxPool2D(pool_size=(sequence_length - filter_sizes[1] + 1, 1), strides=(1,1), padding='valid')(conv_1)
47 | maxpool_2 = MaxPool2D(pool_size=(sequence_length - filter_sizes[2] + 1, 1), strides=(1,1), padding='valid')(conv_2)
48 | 
49 | concatenated_tensor = Concatenate(axis=1)([maxpool_0, maxpool_1, maxpool_2])
50 | flatten = Flatten()(concatenated_tensor)
51 | dropout = Dropout(drop)(flatten)
52 | output = Dense(units=2, activation='softmax')(dropout)
53 | 
54 | # this creates a model that includes
55 | model = Model(inputs=inputs, outputs=output)
56 | 
57 | checkpoint = ModelCheckpoint('weights.{epoch:03d}-{val_acc:.4f}.hdf5', monitor='val_acc', verbose=1, save_best_only=True, mode='auto')
58 | adam = Adam(lr=1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
59 | 
60 | model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])
61 | print("Traning Model...")
62 | model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1, callbacks=[checkpoint], validation_data=(X_test, y_test))  # starts training
63 | 
64 | 
65 | 
66 | 


--------------------------------------------------------------------------------