├── LICENSE ├── README.md ├── data ├── small_samples.csv └── train.csv.zip ├── data_helper.py ├── predict.py ├── predicted_results_1516404693 └── predictions_all.csv ├── text_cnn_rnn.py ├── train.py ├── trained_results_1516404693 ├── best_model.ckpt.data-00000-of-00001 ├── best_model.ckpt.index ├── best_model.ckpt.meta ├── checkpoint ├── embeddings.pickle ├── labels.json ├── trained_parameters.json └── words_index.json └── training_config.json /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ### Project: Classify Kaggle San Francisco Crime Description 2 | 3 | ### Highlights: 4 | - This is a **multi-class text classification (sentence classification)** problem. 5 | - The goal of this project is to **classify Kaggle San Francisco Crime Description into 39 classes**. 6 | - This model was built with **CNN, RNN (LSTM and GRU) and Word Embeddings** on **Tensorflow**. 7 | 8 | ### Data: [Kaggle San Francisco Crime](https://www.kaggle.com/c/sf-crime/data) 9 | - Input: **Descript** 10 | - Output: **Category** 11 | - Examples: 12 | 13 | Descript | Category 14 | -----------|----------- 15 | GRAND THEFT FROM LOCKED AUTO|LARCENY/THEFT 16 | POSSESSION OF NARCOTICS PARAPHERNALIA|DRUG/NARCOTIC 17 | AIDED CASE, MENTAL DISTURBED|NON-CRIMINAL 18 | AGGRAVATED ASSAULT WITH BODILY FORCE|ASSAULT 19 | ATTEMPTED ROBBERY ON THE STREET WITH A GUN|ROBBERY 20 | 21 | ### Train: 22 | - Command: python3 train.py train_data.file train_parameters.json 23 | - Example: ```python3 train.py ./data/train.csv.zip ./training_config.json``` 24 | 25 | ### Predict: 26 | - Command: python3 predict.py ./trained_results_dir/ new_data.csv 27 | - Example: ```python3 predict.py ./trained_results_1478563595/ ./data/small_samples.csv``` 28 | 29 | ### Reference: 30 | - [Implement a cnn for text classification in tensorflow](http://www.wildml.com/2015/12/implementing-a-cnn-for-text-classification-in-tensorflow/) 31 | -------------------------------------------------------------------------------- /data/train.csv.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiegzhan/multi-class-text-classification-cnn-rnn/5db803b7c156655ead270cb9788148dc8c7f6afd/data/train.csv.zip -------------------------------------------------------------------------------- /data_helper.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import sys 4 | import json 5 | import pickle 6 | import logging 7 | import itertools 8 | import numpy as np 9 | import pandas as pd 10 | import gensim as gs 11 | from pprint import pprint 12 | from collections import Counter 13 | from tensorflow.contrib import learn 14 | 15 | logging.getLogger().setLevel(logging.INFO) 16 | 17 | def clean_str(s): 18 | s = re.sub(r"[^A-Za-z0-9:(),!?\'\`]", " ", s) 19 | s = re.sub(r" : ", ":", s) 20 | s = re.sub(r"\'s", " \'s", s) 21 | s = re.sub(r"\'ve", " \'ve", s) 22 | s = re.sub(r"n\'t", " n\'t", s) 23 | s = re.sub(r"\'re", " \'re", s) 24 | s = re.sub(r"\'d", " \'d", s) 25 | s = re.sub(r"\'ll", " \'ll", s) 26 | s = re.sub(r",", " , ", s) 27 | s = re.sub(r"!", " ! ", s) 28 | s = re.sub(r"\(", " \( ", s) 29 | s = re.sub(r"\)", " \) ", s) 30 | s = re.sub(r"\?", " \? ", s) 31 | s = re.sub(r"\s{2,}", " ", s) 32 | return s.strip().lower() 33 | 34 | def load_embeddings(vocabulary): 35 | word_embeddings = {} 36 | for word in vocabulary: 37 | word_embeddings[word] = np.random.uniform(-0.25, 0.25, 300) 38 | return word_embeddings 39 | 40 | def pad_sentences(sentences, padding_word="", forced_sequence_length=None): 41 | """Pad setences during training or prediction""" 42 | if forced_sequence_length is None: # Train 43 | sequence_length = max(len(x) for x in sentences) 44 | else: # Prediction 45 | logging.critical('This is prediction, reading the trained sequence length') 46 | sequence_length = forced_sequence_length 47 | logging.critical('The maximum length is {}'.format(sequence_length)) 48 | 49 | padded_sentences = [] 50 | for i in range(len(sentences)): 51 | sentence = sentences[i] 52 | num_padding = sequence_length - len(sentence) 53 | 54 | if num_padding < 0: # Prediction: cut off the sentence if it is longer than the sequence length 55 | logging.info('This sentence has to be cut off because it is longer than trained sequence length') 56 | padded_sentence = sentence[0:sequence_length] 57 | else: 58 | padded_sentence = sentence + [padding_word] * num_padding 59 | padded_sentences.append(padded_sentence) 60 | return padded_sentences 61 | 62 | def build_vocab(sentences): 63 | word_counts = Counter(itertools.chain(*sentences)) 64 | vocabulary_inv = [word[0] for word in word_counts.most_common()] 65 | vocabulary = {word: index for index, word in enumerate(vocabulary_inv)} 66 | return vocabulary, vocabulary_inv 67 | 68 | def batch_iter(data, batch_size, num_epochs, shuffle=True): 69 | data = np.array(data) 70 | data_size = len(data) 71 | num_batches_per_epoch = int(data_size / batch_size) + 1 72 | 73 | for epoch in range(num_epochs): 74 | if shuffle: 75 | shuffle_indices = np.random.permutation(np.arange(data_size)) 76 | shuffled_data = data[shuffle_indices] 77 | else: 78 | shuffled_data = data 79 | 80 | for batch_num in range(num_batches_per_epoch): 81 | start_index = batch_num * batch_size 82 | end_index = min((batch_num + 1) * batch_size, data_size) 83 | yield shuffled_data[start_index:end_index] 84 | 85 | def load_data(filename): 86 | df = pd.read_csv(filename, compression='zip') 87 | selected = ['Category', 'Descript'] 88 | non_selected = list(set(df.columns) - set(selected)) 89 | 90 | df = df.drop(non_selected, axis=1) 91 | df = df.dropna(axis=0, how='any', subset=selected) 92 | df = df.reindex(np.random.permutation(df.index)) 93 | 94 | labels = sorted(list(set(df[selected[0]].tolist()))) 95 | num_labels = len(labels) 96 | one_hot = np.zeros((num_labels, num_labels), int) 97 | np.fill_diagonal(one_hot, 1) 98 | label_dict = dict(zip(labels, one_hot)) 99 | 100 | x_raw= df[selected[1]].apply(lambda x: clean_str(x).split(' ')).tolist() 101 | y_raw = df[selected[0]].apply(lambda y: label_dict[y]).tolist() 102 | 103 | x_raw = pad_sentences(x_raw) 104 | vocabulary, vocabulary_inv = build_vocab(x_raw) 105 | 106 | x = np.array([[vocabulary[word] for word in sentence] for sentence in x_raw]) 107 | y = np.array(y_raw) 108 | return x, y, vocabulary, vocabulary_inv, df, labels 109 | 110 | if __name__ == "__main__": 111 | train_file = './data/train.csv.zip' 112 | load_data(train_file) 113 | -------------------------------------------------------------------------------- /predict.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import json 4 | import shutil 5 | import pickle 6 | import logging 7 | import data_helper 8 | import numpy as np 9 | import pandas as pd 10 | import tensorflow as tf 11 | from text_cnn_rnn import TextCNNRNN 12 | 13 | logging.getLogger().setLevel(logging.INFO) 14 | 15 | def load_trained_params(trained_dir): 16 | params = json.loads(open(trained_dir + 'trained_parameters.json').read()) 17 | words_index = json.loads(open(trained_dir + 'words_index.json').read()) 18 | labels = json.loads(open(trained_dir + 'labels.json').read()) 19 | 20 | with open(trained_dir + 'embeddings.pickle', 'rb') as input_file: 21 | fetched_embedding = pickle.load(input_file) 22 | embedding_mat = np.array(fetched_embedding, dtype = np.float32) 23 | return params, words_index, labels, embedding_mat 24 | 25 | def load_test_data(test_file, labels): 26 | df = pd.read_csv(test_file, sep='|') 27 | select = ['Descript'] 28 | 29 | df = df.dropna(axis=0, how='any', subset=select) 30 | test_examples = df[select[0]].apply(lambda x: data_helper.clean_str(x).split(' ')).tolist() 31 | 32 | num_labels = len(labels) 33 | one_hot = np.zeros((num_labels, num_labels), int) 34 | np.fill_diagonal(one_hot, 1) 35 | label_dict = dict(zip(labels, one_hot)) 36 | 37 | y_ = None 38 | if 'Category' in df.columns: 39 | select.append('Category') 40 | y_ = df[select[1]].apply(lambda x: label_dict[x]).tolist() 41 | 42 | not_select = list(set(df.columns) - set(select)) 43 | df = df.drop(not_select, axis=1) 44 | return test_examples, y_, df 45 | 46 | def map_word_to_index(examples, words_index): 47 | x_ = [] 48 | for example in examples: 49 | temp = [] 50 | for word in example: 51 | if word in words_index: 52 | temp.append(words_index[word]) 53 | else: 54 | temp.append(0) 55 | x_.append(temp) 56 | return x_ 57 | 58 | def predict_unseen_data(): 59 | trained_dir = sys.argv[1] 60 | if not trained_dir.endswith('/'): 61 | trained_dir += '/' 62 | test_file = sys.argv[2] 63 | 64 | params, words_index, labels, embedding_mat = load_trained_params(trained_dir) 65 | x_, y_, df = load_test_data(test_file, labels) 66 | x_ = data_helper.pad_sentences(x_, forced_sequence_length=params['sequence_length']) 67 | x_ = map_word_to_index(x_, words_index) 68 | 69 | x_test, y_test = np.asarray(x_), None 70 | if y_ is not None: 71 | y_test = np.asarray(y_) 72 | 73 | timestamp = trained_dir.split('/')[-2].split('_')[-1] 74 | predicted_dir = './predicted_results_' + timestamp + '/' 75 | if os.path.exists(predicted_dir): 76 | shutil.rmtree(predicted_dir) 77 | os.makedirs(predicted_dir) 78 | 79 | with tf.Graph().as_default(): 80 | session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) 81 | sess = tf.Session(config=session_conf) 82 | with sess.as_default(): 83 | cnn_rnn = TextCNNRNN( 84 | embedding_mat = embedding_mat, 85 | non_static = params['non_static'], 86 | hidden_unit = params['hidden_unit'], 87 | sequence_length = len(x_test[0]), 88 | max_pool_size = params['max_pool_size'], 89 | filter_sizes = map(int, params['filter_sizes'].split(",")), 90 | num_filters = params['num_filters'], 91 | num_classes = len(labels), 92 | embedding_size = params['embedding_dim'], 93 | l2_reg_lambda = params['l2_reg_lambda']) 94 | 95 | def real_len(batches): 96 | return [np.ceil(np.argmin(batch + [0]) * 1.0 / params['max_pool_size']) for batch in batches] 97 | 98 | def predict_step(x_batch): 99 | feed_dict = { 100 | cnn_rnn.input_x: x_batch, 101 | cnn_rnn.dropout_keep_prob: 1.0, 102 | cnn_rnn.batch_size: len(x_batch), 103 | cnn_rnn.pad: np.zeros([len(x_batch), 1, params['embedding_dim'], 1]), 104 | cnn_rnn.real_len: real_len(x_batch), 105 | } 106 | predictions = sess.run([cnn_rnn.predictions], feed_dict) 107 | return predictions 108 | 109 | checkpoint_file = trained_dir + 'best_model.ckpt' 110 | saver = tf.train.Saver(tf.all_variables()) 111 | saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file)) 112 | saver.restore(sess, checkpoint_file) 113 | logging.critical('{} has been loaded'.format(checkpoint_file)) 114 | 115 | batches = data_helper.batch_iter(list(x_test), params['batch_size'], 1, shuffle=False) 116 | 117 | predictions, predict_labels = [], [] 118 | for x_batch in batches: 119 | batch_predictions = predict_step(x_batch)[0] 120 | for batch_prediction in batch_predictions: 121 | predictions.append(batch_prediction) 122 | predict_labels.append(labels[batch_prediction]) 123 | 124 | # Save the predictions back to file 125 | df['NEW_PREDICTED'] = predict_labels 126 | columns = sorted(df.columns, reverse=True) 127 | df.to_csv(predicted_dir + 'predictions_all.csv', index=False, columns=columns, sep='|') 128 | 129 | if y_test is not None: 130 | y_test = np.array(np.argmax(y_test, axis=1)) 131 | accuracy = sum(np.array(predictions) == y_test) / float(len(y_test)) 132 | logging.critical('The prediction accuracy is: {}'.format(accuracy)) 133 | 134 | logging.critical('Prediction is complete, all files have been saved: {}'.format(predicted_dir)) 135 | 136 | if __name__ == '__main__': 137 | # python3 predict.py ./trained_results_1478563595/ ./data/small_samples.csv 138 | predict_unseen_data() 139 | -------------------------------------------------------------------------------- /text_cnn_rnn.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | 4 | class TextCNNRNN(object): 5 | def __init__(self, embedding_mat, non_static, hidden_unit, sequence_length, max_pool_size, 6 | num_classes, embedding_size, filter_sizes, num_filters, l2_reg_lambda=0.0): 7 | 8 | self.input_x = tf.placeholder(tf.int32, [None, sequence_length], name='input_x') 9 | self.input_y = tf.placeholder(tf.float32, [None, num_classes], name='input_y') 10 | self.dropout_keep_prob = tf.placeholder(tf.float32, name='dropout_keep_prob') 11 | self.batch_size = tf.placeholder(tf.int32, []) 12 | self.pad = tf.placeholder(tf.float32, [None, 1, embedding_size, 1], name='pad') 13 | self.real_len = tf.placeholder(tf.int32, [None], name='real_len') 14 | 15 | l2_loss = tf.constant(0.0) 16 | 17 | with tf.device('/cpu:0'), tf.name_scope('embedding'): 18 | if not non_static: 19 | W = tf.constant(embedding_mat, name='W') 20 | else: 21 | W = tf.Variable(embedding_mat, name='W') 22 | self.embedded_chars = tf.nn.embedding_lookup(W, self.input_x) 23 | emb = tf.expand_dims(self.embedded_chars, -1) 24 | 25 | pooled_concat = [] 26 | reduced = np.int32(np.ceil((sequence_length) * 1.0 / max_pool_size)) 27 | 28 | for i, filter_size in enumerate(filter_sizes): 29 | with tf.name_scope('conv-maxpool-%s' % filter_size): 30 | 31 | # Zero paddings so that the convolution output have dimension batch x sequence_length x emb_size x channel 32 | num_prio = (filter_size-1) // 2 33 | num_post = (filter_size-1) - num_prio 34 | pad_prio = tf.concat([self.pad] * num_prio,1) 35 | pad_post = tf.concat([self.pad] * num_post,1) 36 | emb_pad = tf.concat([pad_prio, emb, pad_post],1) 37 | 38 | filter_shape = [filter_size, embedding_size, 1, num_filters] 39 | W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name='W') 40 | b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name='b') 41 | conv = tf.nn.conv2d(emb_pad, W, strides=[1, 1, 1, 1], padding='VALID', name='conv') 42 | 43 | h = tf.nn.relu(tf.nn.bias_add(conv, b), name='relu') 44 | 45 | # Maxpooling over the outputs 46 | pooled = tf.nn.max_pool(h, ksize=[1, max_pool_size, 1, 1], strides=[1, max_pool_size, 1, 1], padding='SAME', name='pool') 47 | pooled = tf.reshape(pooled, [-1, reduced, num_filters]) 48 | pooled_concat.append(pooled) 49 | 50 | pooled_concat = tf.concat(pooled_concat,2) 51 | pooled_concat = tf.nn.dropout(pooled_concat, self.dropout_keep_prob) 52 | 53 | # lstm_cell = tf.nn.rnn_cell.LSTMCell(num_units=hidden_unit) 54 | 55 | #lstm_cell = tf.nn.rnn_cell.GRUCell(num_units=hidden_unit) 56 | lstm_cell = tf.contrib.rnn.GRUCell(num_units=hidden_unit) 57 | 58 | #lstm_cell = tf.nn.rnn_cell.DropoutWrapper(lstm_cell, output_keep_prob=self.dropout_keep_prob) 59 | lstm_cell = tf.contrib.rnn.DropoutWrapper(lstm_cell, output_keep_prob=self.dropout_keep_prob) 60 | 61 | 62 | self._initial_state = lstm_cell.zero_state(self.batch_size, tf.float32) 63 | #inputs = [tf.squeeze(input_, [1]) for input_ in tf.split(1, reduced, pooled_concat)] 64 | inputs = [tf.squeeze(input_, [1]) for input_ in tf.split(pooled_concat,num_or_size_splits=int(reduced),axis=1)] 65 | #outputs, state = tf.nn.rnn(lstm_cell, inputs, initial_state=self._initial_state, sequence_length=self.real_len) 66 | outputs, state = tf.contrib.rnn.static_rnn(lstm_cell, inputs, initial_state=self._initial_state, sequence_length=self.real_len) 67 | 68 | # Collect the appropriate last words into variable output (dimension = batch x embedding_size) 69 | output = outputs[0] 70 | with tf.variable_scope('Output'): 71 | tf.get_variable_scope().reuse_variables() 72 | one = tf.ones([1, hidden_unit], tf.float32) 73 | for i in range(1,len(outputs)): 74 | ind = self.real_len < (i+1) 75 | ind = tf.to_float(ind) 76 | ind = tf.expand_dims(ind, -1) 77 | mat = tf.matmul(ind, one) 78 | output = tf.add(tf.multiply(output, mat),tf.multiply(outputs[i], 1.0 - mat)) 79 | 80 | with tf.name_scope('output'): 81 | self.W = tf.Variable(tf.truncated_normal([hidden_unit, num_classes], stddev=0.1), name='W') 82 | b = tf.Variable(tf.constant(0.1, shape=[num_classes]), name='b') 83 | l2_loss += tf.nn.l2_loss(W) 84 | l2_loss += tf.nn.l2_loss(b) 85 | self.scores = tf.nn.xw_plus_b(output, self.W, b, name='scores') 86 | self.predictions = tf.argmax(self.scores, 1, name='predictions') 87 | 88 | with tf.name_scope('loss'): 89 | losses = tf.nn.softmax_cross_entropy_with_logits(labels = self.input_y, logits = self.scores) # only named arguments accepted 90 | self.loss = tf.reduce_mean(losses) + l2_reg_lambda * l2_loss 91 | 92 | with tf.name_scope('accuracy'): 93 | correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1)) 94 | self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name='accuracy') 95 | 96 | with tf.name_scope('num_correct'): 97 | correct = tf.equal(self.predictions, tf.argmax(self.input_y, 1)) 98 | self.num_correct = tf.reduce_sum(tf.cast(correct, 'float')) 99 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import json 4 | import time 5 | import shutil 6 | import pickle 7 | import logging 8 | import data_helper 9 | import numpy as np 10 | import pandas as pd 11 | import tensorflow as tf 12 | from text_cnn_rnn import TextCNNRNN 13 | from sklearn.model_selection import train_test_split 14 | 15 | logging.getLogger().setLevel(logging.INFO) 16 | 17 | def train_cnn_rnn(): 18 | input_file = sys.argv[1] 19 | x_, y_, vocabulary, vocabulary_inv, df, labels = data_helper.load_data(input_file) 20 | 21 | training_config = sys.argv[2] 22 | params = json.loads(open(training_config).read()) 23 | 24 | # Assign a 300 dimension vector to each word 25 | word_embeddings = data_helper.load_embeddings(vocabulary) 26 | embedding_mat = [word_embeddings[word] for index, word in enumerate(vocabulary_inv)] 27 | embedding_mat = np.array(embedding_mat, dtype = np.float32) 28 | 29 | # Split the original dataset into train set and test set 30 | x, x_test, y, y_test = train_test_split(x_, y_, test_size=0.1) 31 | 32 | # Split the train set into train set and dev set 33 | x_train, x_dev, y_train, y_dev = train_test_split(x, y, test_size=0.1) 34 | 35 | logging.info('x_train: {}, x_dev: {}, x_test: {}'.format(len(x_train), len(x_dev), len(x_test))) 36 | logging.info('y_train: {}, y_dev: {}, y_test: {}'.format(len(y_train), len(y_dev), len(y_test))) 37 | 38 | # Create a directory, everything related to the training will be saved in this directory 39 | timestamp = str(int(time.time())) 40 | trained_dir = './trained_results_' + timestamp + '/' 41 | if os.path.exists(trained_dir): 42 | shutil.rmtree(trained_dir) 43 | os.makedirs(trained_dir) 44 | 45 | graph = tf.Graph() 46 | with graph.as_default(): 47 | session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) 48 | sess = tf.Session(config=session_conf) 49 | with sess.as_default(): 50 | cnn_rnn = TextCNNRNN( 51 | embedding_mat=embedding_mat, 52 | sequence_length=x_train.shape[1], 53 | num_classes = y_train.shape[1], 54 | non_static=params['non_static'], 55 | hidden_unit=params['hidden_unit'], 56 | max_pool_size=params['max_pool_size'], 57 | filter_sizes=map(int, params['filter_sizes'].split(",")), 58 | num_filters = params['num_filters'], 59 | embedding_size = params['embedding_dim'], 60 | l2_reg_lambda = params['l2_reg_lambda']) 61 | 62 | global_step = tf.Variable(0, name='global_step', trainable=False) 63 | optimizer = tf.train.RMSPropOptimizer(1e-3, decay=0.9) 64 | grads_and_vars = optimizer.compute_gradients(cnn_rnn.loss) 65 | train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) 66 | 67 | # Checkpoint files will be saved in this directory during training 68 | checkpoint_dir = './checkpoints_' + timestamp + '/' 69 | if os.path.exists(checkpoint_dir): 70 | shutil.rmtree(checkpoint_dir) 71 | os.makedirs(checkpoint_dir) 72 | checkpoint_prefix = os.path.join(checkpoint_dir, 'model') 73 | 74 | def real_len(batches): 75 | return [np.ceil(np.argmin(batch + [0]) * 1.0 / params['max_pool_size']) for batch in batches] 76 | 77 | def train_step(x_batch, y_batch): 78 | feed_dict = { 79 | cnn_rnn.input_x: x_batch, 80 | cnn_rnn.input_y: y_batch, 81 | cnn_rnn.dropout_keep_prob: params['dropout_keep_prob'], 82 | cnn_rnn.batch_size: len(x_batch), 83 | cnn_rnn.pad: np.zeros([len(x_batch), 1, params['embedding_dim'], 1]), 84 | cnn_rnn.real_len: real_len(x_batch), 85 | } 86 | _, step, loss, accuracy = sess.run([train_op, global_step, cnn_rnn.loss, cnn_rnn.accuracy], feed_dict) 87 | 88 | def dev_step(x_batch, y_batch): 89 | feed_dict = { 90 | cnn_rnn.input_x: x_batch, 91 | cnn_rnn.input_y: y_batch, 92 | cnn_rnn.dropout_keep_prob: 1.0, 93 | cnn_rnn.batch_size: len(x_batch), 94 | cnn_rnn.pad: np.zeros([len(x_batch), 1, params['embedding_dim'], 1]), 95 | cnn_rnn.real_len: real_len(x_batch), 96 | } 97 | step, loss, accuracy, num_correct, predictions = sess.run( 98 | [global_step, cnn_rnn.loss, cnn_rnn.accuracy, cnn_rnn.num_correct, cnn_rnn.predictions], feed_dict) 99 | return accuracy, loss, num_correct, predictions 100 | 101 | saver = tf.train.Saver() 102 | sess.run(tf.global_variables_initializer()) 103 | 104 | # Training starts here 105 | train_batches = data_helper.batch_iter(list(zip(x_train, y_train)), params['batch_size'], params['num_epochs']) 106 | best_accuracy, best_at_step = 0, 0 107 | 108 | # Train the model with x_train and y_train 109 | for train_batch in train_batches: 110 | x_train_batch, y_train_batch = zip(*train_batch) 111 | train_step(x_train_batch, y_train_batch) 112 | current_step = tf.train.global_step(sess, global_step) 113 | 114 | # Evaluate the model with x_dev and y_dev 115 | if current_step % params['evaluate_every'] == 0: 116 | dev_batches = data_helper.batch_iter(list(zip(x_dev, y_dev)), params['batch_size'], 1) 117 | 118 | total_dev_correct = 0 119 | for dev_batch in dev_batches: 120 | x_dev_batch, y_dev_batch = zip(*dev_batch) 121 | acc, loss, num_dev_correct, predictions = dev_step(x_dev_batch, y_dev_batch) 122 | total_dev_correct += num_dev_correct 123 | accuracy = float(total_dev_correct) / len(y_dev) 124 | logging.info('Accuracy on dev set: {}'.format(accuracy)) 125 | 126 | if accuracy >= best_accuracy: 127 | best_accuracy, best_at_step = accuracy, current_step 128 | path = saver.save(sess, checkpoint_prefix, global_step=current_step) 129 | logging.critical('Saved model {} at step {}'.format(path, best_at_step)) 130 | logging.critical('Best accuracy {} at step {}'.format(best_accuracy, best_at_step)) 131 | logging.critical('Training is complete, testing the best model on x_test and y_test') 132 | 133 | # Save the model files to trained_dir. predict.py needs trained model files. 134 | saver.save(sess, trained_dir + "best_model.ckpt") 135 | 136 | # Evaluate x_test and y_test 137 | saver.restore(sess, checkpoint_prefix + '-' + str(best_at_step)) 138 | test_batches = data_helper.batch_iter(list(zip(x_test, y_test)), params['batch_size'], 1, shuffle=False) 139 | total_test_correct = 0 140 | for test_batch in test_batches: 141 | x_test_batch, y_test_batch = zip(*test_batch) 142 | acc, loss, num_test_correct, predictions = dev_step(x_test_batch, y_test_batch) 143 | total_test_correct += int(num_test_correct) 144 | logging.critical('Accuracy on test set: {}'.format(float(total_test_correct) / len(y_test))) 145 | 146 | # Save trained parameters and files since predict.py needs them 147 | with open(trained_dir + 'words_index.json', 'w') as outfile: 148 | json.dump(vocabulary, outfile, indent=4, ensure_ascii=False) 149 | with open(trained_dir + 'embeddings.pickle', 'wb') as outfile: 150 | pickle.dump(embedding_mat, outfile, pickle.HIGHEST_PROTOCOL) 151 | with open(trained_dir + 'labels.json', 'w') as outfile: 152 | json.dump(labels, outfile, indent=4, ensure_ascii=False) 153 | 154 | params['sequence_length'] = x_train.shape[1] 155 | with open(trained_dir + 'trained_parameters.json', 'w') as outfile: 156 | json.dump(params, outfile, indent=4, sort_keys=True, ensure_ascii=False) 157 | 158 | if __name__ == '__main__': 159 | # python3 train.py ./data/train.csv.zip ./training_config.json 160 | train_cnn_rnn() 161 | -------------------------------------------------------------------------------- /trained_results_1516404693/best_model.ckpt.data-00000-of-00001: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiegzhan/multi-class-text-classification-cnn-rnn/5db803b7c156655ead270cb9788148dc8c7f6afd/trained_results_1516404693/best_model.ckpt.data-00000-of-00001 -------------------------------------------------------------------------------- /trained_results_1516404693/best_model.ckpt.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiegzhan/multi-class-text-classification-cnn-rnn/5db803b7c156655ead270cb9788148dc8c7f6afd/trained_results_1516404693/best_model.ckpt.index -------------------------------------------------------------------------------- /trained_results_1516404693/best_model.ckpt.meta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiegzhan/multi-class-text-classification-cnn-rnn/5db803b7c156655ead270cb9788148dc8c7f6afd/trained_results_1516404693/best_model.ckpt.meta -------------------------------------------------------------------------------- /trained_results_1516404693/checkpoint: -------------------------------------------------------------------------------- 1 | model_checkpoint_path: "best_model.ckpt" 2 | all_model_checkpoint_paths: "../checkpoints_1516404693/model-4200" 3 | all_model_checkpoint_paths: "../checkpoints_1516404693/model-5000" 4 | all_model_checkpoint_paths: "../checkpoints_1516404693/model-5200" 5 | all_model_checkpoint_paths: "../checkpoints_1516404693/model-5400" 6 | all_model_checkpoint_paths: "best_model.ckpt" 7 | -------------------------------------------------------------------------------- /trained_results_1516404693/embeddings.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiegzhan/multi-class-text-classification-cnn-rnn/5db803b7c156655ead270cb9788148dc8c7f6afd/trained_results_1516404693/embeddings.pickle -------------------------------------------------------------------------------- /trained_results_1516404693/labels.json: -------------------------------------------------------------------------------- 1 | [ 2 | "ARSON", 3 | "ASSAULT", 4 | "BAD CHECKS", 5 | "BRIBERY", 6 | "BURGLARY", 7 | "DISORDERLY CONDUCT", 8 | "DRIVING UNDER THE INFLUENCE", 9 | "DRUG/NARCOTIC", 10 | "DRUNKENNESS", 11 | "EMBEZZLEMENT", 12 | "EXTORTION", 13 | "FAMILY OFFENSES", 14 | "FORGERY/COUNTERFEITING", 15 | "FRAUD", 16 | "GAMBLING", 17 | "KIDNAPPING", 18 | "LARCENY/THEFT", 19 | "LIQUOR LAWS", 20 | "LOITERING", 21 | "MISSING PERSON", 22 | "NON-CRIMINAL", 23 | "OTHER OFFENSES", 24 | "PORNOGRAPHY/OBSCENE MAT", 25 | "PROSTITUTION", 26 | "RECOVERED VEHICLE", 27 | "ROBBERY", 28 | "RUNAWAY", 29 | "SECONDARY CODES", 30 | "SEX OFFENSES FORCIBLE", 31 | "SEX OFFENSES NON FORCIBLE", 32 | "STOLEN PROPERTY", 33 | "SUICIDE", 34 | "SUSPICIOUS OCC", 35 | "TREA", 36 | "TRESPASS", 37 | "VANDALISM", 38 | "VEHICLE THEFT", 39 | "WARRANTS", 40 | "WEAPON LAWS" 41 | ] -------------------------------------------------------------------------------- /trained_results_1516404693/trained_parameters.json: -------------------------------------------------------------------------------- 1 | { 2 | "batch_size": 128, 3 | "dropout_keep_prob": 0.5, 4 | "embedding_dim": 300, 5 | "evaluate_every": 200, 6 | "filter_sizes": "3,4,5", 7 | "hidden_unit": 300, 8 | "l2_reg_lambda": 0.0, 9 | "max_pool_size": 4, 10 | "non_static": false, 11 | "num_epochs": 1, 12 | "num_filters": 32, 13 | "sequence_length": 14 14 | } -------------------------------------------------------------------------------- /trained_results_1516404693/words_index.json: -------------------------------------------------------------------------------- 1 | { 2 | "forced": 484, 3 | "written": 775, 4 | "smoked": 526, 5 | "hazardous": 633, 6 | "prowl": 168, 7 | "number": 530, 8 | "warehouse": 357, 9 | "dangerous": 213, 10 | "carjacking": 324, 11 | "ammunition": 463, 12 | "electrical": 691, 13 | "manslaughter": 693, 14 | "red": 551, 15 | "unlawful": 44, 16 | "administering": 549, 17 | "insured": 830, 18 | "sidewalks": 158, 19 | "gambling": 452, 20 | "crimes": 399, 21 | "assisting": 728, 22 | "restraining": 112, 23 | "commit": 422, 24 | "battery": 24, 25 | "advertising": 603, 26 | "encourage": 801, 27 | "making": 845, 28 | "police": 124, 29 | "box": 692, 30 | "rape": 218, 31 | "furnishing": 434, 32 | "intercourse": 461, 33 | "buses": 380, 34 | "aggravated": 55, 35 | "threating": 252, 36 | "fare": 575, 37 | "suspended": 31, 38 | "house": 72, 39 | "failure": 264, 40 | "counterfeit": 527, 41 | "windows": 121, 42 | "investigation": 93, 43 | "alcohol": 95, 44 | "disguise": 814, 45 | "incl": 682, 46 | "permit": 219, 47 | "heating": 786, 48 | "danger": 327, 49 | "control": 376, 50 | "physical": 216, 51 | "unlocked": 109, 52 | "identify": 901, 53 | "made": 284, 54 | "auction": 784, 55 | "under": 65, 56 | "motorcycle": 159, 57 | "religious": 802, 58 | "community": 502, 59 | "occurence": 427, 60 | "apparel": 902, 61 | "hate": 776, 62 | "methadone": 443, 63 | "dispute": 729, 64 | "services": 578, 65 | "traffic": 37, 66 | "near": 760, 67 | "lodging": 314, 68 | "fraudulent": 149, 69 | "representation": 903, 70 | "incident": 285, 71 | "unauthorized": 624, 72 | "justice": 629, 73 | "bodily": 63, 74 | "accepting": 650, 75 | "commercial": 241, 76 | "mental": 40, 77 | "violation": 13, 78 | "take": 763, 79 | "withhold": 720, 80 | "supervision": 503, 81 | "telegraph": 351, 82 | "possessing": 491, 83 | "elder": 299, 84 | "lights": 816, 85 | "offenses": 581, 86 | "sewers": 904, 87 | "collision": 390, 88 | "statute": 559, 89 | "matter": 509, 90 | "explosive": 466, 91 | "scene": 740, 92 | "wife": 777, 93 | "resulting": 456, 94 | "involved": 273, 95 | "order": 86, 96 | "where": 278, 97 | "post": 504, 98 | "rescuing": 787, 99 | "causing": 488, 100 | "business": 683, 101 | "exhibiting": 247, 102 | "sidewalk": 547, 103 | "paraphernalia": 82, 104 | "flat": 209, 105 | "ingestion": 548, 106 | "hit": 391, 107 | "obtaining": 233, 108 | "meters": 741, 109 | "alarm": 472, 110 | "imprisonment": 196, 111 | "complaint": 400, 112 | "theft": 2, 113 | "oil": 905, 114 | "in": 89, 115 | "plays": 846, 116 | "resisting": 84, 117 | "abortion": 788, 118 | "common": 750, 119 | "evidence": 220, 120 | "silencer": 847, 121 | "access": 350, 122 | "operated": 453, 123 | "\\)": 74, 124 | "manufacture": 499, 125 | "tire": 366, 126 | "entry": 22, 127 | "heed": 552, 128 | "care": 474, 129 | "bring": 471, 130 | "adult": 79, 131 | "explosives": 640, 132 | "threats": 60, 133 | "fraudulently": 510, 134 | "damaging": 593, 135 | "911": 778, 136 | "disperse": 654, 137 | "on": 41, 138 | "estate": 684, 139 | "stray": 597, 140 | "claims": 583, 141 | "are": 279, 142 | "entering": 867, 143 | "copying": 584, 144 | "electrically": 869, 145 | "corrections": 151, 146 | "switch": 870, 147 | "run": 392, 148 | "proper": 475, 149 | "instruments": 370, 150 | "barbituates": 694, 151 | "fire": 238, 152 | "schoolyard": 643, 153 | "no": 598, 154 | "hole": 817, 155 | "plate": 108, 156 | "violence": 99, 157 | "hallucinogenic": 379, 158 | "youth": 698, 159 | "posing": 906, 160 | "lewd": 374, 161 | "dismissal": 871, 162 | "amphetamine": 119, 163 | "attempting": 765, 164 | "street": 66, 165 | "riot": 531, 166 | "possibly": 432, 167 | "sufficient": 337, 168 | "alter": 464, 169 | "grand": 5, 170 | "dancehall": 794, 171 | "misconduct": 907, 172 | "racing": 942, 173 | "destitute": 848, 174 | "inflicting": 457, 175 | "bribery": 451, 176 | "inmate": 615, 177 | "distributors": 604, 178 | "unusual": 428, 179 | "keeper": 614, 180 | "affixing": 849, 181 | "camper": 536, 182 | "disturbance": 492, 183 | "change": 599, 184 | "committing": 217, 185 | "placing": 540, 186 | "permission": 317, 187 | "identification": 215, 188 | "tear": 557, 189 | "perjury": 590, 190 | "alcoholic": 908, 191 | "freight": 751, 192 | "tab": 467, 193 | "coins": 207, 194 | "passenger": 752, 195 | "p": 695, 196 | "trespass": 630, 197 | "bar": 803, 198 | "victim": 232, 199 | "object": 477, 200 | "asphyxiation": 605, 201 | "fireworks": 818, 202 | "consent": 626, 203 | "courtesy": 256, 204 | "insurer": 819, 205 | "invasion": 713, 206 | "game": 245, 207 | "during": 550, 208 | "bldg": 283, 209 | "embezzled": 429, 210 | "industrial": 790, 211 | "gov't": 909, 212 | "grounds": 439, 213 | "devices": 642, 214 | "federal": 560, 215 | "warrant": 34, 216 | "return": 781, 217 | "hotel": 255, 218 | "gas": 524, 219 | "procuring": 910, 220 | "performances": 850, 221 | "narcotics": 76, 222 | "trafficking": 482, 223 | "documents": 672, 224 | "station": 445, 225 | "switchblade": 500, 226 | "code": 130, 227 | "motor": 534, 228 | "injunction": 696, 229 | "store": 91, 230 | "knife": 131, 231 | "injurious": 730, 232 | "dv": 178, 233 | "roll": 566, 234 | "marker": 721, 235 | "animals": 411, 236 | "truant": 436, 237 | "noxious": 843, 238 | "lost": 17, 239 | "escapee": 655, 240 | "chemicals": 341, 241 | "general": 206, 242 | "cheat": 460, 243 | "calls": 125, 244 | "with": 20, 245 | "falsification": 851, 246 | "soliciting": 423, 247 | "view": 239, 248 | "disturbing": 192, 249 | "licensed": 742, 250 | "selling": 470, 251 | "id": 154, 252 | "30": 773, 253 | "offensive": 528, 254 | "known": 645, 255 | "tickets": 804, 256 | "juvenile": 83, 257 | "stealing": 378, 258 | "tobacco": 493, 259 | "penetration": 485, 260 | "telephone": 343, 261 | "prejudice": 302, 262 | "commission": 648, 263 | "alien": 306, 264 | "scope": 938, 265 | "harbor": 636, 266 | "school": 282, 267 | "visiting": 529, 268 | "plates": 820, 269 | "conceal": 717, 270 | "towards": 231, 271 | "siren": 553, 272 | "bigamy": 911, 273 | "protective": 561, 274 | "tools": 173, 275 | "flammable": 641, 276 | "comply": 876, 277 | "privacy": 712, 278 | "playground": 623, 279 | "checks": 113, 280 | "outside": 54, 281 | "assembly": 567, 282 | "by": 70, 283 | "resist": 407, 284 | "overcharging": 780, 285 | "misuse": 821, 286 | "release": 505, 287 | "against": 57, 288 | "juveniles": 872, 289 | "robbery": 35, 290 | "cloned": 571, 291 | "drowning": 805, 292 | "other": 160, 293 | "metals": 789, 294 | "escape": 585, 295 | "use": 80, 296 | "lines": 344, 297 | "addict": 301, 298 | "mail": 574, 299 | "aerosol": 639, 300 | "oral": 362, 301 | "meth": 123, 302 | "unlawfully": 612, 303 | "transfer": 646, 304 | "concealed": 203, 305 | "dispensing": 616, 306 | "turned": 322, 307 | "facility": 913, 308 | "injury": 96, 309 | "vio": 647, 310 | "transportation": 244, 311 | "demonstration": 731, 312 | "grossly": 435, 313 | "related": 162, 314 | "harassing": 186, 315 | "discharge": 251, 316 | "bridges": 873, 317 | "occupant": 681, 318 | "distribution": 852, 319 | "atm": 359, 320 | "misc": 732, 321 | "peyote": 914, 322 | "shots": 345, 323 | "locked": 8, 324 | "deceive": 915, 325 | "aggressive": 430, 326 | "conductive": 874, 327 | "threat": 412, 328 | "prescription": 478, 329 | "operating": 389, 330 | "receiving": 136, 331 | "taxi": 329, 332 | "pornography": 465, 333 | "machine": 444, 334 | "eavesdropping": 853, 335 | "drives": 656, 336 | "lasers": 822, 337 | "trailer": 533, 338 | "comm": 438, 339 | "detention": 115, 340 | "witnesses": 455, 341 | ",": 1, 342 | "occurrence": 36, 343 | "restrictions": 657, 344 | "female": 277, 345 | "uttering": 371, 346 | "assault": 50, 347 | "leading": 328, 348 | "constrt": 297, 349 | "construction": 126, 350 | "age": 764, 351 | "rights": 783, 352 | "bank": 354, 353 | "att": 224, 354 | "teachers": 447, 355 | "mischief": 15, 356 | "habitual": 437, 357 | "dwelling": 229, 358 | "movies": 806, 359 | "worship": 916, 360 | "molest": 449, 361 | "thoroughfare": 416, 362 | "incomplete": 586, 363 | "depositing": 815, 364 | "extortion": 394, 365 | "telecommunication": 753, 366 | "kidnapper": 917, 367 | "wearing": 743, 368 | "not": 320, 369 | "immoral": 325, 370 | "peeping": 580, 371 | "suspicious": 33, 372 | "posted": 791, 373 | "marshall": 424, 374 | "attending": 918, 375 | "breaking": 122, 376 | "planting": 382, 377 | "coin": 454, 378 | "trick": 200, 379 | "broadcast": 875, 380 | "illegal": 807, 381 | "firearms": 543, 382 | "for": 49, 383 | "department": 150, 384 | "minor": 234, 385 | "weapons": 714, 386 | "city": 365, 387 | "suicide": 321, 388 | "embezzlement": 197, 389 | "money": 143, 390 | "u": 425, 391 | "after": 266, 392 | "transit": 330, 393 | "occupied": 369, 394 | "lawful": 587, 395 | "decoding": 919, 396 | "drag": 920, 397 | "lynching": 715, 398 | "jurisdiction": 69, 399 | "firearm": 139, 400 | "cabaret": 754, 401 | "copulation": 363, 402 | "dissuading": 413, 403 | "mobile": 733, 404 | "required": 631, 405 | "semi": 823, 406 | "trespassing": 107, 407 | "sodomy": 408, 408 | "barking": 658, 409 | "megan's": 744, 410 | "message": 808, 411 | "owner": 685, 412 | "shooting": 387, 413 | "utility": 592, 414 | "derivative": 576, 415 | "cellular": 541, 416 | "slashing": 367, 417 | "chemical": 617, 418 | "\\(": 75, 419 | "materials": 644, 420 | "200": 627, 421 | "abandonment": 420, 422 | "into": 386, 423 | "prisoner": 755, 424 | "sell": 525, 425 | "s": 146, 426 | "simple": 358, 427 | "records": 313, 428 | "child": 155, 429 | "beverage": 921, 430 | "firecrackers": 495, 431 | "product": 756, 432 | "witness": 414, 433 | "real": 686, 434 | "attempted": 77, 435 | "service": 243, 436 | "estab": 490, 437 | "sending": 809, 438 | "delinquency": 404, 439 | "neglect": 501, 440 | "building": 42, 441 | "jail": 668, 442 | "residence": 97, 443 | "desertion": 468, 444 | "trash": 558, 445 | "solicits": 117, 446 | "through": 824, 447 | "innkeeper": 296, 448 | "impersonating": 483, 449 | "arson": 205, 450 | "judicial": 673, 451 | "mask": 757, 452 | "display": 722, 453 | "spousal": 687, 454 | "officer": 111, 455 | "purchase": 634, 456 | "register": 287, 457 | "driving": 167, 458 | "driver": 419, 459 | "caustic": 331, 460 | "loitering": 157, 461 | "trains": 718, 462 | "jumping": 512, 463 | "shoplifting": 68, 464 | "death": 120, 465 | "meeting": 674, 466 | "engaging": 402, 467 | "procurement": 922, 468 | "bribe": 782, 469 | "emergency": 469, 470 | "felon": 259, 471 | "100": 562, 472 | "hire": 766, 473 | "chain": 311, 474 | "area": 825, 475 | "carrying": 199, 476 | "prohibited": 274, 477 | "throwing": 448, 478 | "destruction": 268, 479 | "medical": 854, 480 | "injuries": 275, 481 | "acts": 356, 482 | "loudspeaker": 881, 483 | "nature": 923, 484 | "system": 594, 485 | "obstructions": 291, 486 | "refusing": 659, 487 | "sick": 395, 488 | "fighting": 323, 489 | "meetings": 826, 490 | "device": 263, 491 | "": 0, 492 | "bomb": 368, 493 | "grabbing": 745, 494 | "prostitution": 102, 495 | "tarasoff": 304, 496 | "an": 204, 497 | "member": 792, 498 | "reformatory": 735, 499 | "disabled": 572, 500 | "located": 271, 501 | "dies": 827, 502 | "interfering": 660, 503 | "life": 56, 504 | "stolen": 10, 505 | "credit": 71, 506 | "forge": 479, 507 | "removal": 707, 508 | "computers": 309, 509 | "recovered": 45, 510 | "prior": 142, 511 | "speeding": 480, 512 | "push": 924, 513 | "convicted": 262, 514 | "upon": 651, 515 | "stay": 174, 516 | "glue": 877, 517 | "campus": 697, 518 | "syringe": 519, 519 | "maintaining": 202, 520 | "disorderly": 855, 521 | "intercepting": 708, 522 | "secondhand": 925, 523 | "clone": 758, 524 | "park": 332, 525 | "offender": 295, 526 | "heroin": 147, 527 | "dump": 828, 528 | "days": 774, 529 | "vicious": 595, 530 | "injured": 227, 531 | "contributing": 405, 532 | "opiates": 406, 533 | "base": 58, 534 | "over": 878, 535 | "home": 516, 536 | "obstructing": 415, 537 | "furnish": 736, 538 | "booking": 588, 539 | "computer": 515, 540 | "cards": 383, 541 | "drug": 544, 542 | "articles": 573, 543 | "cohabitee": 104, 544 | "strangulation": 517, 545 | "restricted": 879, 546 | "railroads": 880, 547 | "guide": 829, 548 | "parental": 348, 549 | "pursesnatch": 280, 550 | "pimping": 496, 551 | "executive": 417, 552 | "cocaine": 48, 553 | "hypodermic": 520, 554 | "a": 11, 555 | "agents": 608, 556 | "remaining": 704, 557 | "investigative": 116, 558 | "based": 303, 559 | "pickpocket": 118, 560 | "property": 7, 561 | "threatening": 228, 562 | "of": 3, 563 | "violator": 737, 564 | "limits": 364, 565 | "arrest": 19, 566 | "the": 53, 567 | "registration": 222, 568 | "track": 661, 569 | "automated": 211, 570 | "fired": 346, 571 | "report": 94, 572 | "accident": 326, 573 | "apt": 446, 574 | "around": 675, 575 | "air": 336, 576 | "satellite": 767, 577 | "patient": 723, 578 | "pass": 187, 579 | "armed": 183, 580 | "amount": 288, 581 | "dumping": 514, 582 | "found": 38, 583 | "load": 882, 584 | "locatn": 793, 585 | "within": 352, 586 | "private": 709, 587 | "entertainment": 618, 588 | "roadway": 856, 589 | "spill": 810, 590 | "legal": 372, 591 | "impounded": 360, 592 | "possible": 347, 593 | "incest": 768, 594 | "forgery": 110, 595 | "vehicles": 47, 596 | "murder": 926, 597 | "misdemeanor": 179, 598 | "poss": 145, 599 | "lab": 927, 600 | "stalking": 257, 601 | "at": 270, 602 | "parents": 883, 603 | "drunk": 532, 604 | "cars": 381, 605 | "labor": 494, 606 | "enroute": 46, 607 | "misplaced": 537, 608 | "cable": 929, 609 | "make": 194, 610 | "taking": 664, 611 | "calif": 930, 612 | "communications": 705, 613 | "notes": 181, 614 | "causes": 236, 615 | "car": 535, 616 | "natural": 235, 617 | "safe": 403, 618 | "judge": 706, 619 | "court": 195, 620 | "conduct": 441, 621 | "influence": 106, 622 | "fight": 932, 623 | "auto": 6, 624 | "400": 568, 625 | "felony": 156, 626 | "dating": 164, 627 | "mayhem": 426, 628 | "accessing": 596, 629 | "malicious": 16, 630 | "inflict": 105, 631 | "peddling": 305, 632 | "contraband": 738, 633 | "seals": 622, 634 | "strongarm": 103, 635 | "spray": 726, 636 | "light": 554, 637 | "personation": 144, 638 | "boat": 759, 639 | "intoxicated": 487, 640 | "negligent": 421, 641 | "carrier": 746, 642 | "vehicle": 27, 643 | "state": 188, 644 | "50": 513, 645 | "abuse": 171, 646 | "non": 189, 647 | "aid": 635, 648 | "substance": 129, 649 | "apparatus": 609, 650 | "bus": 747, 651 | "seizing": 748, 652 | "beyond": 377, 653 | "armor": 885, 654 | "toilet": 676, 655 | "citizenship": 811, 656 | "begging": 333, 657 | "sniping": 933, 658 | "sale": 62, 659 | "cat": 857, 660 | "act": 134, 661 | "massage": 831, 662 | "cultivating": 384, 663 | "command": 653, 664 | "political": 934, 665 | "citation": 223, 666 | "defrauding": 253, 667 | "loaded": 184, 668 | "law": 375, 669 | "corporate": 621, 670 | "poles": 858, 671 | "force": 61, 672 | "miscellaneous": 73, 673 | "missing": 52, 674 | "willful": 476, 675 | "vacant": 607, 676 | "funds": 338, 677 | "falsifying": 677, 678 | "visit": 176, 679 | "leased": 663, 680 | "card": 64, 681 | "hot": 169, 682 | "employee": 319, 683 | "opium": 511, 684 | "collector": 859, 685 | "sex": 258, 686 | "putting": 795, 687 | "homicide": 312, 688 | "unemployment": 935, 689 | "needle": 521, 690 | "peddler": 936, 691 | "phone": 114, 692 | "food": 832, 693 | "face": 289, 694 | "hospitalized": 860, 695 | "w": 272, 696 | "away": 175, 697 | "products": 506, 698 | "rob": 886, 699 | "their": 522, 700 | "room": 261, 701 | "provide": 579, 702 | "parole": 101, 703 | "officers": 937, 704 | "b": 699, 705 | "consuming": 240, 706 | "controlled": 127, 707 | "contact": 796, 708 | "minors": 861, 709 | "loiters": 678, 710 | "drivers": 28, 711 | "penetrating": 887, 712 | "ft": 563, 713 | "open": 486, 714 | "disrupts": 410, 715 | "booth": 637, 716 | "information": 361, 717 | "public": 85, 718 | "looking": 833, 719 | "condition": 458, 720 | "paint": 724, 721 | "civil": 182, 722 | "commotion": 431, 723 | "loud": 834, 724 | "receive": 177, 725 | "pens": 725, 726 | "teller": 212, 727 | "place": 132, 728 | "former": 161, 729 | "revenue": 888, 730 | "defraud": 835, 731 | "automobile": 30, 732 | "pretenses": 250, 733 | "tom": 582, 734 | "serious": 260, 735 | "solicitations": 889, 736 | "water": 862, 737 | "cart": 939, 738 | "bite": 214, 739 | "infractions": 489, 740 | "closure": 180, 741 | "vin": 679, 742 | "relationship": 165, 743 | "brookers": 610, 744 | "government": 440, 745 | "throw": 836, 746 | "owning": 931, 747 | "prohibi": 619, 748 | "concealment": 711, 749 | "slugs": 797, 750 | "disturbed": 39, 751 | "possess": 442, 752 | "cordless": 710, 753 | "bookmaking": 890, 754 | "library": 334, 755 | "cash": 620, 756 | "zone": 662, 757 | "short": 600, 758 | "opposite": 940, 759 | "rock": 59, 760 | "terrorizing": 863, 761 | "sound": 739, 762 | "exposure": 293, 763 | "interferring": 497, 764 | "marking": 941, 765 | "video": 652, 766 | "telling": 703, 767 | "inciting": 602, 768 | "location": 837, 769 | "any": 864, 770 | "prostitute": 591, 771 | "sf": 133, 772 | "nuisance": 172, 773 | "purpose": 221, 774 | "traumatic": 459, 775 | "kidnapping": 307, 776 | "presenting": 589, 777 | "license": 21, 778 | "municipal": 137, 779 | "burns": 891, 780 | "case": 25, 781 | "peace": 148, 782 | "publication": 770, 783 | "counterfeiting": 201, 784 | "children": 401, 785 | "changing": 290, 786 | "call": 249, 787 | "bb": 388, 788 | "authority": 254, 789 | "drugs": 318, 790 | "aided": 26, 791 | "fraud": 688, 792 | "12": 670, 793 | "obscene": 170, 794 | "application": 237, 795 | "crowd": 838, 796 | "dependent": 300, 797 | "payment": 565, 798 | "activities": 409, 799 | "4390": 701, 800 | "electronics": 577, 801 | "suspension": 892, 802 | "repairman": 749, 803 | "streets": 292, 804 | "agent": 538, 805 | "offense": 689, 806 | "priors": 473, 807 | "sniper": 943, 808 | "balloons": 893, 809 | "agricultural": 899, 810 | "deadly": 92, 811 | "strip": 210, 812 | "placard": 570, 813 | "transportaion": 507, 814 | "reckless": 342, 815 | "cruelty": 398, 816 | "burglary": 18, 817 | "along": 665, 818 | "fiduciary": 669, 819 | "liqour": 385, 820 | "damage": 396, 821 | "scalping": 812, 822 | "manner": 225, 823 | "bathroom": 839, 824 | "regulations": 894, 825 | "recordings": 761, 826 | "insurance": 944, 827 | "as": 286, 828 | "spouse": 166, 829 | "suspect": 353, 830 | "sold": 298, 831 | "premise": 340, 832 | "cause": 138, 833 | "while": 163, 834 | "substances": 418, 835 | "poisoning": 702, 836 | "from": 4, 837 | "serial": 546, 838 | "hospital": 945, 839 | "crime": 339, 840 | "health": 946, 841 | "valet": 895, 842 | "about": 625, 843 | "trade": 840, 844 | "possession": 12, 845 | "secrets": 841, 846 | "merchant": 335, 847 | "escapes": 896, 848 | "person": 43, 849 | "and": 152, 850 | "used": 281, 851 | "using": 897, 852 | "destroying": 865, 853 | "inhabited": 230, 854 | "student": 898, 855 | "railroad": 523, 856 | "encouraging": 716, 857 | "disaster": 947, 858 | "altered": 555, 859 | "fictitious": 193, 860 | "false": 78, 861 | "search": 269, 862 | "human": 518, 863 | "recklessly": 393, 864 | "unknown": 135, 865 | "forged": 373, 866 | "connection": 948, 867 | "remove": 539, 868 | "probation": 90, 869 | "to": 29, 870 | "attention": 771, 871 | "without": 198, 872 | "container": 481, 873 | "cell": 308, 874 | "sales": 128, 875 | "fortune": 700, 876 | "instrument": 498, 877 | "knowledge": 190, 878 | "revoked": 32, 879 | "or": 14, 880 | "marks": 601, 881 | "truck": 88, 882 | "warning": 248, 883 | "animal": 680, 884 | "petty": 9, 885 | "swearing": 433, 886 | "goods": 632, 887 | "domestic": 100, 888 | "discharging": 397, 889 | "financial": 798, 890 | "sharp": 508, 891 | "interception": 900, 892 | "charitable": 884, 893 | "marijuana": 81, 894 | "tampering": 246, 895 | "accidental": 606, 896 | "pandering": 545, 897 | "bicycle": 141, 898 | "annoy": 450, 899 | "machines": 611, 900 | "shelter": 349, 901 | "caretaker": 462, 902 | "apartment": 87, 903 | "stamps": 842, 904 | "credible": 772, 905 | "returns": 315, 906 | "custody": 734, 907 | "laceration": 556, 908 | "advertisments": 866, 909 | "spitting": 628, 910 | "weapon": 67, 911 | "statements": 799, 912 | "indecent": 294, 913 | "parking": 727, 914 | "package": 638, 915 | "activity": 267, 916 | "juror": 785, 917 | "exploitation": 569, 918 | "months": 671, 919 | "sexual": 185, 920 | "intent": 276, 921 | "specific": 690, 922 | "runaway": 191, 923 | "establishment": 242, 924 | "conspiracy": 153, 925 | "aircraft": 564, 926 | "graffiti": 140, 927 | "fireman": 800, 928 | "gang": 265, 929 | "rental": 779, 930 | "tv": 949, 931 | "hours": 666, 932 | "speakers": 844, 933 | "destructive": 950, 934 | "refusal": 951, 935 | "phones": 310, 936 | "notification": 226, 937 | "entries": 316, 938 | "highway": 542, 939 | "presence": 649, 940 | "purchasing": 667, 941 | "official": 719, 942 | "document": 813, 943 | "full": 912, 944 | "forcible": 51, 945 | "audiovisual": 762, 946 | "vandalism": 23, 947 | "dog": 208, 948 | "conviction": 613, 949 | "gun": 98, 950 | "keeping": 868, 951 | "evading": 355, 952 | "dealer": 928, 953 | "curfew": 769 954 | } -------------------------------------------------------------------------------- /training_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "batch_size": 128, 3 | "dropout_keep_prob": 0.5, 4 | "embedding_dim": 300, 5 | "evaluate_every": 200, 6 | "filter_sizes": "3,4,5", 7 | "hidden_unit": 300, 8 | "l2_reg_lambda": 0.0, 9 | "max_pool_size": 4, 10 | "non_static": false, 11 | "num_epochs": 1, 12 | "num_filters": 32 13 | } 14 | --------------------------------------------------------------------------------