├── README.md ├── LICENSE ├── .gitignore ├── uncertainty_sampling.py ├── multilabel.py ├── hierarchical.py ├── imballanced_classes.py ├── cost_sensitive_learning.py ├── delicious_loader.py ├── text_model.py └── loaders.py /README.md: -------------------------------------------------------------------------------- 1 | # Advanced-ML-techniques 2 | This repo contains implementation of advanced ML techniques. Includes model ensembles, cost-sensitive learning and dealing with class imbalance. 3 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Alex Gidiotis 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | -------------------------------------------------------------------------------- /uncertainty_sampling.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from delicious_loader import load_dataset 4 | from sklearn import svm 5 | from sklearn.linear_model import LogisticRegression 6 | import matplotlib.pyplot as plt 7 | 8 | if __name__ == '__main__': 9 | ngram_range = 1 10 | maxlen = 200 11 | 12 | X_train, y_train, X_val, y_val, X_test, y_test, word_index = load_dataset(ngram_range=ngram_range, maxlen=maxlen) 13 | 14 | # Column 19 is the rarest 15 | 16 | # y_data = pd.read_csv('data/delicious/train-label.dat', header=None, sep=' ') 17 | # for col in range(0, 20, 1): 18 | # print y_data[col].sum() 19 | 20 | X_test_unlabeled_pool = X_test[:1992, :] 21 | X_test_test = X_test[1992:, :] 22 | y_test_unlabeled_pool = y_test[:1992, -1] 23 | y_test_test = y_test[1992:, -1] 24 | 25 | acc = [] 26 | train_acc = [] 27 | dim = [] 28 | 29 | for k in range(0,10,1): 30 | clf = LogisticRegression() 31 | print 'Size of X: ', len(X_train), X_train.shape, type(X_train) 32 | clf.fit(X_train, y_train[:, -1]) 33 | 34 | preds = clf.decision_function(X_test_unlabeled_pool) 35 | 36 | values = [] 37 | positions = [] 38 | for i in range(0, len(X_test_unlabeled_pool), 1): 39 | values.append(abs(preds[i])) 40 | positions.append(i) 41 | 42 | for i in range(10): 43 | pos = np.array(values).argmin() 44 | # print np.array(values).min() 45 | X_train_new = np.zeros(((X_train.shape[0] + 1), X_train.shape[1])) 46 | y_train_new = np.zeros(((y_train[:, -1].shape[0] + 1), 1)) 47 | 48 | X_train_new[:X_train.shape[0]] = X_train 49 | X_train_new[X_train.shape[0]:] = X_test_unlabeled_pool[pos, :] 50 | y_train_new[:y_train.shape[0],0] = y_train[:, -1] 51 | y_train_new[y_train.shape[0]:] = y_test_unlabeled_pool[pos] 52 | X_train = X_train_new 53 | y_train = y_train_new 54 | print len(X_train), y_train.shape 55 | np.delete(X_test_unlabeled_pool, pos, 0) 56 | np.delete(y_test_unlabeled_pool, pos, 0) 57 | del values[pos] 58 | del positions[pos] 59 | 60 | acc.append((1-clf.score(X_test_test, y_test_test))) 61 | train_acc.append((1-clf.score(X_train, y_train))) 62 | dim.append(X_train.shape[0]) 63 | 64 | plt.plot(dim, acc, dim, train_acc) 65 | plt.show() 66 | 67 | 68 | 69 | -------------------------------------------------------------------------------- /multilabel.py: -------------------------------------------------------------------------------- 1 | from delicious_loader import load_dataset 2 | 3 | 4 | import numpy as np 5 | from sklearn.metrics import precision_recall_fscore_support, roc_auc_score 6 | 7 | 8 | from keras.models import Model, model_from_json 9 | from keras.layers import Dense, Input, Embedding, GlobalAveragePooling1D 10 | from keras.callbacks import EarlyStopping, ModelCheckpoint, Callback 11 | from keras import regularizers 12 | 13 | import tensorflow as tf 14 | 15 | 16 | 17 | 18 | 19 | 20 | def f1_score(y_true, y_pred): 21 | """ 22 | Compute the micro f(b) score with b=1. 23 | """ 24 | y_true = tf.cast(y_true, "float32") 25 | y_pred = tf.cast(tf.round(y_pred), "float32") # implicit 0.5 threshold via tf.round 26 | y_correct = y_true * y_pred 27 | 28 | 29 | sum_true = tf.reduce_sum(y_true, axis=1) 30 | sum_pred = tf.reduce_sum(y_pred, axis=1) 31 | sum_correct = tf.reduce_sum(y_correct, axis=1) 32 | 33 | 34 | precision = sum_correct / sum_pred 35 | recall = sum_correct / sum_true 36 | f_score = 2 * precision * recall / (precision + recall) 37 | f_score = tf.where(tf.is_nan(f_score), tf.zeros_like(f_score), f_score) 38 | 39 | 40 | return tf.reduce_mean(f_score) 41 | 42 | 43 | 44 | 45 | 46 | 47 | def build_model(num_features, 48 | num_classes, 49 | embedding_dims, 50 | maxlen): 51 | """ 52 | """ 53 | 54 | input_layer = Input(shape=(maxlen,), 55 | dtype='int32') 56 | 57 | 58 | embeddings = Embedding(num_features, 59 | embedding_dims, 60 | input_length=maxlen, 61 | embeddings_regularizer=regularizers.l1(7e-7))(input_layer) 62 | 63 | avg_layer = GlobalAveragePooling1D()(embeddings) 64 | predictions = Dense(num_classes, activation='sigmoid')(avg_layer) 65 | 66 | model = Model(inputs=input_layer, outputs=predictions) 67 | model.compile(loss='binary_crossentropy', 68 | optimizer='adam', 69 | metrics=[f1_score]) 70 | 71 | model.summary() 72 | 73 | return model 74 | 75 | 76 | 77 | 78 | 79 | 80 | def load_model(): 81 | """ 82 | """ 83 | 84 | json_file = open('multilabel_model.json', 'r') 85 | loaded_model_json = json_file.read() 86 | json_file.close() 87 | model = model_from_json(loaded_model_json) 88 | 89 | model.load_weights('multilabel_model.h5') 90 | print("Loaded model from disk") 91 | 92 | model.summary() 93 | 94 | model.compile(loss='binary_crossentropy', 95 | optimizer='adam', 96 | metrics=[f1_score]) 97 | 98 | 99 | return model 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | if __name__ == '__main__': 109 | 110 | ngram_range = 1 111 | maxlen = 200 112 | batch_size = 32 113 | embedding_dims = 50 114 | epochs = 500 115 | num_classes = 20 116 | 117 | 118 | 119 | X_train,y_train,X_val,y_val,X_test,y_test,word_index = load_dataset(ngram_range=ngram_range,maxlen=maxlen) 120 | 121 | num_features = len(word_index) 122 | print('Found %d words' % num_features) 123 | 124 | 125 | model = build_model(num_features,num_classes,embedding_dims,maxlen) 126 | 127 | model_json = model.to_json() 128 | with open("multilabel_model.json", "w") as json_file: 129 | json_file.write(model_json) 130 | 131 | 132 | early_stopping =EarlyStopping(monitor='val_f1_score', 133 | patience=15, 134 | mode='max') 135 | bst_model_path = 'multilabel_model.h5' 136 | model_checkpoint = ModelCheckpoint(bst_model_path, 137 | monitor='val_f1_score', 138 | verbose=1, 139 | save_best_only=True, 140 | mode='max', 141 | save_weights_only=True) 142 | 143 | model.fit(X_train, y_train, 144 | batch_size=batch_size, 145 | epochs=epochs, 146 | validation_data=(X_val, y_val), 147 | callbacks=[model_checkpoint,early_stopping]) 148 | 149 | 150 | model = load_model() 151 | y_pred = model.predict(X_test) 152 | 153 | print 'AUC:',roc_auc_score(y_test, y_pred) 154 | y_pred[y_pred > 0.25] = 1 155 | y_pred[y_pred <= 0.25] = 0 156 | 157 | 158 | for i in range(10): 159 | pred,lab = y_pred[i],y_test[i] 160 | print np.where(pred == 1), np.where(lab == 1) 161 | 162 | 163 | print precision_recall_fscore_support(y_test, y_pred, average='micro') 164 | print precision_recall_fscore_support(y_test, y_pred, average='macro') -------------------------------------------------------------------------------- /hierarchical.py: -------------------------------------------------------------------------------- 1 | from delicious_loader import load_dataset_hierarchical 2 | 3 | 4 | import numpy as np 5 | from sklearn.metrics import precision_recall_fscore_support, roc_auc_score 6 | 7 | 8 | from keras.models import Model, model_from_json 9 | from keras.layers import Dense, Input, Embedding, GlobalAveragePooling1D, TimeDistributed, LSTM, Dropout, Flatten 10 | from keras.callbacks import EarlyStopping, ModelCheckpoint, Callback 11 | from keras.layers.wrappers import Bidirectional 12 | from keras import regularizers 13 | 14 | import tensorflow as tf 15 | 16 | 17 | 18 | 19 | 20 | 21 | def f1_score(y_true, y_pred): 22 | """ 23 | Compute the micro f(b) score with b=1. 24 | """ 25 | y_true = tf.cast(y_true, "float32") 26 | y_pred = tf.cast(tf.round(y_pred), "float32") # implicit 0.5 threshold via tf.round 27 | y_correct = y_true * y_pred 28 | 29 | 30 | sum_true = tf.reduce_sum(y_true, axis=1) 31 | sum_pred = tf.reduce_sum(y_pred, axis=1) 32 | sum_correct = tf.reduce_sum(y_correct, axis=1) 33 | 34 | 35 | precision = sum_correct / sum_pred 36 | recall = sum_correct / sum_true 37 | f_score = 2 * precision * recall / (precision + recall) 38 | f_score = tf.where(tf.is_nan(f_score), tf.zeros_like(f_score), f_score) 39 | 40 | 41 | return tf.reduce_mean(f_score) 42 | 43 | 44 | 45 | 46 | 47 | 48 | def build_model(num_features, 49 | num_classes, 50 | embedding_dims, 51 | maxlen, 52 | max_sentence_len): 53 | """ 54 | """ 55 | 56 | input_layer = Input(shape=(maxlen,max_sentence_len,), 57 | dtype='int32') 58 | sentence_input = Input(shape=(max_sentence_len,), 59 | dtype='int32') 60 | 61 | embeddings = Embedding(num_features, 62 | embedding_dims, 63 | input_length=max_sentence_len, 64 | embeddings_regularizer=regularizers.l1(1e-6))(sentence_input) 65 | 66 | avg_layer = GlobalAveragePooling1D()(embeddings) 67 | sentEncoder = Model(inputs=sentence_input, 68 | outputs=avg_layer) 69 | sentEncoder.summary() 70 | textEncoder = TimeDistributed(sentEncoder)(input_layer) 71 | 72 | global_avg_layer = Flatten()(textEncoder) 73 | 74 | global_avg_layer = Dropout(0.5)(global_avg_layer) 75 | predictions = Dense(num_classes, 76 | activation='sigmoid', 77 | kernel_regularizer=regularizers.l1(1e-5))(global_avg_layer) 78 | 79 | model = Model(inputs=input_layer, 80 | outputs=predictions) 81 | model.compile(loss='binary_crossentropy', 82 | optimizer='adam', 83 | metrics=[f1_score]) 84 | 85 | model.summary() 86 | 87 | return model 88 | 89 | 90 | 91 | 92 | 93 | 94 | def load_model(): 95 | """ 96 | """ 97 | 98 | json_file = open('hierarchical_model.json', 'r') 99 | loaded_model_json = json_file.read() 100 | json_file.close() 101 | model = model_from_json(loaded_model_json) 102 | 103 | model.load_weights('hierarchical_model.h5') 104 | print("Loaded model from disk") 105 | 106 | model.summary() 107 | 108 | model.compile(loss='binary_crossentropy', 109 | optimizer='adam', 110 | metrics=[f1_score]) 111 | 112 | 113 | return model 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | if __name__ == '__main__': 123 | 124 | ngram_range = 1 125 | maxlen = 20 126 | max_sentence_len = 10 127 | batch_size = 32 128 | embedding_dims = 50 129 | epochs = 500 130 | num_classes = 20 131 | 132 | 133 | X_train,y_train,X_val,y_val,X_test,y_test,word_index = load_dataset_hierarchical(maxlen,max_sentence_len) 134 | 135 | 136 | 137 | num_features = len(word_index) 138 | print('Found %d words' % num_features) 139 | 140 | ''' 141 | model = build_model(num_features,num_classes,embedding_dims,maxlen,max_sentence_len) 142 | 143 | model_json = model.to_json() 144 | with open("hierarchical_model.json", "w") as json_file: 145 | json_file.write(model_json) 146 | 147 | 148 | early_stopping =EarlyStopping(monitor='val_f1_score', 149 | patience=15, 150 | mode='max') 151 | bst_model_path = 'hierarchical_model.h5' 152 | model_checkpoint = ModelCheckpoint(bst_model_path, 153 | monitor='val_f1_score', 154 | verbose=1, 155 | save_best_only=True, 156 | mode='max', 157 | save_weights_only=True) 158 | 159 | model.fit(X_train, y_train, 160 | batch_size=batch_size, 161 | epochs=epochs, 162 | validation_data=(X_val, y_val), 163 | callbacks=[model_checkpoint,early_stopping]) 164 | 165 | ''' 166 | model = load_model() 167 | y_pred = model.predict(X_test) 168 | 169 | print 'AUC:',roc_auc_score(y_test, y_pred) 170 | y_pred[y_pred > 0.25] = 1 171 | y_pred[y_pred <= 0.25] = 0 172 | 173 | 174 | for i in range(10): 175 | pred,lab = y_pred[i],y_test[i] 176 | print np.where(pred == 1), np.where(lab == 1) 177 | 178 | 179 | print precision_recall_fscore_support(y_test, y_pred, average='micro') 180 | print precision_recall_fscore_support(y_test, y_pred, average='macro') 181 | -------------------------------------------------------------------------------- /imballanced_classes.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | from sklearn.feature_extraction.text import TfidfVectorizer 5 | from sklearn.model_selection import train_test_split 6 | from sklearn.naive_bayes import GaussianNB 7 | from imblearn.over_sampling import SMOTE 8 | from sklearn.metrics import f1_score 9 | from sklearn.metrics import recall_score, precision_score 10 | from sklearn.ensemble import RandomForestClassifier 11 | from sklearn.svm import LinearSVC 12 | from imblearn.under_sampling import NearMiss 13 | 14 | 15 | if __name__ == '__main__': 16 | df = pd.read_csv('creditcard.csv') 17 | 18 | Y_data = [] 19 | Y_data = df['Class'].tolist() 20 | df = df.drop('Class',axis=1) 21 | df = (df - df.mean()) / ((df.max() - df.min())) 22 | X_data = df.as_matrix() 23 | 24 | X_train, X_test, y_train, y_test = train_test_split(X_data, 25 | Y_data, 26 | test_size=0.1, 27 | random_state=0) 28 | 29 | X_train, X_val, y_train, y_val = train_test_split(X_train, 30 | y_train, 31 | test_size=0.1, 32 | random_state=0) 33 | 34 | #Oversampling (SMOTE) 35 | sm = SMOTE() 36 | X_smote, y_smote = sm.fit_sample(X_train, y_train) 37 | 38 | #Undersampling (Distance-based Near Miss 1,2,3) 39 | nm1 = NearMiss(version = 1) 40 | X_miss1, y_miss1 = nm1.fit_sample(X_train, y_train) 41 | nm2 = NearMiss(version = 2) 42 | X_miss2, y_miss2 = nm2.fit_sample(X_train, y_train) 43 | nm3 = NearMiss(version = 3) 44 | X_miss3, y_miss3 = nm3.fit_sample(X_train, y_train) 45 | 46 | #Undersampling (EasyEnsemble) 47 | ee = EasyEnsemble(n_subsets=30) 48 | X_resampled, y_resampled = ee.fit_sample(X_train, y_train) 49 | 50 | 51 | print "Naive Bayes" 52 | naive_clf = GaussianNB() 53 | naive_clf.fit (X_train, y_train) 54 | y_pred = naive_clf.predict(X_test) 55 | print "initial: ",f1_score (y_test, y_pred) 56 | 57 | naive_clf.fit (X_smote, y_smote) 58 | y_pred = naive_clf.predict(X_test) 59 | print "smote: ",f1_score (y_test, y_pred) 60 | 61 | naive_clf.fit (X_miss1, y_miss1) 62 | y_pred = naive_clf.predict(X_test) 63 | print "near miss-1: ",f1_score (y_test, y_pred) 64 | naive_clf.fit (X_miss2, y_miss2) 65 | y_pred = naive_clf.predict(X_test) 66 | print "near miss-2: ",f1_score (y_test, y_pred) 67 | naive_clf.fit (X_miss3, y_miss3) 68 | y_pred = naive_clf.predict(X_test) 69 | print "near miss-3: ",f1_score (y_test, y_pred) 70 | 71 | NBclassifiers = [] 72 | for i in range(0,10,1): 73 | NBclassifiers.append(GaussianNB().fit(X_resampled[i], y_resampled[i])) 74 | 75 | y_pred = np.asarray([clf.predict(X_test) for clf in NBclassifiers]).T 76 | y_pred = np.apply_along_axis(lambda x: 77 | np.argmax(np.bincount(x)), 78 | axis=1, 79 | arr=y_pred.astype('int')) 80 | print "easy ensemle: ",f1_score (y_test, y_pred) 81 | 82 | 83 | 84 | print "Random Forest" 85 | forest_clf = RandomForestClassifier(n_estimators=50, 86 | max_depth=10, 87 | random_state=0) 88 | forest_clf.fit(X_train, y_train) 89 | y_pred = forest_clf.predict(X_test) 90 | print "initial: ", f1_score (y_test, y_pred) 91 | 92 | forest_clf.fit (X_smote, y_smote) 93 | y_pred = forest_clf.predict(X_test) 94 | print "smote: ",f1_score (y_test, y_pred) 95 | 96 | forest_clf.fit (X_miss1, y_miss1) 97 | y_pred = forest_clf.predict(X_test) 98 | print "near miss-1: ",f1_score (y_test, y_pred) 99 | forest_clf.fit (X_miss2, y_miss2) 100 | y_pred = forest_clf.predict(X_test) 101 | print "near miss-2: ",f1_score (y_test, y_pred) 102 | forest_clf.fit (X_miss3, y_miss3) 103 | y_pred = forest_clf.predict(X_test) 104 | print "near miss-3: ",f1_score (y_test, y_pred) 105 | 106 | forests = [] 107 | for i in range(0,10,1): 108 | forests.append(RandomForestClassifier(n_estimators=20, max_depth=5, 109 | random_state=0).fit(X_resampled[i], y_resampled[i])) 110 | 111 | y_pred = np.asarray([clf.predict(X_test) for clf in forests]).T 112 | y_pred = np.apply_along_axis(lambda x: 113 | np.argmax(np.bincount(x)), 114 | axis=1, 115 | arr=y_pred.astype('int')) 116 | print "easy ensemle: ",f1_score (y_test, y_pred) 117 | 118 | 119 | 120 | print "SVM" 121 | svc_clf = LinearSVC(random_state=0) 122 | svc_clf.fit(X_train,y_train) 123 | y_pred = svc_clf.predict(X_test) 124 | print "initial: ",f1_score (y_test, y_pred) 125 | 126 | svc_clf.fit (X_smote, y_smote) 127 | y_pred = svc_clf.predict(X_test) 128 | print "smote: ",f1_score (y_test, y_pred) 129 | 130 | svc_clf.fit (X_miss1, y_miss1) 131 | y_pred = svc_clf.predict(X_test) 132 | print "near miss-1: ",f1_score (y_test, y_pred) 133 | svc_clf.fit (X_miss2, y_miss2) 134 | y_pred = svc_clf.predict(X_test) 135 | print "near miss-2: ",f1_score (y_test, y_pred) 136 | svc_clf.fit (X_miss3, y_miss3) 137 | y_pred = svc_clf.predict(X_test) 138 | print "near miss-3: ",f1_score (y_test, y_pred) 139 | 140 | svms = [] 141 | for i in range(0,10,1): 142 | svms.append(LinearSVC(random_state=0).fit(X_resampled[i], y_resampled[i])) 143 | 144 | y_pred = np.asarray([clf.predict(X_test) for clf in svms]).T 145 | y_pred = np.apply_along_axis(lambda x: 146 | np.argmax(np.bincount(x)), 147 | axis=1, 148 | arr=y_pred.astype('int')) 149 | print "easy ensemle: ",f1_score (y_test, y_pred) 150 | -------------------------------------------------------------------------------- /cost_sensitive_learning.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python 2 | 3 | import pandas as pd 4 | import numpy as np 5 | 6 | from sklearn.model_selection import train_test_split 7 | from sklearn.ensemble import RandomForestClassifier 8 | from sklearn.naive_bayes import GaussianNB 9 | from sklearn.svm import LinearSVC 10 | from sklearn.calibration import CalibratedClassifierCV 11 | from sklearn.metrics import confusion_matrix 12 | from sklearn.utils import shuffle, resample 13 | 14 | 15 | def cost_score(y_pred,y_true,costs=[0,5,1,0]): 16 | """ 17 | """ 18 | tn, fp, fn, tp = confusion_matrix(y_true,y_pred).ravel() 19 | 20 | cost_loss = (tn*costs[0] + fp*costs[1] + fn*costs[2] + tp*costs[3]) 21 | 22 | return cost_loss 23 | 24 | 25 | def read_data(): 26 | """ 27 | """ 28 | 29 | df = pd.read_csv('data/heart.dat', 30 | header=None, 31 | delimiter=' ') 32 | 33 | y_data = df[13].as_matrix() 34 | 35 | df = df.drop(13,axis=1) 36 | df = (df - df.mean()) / (df.max() - df.min()) 37 | 38 | X_data = df.as_matrix() 39 | 40 | X_train, X_test, y_train, y_test = train_test_split(X_data, 41 | y_data, 42 | test_size=0.1, 43 | random_state=0) 44 | 45 | X_train, X_val, y_train, y_val = train_test_split(X_train, 46 | y_train, 47 | test_size=0.1, 48 | random_state=0) 49 | 50 | return X_train, y_train, X_val, y_val, X_test, y_test 51 | 52 | 53 | def default_scores(X_train, y_train, X_val, y_val): 54 | """ 55 | """ 56 | 57 | 58 | svc_model = LinearSVC(random_state=0).fit(X_train,y_train) 59 | 60 | y_pred = svc_model.predict(X_val) 61 | print 'SVC loss:',cost_score(y_pred,y_val) 62 | 63 | rf_model = RandomForestClassifier(random_state=0).fit(X_train,y_train) 64 | 65 | y_pred = rf_model.predict(X_val) 66 | print 'Random Forest loss:',cost_score(y_pred,y_val) 67 | 68 | nb_model = GaussianNB().fit(X_train,y_train) 69 | 70 | y_pred = nb_model.predict(X_val) 71 | print 'Naive Bayes loss:',cost_score(y_pred,y_val) 72 | 73 | return 74 | 75 | 76 | def class_weighting(X_train, y_train, X_val, y_val): 77 | """ 78 | """ 79 | 80 | svc_model = LinearSVC(random_state=0, 81 | class_weight={1:5.,2:1.}).fit(X_train,y_train) 82 | 83 | y_pred = svc_model.predict(X_val) 84 | print 'SVC with class weighting loss:',cost_score(y_pred,y_val) 85 | 86 | rf_model = RandomForestClassifier(random_state=0, 87 | class_weight={1:5.,2:1.}).fit(X_train,y_train) 88 | 89 | y_pred = rf_model.predict(X_val) 90 | print 'Random Forest with class weighting loss:',cost_score(y_pred,y_val) 91 | 92 | 93 | sample_weights = [] 94 | for y in y_train: 95 | if y == 1: 96 | sample_weights.append(5) 97 | elif y == 2: 98 | sample_weights.append(1) 99 | 100 | nb_model = GaussianNB().fit(X_train,y_train,sample_weight=sample_weights) 101 | 102 | 103 | y_pred = nb_model.predict(X_val) 104 | print 'Naive Bayes with class weighting loss:',cost_score(y_pred,y_val) 105 | 106 | return 107 | 108 | 109 | def class_oversampling(X_train, y_train, X_val, y_val): 110 | """ 111 | """ 112 | 113 | positives = np.where( y_train == 1) 114 | X_positives = np.repeat(X_train[positives],4,axis=0) 115 | y_positives = np.repeat(y_train[positives],4) 116 | 117 | X_train_new = np.zeros(((X_train.shape[0]+X_positives.shape[0]),X_train.shape[1])) 118 | y_train_new = np.zeros(((y_train.shape[0]+y_positives.shape[0]),)) 119 | 120 | X_train_new[:X_train.shape[0]] = X_train 121 | X_train_new[X_train.shape[0]:] = X_positives 122 | y_train_new[:y_train.shape[0]] = y_train 123 | y_train_new[y_train.shape[0]:] = y_positives 124 | 125 | X_train, y_train = shuffle(X_train_new, y_train_new, random_state=0) 126 | 127 | svc_model = LinearSVC(random_state=0).fit(X_train,y_train) 128 | 129 | y_pred = svc_model.predict(X_val) 130 | print 'SVC after oversampling loss:',cost_score(y_pred,y_val) 131 | 132 | rf_model = RandomForestClassifier(random_state=0).fit(X_train,y_train) 133 | 134 | y_pred = rf_model.predict(X_val) 135 | print 'Random Forest after oversampling loss:',cost_score(y_pred,y_val) 136 | 137 | 138 | nb_model = GaussianNB().fit(X_train,y_train) 139 | 140 | y_pred = nb_model.predict(X_val) 141 | print 'Naive Bayes after oversampling loss:',cost_score(y_pred,y_val) 142 | 143 | return 144 | 145 | 146 | def rejection_sampling(X_train, 147 | y_train, 148 | c=[5.,1.], 149 | zeta=5., 150 | random_state=0): 151 | """ 152 | """ 153 | 154 | X_sample = [] 155 | y_sample = [] 156 | for x,y in zip(X_train,y_train): 157 | if y == 1: 158 | prob = c[0] / zeta 159 | elif y == 2: 160 | prob = c[1] / zeta 161 | 162 | sample_item = np.random.choice([True,False], p=[prob, 1-prob]) 163 | 164 | if sample_item: 165 | X_sample.append(x) 166 | y_sample.append(y) 167 | 168 | return np.array(X_sample),np.array(y_sample) 169 | 170 | 171 | def votting(clf_list, 172 | X_val): 173 | """ 174 | """ 175 | 176 | #For hard voting: 177 | pred = np.asarray([clf.predict(X_val) for clf in clf_list]).T 178 | pred = np.apply_along_axis(lambda x: 179 | np.argmax(np.bincount(x)), 180 | axis=1, 181 | arr=pred.astype('int')) 182 | 183 | return pred 184 | 185 | 186 | def costing(X_train, y_train, X_val, y_val): 187 | """ 188 | """ 189 | 190 | svc_models = [] 191 | rf_models = [] 192 | nb_models = [] 193 | for i in range(10): 194 | X_train_sample, y_train_sample = rejection_sampling(X_train, y_train, random_state=0) 195 | svc_models.append(LinearSVC(random_state=0).fit(X_train_sample,y_train_sample)) 196 | rf_models.append(RandomForestClassifier(random_state=0).fit(X_train_sample,y_train_sample)) 197 | nb_models.append(GaussianNB().fit(X_train_sample,y_train_sample)) 198 | 199 | 200 | y_pred = votting(svc_models,X_val) 201 | print 'SVC with costing loss:',cost_score(y_pred,y_val) 202 | 203 | y_pred = votting(rf_models,X_val) 204 | print 'Random Forest with costing loss:',cost_score(y_pred,y_val) 205 | 206 | 207 | y_pred = votting(nb_models,X_val) 208 | print 'Naive Bayes with costing loss:',cost_score(y_pred,y_val) 209 | 210 | return 211 | 212 | 213 | if __name__ == '__main__': 214 | 215 | X_train, y_train, X_val, y_val, X_test, y_test = read_data() 216 | 217 | default_scores(X_train, y_train, X_val, y_val) 218 | class_weighting(X_train, y_train, X_val, y_val) 219 | class_oversampling(X_train, y_train, X_val, y_val) 220 | costing(X_train,y_train,X_val,y_val) 221 | 222 | 223 | -------------------------------------------------------------------------------- /delicious_loader.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import re 4 | import time 5 | 6 | 7 | from keras.preprocessing import sequence 8 | 9 | 10 | 11 | 12 | 13 | def read_data(file, 14 | lab_file): 15 | """ 16 | """ 17 | 18 | X_data = pd.read_csv(file,header=None) 19 | y_data = pd.read_csv(lab_file,header=None) 20 | 21 | X_data = X_data[0].map(lambda x: re.sub('<\d+>','',x) \ 22 | .strip() \ 23 | .split()) 24 | X_data = X_data.map(lambda x: [int(tok.strip()) for tok in x]) 25 | y_data = y_data[0].map(lambda x: np.array([int(lab) for lab in x.split()])) 26 | 27 | return X_data.tolist(),np.array(y_data.tolist()) 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | def read_data_sentences(file, 36 | lab_file, 37 | maxlen, 38 | max_sentence_len): 39 | """ 40 | """ 41 | 42 | X_data = pd.read_csv(file,header=None) 43 | y_data = pd.read_csv(lab_file,header=None) 44 | 45 | X_data = X_data[0].map(lambda x: x.strip()) 46 | 47 | X_data = X_data.map(lambda x: re.findall('<\d+>([^<]+)',x)[1:]) 48 | 49 | X_data = X_data.map(lambda x: [[int(tok.strip()) for tok in sent.strip().split()] for sent in x ]) 50 | 51 | y_data = y_data[0].map(lambda x: np.array([int(lab) for lab in x.split()])) 52 | 53 | X_data = X_data.tolist() 54 | X_data_int = np.zeros((len(X_data),maxlen,max_sentence_len)) 55 | for idx,text_bag in enumerate(X_data): 56 | sentences_batch = np.zeros((maxlen,max_sentence_len)) 57 | sentences = sequence.pad_sequences(text_bag, 58 | maxlen=max_sentence_len, 59 | padding='post', 60 | truncating='post', 61 | dtype='int32') 62 | for j,sent in enumerate(sentences): 63 | if j >= max_sentence_len: 64 | break 65 | sentences_batch[j,:] = sent 66 | X_data_int[idx,:,:] = sentences_batch 67 | 68 | X_data = X_data_int 69 | 70 | return X_data,np.array(y_data.tolist()) 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | def create_ngram_set(input_list, ngram_value=2): 79 | """ 80 | Extract a set of n-grams from a list of integers. 81 | >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=2) 82 | {(4, 9), (4, 1), (1, 4), (9, 4)} 83 | >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=3) 84 | [(1, 4, 9), (4, 9, 4), (9, 4, 1), (4, 1, 4)] 85 | """ 86 | return set(zip(*[input_list[i:] for i in range(ngram_value)])) 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | def add_ngram(sequences, token_indice, ngram_range=2): 95 | """ 96 | Augment the input list of list (sequences) by appending n-grams values. 97 | Example: adding bi-gram 98 | >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]] 99 | >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017} 100 | >>> add_ngram(sequences, token_indice, ngram_range=2) 101 | [[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42]] 102 | Example: adding tri-gram 103 | >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]] 104 | >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017, (7, 9, 2): 2018} 105 | >>> add_ngram(sequences, token_indice, ngram_range=3) 106 | [[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42, 2018]] 107 | """ 108 | new_sequences = [] 109 | for input_list in sequences: 110 | new_list = input_list[:] 111 | for ngram_value in range(2, ngram_range + 1): 112 | for i in range(len(new_list) - ngram_value + 1): 113 | ngram = tuple(new_list[i:i + ngram_value]) 114 | if ngram in token_indice: 115 | new_list.append(token_indice[ngram]) 116 | new_sequences.append(new_list) 117 | 118 | return new_sequences 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | def load_dataset(maxlen, 127 | ngram_range=1): 128 | """ 129 | """ 130 | train_data = 'data/delicious/train-data.dat' 131 | train_labels = 'data/delicious/train-label.dat' 132 | val_data = 'data/delicious/valid-data.dat' 133 | val_labels = 'data/delicious/valid-label.dat' 134 | test_data = 'data/delicious/test-data.dat' 135 | test_labels = 'data/delicious/test-label.dat' 136 | vocab_file = 'data/delicious/vocabs.txt' 137 | 138 | 139 | print('Loading data...') 140 | X_train, y_train = read_data(train_data,train_labels) 141 | X_val, y_val = read_data(val_data,val_labels) 142 | X_test, y_test = read_data(test_data,test_labels) 143 | print(len(X_train), 'train sequences') 144 | print(len(X_test), 'test sequences') 145 | print('Average train sequence length: {}'.format(np.mean(list(map(len, X_train)), dtype=int))) 146 | print('Average test sequence length: {}'.format(np.mean(list(map(len, X_test)), dtype=int))) 147 | 148 | 149 | word_index = {} 150 | with open(vocab_file,'r') as vf: 151 | for line in vf: 152 | line = line.strip().split(', ') 153 | key = line[0] 154 | value = int(line[1]) 155 | word_index[key] = value 156 | 157 | max_features = len(word_index) 158 | 159 | if ngram_range > 1: 160 | print('Adding {}-gram features'.format(ngram_range)) 161 | # Create set of unique n-gram from the training set. 162 | ngram_set = set() 163 | for input_list in X_train: 164 | for i in range(2, ngram_range + 1): 165 | set_of_ngram = create_ngram_set(input_list, ngram_value=i) 166 | ngram_set.update(set_of_ngram) 167 | 168 | # Dictionary mapping n-gram token to a unique integer. 169 | # Integer values are greater than max_features in order 170 | # to avoid collision with existing features. 171 | start_index = max_features + 1 172 | token_indice = {v: k + start_index for k, v in enumerate(ngram_set)} 173 | indice_token = {token_indice[k]: k for k in token_indice} 174 | 175 | # max_features is the highest integer that could be found in the dataset. 176 | max_features = np.max(list(indice_token.keys())) + 1 177 | 178 | # Augmenting x_train and x_test with n-grams features 179 | X_train = add_ngram(X_train, token_indice, ngram_range) 180 | X_val = add_ngram(X_val, token_indice, ngram_range) 181 | X_test = add_ngram(X_test, token_indice, ngram_range) 182 | print('Average train sequence length: {}'.format(np.mean(list(map(len, X_train)), dtype=int))) 183 | print('Average val sequence length: {}'.format(np.mean(list(map(len, X_val)), dtype=int))) 184 | print('Average test sequence length: {}'.format(np.mean(list(map(len, X_test)), dtype=int))) 185 | 186 | 187 | print('Pad sequences (samples x time)') 188 | X_train = sequence.pad_sequences(X_train, maxlen=maxlen) 189 | X_val = sequence.pad_sequences(X_val, maxlen=maxlen) 190 | X_test = sequence.pad_sequences(X_test, maxlen=maxlen) 191 | print('X_train shape:', X_train.shape) 192 | print('X_val shape:', X_val.shape) 193 | print('X_test shape:', X_test.shape) 194 | 195 | 196 | return X_train,y_train,X_val,y_val,X_test,y_test,word_index 197 | 198 | 199 | 200 | 201 | 202 | 203 | def load_dataset_hierarchical(maxlen, 204 | max_sentence_len): 205 | """ 206 | """ 207 | train_data = 'data/delicious/train-data.dat' 208 | train_labels = 'data/delicious/train-label.dat' 209 | val_data = 'data/delicious/valid-data.dat' 210 | val_labels = 'data/delicious/valid-label.dat' 211 | test_data = 'data/delicious/test-data.dat' 212 | test_labels = 'data/delicious/test-label.dat' 213 | vocab_file = 'data/delicious/vocabs.txt' 214 | 215 | 216 | print('Loading data...') 217 | X_train, y_train = read_data_sentences(train_data,train_labels,maxlen,max_sentence_len) 218 | X_val, y_val = read_data_sentences(val_data,val_labels,maxlen,max_sentence_len) 219 | X_test, y_test = read_data_sentences(test_data,test_labels,maxlen,max_sentence_len) 220 | 221 | 222 | word_index = {} 223 | with open(vocab_file,'r') as vf: 224 | for line in vf: 225 | line = line.strip().split(', ') 226 | key = line[0] 227 | value = int(line[1]) 228 | word_index[key] = value 229 | 230 | max_features = len(word_index) 231 | 232 | print('X_train shape:', X_train.shape) 233 | print('X_val shape:', X_val.shape) 234 | print('X_test shape:', X_test.shape) 235 | 236 | 237 | return X_train,y_train,X_val,y_val,X_test,y_test,word_index -------------------------------------------------------------------------------- /text_model.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import numpy as np 4 | import pandas as pd 5 | 6 | from sklearn.tree import DecisionTreeClassifier 7 | from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, RandomForestClassifier, GradientBoostingClassifier 8 | from sklearn.naive_bayes import MultinomialNB 9 | from sklearn import metrics 10 | 11 | from loaders import load_20news,load_imdb,load_sms,load_amazon,load_paper_reviews 12 | from loaders import load_yelp,load_youtube,load_reuters8,load_reuters52,load_webkb 13 | 14 | 15 | if __name__ == '__main__': 16 | """ 17 | 20 newsgroup: 18 | Bagging: max_samples=0.8, max_features=0.7, n_estimators=50 19 | AdaBoost: n_estimators=300, learning_rates=1.7 20 | GradientBoostingClassifier: estimator_nums=100, learning_rates=0.5, max_depths=5 21 | RandomForestClassifier: estimator_nums=100, max_depths=7 22 | 23 | 24 | IMDB: 25 | Bagging: max_samples=0.7, max_features=0.95, n_estimators=40 26 | AdaBoost: n_estimators=200, learning_rates=1.0 27 | GradientBoostingClassifier: estimator_nums=, learning_rates=, max_depths= 28 | RandomForestClassifier: estimator_nums=, max_depths= 29 | 30 | 31 | SMSSpamCollection: 32 | Bagging: max_samples=0.4, max_features=0.6, n_estimators=60 33 | AdaBoost: n_estimators=30, learning_rates=1.5 34 | GradientBoostingClassifier: estimator_nums=100, learning_rates=0.5, max_depths=3 35 | RandomForestClassifier: estimator_nums=, max_depths= 36 | 37 | 38 | paper reviews: 39 | Bagging: max_samples=0.4, max_features=0.8, n_estimators=20 40 | AdaBoost: n_estimators=10, learning_rates=0.3 41 | GradientBoostingClassifier: estimator_nums=100, learning_rates=1.0, max_depths=2 42 | RandomForestClassifier: estimator_nums=, max_depths= 43 | 44 | 45 | yelp: 46 | Bagging: max_samples=0.3, max_features=0.95, n_estimators=70 47 | AdaBoost: n_estimators=60, learning_rates=1.5 48 | GradientBoostingClassifier: estimator_nums=, learning_rates=, max_depths= 49 | RandomForestClassifier: estimator_nums=, max_depths= 50 | 51 | 52 | amazon: 53 | Bagging: max_samples=0.3, max_features=0.5, n_estimators=20 54 | AdaBoost: n_estimators=30, learning_rates=0.5 55 | GradientBoostingClassifier: estimator_nums=50, learning_rates=0.5, max_depths=7 56 | RandomForestClassifier: estimator_nums=, max_depths= 57 | 58 | 59 | youtube: 60 | Bagging: max_samples=0.3, max_features=0.6, n_estimators=10 61 | AdaBoost: n_estimators=10, learning_rates=0.5 62 | GradientBoostingClassifier: estimator_nums=50, learning_rates=0.7, max_depths=2 63 | RandomForestClassifier: estimator_nums=, max_depths= 64 | 65 | 66 | reuters8: 67 | Bagging: max_samples=0.5, max_features=0.9, n_estimators=100 68 | AdaBoost: n_estimators=100, learning_rates=1.2 69 | GradientBoostingClassifier: estimator_nums=100, learning_rates=0.5, max_depths=5 70 | RandomForestClassifier: estimator_nums=, max_depths= 71 | 72 | 73 | reuters52: 74 | Bagging: max_samples=0.95, max_features=0.95, n_estimators=50 75 | AdaBoost: n_estimators=250, learning_rates=1.0 76 | GradientBoostingClassifier: estimator_nums=, learning_rates=, max_depths= 77 | RandomForestClassifier: estimator_nums=, max_depths= 78 | 79 | 80 | reuterswebkb: 81 | Bagging: max_samples=0.7, max_features=0.5, n_estimators=100 82 | AdaBoost: n_estimators=50, learning_rates=0.95 83 | GradientBoostingClassifier: estimator_nums=, learning_rates=, max_depths= 84 | RandomForestClassifier: estimator_nums=, max_depths= 85 | """ 86 | 87 | dataset = sys.argv[1] 88 | 89 | if dataset == '20news': 90 | X_train, y_train, X_val, y_val, X_test, y_test = load_20news() 91 | elif dataset == 'imdb': 92 | X_train, y_train, X_val, y_val, X_test, y_test = load_imdb() 93 | elif dataset == 'sms': 94 | X_train, y_train, X_val, y_val, X_test, y_test = load_sms() 95 | elif dataset == 'p_reviews': 96 | X_train, y_train, X_val, y_val, X_test, y_test = load_paper_reviews() 97 | elif dataset == 'yelp': 98 | X_train, y_train, X_val, y_val, X_test, y_test = load_yelp() 99 | elif dataset == 'amazon': 100 | X_train, y_train, X_val, y_val, X_test, y_test = load_amazon() 101 | elif dataset == 'youtube': 102 | X_train, y_train, X_val, y_val, X_test, y_test = load_youtube() 103 | elif dataset == 'r8': 104 | X_train, y_train, X_val, y_val, X_test, y_test = load_reuters8() 105 | elif dataset == 'r52': 106 | X_train, y_train, X_val, y_val, X_test, y_test = load_reuters52() 107 | elif dataset == 'webkb': 108 | X_train, y_train, X_val, y_val, X_test, y_test = load_webkb() 109 | 110 | 111 | clf = MultinomialNB(alpha=.01) 112 | 113 | 114 | clf.fit(X_train, y_train) 115 | preds = clf.predict(X_train) 116 | val_preds = clf.predict(X_test) 117 | print 'NB training f-score:',metrics.f1_score(y_train, preds, average='macro') 118 | print 'NB test f-score:',metrics.f1_score(y_test, val_preds, average='macro') 119 | 120 | estimator_nums = [100] 121 | max_samps = [0.7] 122 | max_feats = [0.5] 123 | best_fscore = 0.0 124 | for m in max_samps: 125 | for n in estimator_nums: 126 | for f in max_feats: 127 | bagg_clf = BaggingClassifier(clf, 128 | n_estimators=n, 129 | max_samples=m, 130 | max_features=f, 131 | random_state=0) 132 | bagg_clf.fit(X_train, y_train) 133 | 134 | val_preds = bagg_clf.predict(X_val) 135 | val_score = metrics.f1_score(y_val, val_preds, average='macro') 136 | if val_score > best_fscore: 137 | best_fscore = val_score 138 | best_params = (m,n,f) 139 | best_clf = bagg_clf 140 | 141 | print 'best parameters:',best_params 142 | preds = best_clf.predict(X_train) 143 | val_preds = best_clf.predict(X_test) 144 | print 'Bagging training f-score:',metrics.f1_score(y_train, preds, average='macro') 145 | print 'Bagging test f-score:',metrics.f1_score(y_test, val_preds, average='macro') 146 | 147 | 148 | estimator_nums = [50] 149 | learning_rates = [0.95] 150 | best_fscore = 0.0 151 | for n in estimator_nums: 152 | for lr in learning_rates: 153 | ada_clf = AdaBoostClassifier(clf, 154 | n_estimators=n, 155 | learning_rate=lr, 156 | random_state=0) 157 | ada_clf.fit(X_train, y_train) 158 | 159 | val_preds = ada_clf.predict(X_val) 160 | val_score = metrics.f1_score(y_val, val_preds, average='macro') 161 | if val_score > best_fscore: 162 | best_fscore = val_score 163 | best_params = (n,lr) 164 | best_clf = ada_clf 165 | 166 | print 'best parameters:',best_params 167 | preds = best_clf.predict(X_train) 168 | val_preds = best_clf.predict(X_test) 169 | print 'AdaBoost training f-score:',metrics.f1_score(y_train, preds, average='macro') 170 | print 'AdaBoost test f-score:',metrics.f1_score(y_test, val_preds, average='macro') 171 | 172 | 173 | estimator_nums = [100] 174 | learning_rates = [0.5] 175 | max_depths = [5] 176 | best_fscore = 0.0 177 | for n in estimator_nums: 178 | for lr in learning_rates: 179 | for d in max_depths: 180 | gb_clf = GradientBoostingClassifier(n_estimators=n, 181 | max_depth=d, 182 | learning_rate=lr, 183 | random_state=0) 184 | gb_clf.fit(X_train, y_train) 185 | 186 | val_preds = gb_clf.predict(X_val) 187 | val_score = metrics.f1_score(y_val, val_preds, average='macro') 188 | if val_score > best_fscore: 189 | best_fscore = val_score 190 | best_params = (n,lr,d) 191 | best_clf = gb_clf 192 | 193 | print 'best parameters:',best_params 194 | preds = best_clf.predict(X_train) 195 | val_preds = best_clf.predict(X_test) 196 | print 'Gradient Boosting training f-score:',metrics.f1_score(y_train, preds, average='macro') 197 | print 'Gradient Boosting test f-score:',metrics.f1_score(y_test, val_preds, average='macro') 198 | 199 | 200 | estimator_nums = [50] 201 | max_depths = [7] 202 | best_fscore = 0.0 203 | for n in estimator_nums: 204 | for d in max_depths: 205 | forest_clf = RandomForestClassifier(n_estimators=n, 206 | max_depth=d, 207 | random_state=0) 208 | forest_clf.fit(X_train, y_train) 209 | 210 | val_preds = forest_clf.predict(X_val) 211 | val_score = metrics.f1_score(y_val, val_preds, average='macro') 212 | if val_score > best_fscore: 213 | best_fscore = val_score 214 | best_params = (n,d) 215 | best_clf = forest_clf 216 | 217 | print 'best parameters:',best_params 218 | preds = best_clf.predict(X_train) 219 | val_preds = best_clf.predict(X_test) 220 | print 'Random Forest training f-score:',metrics.f1_score(y_train, preds, average='macro') 221 | print 'Random Forest test f-score:',metrics.f1_score(y_test, val_preds, average='macro') 222 | 223 | 224 | 225 | -------------------------------------------------------------------------------- /loaders.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import numpy as np 4 | import pandas as pd 5 | 6 | from sklearn.datasets import fetch_20newsgroups ,fetch_rcv1 7 | from sklearn.feature_extraction.text import TfidfVectorizer 8 | from sklearn.model_selection import train_test_split 9 | 10 | from keras.datasets import imdb 11 | 12 | 13 | def load_20news(): 14 | """ 15 | """ 16 | print 'Loading data...' 17 | newsgroups_train = fetch_20newsgroups(subset='train', 18 | remove=('headers', 'footers', 'quotes'), 19 | shuffle=True) 20 | newsgroups_test = fetch_20newsgroups(subset='test', 21 | remove=('headers', 'footers', 'quotes'), 22 | shuffle=True) 23 | 24 | print 'Preprocessing...' 25 | vectorizer = TfidfVectorizer(strip_accents='unicode', 26 | lowercase=True, 27 | stop_words='english', 28 | ngram_range=(1, 2), 29 | max_df=0.5, 30 | min_df=5, 31 | max_features=20000, 32 | norm='l2', 33 | use_idf=True, 34 | smooth_idf=True, 35 | sublinear_tf=False) 36 | 37 | vectorizer.fit(newsgroups_train.data) 38 | 39 | X_train = vectorizer.transform(newsgroups_train.data) 40 | y_train = newsgroups_train.target 41 | X_test = vectorizer.transform(newsgroups_test.data) 42 | y_test = newsgroups_test.target 43 | 44 | X_train, X_val, y_train, y_val = train_test_split(X_train, 45 | y_train, 46 | test_size=0.2, 47 | random_state=0) 48 | 49 | return X_train, y_train, X_val, y_val, X_test, y_test 50 | 51 | 52 | def load_imdb(): 53 | """ 54 | """ 55 | print 'Loading data...' 56 | 57 | word_to_index = imdb.get_word_index() 58 | index_to_word = [None] * (max(word_to_index.values()) + 1) 59 | for w, i in word_to_index.items(): 60 | index_to_word[i] = w 61 | 62 | (X_train, y_train), (X_test, y_test) = imdb.load_data() 63 | 64 | print 'Preprocessing...' 65 | X_train = [ 66 | ' '.join(index_to_word[i] 67 | for i in X_train[i] 68 | if i < len(index_to_word)) 69 | for i in range(X_train.shape[0]) 70 | ] 71 | 72 | X_test = [ 73 | ' '.join(index_to_word[i] 74 | for i in X_test[i] 75 | if i < len(index_to_word)) 76 | for i in range(X_test.shape[0]) 77 | ] 78 | 79 | vectorizer = TfidfVectorizer(strip_accents='unicode', 80 | lowercase=True, 81 | stop_words='english', 82 | ngram_range=(1, 2), 83 | max_df=0.5, 84 | min_df=5, 85 | max_features=50000, 86 | norm='l2', 87 | use_idf=True, 88 | smooth_idf=True, 89 | sublinear_tf=False) 90 | 91 | vectorizer.fit(X_train) 92 | 93 | X_train = vectorizer.transform(X_train) 94 | X_test = vectorizer.transform(X_test) 95 | 96 | 97 | X_train, X_val, y_train, y_val = train_test_split(X_train, 98 | y_train, 99 | test_size=0.2, 100 | random_state=0) 101 | 102 | return X_train, y_train, X_val, y_val, X_test, y_test 103 | 104 | 105 | def load_sms(): 106 | """ 107 | """ 108 | print 'Loading data...' 109 | 110 | df = pd.read_csv('data/SMSSpamCollection', 111 | header=None, 112 | delimiter='\t') 113 | 114 | classes = dict((k,idx) for idx,k in enumerate(df[0].unique())) 115 | y_data = df[0].map(lambda x: classes[x]).tolist() 116 | X_data = df[1].tolist() 117 | 118 | vectorizer = TfidfVectorizer(strip_accents='unicode', 119 | lowercase=True, 120 | stop_words='english', 121 | ngram_range=(1, 2), 122 | max_df=0.5, 123 | min_df=5, 124 | max_features=50000, 125 | norm='l2', 126 | use_idf=True, 127 | smooth_idf=True, 128 | sublinear_tf=False) 129 | 130 | vectorizer.fit(X_data) 131 | X_data = vectorizer.transform(X_data) 132 | 133 | X_train, X_test, y_train, y_test = train_test_split(X_data, 134 | y_data, 135 | test_size=0.1, 136 | random_state=0) 137 | 138 | X_train, X_val, y_train, y_val = train_test_split(X_train, 139 | y_train, 140 | test_size=0.2, 141 | random_state=0) 142 | 143 | return X_train, y_train, X_val, y_val, X_test, y_test 144 | 145 | 146 | def load_paper_reviews(): 147 | """ 148 | """ 149 | df = pd.read_json('data/reviews.json') 150 | class2id = dict((k,idx) for idx,k in enumerate(df['preliminary_decision'].unique())) 151 | 152 | y_data = df['preliminary_decision'].map(lambda x: class2id[x]).tolist() 153 | 154 | 155 | X_data = df['review'] 156 | X_list = [] 157 | y_list = [] 158 | 159 | for i,(review,lab) in enumerate(zip(X_data,y_data)): 160 | try: 161 | X_list.append(review[0]['text']) 162 | y_list.append(lab) 163 | except: 164 | continue 165 | 166 | y_data = y_list 167 | 168 | print 'Preprocessing...' 169 | vectorizer = TfidfVectorizer(strip_accents='unicode', 170 | lowercase=True, 171 | stop_words='english', 172 | ngram_range=(1, 2), 173 | max_df=0.5, 174 | min_df=5, 175 | max_features=10000, 176 | norm='l2', 177 | use_idf=True, 178 | smooth_idf=True, 179 | sublinear_tf=False) 180 | 181 | vectorizer.fit(X_list) 182 | 183 | X_data = vectorizer.transform(X_list) 184 | 185 | X_train, X_test, y_train, y_test = train_test_split(X_data, 186 | y_data, 187 | test_size=0.1, 188 | random_state=0) 189 | 190 | X_train, X_val, y_train, y_val = train_test_split(X_train, 191 | y_train, 192 | test_size=0.2, 193 | random_state=0) 194 | 195 | return X_train, y_train, X_val, y_val, X_test, y_test 196 | 197 | 198 | def load_yelp(): 199 | """ 200 | """ 201 | df = pd.read_json('data/yelp.json',orient='records',lines=True, encoding='utf-8') 202 | 203 | X_data = df['text'].tolist() 204 | y_data = df['stars'].tolist() 205 | 206 | print 'Preprocessing...' 207 | vectorizer = TfidfVectorizer(strip_accents='unicode', 208 | lowercase=True, 209 | stop_words='english', 210 | ngram_range=(1, 2), 211 | max_df=0.5, 212 | min_df=5, 213 | max_features=20000, 214 | norm='l2', 215 | use_idf=True, 216 | smooth_idf=True, 217 | sublinear_tf=False) 218 | 219 | vectorizer.fit(X_data) 220 | 221 | X_data = vectorizer.transform(X_data) 222 | 223 | X_train, X_test, y_train, y_test = train_test_split(X_data, 224 | y_data, 225 | test_size=0.1, 226 | random_state=0) 227 | 228 | X_train, X_val, y_train, y_val = train_test_split(X_train, 229 | y_train, 230 | test_size=0.2, 231 | random_state=0) 232 | 233 | return X_train, y_train, X_val, y_val, X_test, y_test 234 | 235 | 236 | def load_amazon(): 237 | """ 238 | """ 239 | df = pd.read_csv('data/amazon.txt', 240 | header=None, 241 | delimiter='\t') 242 | 243 | X_data = df[0].tolist() 244 | y_data = df[1].tolist() 245 | 246 | print 'Preprocessing...' 247 | vectorizer = TfidfVectorizer(strip_accents='unicode', 248 | lowercase=True, 249 | stop_words='english', 250 | ngram_range=(1, 2), 251 | max_df=0.5, 252 | min_df=5, 253 | max_features=20000, 254 | norm='l2', 255 | use_idf=True, 256 | smooth_idf=True, 257 | sublinear_tf=False) 258 | 259 | vectorizer.fit(X_data) 260 | 261 | X_data = vectorizer.transform(X_data) 262 | 263 | X_train, X_test, y_train, y_test = train_test_split(X_data, 264 | y_data, 265 | test_size=0.1, 266 | random_state=0) 267 | 268 | X_train, X_val, y_train, y_val = train_test_split(X_train, 269 | y_train, 270 | test_size=0.2, 271 | random_state=0) 272 | 273 | return X_train, y_train, X_val, y_val, X_test, y_test 274 | 275 | 276 | def load_youtube(): 277 | """ 278 | """ 279 | df = pd.read_csv('data/untitled1/Youtube01-Psy.csv') 280 | df = df.append(pd.read_csv('data/untitled1/Youtube02-KatyPerry.csv')) 281 | df = df.append(pd.read_csv('data/untitled1/Youtube03-LMFAO.csv')) 282 | df = df.append(pd.read_csv('data/untitled1/Youtube04-Eminem.csv')) 283 | df = df.append(pd.read_csv('data/untitled1/Youtube05-Shakira.csv')) 284 | 285 | X_data = df["CONTENT"].tolist() 286 | y_data = df["CLASS"].tolist() 287 | 288 | 289 | print 'Preprocessing...' 290 | vectorizer = TfidfVectorizer(strip_accents='unicode', 291 | lowercase=True, 292 | stop_words='english', 293 | ngram_range=(1, 2), 294 | max_df=0.5, 295 | min_df=5, 296 | max_features=20000, 297 | norm='l2', 298 | use_idf=True, 299 | smooth_idf=True, 300 | sublinear_tf=False) 301 | 302 | vectorizer.fit(X_data) 303 | 304 | X_data = vectorizer.transform(X_data) 305 | 306 | X_train, X_test, y_train, y_test = train_test_split(X_data, 307 | y_data, 308 | test_size=0.1, 309 | random_state=0) 310 | 311 | X_train, X_val, y_train, y_val = train_test_split(X_train, 312 | y_train, 313 | test_size=0.2, 314 | random_state=0) 315 | 316 | return X_train, y_train, X_val, y_val, X_test, y_test 317 | 318 | 319 | def load_reuters8(): 320 | """ 321 | """ 322 | 323 | df = pd.read_csv('data/r8-train-all-terms.txt', 324 | header=None, 325 | delimiter='\t') 326 | 327 | test_df = pd.read_csv('data/r8-test-all-terms.txt', 328 | header=None, 329 | delimiter='\t') 330 | 331 | class2id = dict((k,idx) for idx,k in enumerate(df[0].unique())) 332 | 333 | X_train = df[1].tolist() 334 | X_test = test_df[1].tolist() 335 | 336 | y_train = df[0].map(lambda x: class2id[x]).tolist() 337 | y_test = test_df[0].map(lambda x: class2id[x]).tolist() 338 | 339 | print 'Preprocessing...' 340 | vectorizer = TfidfVectorizer(strip_accents='unicode', 341 | lowercase=True, 342 | stop_words='english', 343 | ngram_range=(1, 2), 344 | max_df=0.5, 345 | min_df=5, 346 | max_features=20000, 347 | norm='l2', 348 | use_idf=True, 349 | smooth_idf=True, 350 | sublinear_tf=False) 351 | 352 | vectorizer.fit(X_train) 353 | 354 | X_train = vectorizer.transform(X_train) 355 | X_test = vectorizer.transform(X_test) 356 | 357 | X_train, X_val, y_train, y_val = train_test_split(X_train, 358 | y_train, 359 | test_size=0.2, 360 | random_state=0) 361 | 362 | return X_train, y_train, X_val, y_val, X_test, y_test 363 | 364 | 365 | def load_reuters52(): 366 | """ 367 | """ 368 | 369 | df = pd.read_csv('data/r52-train-all-terms.txt', 370 | header=None, 371 | delimiter='\t') 372 | 373 | test_df = pd.read_csv('data/r52-test-all-terms.txt', 374 | header=None, 375 | delimiter='\t') 376 | 377 | class2id = dict((k,idx) for idx,k in enumerate(df[0].unique())) 378 | 379 | X_train = df[1].tolist() 380 | X_test = test_df[1].tolist() 381 | 382 | y_train = df[0].map(lambda x: class2id[x]).tolist() 383 | y_test = test_df[0].map(lambda x: class2id[x]).tolist() 384 | 385 | print 'Preprocessing...' 386 | vectorizer = TfidfVectorizer(strip_accents='unicode', 387 | lowercase=True, 388 | stop_words='english', 389 | ngram_range=(1, 2), 390 | max_df=0.5, 391 | min_df=5, 392 | max_features=20000, 393 | norm='l2', 394 | use_idf=True, 395 | smooth_idf=True, 396 | sublinear_tf=False) 397 | 398 | vectorizer.fit(X_train) 399 | 400 | X_train = vectorizer.transform(X_train) 401 | X_test = vectorizer.transform(X_test) 402 | 403 | X_train, X_val, y_train, y_val = train_test_split(X_train, 404 | y_train, 405 | test_size=0.2, 406 | random_state=0) 407 | 408 | return X_train, y_train, X_val, y_val, X_test, y_test 409 | 410 | 411 | def load_webkb(): 412 | """ 413 | """ 414 | 415 | df = pd.read_csv('data/web.txt', 416 | header=None, 417 | delimiter='\t') 418 | 419 | class2id = dict((k,idx) for idx,k in enumerate(df[0].unique())) 420 | 421 | y_data = df[0].map(lambda x: class2id[x]).tolist() 422 | X_list = df[1].tolist() 423 | 424 | 425 | X_data = [] 426 | y_list = [] 427 | for x,y in zip(X_list,y_data): 428 | try: 429 | if np.isnan(x): 430 | continue 431 | except: 432 | pass 433 | X_data.append(x) 434 | y_list.append(y) 435 | 436 | y_data = y_list 437 | 438 | print 'Preprocessing...' 439 | vectorizer = TfidfVectorizer(strip_accents='unicode', 440 | lowercase=True, 441 | stop_words='english', 442 | ngram_range=(1, 2), 443 | max_df=0.5, 444 | min_df=5, 445 | max_features=10000, 446 | norm='l2', 447 | use_idf=True, 448 | smooth_idf=True, 449 | sublinear_tf=False) 450 | 451 | vectorizer.fit(X_data) 452 | 453 | X_data = vectorizer.transform(X_data) 454 | 455 | X_train, X_test, y_train, y_test = train_test_split(X_data, 456 | y_data, 457 | test_size=0.1, 458 | random_state=0) 459 | 460 | X_train, X_val, y_train, y_val = train_test_split(X_train, 461 | y_train, 462 | test_size=0.2, 463 | random_state=0) 464 | 465 | return X_train, y_train, X_val, y_val, X_test, y_test --------------------------------------------------------------------------------