├── README.md
├── LICENSE
├── .gitignore
├── uncertainty_sampling.py
├── multilabel.py
├── hierarchical.py
├── imballanced_classes.py
├── cost_sensitive_learning.py
├── delicious_loader.py
├── text_model.py
└── loaders.py


/README.md:
--------------------------------------------------------------------------------
1 | # Advanced-ML-techniques
2 | This repo contains implementation of advanced ML techniques. Includes model ensembles, cost-sensitive learning and dealing with class imbalance. 
3 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Alex Gidiotis
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | local_settings.py
 56 | 
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | 
 61 | # Scrapy stuff:
 62 | .scrapy
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | 
 70 | # Jupyter Notebook
 71 | .ipynb_checkpoints
 72 | 
 73 | # pyenv
 74 | .python-version
 75 | 
 76 | # celery beat schedule file
 77 | celerybeat-schedule
 78 | 
 79 | # SageMath parsed files
 80 | *.sage.py
 81 | 
 82 | # dotenv
 83 | .env
 84 | 
 85 | # virtualenv
 86 | .venv
 87 | venv/
 88 | ENV/
 89 | 
 90 | # Spyder project settings
 91 | .spyderproject
 92 | .spyproject
 93 | 
 94 | # Rope project settings
 95 | .ropeproject
 96 | 
 97 | # mkdocs documentation
 98 | /site
 99 | 
100 | # mypy
101 | .mypy_cache/
102 | 


--------------------------------------------------------------------------------
/uncertainty_sampling.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | from delicious_loader import load_dataset
 4 | from sklearn import svm
 5 | from sklearn.linear_model import LogisticRegression
 6 | import matplotlib.pyplot as plt
 7 | 
 8 | if __name__ == '__main__':
 9 |     ngram_range = 1
10 |     maxlen = 200
11 | 
12 |     X_train, y_train, X_val, y_val, X_test, y_test, word_index = load_dataset(ngram_range=ngram_range, maxlen=maxlen)
13 | 
14 |     # Column 19 is the rarest
15 | 
16 |     # y_data = pd.read_csv('data/delicious/train-label.dat', header=None, sep=' ')
17 |     # for col in range(0, 20, 1):
18 |     #     print y_data[col].sum()
19 | 
20 |     X_test_unlabeled_pool = X_test[:1992, :]
21 |     X_test_test = X_test[1992:, :]
22 |     y_test_unlabeled_pool = y_test[:1992, -1]
23 |     y_test_test = y_test[1992:, -1]
24 | 
25 |     acc = []
26 |     train_acc = []
27 |     dim = []
28 | 
29 |     for k in range(0,10,1):
30 |         clf = LogisticRegression()
31 |         print 'Size of X: ', len(X_train), X_train.shape, type(X_train)
32 |         clf.fit(X_train, y_train[:, -1])
33 | 
34 |         preds = clf.decision_function(X_test_unlabeled_pool)
35 | 
36 |         values = []
37 |         positions = []
38 |         for i in range(0, len(X_test_unlabeled_pool), 1):
39 |             values.append(abs(preds[i]))
40 |             positions.append(i)
41 | 
42 |         for i in range(10):
43 |             pos = np.array(values).argmin()
44 |             # print np.array(values).min()
45 |             X_train_new = np.zeros(((X_train.shape[0] + 1), X_train.shape[1]))
46 |             y_train_new = np.zeros(((y_train[:, -1].shape[0] + 1), 1))
47 | 
48 |             X_train_new[:X_train.shape[0]] = X_train
49 |             X_train_new[X_train.shape[0]:] = X_test_unlabeled_pool[pos, :]
50 |             y_train_new[:y_train.shape[0],0] = y_train[:, -1]
51 |             y_train_new[y_train.shape[0]:] = y_test_unlabeled_pool[pos]
52 |             X_train = X_train_new
53 |             y_train = y_train_new
54 |             print len(X_train), y_train.shape
55 |             np.delete(X_test_unlabeled_pool, pos, 0)
56 |             np.delete(y_test_unlabeled_pool, pos, 0)
57 |             del values[pos]
58 |             del positions[pos]
59 | 
60 |         acc.append((1-clf.score(X_test_test, y_test_test)))
61 |         train_acc.append((1-clf.score(X_train, y_train)))
62 |         dim.append(X_train.shape[0])
63 | 
64 |     plt.plot(dim, acc, dim, train_acc)
65 |     plt.show()
66 | 
67 | 
68 | 
69 | 


--------------------------------------------------------------------------------
/multilabel.py:
--------------------------------------------------------------------------------
  1 | from delicious_loader import load_dataset
  2 | 
  3 | 
  4 | import numpy as np
  5 | from sklearn.metrics import precision_recall_fscore_support, roc_auc_score
  6 | 
  7 | 
  8 | from keras.models import Model, model_from_json
  9 | from keras.layers import Dense, Input, Embedding, GlobalAveragePooling1D
 10 | from keras.callbacks import EarlyStopping, ModelCheckpoint, Callback
 11 | from keras import regularizers
 12 | 
 13 | import tensorflow as tf
 14 | 
 15 | 
 16 | 
 17 | 
 18 | 
 19 | 
 20 | def f1_score(y_true, y_pred):
 21 | 	"""
 22 | 	Compute the micro f(b) score with b=1.
 23 | 	"""
 24 | 	y_true = tf.cast(y_true, "float32")
 25 | 	y_pred = tf.cast(tf.round(y_pred), "float32") # implicit 0.5 threshold via tf.round
 26 | 	y_correct = y_true * y_pred
 27 | 
 28 | 
 29 | 	sum_true = tf.reduce_sum(y_true, axis=1)
 30 | 	sum_pred = tf.reduce_sum(y_pred, axis=1)
 31 | 	sum_correct = tf.reduce_sum(y_correct, axis=1)
 32 | 
 33 | 
 34 | 	precision = sum_correct / sum_pred
 35 | 	recall = sum_correct / sum_true
 36 | 	f_score = 2 * precision * recall / (precision + recall)
 37 | 	f_score = tf.where(tf.is_nan(f_score), tf.zeros_like(f_score), f_score)
 38 | 
 39 | 
 40 | 	return tf.reduce_mean(f_score)
 41 | 
 42 | 
 43 | 
 44 | 
 45 | 
 46 | 
 47 | def build_model(num_features,
 48 | 	num_classes,
 49 | 	embedding_dims,
 50 | 	maxlen):
 51 | 	"""
 52 | 	"""
 53 | 
 54 | 	input_layer = Input(shape=(maxlen,),
 55 | 		dtype='int32')
 56 | 
 57 | 
 58 | 	embeddings = Embedding(num_features,
 59 | 		embedding_dims,
 60 | 		input_length=maxlen,
 61 | 		embeddings_regularizer=regularizers.l1(7e-7))(input_layer)
 62 | 
 63 | 	avg_layer = GlobalAveragePooling1D()(embeddings)
 64 | 	predictions = Dense(num_classes, activation='sigmoid')(avg_layer)
 65 | 
 66 | 	model = Model(inputs=input_layer, outputs=predictions)
 67 | 	model.compile(loss='binary_crossentropy',
 68 |               optimizer='adam',
 69 |               metrics=[f1_score])
 70 | 
 71 | 	model.summary()
 72 | 
 73 | 	return model
 74 | 
 75 | 
 76 | 
 77 | 
 78 | 
 79 | 
 80 | def load_model():
 81 | 	"""
 82 | 	"""
 83 | 
 84 | 	json_file = open('multilabel_model.json', 'r')
 85 | 	loaded_model_json = json_file.read()
 86 | 	json_file.close()
 87 | 	model = model_from_json(loaded_model_json)
 88 | 
 89 | 	model.load_weights('multilabel_model.h5')
 90 | 	print("Loaded model from disk")
 91 | 
 92 | 	model.summary()
 93 | 
 94 | 	model.compile(loss='binary_crossentropy',
 95 | 		optimizer='adam',
 96 | 		metrics=[f1_score])
 97 | 
 98 | 
 99 | 	return model
100 | 
101 | 
102 | 
103 | 
104 | 
105 | 
106 | 
107 | 
108 | if __name__ == '__main__':
109 | 
110 | 	ngram_range = 1
111 | 	maxlen = 200
112 | 	batch_size = 32
113 | 	embedding_dims = 50
114 | 	epochs = 500
115 | 	num_classes = 20 
116 | 
117 | 
118 | 
119 | 	X_train,y_train,X_val,y_val,X_test,y_test,word_index = load_dataset(ngram_range=ngram_range,maxlen=maxlen)
120 | 
121 | 	num_features = len(word_index)
122 | 	print('Found %d words' % num_features)
123 | 
124 | 	
125 | 	model = build_model(num_features,num_classes,embedding_dims,maxlen)
126 | 
127 | 	model_json = model.to_json()
128 | 	with open("multilabel_model.json", "w") as json_file:
129 | 		json_file.write(model_json)
130 | 
131 | 
132 | 	early_stopping =EarlyStopping(monitor='val_f1_score',
133 | 		patience=15,
134 | 		mode='max')
135 | 	bst_model_path = 'multilabel_model.h5'
136 | 	model_checkpoint = ModelCheckpoint(bst_model_path,
137 | 		monitor='val_f1_score',
138 | 		verbose=1,
139 | 		save_best_only=True,
140 | 		mode='max',
141 | 		save_weights_only=True)
142 | 
143 | 	model.fit(X_train, y_train,
144 | 		batch_size=batch_size,
145 | 		epochs=epochs,
146 | 		validation_data=(X_val, y_val),
147 | 		callbacks=[model_checkpoint,early_stopping])
148 | 	
149 | 
150 | 	model = load_model()
151 | 	y_pred = model.predict(X_test)
152 | 
153 | 	print 'AUC:',roc_auc_score(y_test, y_pred)
154 | 	y_pred[y_pred > 0.25] = 1
155 | 	y_pred[y_pred <= 0.25] = 0
156 | 	
157 | 
158 | 	for i in range(10):
159 | 		pred,lab = y_pred[i],y_test[i]
160 | 		print np.where(pred == 1), np.where(lab == 1)
161 |     	
162 | 
163 |  	print precision_recall_fscore_support(y_test, y_pred, average='micro')
164 | 	print precision_recall_fscore_support(y_test, y_pred, average='macro')


--------------------------------------------------------------------------------
/hierarchical.py:
--------------------------------------------------------------------------------
  1 | from delicious_loader import load_dataset_hierarchical
  2 | 
  3 | 
  4 | import numpy as np
  5 | from sklearn.metrics import precision_recall_fscore_support, roc_auc_score
  6 | 
  7 | 
  8 | from keras.models import Model, model_from_json
  9 | from keras.layers import Dense, Input, Embedding, GlobalAveragePooling1D, TimeDistributed, LSTM, Dropout, Flatten
 10 | from keras.callbacks import EarlyStopping, ModelCheckpoint, Callback
 11 | from keras.layers.wrappers import Bidirectional
 12 | from keras import regularizers
 13 | 
 14 | import tensorflow as tf
 15 | 
 16 | 
 17 | 
 18 | 
 19 | 
 20 | 
 21 | def f1_score(y_true, y_pred):
 22 | 	"""
 23 | 	Compute the micro f(b) score with b=1.
 24 | 	"""
 25 | 	y_true = tf.cast(y_true, "float32")
 26 | 	y_pred = tf.cast(tf.round(y_pred), "float32") # implicit 0.5 threshold via tf.round
 27 | 	y_correct = y_true * y_pred
 28 | 
 29 | 
 30 | 	sum_true = tf.reduce_sum(y_true, axis=1)
 31 | 	sum_pred = tf.reduce_sum(y_pred, axis=1)
 32 | 	sum_correct = tf.reduce_sum(y_correct, axis=1)
 33 | 
 34 | 
 35 | 	precision = sum_correct / sum_pred
 36 | 	recall = sum_correct / sum_true
 37 | 	f_score = 2 * precision * recall / (precision + recall)
 38 | 	f_score = tf.where(tf.is_nan(f_score), tf.zeros_like(f_score), f_score)
 39 | 
 40 | 
 41 | 	return tf.reduce_mean(f_score)
 42 | 
 43 | 
 44 | 
 45 | 
 46 | 
 47 | 
 48 | def build_model(num_features,
 49 | 	num_classes,
 50 | 	embedding_dims,
 51 | 	maxlen,
 52 | 	max_sentence_len):
 53 | 	"""
 54 | 	"""
 55 | 
 56 | 	input_layer = Input(shape=(maxlen,max_sentence_len,),
 57 | 		dtype='int32')
 58 | 	sentence_input = Input(shape=(max_sentence_len,),
 59 | 		dtype='int32')
 60 | 
 61 | 	embeddings = Embedding(num_features,
 62 | 		embedding_dims,
 63 | 		input_length=max_sentence_len,
 64 | 		embeddings_regularizer=regularizers.l1(1e-6))(sentence_input)
 65 | 
 66 | 	avg_layer = GlobalAveragePooling1D()(embeddings)
 67 | 	sentEncoder = Model(inputs=sentence_input,
 68 | 		outputs=avg_layer)
 69 | 	sentEncoder.summary()
 70 | 	textEncoder = TimeDistributed(sentEncoder)(input_layer)
 71 | 
 72 | 	global_avg_layer = Flatten()(textEncoder)
 73 | 
 74 | 	global_avg_layer = Dropout(0.5)(global_avg_layer)
 75 | 	predictions = Dense(num_classes, 
 76 | 		activation='sigmoid',
 77 | 		kernel_regularizer=regularizers.l1(1e-5))(global_avg_layer)
 78 | 
 79 | 	model = Model(inputs=input_layer,
 80 | 		outputs=predictions)
 81 | 	model.compile(loss='binary_crossentropy',
 82 |               optimizer='adam',
 83 |               metrics=[f1_score])
 84 | 
 85 | 	model.summary()
 86 | 
 87 | 	return model
 88 | 
 89 | 
 90 | 
 91 | 
 92 | 
 93 | 
 94 | def load_model():
 95 | 	"""
 96 | 	"""
 97 | 
 98 | 	json_file = open('hierarchical_model.json', 'r')
 99 | 	loaded_model_json = json_file.read()
100 | 	json_file.close()
101 | 	model = model_from_json(loaded_model_json)
102 | 
103 | 	model.load_weights('hierarchical_model.h5')
104 | 	print("Loaded model from disk")
105 | 
106 | 	model.summary()
107 | 
108 | 	model.compile(loss='binary_crossentropy',
109 | 		optimizer='adam',
110 | 		metrics=[f1_score])
111 | 
112 | 
113 | 	return model
114 | 
115 | 
116 | 
117 | 
118 | 
119 | 
120 | 
121 | 
122 | if __name__ == '__main__':
123 | 
124 | 	ngram_range = 1
125 | 	maxlen = 20
126 | 	max_sentence_len = 10
127 | 	batch_size = 32
128 | 	embedding_dims = 50
129 | 	epochs = 500
130 | 	num_classes = 20 
131 | 
132 | 
133 | 	X_train,y_train,X_val,y_val,X_test,y_test,word_index = load_dataset_hierarchical(maxlen,max_sentence_len)
134 | 
135 | 
136 | 
137 | 	num_features = len(word_index)
138 | 	print('Found %d words' % num_features)
139 | 
140 | 	'''
141 | 	model = build_model(num_features,num_classes,embedding_dims,maxlen,max_sentence_len)
142 | 	
143 | 	model_json = model.to_json()
144 | 	with open("hierarchical_model.json", "w") as json_file:
145 | 		json_file.write(model_json)
146 | 
147 | 
148 | 	early_stopping =EarlyStopping(monitor='val_f1_score',
149 | 		patience=15,
150 | 		mode='max')
151 | 	bst_model_path = 'hierarchical_model.h5'
152 | 	model_checkpoint = ModelCheckpoint(bst_model_path,
153 | 		monitor='val_f1_score',
154 | 		verbose=1,
155 | 		save_best_only=True,
156 | 		mode='max',
157 | 		save_weights_only=True)
158 | 
159 | 	model.fit(X_train, y_train,
160 | 		batch_size=batch_size,
161 | 		epochs=epochs,
162 | 		validation_data=(X_val, y_val),
163 | 		callbacks=[model_checkpoint,early_stopping])
164 | 	
165 | 	'''
166 | 	model = load_model()
167 | 	y_pred = model.predict(X_test)
168 | 
169 | 	print 'AUC:',roc_auc_score(y_test, y_pred)
170 | 	y_pred[y_pred > 0.25] = 1
171 | 	y_pred[y_pred <= 0.25] = 0
172 | 	
173 | 
174 | 	for i in range(10):
175 | 		pred,lab = y_pred[i],y_test[i]
176 | 		print np.where(pred == 1), np.where(lab == 1)
177 |     	
178 | 
179 |  	print precision_recall_fscore_support(y_test, y_pred, average='micro')
180 | 	print precision_recall_fscore_support(y_test, y_pred, average='macro')
181 | 	


--------------------------------------------------------------------------------
/imballanced_classes.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | 
  4 | from sklearn.feature_extraction.text import TfidfVectorizer
  5 | from sklearn.model_selection import train_test_split
  6 | from sklearn.naive_bayes import GaussianNB
  7 | from imblearn.over_sampling import SMOTE
  8 | from sklearn.metrics import f1_score
  9 | from sklearn.metrics import recall_score, precision_score
 10 | from sklearn.ensemble import RandomForestClassifier
 11 | from sklearn.svm import LinearSVC
 12 | from imblearn.under_sampling import NearMiss
 13 | 
 14 | 
 15 | if __name__ == '__main__':
 16 | 	df = pd.read_csv('creditcard.csv')
 17 | 
 18 | 	Y_data = []
 19 | 	Y_data = df['Class'].tolist()
 20 | 	df = df.drop('Class',axis=1)
 21 | 	df = (df - df.mean()) / ((df.max() - df.min()))
 22 | 	X_data = df.as_matrix()
 23 | 
 24 | 	X_train, X_test, y_train, y_test = train_test_split(X_data,
 25 | 		Y_data,
 26 | 		test_size=0.1,
 27 | 		random_state=0)
 28 | 
 29 | 	X_train, X_val, y_train, y_val = train_test_split(X_train,
 30 | 		y_train,
 31 | 		test_size=0.1,
 32 | 		random_state=0)
 33 | 
 34 | 	#Oversampling (SMOTE)
 35 | 	sm = SMOTE()
 36 | 	X_smote, y_smote = sm.fit_sample(X_train, y_train)
 37 | 
 38 | 	#Undersampling (Distance-based Near Miss 1,2,3)
 39 | 	nm1 = NearMiss(version = 1)
 40 | 	X_miss1, y_miss1 = nm1.fit_sample(X_train, y_train)
 41 | 	nm2 = NearMiss(version = 2)
 42 | 	X_miss2, y_miss2 = nm2.fit_sample(X_train, y_train)
 43 | 	nm3 = NearMiss(version = 3)
 44 | 	X_miss3, y_miss3 = nm3.fit_sample(X_train, y_train)
 45 | 
 46 | 	#Undersampling (EasyEnsemble)
 47 | 	ee = EasyEnsemble(n_subsets=30)
 48 | 	X_resampled, y_resampled = ee.fit_sample(X_train, y_train)	
 49 | 
 50 | 
 51 | 	print "Naive Bayes"
 52 | 	naive_clf = GaussianNB()
 53 | 	naive_clf.fit (X_train, y_train)
 54 | 	y_pred = naive_clf.predict(X_test)
 55 | 	print "initial: ",f1_score (y_test, y_pred)
 56 | 
 57 | 	naive_clf.fit (X_smote, y_smote)
 58 | 	y_pred = naive_clf.predict(X_test)
 59 | 	print "smote: ",f1_score (y_test, y_pred)
 60 | 
 61 | 	naive_clf.fit (X_miss1, y_miss1)
 62 | 	y_pred = naive_clf.predict(X_test)
 63 | 	print "near miss-1: ",f1_score (y_test, y_pred)
 64 | 	naive_clf.fit (X_miss2, y_miss2)
 65 | 	y_pred = naive_clf.predict(X_test)
 66 | 	print "near miss-2: ",f1_score (y_test, y_pred)
 67 | 	naive_clf.fit (X_miss3, y_miss3)
 68 | 	y_pred = naive_clf.predict(X_test)
 69 | 	print "near miss-3: ",f1_score (y_test, y_pred)
 70 | 
 71 | 	NBclassifiers = []		
 72 | 	for i in range(0,10,1):
 73 | 		NBclassifiers.append(GaussianNB().fit(X_resampled[i], y_resampled[i]))
 74 | 	
 75 | 	y_pred = np.asarray([clf.predict(X_test) for clf in NBclassifiers]).T
 76 | 	y_pred = np.apply_along_axis(lambda x:
 77 | 		np.argmax(np.bincount(x)),
 78 | 		axis=1,
 79 | 		arr=y_pred.astype('int'))
 80 | 	print "easy ensemle: ",f1_score (y_test, y_pred)
 81 | 
 82 | 
 83 | 
 84 | 	print "Random Forest"
 85 | 	forest_clf = RandomForestClassifier(n_estimators=50,
 86 | 				max_depth=10, 
 87 | 				random_state=0)
 88 | 	forest_clf.fit(X_train, y_train)
 89 | 	y_pred = forest_clf.predict(X_test)
 90 | 	print "initial: ", f1_score (y_test, y_pred)
 91 | 
 92 | 	forest_clf.fit (X_smote, y_smote)
 93 | 	y_pred = forest_clf.predict(X_test)
 94 | 	print "smote: ",f1_score (y_test, y_pred)
 95 | 
 96 | 	forest_clf.fit (X_miss1, y_miss1)
 97 | 	y_pred = forest_clf.predict(X_test)
 98 | 	print "near miss-1: ",f1_score (y_test, y_pred)
 99 | 	forest_clf.fit (X_miss2, y_miss2)
100 | 	y_pred = forest_clf.predict(X_test)
101 | 	print "near miss-2: ",f1_score (y_test, y_pred)
102 | 	forest_clf.fit (X_miss3, y_miss3)
103 | 	y_pred = forest_clf.predict(X_test)
104 | 	print "near miss-3: ",f1_score (y_test, y_pred)
105 | 
106 | 	forests = []
107 | 	for i in range(0,10,1):
108 | 		forests.append(RandomForestClassifier(n_estimators=20, max_depth=5,
109 | 		random_state=0).fit(X_resampled[i], y_resampled[i]))
110 | 
111 | 	y_pred = np.asarray([clf.predict(X_test) for clf in forests]).T
112 | 	y_pred = np.apply_along_axis(lambda x:
113 | 		np.argmax(np.bincount(x)),
114 | 		axis=1,
115 | 		arr=y_pred.astype('int'))
116 | 	print "easy ensemle: ",f1_score (y_test, y_pred)
117 | 
118 | 
119 | 
120 | 	print "SVM"
121 | 	svc_clf = LinearSVC(random_state=0)
122 | 	svc_clf.fit(X_train,y_train)
123 | 	y_pred = svc_clf.predict(X_test)
124 | 	print "initial: ",f1_score (y_test, y_pred)
125 | 
126 | 	svc_clf.fit (X_smote, y_smote)
127 | 	y_pred = svc_clf.predict(X_test)
128 | 	print "smote: ",f1_score (y_test, y_pred)
129 | 
130 | 	svc_clf.fit (X_miss1, y_miss1)
131 | 	y_pred = svc_clf.predict(X_test)
132 | 	print "near miss-1: ",f1_score (y_test, y_pred)
133 | 	svc_clf.fit (X_miss2, y_miss2)
134 | 	y_pred = svc_clf.predict(X_test)
135 | 	print "near miss-2: ",f1_score (y_test, y_pred)
136 | 	svc_clf.fit (X_miss3, y_miss3)
137 | 	y_pred = svc_clf.predict(X_test)
138 | 	print "near miss-3: ",f1_score (y_test, y_pred)
139 | 
140 | 	svms = []
141 | 	for i in range(0,10,1):
142 | 		svms.append(LinearSVC(random_state=0).fit(X_resampled[i], y_resampled[i]))
143 | 	
144 | 	y_pred = np.asarray([clf.predict(X_test) for clf in svms]).T
145 | 	y_pred = np.apply_along_axis(lambda x:
146 | 		np.argmax(np.bincount(x)),
147 | 		axis=1,
148 | 		arr=y_pred.astype('int'))
149 | 	print "easy ensemle: ",f1_score (y_test, y_pred)
150 | 


--------------------------------------------------------------------------------
/cost_sensitive_learning.py:
--------------------------------------------------------------------------------
  1 | #!/bin/env python
  2 | 
  3 | import pandas as pd
  4 | import numpy as np
  5 | 
  6 | from sklearn.model_selection import train_test_split
  7 | from sklearn.ensemble import RandomForestClassifier
  8 | from sklearn.naive_bayes import GaussianNB
  9 | from sklearn.svm import LinearSVC
 10 | from sklearn.calibration import CalibratedClassifierCV
 11 | from sklearn.metrics import confusion_matrix
 12 | from sklearn.utils import shuffle, resample
 13 | 
 14 | 
 15 | def cost_score(y_pred,y_true,costs=[0,5,1,0]):
 16 | 	"""
 17 | 	"""
 18 | 	tn, fp, fn, tp = confusion_matrix(y_true,y_pred).ravel()
 19 | 
 20 | 	cost_loss = (tn*costs[0] + fp*costs[1] + fn*costs[2] + tp*costs[3])
 21 | 	
 22 | 	return cost_loss
 23 | 
 24 | 
 25 | def read_data():
 26 | 	"""
 27 | 	"""
 28 | 
 29 | 	df = pd.read_csv('data/heart.dat',
 30 | 		header=None,
 31 | 		delimiter=' ')
 32 | 
 33 | 	y_data = df[13].as_matrix()
 34 | 
 35 | 	df = df.drop(13,axis=1)
 36 | 	df = (df - df.mean()) / (df.max() - df.min())
 37 | 
 38 | 	X_data = df.as_matrix()
 39 | 
 40 | 	X_train, X_test, y_train, y_test = train_test_split(X_data,
 41 | 		y_data,
 42 | 		test_size=0.1,
 43 | 		random_state=0)
 44 | 
 45 | 	X_train, X_val, y_train, y_val = train_test_split(X_train,
 46 | 		y_train,
 47 | 		test_size=0.1,
 48 | 		random_state=0)
 49 | 
 50 | 	return X_train, y_train, X_val, y_val, X_test, y_test
 51 | 
 52 | 
 53 | def default_scores(X_train, y_train, X_val, y_val):
 54 | 	"""
 55 | 	"""
 56 | 
 57 | 
 58 | 	svc_model = LinearSVC(random_state=0).fit(X_train,y_train)
 59 | 
 60 | 	y_pred = svc_model.predict(X_val)
 61 | 	print 'SVC loss:',cost_score(y_pred,y_val)
 62 | 
 63 | 	rf_model = RandomForestClassifier(random_state=0).fit(X_train,y_train)
 64 | 
 65 | 	y_pred = rf_model.predict(X_val)
 66 | 	print 'Random Forest loss:',cost_score(y_pred,y_val)
 67 | 
 68 | 	nb_model = GaussianNB().fit(X_train,y_train)
 69 | 
 70 | 	y_pred = nb_model.predict(X_val)
 71 | 	print 'Naive Bayes loss:',cost_score(y_pred,y_val)
 72 | 
 73 | 	return 
 74 | 
 75 | 
 76 | def class_weighting(X_train, y_train, X_val, y_val):
 77 | 	"""
 78 | 	"""
 79 | 
 80 | 	svc_model = LinearSVC(random_state=0,
 81 | 		class_weight={1:5.,2:1.}).fit(X_train,y_train)
 82 | 	
 83 | 	y_pred = svc_model.predict(X_val)
 84 | 	print 'SVC with class weighting loss:',cost_score(y_pred,y_val)
 85 | 
 86 | 	rf_model = RandomForestClassifier(random_state=0,
 87 | 		class_weight={1:5.,2:1.}).fit(X_train,y_train)
 88 | 	
 89 | 	y_pred = rf_model.predict(X_val)
 90 | 	print 'Random Forest with class weighting loss:',cost_score(y_pred,y_val)
 91 | 
 92 | 
 93 | 	sample_weights = []
 94 | 	for y in y_train:
 95 | 		if y == 1:
 96 | 			sample_weights.append(5)
 97 | 		elif y == 2:
 98 | 			sample_weights.append(1)
 99 | 
100 | 	nb_model = GaussianNB().fit(X_train,y_train,sample_weight=sample_weights)
101 | 	
102 | 
103 | 	y_pred = nb_model.predict(X_val)
104 | 	print 'Naive Bayes with class weighting loss:',cost_score(y_pred,y_val)
105 | 
106 | 	return 
107 | 
108 | 
109 | def class_oversampling(X_train, y_train, X_val, y_val):
110 | 	"""
111 | 	"""
112 | 
113 | 	positives = np.where( y_train == 1)
114 | 	X_positives = np.repeat(X_train[positives],4,axis=0)
115 | 	y_positives = np.repeat(y_train[positives],4)
116 | 
117 | 	X_train_new = np.zeros(((X_train.shape[0]+X_positives.shape[0]),X_train.shape[1]))
118 | 	y_train_new = np.zeros(((y_train.shape[0]+y_positives.shape[0]),))
119 | 
120 | 	X_train_new[:X_train.shape[0]] = X_train
121 | 	X_train_new[X_train.shape[0]:] = X_positives
122 | 	y_train_new[:y_train.shape[0]] = y_train
123 | 	y_train_new[y_train.shape[0]:] = y_positives
124 | 
125 | 	X_train, y_train = shuffle(X_train_new, y_train_new, random_state=0)
126 | 
127 | 	svc_model = LinearSVC(random_state=0).fit(X_train,y_train)
128 | 
129 | 	y_pred = svc_model.predict(X_val)
130 | 	print 'SVC after oversampling loss:',cost_score(y_pred,y_val)
131 | 
132 | 	rf_model = RandomForestClassifier(random_state=0).fit(X_train,y_train)
133 | 
134 | 	y_pred = rf_model.predict(X_val)
135 | 	print 'Random Forest after oversampling loss:',cost_score(y_pred,y_val)
136 | 
137 | 
138 | 	nb_model = GaussianNB().fit(X_train,y_train)
139 | 
140 | 	y_pred = nb_model.predict(X_val)
141 | 	print 'Naive Bayes after oversampling loss:',cost_score(y_pred,y_val)
142 | 	
143 | 	return 
144 | 
145 | 
146 | def rejection_sampling(X_train,
147 | 	y_train,
148 | 	c=[5.,1.],
149 | 	zeta=5.,
150 | 	random_state=0):
151 | 	"""
152 | 	"""
153 | 
154 | 	X_sample = []
155 | 	y_sample = []
156 | 	for x,y in zip(X_train,y_train):
157 | 		if y == 1:
158 | 			prob = c[0] / zeta
159 | 		elif y == 2:
160 | 			prob = c[1] / zeta
161 | 
162 | 		sample_item = np.random.choice([True,False], p=[prob, 1-prob])
163 | 
164 | 		if sample_item:
165 | 			X_sample.append(x)
166 | 			y_sample.append(y)
167 | 
168 | 	return np.array(X_sample),np.array(y_sample)
169 | 
170 | 
171 | def votting(clf_list,
172 | 	X_val):
173 | 	"""
174 | 	"""
175 | 
176 | 	#For hard voting:
177 | 	pred = np.asarray([clf.predict(X_val) for clf in clf_list]).T
178 | 	pred = np.apply_along_axis(lambda x:
179 | 		np.argmax(np.bincount(x)),
180 | 		axis=1,
181 | 		arr=pred.astype('int'))
182 | 
183 | 	return pred
184 | 
185 | 
186 | def costing(X_train, y_train, X_val, y_val):
187 | 	"""
188 | 	"""
189 | 
190 | 	svc_models = []
191 | 	rf_models = []
192 | 	nb_models = []
193 | 	for i in range(10):
194 | 		X_train_sample, y_train_sample = rejection_sampling(X_train, y_train, random_state=0)
195 | 		svc_models.append(LinearSVC(random_state=0).fit(X_train_sample,y_train_sample))
196 | 		rf_models.append(RandomForestClassifier(random_state=0).fit(X_train_sample,y_train_sample))
197 | 		nb_models.append(GaussianNB().fit(X_train_sample,y_train_sample))
198 | 
199 | 	
200 | 	y_pred = votting(svc_models,X_val)
201 | 	print 'SVC with costing loss:',cost_score(y_pred,y_val)
202 | 
203 | 	y_pred = votting(rf_models,X_val)
204 | 	print 'Random Forest with costing loss:',cost_score(y_pred,y_val)
205 | 
206 | 
207 | 	y_pred = votting(nb_models,X_val)
208 | 	print 'Naive Bayes with costing loss:',cost_score(y_pred,y_val)
209 | 	
210 | 	return 
211 | 
212 | 
213 | if __name__ == '__main__':
214 | 
215 | 	X_train, y_train, X_val, y_val, X_test, y_test = read_data()
216 | 
217 | 	default_scores(X_train, y_train, X_val, y_val)
218 | 	class_weighting(X_train, y_train, X_val, y_val)
219 | 	class_oversampling(X_train, y_train, X_val, y_val)
220 | 	costing(X_train,y_train,X_val,y_val)
221 | 
222 | 
223 | 


--------------------------------------------------------------------------------
/delicious_loader.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | import re
  4 | import time
  5 | 
  6 | 
  7 | from keras.preprocessing import sequence
  8 | 
  9 | 
 10 | 
 11 | 
 12 | 
 13 | def read_data(file,
 14 | 	lab_file):
 15 | 	"""
 16 | 	"""
 17 | 
 18 | 	X_data = pd.read_csv(file,header=None)
 19 | 	y_data = pd.read_csv(lab_file,header=None)
 20 | 
 21 | 	X_data = X_data[0].map(lambda x: re.sub('<\d+>','',x) \
 22 | 		.strip() \
 23 | 		.split())
 24 | 	X_data = X_data.map(lambda x: [int(tok.strip()) for tok in x])
 25 | 	y_data = y_data[0].map(lambda x: np.array([int(lab) for lab in x.split()]))
 26 | 
 27 | 	return X_data.tolist(),np.array(y_data.tolist())
 28 | 
 29 | 
 30 | 
 31 | 
 32 | 
 33 | 
 34 | 
 35 | def read_data_sentences(file,
 36 | 	lab_file,
 37 | 	maxlen,
 38 | 	max_sentence_len):
 39 | 	"""
 40 | 	"""
 41 | 
 42 | 	X_data = pd.read_csv(file,header=None)
 43 | 	y_data = pd.read_csv(lab_file,header=None)
 44 | 
 45 | 	X_data = X_data[0].map(lambda x: x.strip())
 46 | 
 47 | 	X_data = X_data.map(lambda x: re.findall('<\d+>([^<]+)',x)[1:])
 48 | 
 49 | 	X_data = X_data.map(lambda x: [[int(tok.strip()) for tok in sent.strip().split()] for sent in x ])
 50 | 
 51 | 	y_data = y_data[0].map(lambda x: np.array([int(lab) for lab in x.split()]))
 52 | 
 53 | 	X_data = X_data.tolist()
 54 | 	X_data_int = np.zeros((len(X_data),maxlen,max_sentence_len))
 55 | 	for idx,text_bag in enumerate(X_data):
 56 | 		sentences_batch = np.zeros((maxlen,max_sentence_len))
 57 | 		sentences =  sequence.pad_sequences(text_bag,
 58 | 			maxlen=max_sentence_len,
 59 | 			padding='post',
 60 | 			truncating='post',
 61 | 			dtype='int32')
 62 | 		for j,sent in enumerate(sentences):
 63 | 			if j >= max_sentence_len:
 64 | 				break
 65 | 			sentences_batch[j,:] = sent
 66 | 		X_data_int[idx,:,:] = sentences_batch
 67 | 
 68 | 	X_data = X_data_int
 69 | 
 70 | 	return X_data,np.array(y_data.tolist())
 71 | 
 72 | 
 73 | 
 74 | 
 75 | 
 76 | 
 77 | 
 78 | def create_ngram_set(input_list, ngram_value=2):
 79 |     """
 80 |     Extract a set of n-grams from a list of integers.
 81 |     >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=2)
 82 |     {(4, 9), (4, 1), (1, 4), (9, 4)}
 83 |     >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=3)
 84 |     [(1, 4, 9), (4, 9, 4), (9, 4, 1), (4, 1, 4)]
 85 |     """
 86 |     return set(zip(*[input_list[i:] for i in range(ngram_value)]))
 87 | 
 88 | 
 89 | 
 90 | 
 91 | 
 92 | 
 93 | 
 94 | def add_ngram(sequences, token_indice, ngram_range=2):
 95 |     """
 96 |     Augment the input list of list (sequences) by appending n-grams values.
 97 |     Example: adding bi-gram
 98 |     >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
 99 |     >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017}
100 |     >>> add_ngram(sequences, token_indice, ngram_range=2)
101 |     [[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42]]
102 |     Example: adding tri-gram
103 |     >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
104 |     >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017, (7, 9, 2): 2018}
105 |     >>> add_ngram(sequences, token_indice, ngram_range=3)
106 |     [[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42, 2018]]
107 |     """
108 |     new_sequences = []
109 |     for input_list in sequences:
110 |         new_list = input_list[:]
111 |         for ngram_value in range(2, ngram_range + 1):
112 |             for i in range(len(new_list) - ngram_value + 1):
113 |                 ngram = tuple(new_list[i:i + ngram_value])
114 |                 if ngram in token_indice:
115 |                     new_list.append(token_indice[ngram])
116 |         new_sequences.append(new_list)
117 | 
118 |     return new_sequences
119 | 
120 | 
121 | 
122 | 
123 | 
124 | 
125 | 
126 | def load_dataset(maxlen,
127 | 	ngram_range=1):
128 | 	"""
129 | 	"""
130 | 	train_data = 'data/delicious/train-data.dat'
131 | 	train_labels = 'data/delicious/train-label.dat'
132 | 	val_data = 'data/delicious/valid-data.dat'
133 | 	val_labels = 'data/delicious/valid-label.dat'
134 | 	test_data = 'data/delicious/test-data.dat'
135 | 	test_labels = 'data/delicious/test-label.dat'
136 | 	vocab_file = 'data/delicious/vocabs.txt'
137 | 
138 | 
139 | 	print('Loading data...')
140 | 	X_train, y_train = read_data(train_data,train_labels)
141 | 	X_val, y_val = read_data(val_data,val_labels)
142 | 	X_test, y_test = read_data(test_data,test_labels)
143 | 	print(len(X_train), 'train sequences')
144 | 	print(len(X_test), 'test sequences')
145 | 	print('Average train sequence length: {}'.format(np.mean(list(map(len, X_train)), dtype=int)))
146 | 	print('Average test sequence length: {}'.format(np.mean(list(map(len, X_test)), dtype=int)))
147 | 
148 | 
149 | 	word_index = {}
150 | 	with open(vocab_file,'r') as vf:
151 | 		for line in vf:
152 | 			line = line.strip().split(', ')
153 | 			key = line[0]
154 | 			value = int(line[1])
155 | 			word_index[key] = value
156 | 
157 | 	max_features = len(word_index)
158 | 
159 | 	if ngram_range > 1:
160 | 	    print('Adding {}-gram features'.format(ngram_range))
161 | 	    # Create set of unique n-gram from the training set.
162 | 	    ngram_set = set()
163 | 	    for input_list in X_train:
164 | 	        for i in range(2, ngram_range + 1):
165 | 	            set_of_ngram = create_ngram_set(input_list, ngram_value=i)
166 | 	            ngram_set.update(set_of_ngram)
167 | 
168 | 	    # Dictionary mapping n-gram token to a unique integer.
169 | 	    # Integer values are greater than max_features in order
170 | 	    # to avoid collision with existing features.
171 | 	    start_index = max_features + 1
172 | 	    token_indice = {v: k + start_index for k, v in enumerate(ngram_set)}
173 | 	    indice_token = {token_indice[k]: k for k in token_indice}
174 | 
175 | 	    # max_features is the highest integer that could be found in the dataset.
176 | 	    max_features = np.max(list(indice_token.keys())) + 1
177 | 
178 | 	    # Augmenting x_train and x_test with n-grams features
179 | 	    X_train = add_ngram(X_train, token_indice, ngram_range)
180 | 	    X_val = add_ngram(X_val, token_indice, ngram_range)
181 | 	    X_test = add_ngram(X_test, token_indice, ngram_range)
182 | 	    print('Average train sequence length: {}'.format(np.mean(list(map(len, X_train)), dtype=int)))
183 | 	    print('Average val sequence length: {}'.format(np.mean(list(map(len, X_val)), dtype=int)))
184 | 	    print('Average test sequence length: {}'.format(np.mean(list(map(len, X_test)), dtype=int)))
185 | 	
186 | 
187 | 	print('Pad sequences (samples x time)')
188 | 	X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
189 | 	X_val = sequence.pad_sequences(X_val, maxlen=maxlen)
190 | 	X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
191 | 	print('X_train shape:', X_train.shape)
192 | 	print('X_val shape:', X_val.shape)
193 | 	print('X_test shape:', X_test.shape)
194 | 
195 | 
196 | 	return X_train,y_train,X_val,y_val,X_test,y_test,word_index
197 | 
198 | 
199 | 
200 | 
201 | 
202 | 
203 | def load_dataset_hierarchical(maxlen,
204 | 	max_sentence_len):
205 | 	"""
206 | 	"""
207 | 	train_data = 'data/delicious/train-data.dat'
208 | 	train_labels = 'data/delicious/train-label.dat'
209 | 	val_data = 'data/delicious/valid-data.dat'
210 | 	val_labels = 'data/delicious/valid-label.dat'
211 | 	test_data = 'data/delicious/test-data.dat'
212 | 	test_labels = 'data/delicious/test-label.dat'
213 | 	vocab_file = 'data/delicious/vocabs.txt'
214 | 
215 | 
216 | 	print('Loading data...')
217 | 	X_train, y_train = read_data_sentences(train_data,train_labels,maxlen,max_sentence_len)
218 | 	X_val, y_val = read_data_sentences(val_data,val_labels,maxlen,max_sentence_len)
219 | 	X_test, y_test = read_data_sentences(test_data,test_labels,maxlen,max_sentence_len)
220 | 	
221 | 
222 | 	word_index = {}
223 | 	with open(vocab_file,'r') as vf:
224 | 		for line in vf:
225 | 			line = line.strip().split(', ')
226 | 			key = line[0]
227 | 			value = int(line[1])
228 | 			word_index[key] = value
229 | 
230 | 	max_features = len(word_index)
231 | 
232 | 	print('X_train shape:', X_train.shape)
233 | 	print('X_val shape:', X_val.shape)
234 | 	print('X_test shape:', X_test.shape)
235 | 
236 | 
237 | 	return X_train,y_train,X_val,y_val,X_test,y_test,word_index


--------------------------------------------------------------------------------
/text_model.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | 
  3 | import numpy as np
  4 | import pandas as pd
  5 | 
  6 | from sklearn.tree import DecisionTreeClassifier
  7 | from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, RandomForestClassifier, GradientBoostingClassifier
  8 | from sklearn.naive_bayes import MultinomialNB
  9 | from sklearn import metrics
 10 | 
 11 | from loaders import load_20news,load_imdb,load_sms,load_amazon,load_paper_reviews
 12 | from loaders import load_yelp,load_youtube,load_reuters8,load_reuters52,load_webkb
 13 | 	
 14 | 
 15 | if __name__ == '__main__':
 16 | 	"""
 17 | 	20 newsgroup: 
 18 | 		Bagging: max_samples=0.8, max_features=0.7, n_estimators=50
 19 | 		AdaBoost: n_estimators=300, learning_rates=1.7
 20 | 		GradientBoostingClassifier: estimator_nums=100, learning_rates=0.5, max_depths=5
 21 | 		RandomForestClassifier: estimator_nums=100, max_depths=7
 22 | 
 23 | 
 24 | 	IMDB:
 25 | 		Bagging: max_samples=0.7, max_features=0.95, n_estimators=40
 26 | 		AdaBoost: n_estimators=200, learning_rates=1.0
 27 | 		GradientBoostingClassifier: estimator_nums=, learning_rates=, max_depths=
 28 | 		RandomForestClassifier: estimator_nums=, max_depths=
 29 | 
 30 | 
 31 | 	SMSSpamCollection:
 32 | 		Bagging: max_samples=0.4, max_features=0.6, n_estimators=60
 33 | 		AdaBoost: n_estimators=30, learning_rates=1.5
 34 | 		GradientBoostingClassifier: estimator_nums=100, learning_rates=0.5, max_depths=3
 35 | 		RandomForestClassifier: estimator_nums=, max_depths=
 36 | 
 37 | 
 38 | 	paper reviews:
 39 | 		Bagging: max_samples=0.4, max_features=0.8, n_estimators=20
 40 | 		AdaBoost: n_estimators=10, learning_rates=0.3
 41 | 		GradientBoostingClassifier: estimator_nums=100, learning_rates=1.0, max_depths=2
 42 | 		RandomForestClassifier: estimator_nums=, max_depths=
 43 | 
 44 | 
 45 | 	yelp:
 46 | 		Bagging: max_samples=0.3, max_features=0.95, n_estimators=70
 47 | 		AdaBoost: n_estimators=60, learning_rates=1.5
 48 | 		GradientBoostingClassifier: estimator_nums=, learning_rates=, max_depths=
 49 | 		RandomForestClassifier: estimator_nums=, max_depths=
 50 | 
 51 | 
 52 | 	amazon:
 53 | 		Bagging: max_samples=0.3, max_features=0.5, n_estimators=20
 54 | 		AdaBoost: n_estimators=30, learning_rates=0.5
 55 | 		GradientBoostingClassifier: estimator_nums=50, learning_rates=0.5, max_depths=7
 56 | 		RandomForestClassifier: estimator_nums=, max_depths=
 57 | 
 58 | 
 59 | 	youtube:
 60 | 		Bagging: max_samples=0.3, max_features=0.6, n_estimators=10
 61 | 		AdaBoost: n_estimators=10, learning_rates=0.5
 62 | 		GradientBoostingClassifier: estimator_nums=50, learning_rates=0.7, max_depths=2
 63 | 		RandomForestClassifier: estimator_nums=, max_depths=
 64 | 
 65 | 
 66 | 	reuters8:
 67 | 		Bagging: max_samples=0.5, max_features=0.9, n_estimators=100
 68 | 		AdaBoost: n_estimators=100, learning_rates=1.2
 69 | 		GradientBoostingClassifier: estimator_nums=100, learning_rates=0.5, max_depths=5
 70 | 		RandomForestClassifier: estimator_nums=, max_depths=
 71 | 
 72 | 
 73 | 	reuters52:
 74 | 		Bagging: max_samples=0.95, max_features=0.95, n_estimators=50
 75 | 		AdaBoost: n_estimators=250, learning_rates=1.0
 76 | 		GradientBoostingClassifier: estimator_nums=, learning_rates=, max_depths=
 77 | 		RandomForestClassifier: estimator_nums=, max_depths=
 78 | 
 79 | 
 80 | 	reuterswebkb:
 81 | 		Bagging: max_samples=0.7, max_features=0.5, n_estimators=100
 82 | 		AdaBoost: n_estimators=50, learning_rates=0.95
 83 | 		GradientBoostingClassifier: estimator_nums=, learning_rates=, max_depths=
 84 | 		RandomForestClassifier: estimator_nums=, max_depths=
 85 | 	"""
 86 | 
 87 | 	dataset = sys.argv[1]
 88 | 
 89 | 	if dataset == '20news':
 90 | 		X_train, y_train, X_val, y_val, X_test, y_test = load_20news()
 91 | 	elif dataset == 'imdb':
 92 | 		X_train, y_train, X_val, y_val, X_test, y_test = load_imdb()
 93 | 	elif dataset == 'sms':
 94 | 		X_train, y_train, X_val, y_val, X_test, y_test = load_sms()
 95 | 	elif dataset == 'p_reviews':
 96 | 		X_train, y_train, X_val, y_val, X_test, y_test = load_paper_reviews()
 97 | 	elif dataset == 'yelp':
 98 | 		X_train, y_train, X_val, y_val, X_test, y_test = load_yelp()
 99 | 	elif dataset == 'amazon':
100 | 		X_train, y_train, X_val, y_val, X_test, y_test = load_amazon()
101 | 	elif dataset == 'youtube':
102 | 		X_train, y_train, X_val, y_val, X_test, y_test = load_youtube()
103 | 	elif dataset == 'r8':
104 | 		X_train, y_train, X_val, y_val, X_test, y_test = load_reuters8()
105 | 	elif dataset == 'r52':
106 | 		X_train, y_train, X_val, y_val, X_test, y_test = load_reuters52()
107 | 	elif dataset == 'webkb':
108 | 		X_train, y_train, X_val, y_val, X_test, y_test = load_webkb()
109 | 
110 | 
111 | 	clf = MultinomialNB(alpha=.01)
112 | 
113 | 
114 | 	clf.fit(X_train, y_train)
115 | 	preds = clf.predict(X_train)
116 | 	val_preds = clf.predict(X_test)
117 | 	print 'NB training f-score:',metrics.f1_score(y_train, preds, average='macro')
118 | 	print 'NB test f-score:',metrics.f1_score(y_test, val_preds, average='macro')
119 | 	
120 | 	estimator_nums = [100]
121 | 	max_samps = [0.7]
122 | 	max_feats = [0.5]
123 | 	best_fscore = 0.0
124 | 	for m in max_samps:
125 | 		for n in estimator_nums:
126 | 			for f in max_feats:
127 | 				bagg_clf = BaggingClassifier(clf,
128 | 					n_estimators=n,
129 | 					max_samples=m,
130 | 					max_features=f,
131 | 					random_state=0)
132 | 				bagg_clf.fit(X_train, y_train)
133 | 
134 | 				val_preds = bagg_clf.predict(X_val)
135 | 				val_score = metrics.f1_score(y_val, val_preds, average='macro')
136 | 				if val_score > best_fscore:
137 | 					best_fscore = val_score
138 | 					best_params = (m,n,f)
139 | 					best_clf = bagg_clf
140 | 
141 | 	print 'best parameters:',best_params
142 | 	preds = best_clf.predict(X_train)
143 | 	val_preds = best_clf.predict(X_test)
144 | 	print 'Bagging training f-score:',metrics.f1_score(y_train, preds, average='macro')
145 | 	print 'Bagging test f-score:',metrics.f1_score(y_test, val_preds, average='macro')
146 | 	
147 | 	
148 | 	estimator_nums = [50]
149 | 	learning_rates = [0.95]
150 | 	best_fscore = 0.0
151 | 	for n in estimator_nums:
152 | 		for lr in learning_rates:
153 | 			ada_clf = AdaBoostClassifier(clf,
154 | 				n_estimators=n,
155 | 				learning_rate=lr,
156 | 				random_state=0)
157 | 			ada_clf.fit(X_train, y_train)
158 | 			
159 | 			val_preds = ada_clf.predict(X_val)
160 | 			val_score = metrics.f1_score(y_val, val_preds, average='macro')
161 | 			if val_score > best_fscore:
162 | 				best_fscore = val_score
163 | 				best_params = (n,lr)
164 | 				best_clf = ada_clf
165 | 
166 | 	print 'best parameters:',best_params
167 | 	preds = best_clf.predict(X_train)
168 | 	val_preds = best_clf.predict(X_test)
169 | 	print 'AdaBoost training f-score:',metrics.f1_score(y_train, preds, average='macro')
170 | 	print 'AdaBoost test f-score:',metrics.f1_score(y_test, val_preds, average='macro')
171 | 	
172 | 	
173 | 	estimator_nums = [100]
174 | 	learning_rates = [0.5]
175 | 	max_depths = [5]
176 | 	best_fscore = 0.0
177 | 	for n in estimator_nums:
178 | 		for lr in learning_rates:
179 | 			for d in max_depths:
180 | 				gb_clf = GradientBoostingClassifier(n_estimators=n,
181 | 					max_depth=d,
182 | 					learning_rate=lr,
183 | 					random_state=0)
184 | 				gb_clf.fit(X_train, y_train)
185 | 				
186 | 				val_preds = gb_clf.predict(X_val)
187 | 				val_score = metrics.f1_score(y_val, val_preds, average='macro')
188 | 				if val_score > best_fscore:
189 | 					best_fscore = val_score
190 | 					best_params = (n,lr,d)
191 | 					best_clf = gb_clf
192 | 
193 | 	print 'best parameters:',best_params
194 | 	preds = best_clf.predict(X_train)
195 | 	val_preds = best_clf.predict(X_test)
196 | 	print 'Gradient Boosting training f-score:',metrics.f1_score(y_train, preds, average='macro')
197 | 	print 'Gradient Boosting test f-score:',metrics.f1_score(y_test, val_preds, average='macro')
198 | 
199 | 
200 | 	estimator_nums = [50]
201 | 	max_depths = [7]
202 | 	best_fscore = 0.0
203 | 	for n in estimator_nums:
204 | 		for d in max_depths:
205 | 			forest_clf = RandomForestClassifier(n_estimators=n,
206 | 				max_depth=d,
207 | 				random_state=0)
208 | 			forest_clf.fit(X_train, y_train)
209 | 			
210 | 			val_preds = forest_clf.predict(X_val)
211 | 			val_score = metrics.f1_score(y_val, val_preds, average='macro')
212 | 			if val_score > best_fscore:
213 | 				best_fscore = val_score
214 | 				best_params = (n,d)
215 | 				best_clf = forest_clf
216 | 
217 | 	print 'best parameters:',best_params
218 | 	preds = best_clf.predict(X_train)
219 | 	val_preds = best_clf.predict(X_test)
220 | 	print 'Random Forest training f-score:',metrics.f1_score(y_train, preds, average='macro')
221 | 	print 'Random Forest test f-score:',metrics.f1_score(y_test, val_preds, average='macro')
222 | 	
223 | 	
224 | 
225 | 


--------------------------------------------------------------------------------
/loaders.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | 
  3 | import numpy as np
  4 | import pandas as pd
  5 | 
  6 | from sklearn.datasets import fetch_20newsgroups ,fetch_rcv1
  7 | from sklearn.feature_extraction.text import TfidfVectorizer
  8 | from sklearn.model_selection import train_test_split
  9 | 
 10 | from keras.datasets import imdb
 11 | 
 12 | 
 13 | def load_20news():
 14 | 	"""
 15 | 	"""
 16 | 	print 'Loading data...'
 17 | 	newsgroups_train = fetch_20newsgroups(subset='train',
 18 | 		remove=('headers', 'footers', 'quotes'),
 19 | 		shuffle=True)
 20 | 	newsgroups_test = fetch_20newsgroups(subset='test',
 21 | 		remove=('headers', 'footers', 'quotes'),
 22 | 		shuffle=True)
 23 | 
 24 | 	print 'Preprocessing...'
 25 | 	vectorizer = TfidfVectorizer(strip_accents='unicode',
 26 | 		lowercase=True,
 27 | 		stop_words='english',
 28 | 		ngram_range=(1, 2),
 29 | 		max_df=0.5,
 30 | 		min_df=5,
 31 | 		max_features=20000,
 32 | 		norm='l2',
 33 | 		use_idf=True,
 34 | 		smooth_idf=True,
 35 | 		sublinear_tf=False)
 36 | 
 37 | 	vectorizer.fit(newsgroups_train.data)
 38 | 
 39 | 	X_train = vectorizer.transform(newsgroups_train.data)
 40 | 	y_train = newsgroups_train.target
 41 | 	X_test = vectorizer.transform(newsgroups_test.data)
 42 | 	y_test = newsgroups_test.target
 43 | 
 44 | 	X_train, X_val, y_train, y_val = train_test_split(X_train,
 45 | 		y_train,
 46 | 		test_size=0.2,
 47 | 		random_state=0)
 48 | 
 49 | 	return X_train, y_train, X_val, y_val, X_test, y_test
 50 | 
 51 | 
 52 | def load_imdb():
 53 | 	"""
 54 | 	"""
 55 | 	print 'Loading data...'
 56 | 
 57 | 	word_to_index = imdb.get_word_index()
 58 | 	index_to_word = [None] * (max(word_to_index.values()) + 1)
 59 | 	for w, i in word_to_index.items():
 60 | 		index_to_word[i] = w
 61 | 
 62 | 	(X_train, y_train), (X_test, y_test) = imdb.load_data()
 63 | 
 64 | 	print 'Preprocessing...'
 65 | 	X_train = [
 66 | 		' '.join(index_to_word[i]
 67 | 			for i in X_train[i]
 68 | 			if i < len(index_to_word))
 69 | 		for i in range(X_train.shape[0])
 70 | 	]
 71 | 
 72 | 	X_test = [
 73 | 		' '.join(index_to_word[i]
 74 | 			for i in X_test[i]
 75 | 			if i < len(index_to_word)) 
 76 | 		for i in range(X_test.shape[0])
 77 | 	]
 78 | 
 79 | 	vectorizer = TfidfVectorizer(strip_accents='unicode',
 80 | 		lowercase=True,
 81 | 		stop_words='english',
 82 | 		ngram_range=(1, 2),
 83 | 		max_df=0.5,
 84 | 		min_df=5,
 85 | 		max_features=50000,
 86 | 		norm='l2',
 87 | 		use_idf=True,
 88 | 		smooth_idf=True,
 89 | 		sublinear_tf=False)
 90 | 
 91 | 	vectorizer.fit(X_train)
 92 | 
 93 | 	X_train = vectorizer.transform(X_train)
 94 | 	X_test = vectorizer.transform(X_test)
 95 | 
 96 | 
 97 | 	X_train, X_val, y_train, y_val = train_test_split(X_train,
 98 | 		y_train,
 99 | 		test_size=0.2,
100 | 		random_state=0)
101 | 
102 | 	return X_train, y_train, X_val, y_val, X_test, y_test
103 | 
104 | 
105 | def load_sms():
106 | 	"""
107 | 	"""
108 | 	print 'Loading data...'
109 | 
110 | 	df = pd.read_csv('data/SMSSpamCollection',
111 | 		header=None,
112 | 		delimiter='\t')
113 | 
114 | 	classes = dict((k,idx) for idx,k in enumerate(df[0].unique()))
115 | 	y_data = df[0].map(lambda x: classes[x]).tolist()
116 | 	X_data = df[1].tolist()
117 | 
118 | 	vectorizer = TfidfVectorizer(strip_accents='unicode',
119 | 		lowercase=True,
120 | 		stop_words='english',
121 | 		ngram_range=(1, 2),
122 | 		max_df=0.5,
123 | 		min_df=5,
124 | 		max_features=50000,
125 | 		norm='l2',
126 | 		use_idf=True,
127 | 		smooth_idf=True,
128 | 		sublinear_tf=False)
129 | 
130 | 	vectorizer.fit(X_data)
131 | 	X_data = vectorizer.transform(X_data)
132 | 
133 | 	X_train, X_test, y_train, y_test = train_test_split(X_data,
134 | 		y_data,
135 | 		test_size=0.1,
136 | 		random_state=0)
137 | 
138 | 	X_train, X_val, y_train, y_val = train_test_split(X_train,
139 | 		y_train,
140 | 		test_size=0.2,
141 | 		random_state=0)
142 | 
143 | 	return X_train, y_train, X_val, y_val, X_test, y_test
144 | 
145 | 
146 | def load_paper_reviews():
147 | 	"""
148 | 	"""
149 | 	df = pd.read_json('data/reviews.json')
150 | 	class2id = dict((k,idx) for idx,k in enumerate(df['preliminary_decision'].unique()))
151 | 
152 | 	y_data = df['preliminary_decision'].map(lambda x: class2id[x]).tolist()
153 | 
154 | 
155 | 	X_data = df['review']
156 | 	X_list = []
157 | 	y_list = []
158 | 
159 | 	for i,(review,lab) in enumerate(zip(X_data,y_data)):
160 | 		try:
161 | 			X_list.append(review[0]['text'])
162 | 			y_list.append(lab)
163 | 		except:
164 | 			continue
165 | 	
166 | 	y_data = y_list
167 | 
168 | 	print 'Preprocessing...'
169 | 	vectorizer = TfidfVectorizer(strip_accents='unicode',
170 | 		lowercase=True,
171 | 		stop_words='english',
172 | 		ngram_range=(1, 2),
173 | 		max_df=0.5,
174 | 		min_df=5,
175 | 		max_features=10000,
176 | 		norm='l2',
177 | 		use_idf=True,
178 | 		smooth_idf=True,
179 | 		sublinear_tf=False)
180 | 
181 | 	vectorizer.fit(X_list)
182 | 
183 | 	X_data = vectorizer.transform(X_list)
184 | 	
185 | 	X_train, X_test, y_train, y_test = train_test_split(X_data,
186 | 		y_data,
187 | 		test_size=0.1,
188 | 		random_state=0)
189 | 
190 | 	X_train, X_val, y_train, y_val = train_test_split(X_train,
191 | 		y_train,
192 | 		test_size=0.2,
193 | 		random_state=0)
194 | 
195 | 	return X_train, y_train, X_val, y_val, X_test, y_test
196 | 
197 | 
198 | def load_yelp():
199 | 	"""
200 | 	"""
201 | 	df = pd.read_json('data/yelp.json',orient='records',lines=True, encoding='utf-8')
202 | 
203 | 	X_data = df['text'].tolist()
204 | 	y_data = df['stars'].tolist()
205 | 
206 | 	print 'Preprocessing...'
207 | 	vectorizer = TfidfVectorizer(strip_accents='unicode',
208 | 		lowercase=True,
209 | 		stop_words='english',
210 | 		ngram_range=(1, 2),
211 | 		max_df=0.5,
212 | 		min_df=5,
213 | 		max_features=20000,
214 | 		norm='l2',
215 | 		use_idf=True,
216 | 		smooth_idf=True,
217 | 		sublinear_tf=False)
218 | 
219 | 	vectorizer.fit(X_data)
220 | 
221 | 	X_data = vectorizer.transform(X_data)
222 | 	
223 | 	X_train, X_test, y_train, y_test = train_test_split(X_data,
224 | 		y_data,
225 | 		test_size=0.1,
226 | 		random_state=0)
227 | 
228 | 	X_train, X_val, y_train, y_val = train_test_split(X_train,
229 | 		y_train,
230 | 		test_size=0.2,
231 | 		random_state=0)
232 | 
233 | 	return X_train, y_train, X_val, y_val, X_test, y_test
234 | 
235 | 
236 | def load_amazon():
237 | 	"""
238 | 	"""
239 | 	df = pd.read_csv('data/amazon.txt',
240 | 		header=None,
241 | 		delimiter='\t')
242 | 
243 | 	X_data = df[0].tolist()
244 | 	y_data = df[1].tolist()
245 | 
246 | 	print 'Preprocessing...'
247 | 	vectorizer = TfidfVectorizer(strip_accents='unicode',
248 | 		lowercase=True,
249 | 		stop_words='english',
250 | 		ngram_range=(1, 2),
251 | 		max_df=0.5,
252 | 		min_df=5,
253 | 		max_features=20000,
254 | 		norm='l2',
255 | 		use_idf=True,
256 | 		smooth_idf=True,
257 | 		sublinear_tf=False)
258 | 
259 | 	vectorizer.fit(X_data)
260 | 
261 | 	X_data = vectorizer.transform(X_data)
262 | 	
263 | 	X_train, X_test, y_train, y_test = train_test_split(X_data,
264 | 		y_data,
265 | 		test_size=0.1,
266 | 		random_state=0)
267 | 
268 | 	X_train, X_val, y_train, y_val = train_test_split(X_train,
269 | 		y_train,
270 | 		test_size=0.2,
271 | 		random_state=0)
272 | 
273 | 	return X_train, y_train, X_val, y_val, X_test, y_test
274 | 
275 | 
276 | def load_youtube():
277 | 	"""
278 | 	"""
279 | 	df = pd.read_csv('data/untitled1/Youtube01-Psy.csv')
280 | 	df = df.append(pd.read_csv('data/untitled1/Youtube02-KatyPerry.csv'))
281 | 	df = df.append(pd.read_csv('data/untitled1/Youtube03-LMFAO.csv'))
282 | 	df = df.append(pd.read_csv('data/untitled1/Youtube04-Eminem.csv'))
283 | 	df = df.append(pd.read_csv('data/untitled1/Youtube05-Shakira.csv'))
284 | 
285 | 	X_data = df["CONTENT"].tolist()
286 | 	y_data = df["CLASS"].tolist()
287 | 
288 | 
289 | 	print 'Preprocessing...'
290 | 	vectorizer = TfidfVectorizer(strip_accents='unicode',
291 | 		lowercase=True,
292 | 		stop_words='english',
293 | 		ngram_range=(1, 2),
294 | 		max_df=0.5,
295 | 		min_df=5,
296 | 		max_features=20000,
297 | 		norm='l2',
298 | 		use_idf=True,
299 | 		smooth_idf=True,
300 | 		sublinear_tf=False)
301 | 
302 | 	vectorizer.fit(X_data)
303 | 
304 | 	X_data = vectorizer.transform(X_data)
305 | 	
306 | 	X_train, X_test, y_train, y_test = train_test_split(X_data,
307 | 		y_data,
308 | 		test_size=0.1,
309 | 		random_state=0)
310 | 
311 | 	X_train, X_val, y_train, y_val = train_test_split(X_train,
312 | 		y_train,
313 | 		test_size=0.2,
314 | 		random_state=0)
315 | 
316 | 	return X_train, y_train, X_val, y_val, X_test, y_test
317 | 
318 | 
319 | def load_reuters8():
320 | 	"""
321 | 	"""
322 | 	
323 | 	df = pd.read_csv('data/r8-train-all-terms.txt',
324 | 		header=None,
325 | 		delimiter='\t')
326 | 
327 | 	test_df = pd.read_csv('data/r8-test-all-terms.txt',
328 | 		header=None,
329 | 		delimiter='\t')
330 | 
331 | 	class2id = dict((k,idx) for idx,k in enumerate(df[0].unique()))
332 | 
333 | 	X_train = df[1].tolist()
334 | 	X_test = test_df[1].tolist()
335 | 
336 | 	y_train = df[0].map(lambda x: class2id[x]).tolist()
337 | 	y_test = test_df[0].map(lambda x: class2id[x]).tolist()
338 | 
339 | 	print 'Preprocessing...'
340 | 	vectorizer = TfidfVectorizer(strip_accents='unicode',
341 | 		lowercase=True,
342 | 		stop_words='english',
343 | 		ngram_range=(1, 2),
344 | 		max_df=0.5,
345 | 		min_df=5,
346 | 		max_features=20000,
347 | 		norm='l2',
348 | 		use_idf=True,
349 | 		smooth_idf=True,
350 | 		sublinear_tf=False)
351 | 
352 | 	vectorizer.fit(X_train)
353 | 
354 | 	X_train = vectorizer.transform(X_train)
355 | 	X_test = vectorizer.transform(X_test)
356 | 
357 | 	X_train, X_val, y_train, y_val = train_test_split(X_train,
358 | 		y_train,
359 | 		test_size=0.2,
360 | 		random_state=0)
361 | 
362 | 	return X_train, y_train, X_val, y_val, X_test, y_test
363 | 
364 | 
365 | def load_reuters52():
366 | 	"""
367 | 	"""
368 | 	
369 | 	df = pd.read_csv('data/r52-train-all-terms.txt',
370 | 		header=None,
371 | 		delimiter='\t')
372 | 
373 | 	test_df = pd.read_csv('data/r52-test-all-terms.txt',
374 | 		header=None,
375 | 		delimiter='\t')
376 | 
377 | 	class2id = dict((k,idx) for idx,k in enumerate(df[0].unique()))
378 | 
379 | 	X_train = df[1].tolist()
380 | 	X_test = test_df[1].tolist()
381 | 
382 | 	y_train = df[0].map(lambda x: class2id[x]).tolist()
383 | 	y_test = test_df[0].map(lambda x: class2id[x]).tolist()
384 | 
385 | 	print 'Preprocessing...'
386 | 	vectorizer = TfidfVectorizer(strip_accents='unicode',
387 | 		lowercase=True,
388 | 		stop_words='english',
389 | 		ngram_range=(1, 2),
390 | 		max_df=0.5,
391 | 		min_df=5,
392 | 		max_features=20000,
393 | 		norm='l2',
394 | 		use_idf=True,
395 | 		smooth_idf=True,
396 | 		sublinear_tf=False)
397 | 
398 | 	vectorizer.fit(X_train)
399 | 
400 | 	X_train = vectorizer.transform(X_train)
401 | 	X_test = vectorizer.transform(X_test)
402 | 
403 | 	X_train, X_val, y_train, y_val = train_test_split(X_train,
404 | 		y_train,
405 | 		test_size=0.2,
406 | 		random_state=0)
407 | 
408 | 	return X_train, y_train, X_val, y_val, X_test, y_test
409 | 
410 | 
411 | def load_webkb():
412 | 	"""
413 | 	"""
414 | 	
415 | 	df = pd.read_csv('data/web.txt',
416 | 		header=None,
417 | 		delimiter='\t')
418 | 
419 | 	class2id = dict((k,idx) for idx,k in enumerate(df[0].unique()))
420 | 
421 | 	y_data = df[0].map(lambda x: class2id[x]).tolist()
422 | 	X_list = df[1].tolist()
423 | 
424 | 
425 | 	X_data = []
426 | 	y_list = []
427 | 	for x,y in zip(X_list,y_data):
428 | 		try:
429 | 			if np.isnan(x):
430 | 				continue
431 | 		except:
432 | 			pass
433 | 		X_data.append(x)
434 | 		y_list.append(y)
435 | 
436 | 	y_data = y_list
437 | 
438 | 	print 'Preprocessing...'
439 | 	vectorizer = TfidfVectorizer(strip_accents='unicode',
440 | 		lowercase=True,
441 | 		stop_words='english',
442 | 		ngram_range=(1, 2),
443 | 		max_df=0.5,
444 | 		min_df=5,
445 | 		max_features=10000,
446 | 		norm='l2',
447 | 		use_idf=True,
448 | 		smooth_idf=True,
449 | 		sublinear_tf=False)
450 | 
451 | 	vectorizer.fit(X_data)
452 | 
453 | 	X_data = vectorizer.transform(X_data)
454 | 	
455 | 	X_train, X_test, y_train, y_test = train_test_split(X_data,
456 | 		y_data,
457 | 		test_size=0.1,
458 | 		random_state=0)
459 | 
460 | 	X_train, X_val, y_train, y_val = train_test_split(X_train,
461 | 		y_train,
462 | 		test_size=0.2,
463 | 		random_state=0)
464 | 
465 | 	return X_train, y_train, X_val, y_val, X_test, y_test


--------------------------------------------------------------------------------