├── LICENSE
├── README.md
├── classify_with_mlp.py
├── classify_with_rf.py
├── classify_with_svm.py
├── data
    └── PLACEHOLDER
├── features
    └── PLACEHOLDER
├── lasso.py
├── relieff.py
└── results
    └── PLACEHOLDER


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Kıvanç Güçkıran
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Microarray Work
 2 | ## [[Paper]](https://dergipark.org.tr/sdufenbed/issue/39838/453462)[[Datasets]](https://github.com/kivancguckiran/microarray-data)
 3 | 
 4 | DNA Microarray Gene Expression Data Classification Using SVM and MLP with Feature Selection Methods Relief and LASSO.
 5 | 
 6 | If you are planning to use this code in your research, please cite this [paper](https://dergipark.org.tr/sdufenbed/issue/39838/453462).
 7 | 
 8 | ## Dataset
 9 | Datasets are DNA microarray gene expression data. [Dataset Link](https://github.com/kivancguckiran/microarray-data).
10 | 
11 | ## Methods
12 | We are using LASSO and Relief for Feature Selection and SVM and MLP for classification.
13 | 
14 | ## Download
15 | 
16 | ```
17 | git clone https://github.com/kivancguckiran/microarray-classification
18 | ```
19 | 
20 | ## Usage
21 | *data* folder should be filled with the dataset you want to classify.
22 | 
23 | ### Examples
24 | Select features using Relief from *alon* dataset.
25 | ```
26 | python relieff.py alon
27 | ```
28 | Select features using LASSO from *borovecki* dataset.
29 | ```
30 | python lasso.py borovecki
31 | ```
32 | Classify using MLP with Relief features with *subramanian* dataset.
33 | ```
34 | python classify_with_mlp.py subramanian relief
35 | ```
36 | Classify using SVM with LASSO features with *sun* dataset.
37 | ```
38 | python classify_with_svm.py sun lasso
39 | ```
40 | 


--------------------------------------------------------------------------------
/classify_with_mlp.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | import matplotlib.pyplot as plt
  4 | import sys
  5 | import time
  6 | from sklearn.pipeline import make_pipeline
  7 | from skrebate import ReliefF
  8 | from sklearn.ensemble import RandomForestClassifier
  9 | from sklearn.model_selection import cross_val_score, train_test_split, LeaveOneOut, KFold, StratifiedKFold
 10 | from sklearn import preprocessing
 11 | import keras
 12 | from keras.datasets import mnist
 13 | from keras.models import Sequential
 14 | from keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout, Activation
 15 | from keras.optimizers import Adam, RMSprop, SGD, Adamax
 16 | 
 17 | 
 18 | # filenames = ['alon', 'borovecki', 'burczynski', 'chiaretti', 'chin', 'chowdary', 'christensen', 'golub', 'gordon', 'gravier', 'khan', 'nakayama', 'pomeroy', 'shipp', 'singh', 'sorlie', 'su', 'subramanian', 'sun', 'tian', 'west', 'yeoh']
 19 | 
 20 | name = sys.argv[1]
 21 | type = sys.argv[2]
 22 | 
 23 | features = pd.read_csv('data/' + name + '_inputs.csv', header = None)
 24 | labels = pd.read_csv('data/' + name + '_outputs.csv', header = None)
 25 | 
 26 | features.fillna(0, inplace = True)
 27 | 
 28 | features = np.asarray(features.values)
 29 | labels = np.transpose(np.asarray(labels.values.ravel() - 1, dtype=int))
 30 | 
 31 | min_max_scaler = preprocessing.MinMaxScaler()
 32 | features = min_max_scaler.fit_transform(features)
 33 | 
 34 | if type == 'lasso':
 35 | 	gains = np.asarray(np.loadtxt('features/' + name + '_lasso.txt'))
 36 | 	indexes = np.where(gains != 0)[0]
 37 | else:
 38 | 	gains = np.asarray(np.loadtxt('features/' + name + '_lasso.txt'))
 39 | 	indexes = np.where(gains != 0)[0]
 40 | 	gains = np.asarray(np.loadtxt('features/' + name + '_relieff.txt')) 
 41 | 	indexes = gains.argsort()[-indexes.shape[0]:][::-1]
 42 | 
 43 | 
 44 | scores = []
 45 | 
 46 | loo = LeaveOneOut()
 47 | 
 48 | startTime = time.time()
 49 | 
 50 | for train_index, test_index in loo.split(features):
 51 | 	x_train, x_test = features[train_index], features[test_index]
 52 | 	y_train, y_test = labels[train_index], labels[test_index]
 53 | 	
 54 | 	X_train = x_train[:, indexes]
 55 | 	X_test = x_test[:, indexes]
 56 | 	Y_train = y_train[:]
 57 | 	Y_test = y_test[:]
 58 | 	
 59 | 	batch_size = 1
 60 | 	num_classes = np.max(labels) + 1
 61 | 	epochs = 50
 62 | 	
 63 | 	X_train = X_train.astype('float32')
 64 | 	X_test = X_test.astype('float32')
 65 | 	Y_train = Y_train[:]
 66 | 	Y_test = Y_test[:]
 67 | 	# print(X_train.shape[0], 'train samples, ', Y_train.shape)
 68 | 	# print(X_test.shape[0], 'test samples, ', Y_test.shape)
 69 | 	
 70 | 	# convert class vectors to binary class matrices
 71 | 	Y_train = keras.utils.to_categorical(Y_train, num_classes)
 72 | 	Y_test = keras.utils.to_categorical(Y_test, num_classes)
 73 | 	
 74 | 	model = Sequential()
 75 | 	
 76 | 	# Dense(64) is a fully-connected layer with 64 hidden units.
 77 | 	# in the first layer, you must specify the expected input data shape:
 78 | 	# here, 20-dimensional vectors.
 79 | 	model.add(Dense(200, input_dim=X_train.shape[1], kernel_initializer='lecun_uniform', activation='relu'))
 80 | 	model.add(Dense(100, kernel_initializer='lecun_uniform', activation='relu'))
 81 | 	model.add(Dense(Y_train.shape[1], kernel_initializer='lecun_uniform', activation='softmax'))
 82 | 	
 83 | 	sgd = SGD(lr=0.005, decay=1e-6, momentum=0.9, nesterov=True)
 84 | 	model.compile(loss='categorical_crossentropy', optimizer=Adamax(), metrics=['accuracy'])
 85 | 	
 86 | 	# model.summary()
 87 | 	
 88 | 	history = model.fit(X_train, Y_train,
 89 |                         batch_size=batch_size,
 90 |                         epochs=epochs,
 91 |                         verbose=0,
 92 |                         validation_data=(X_test, Y_test))
 93 | 
 94 | 	score = model.evaluate(X_test, Y_test, verbose=0)
 95 | 	
 96 | 	scores.append(score[1])
 97 | 
 98 | endTime = time.time()
 99 | 	
100 | with open('results/' + name + '_mlp_' + type + '.txt', 'w') as file:
101 | 	file.write('Score: ' + str(np.average(scores)) + '\n')
102 | 	file.write('Time: ' + str(endTime - startTime))
103 | 	file.close()
104 | 
105 | print('Score: ' + str(np.average(scores)))
106 | print('Time: ' + str(endTime - startTime))
107 | 


--------------------------------------------------------------------------------
/classify_with_rf.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | import sys
 5 | import time
 6 | from sklearn.pipeline import make_pipeline
 7 | from skrebate import ReliefF
 8 | from sklearn.ensemble import RandomForestClassifier
 9 | from sklearn.model_selection import cross_val_score, train_test_split, LeaveOneOut, KFold, StratifiedKFold
10 | from sklearn import preprocessing
11 | from sklearn.svm import LinearSVC
12 | 
13 | 
14 | # filenames = ['alon', 'borovecki', 'burczynski', 'chiaretti', 'chin', 'chowdary', 'christensen', 'golub', 'gordon', 'gravier', 'khan', 'nakayama', 'pomeroy', 'shipp', 'singh', 'sorlie', 'su', 'subramanian', 'sun', 'tian', 'west', 'yeoh']
15 | 
16 | name = sys.argv[1]
17 | type = sys.argv[2]
18 | 
19 | features = pd.read_csv('data/' + name + '_inputs.csv', header = None)
20 | labels = pd.read_csv('data/' + name + '_outputs.csv', header = None)
21 | 
22 | features.fillna(0, inplace = True)
23 | 
24 | features = np.asarray(features.values)
25 | labels = np.transpose(np.asarray(labels.values.ravel() - 1, dtype=int))
26 | 
27 | min_max_scaler = preprocessing.MinMaxScaler()
28 | features = min_max_scaler.fit_transform(features)
29 | 
30 | if type == 'lasso':
31 | 	gains = np.asarray(np.loadtxt('features/' + name + '_lasso.txt'))
32 | 	indexes = np.where(gains != 0)[0]
33 | else:
34 | 	gains = np.asarray(np.loadtxt('features/' + name + '_lasso.txt'))
35 | 	indexes = np.where(gains != 0)[0]
36 | 	gains = np.asarray(np.loadtxt('features/' + name + '_relieff.txt')) 
37 | 	indexes = gains.argsort()[-indexes.shape[0]:][::-1]
38 | 
39 | scores = []
40 | 
41 | loo = LeaveOneOut()
42 | 
43 | startTime = time.time()
44 | 
45 | for train_index, test_index in loo.split(features):
46 | 	x_train, x_test = features[train_index], features[test_index]
47 | 	y_train, y_test = labels[train_index], labels[test_index]
48 | 
49 | 	X_train = x_train[:, indexes]
50 | 	X_test = x_test[:, indexes]
51 | 	Y_train = y_train[:]
52 | 	Y_test = y_test[:]
53 | 
54 | 	batch_size = 1
55 | 	num_classes = np.max(labels) + 1
56 | 	epochs = 50
57 | 
58 | 	X_train = X_train.astype('float32')
59 | 	X_test = X_test.astype('float32')
60 | 	Y_train = Y_train[:]
61 | 	Y_test = Y_test[:]
62 | 
63 | 	clf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
64 | 
65 | 
66 | 	clf.fit(X_train, Y_train)
67 | 	score = clf.score(X_test, Y_test)
68 | 
69 | 	scores.append(score)
70 | 
71 | endTime = time.time()
72 | 	
73 | with open('results/' + name + '_rf_' + type + '.txt', 'w') as file:
74 | 	file.write('Score: ' + str(np.average(scores)) + '\n')
75 | 	file.write('Time: ' + str(endTime - startTime))
76 | 	file.close()
77 | 
78 | print('Score: ' + str(np.average(scores)))
79 | print('Time: ' + str(endTime - startTime))
80 | 


--------------------------------------------------------------------------------
/classify_with_svm.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | import sys
 5 | import time
 6 | from sklearn.pipeline import make_pipeline
 7 | from skrebate import ReliefF
 8 | from sklearn.ensemble import RandomForestClassifier
 9 | from sklearn.model_selection import cross_val_score, train_test_split, LeaveOneOut, KFold, StratifiedKFold
10 | from sklearn import preprocessing
11 | from sklearn.svm import LinearSVC
12 | 
13 | 
14 | # filenames = ['alon', 'borovecki', 'burczynski', 'chiaretti', 'chin', 'chowdary', 'christensen', 'golub', 'gordon', 'gravier', 'khan', 'nakayama', 'pomeroy', 'shipp', 'singh', 'sorlie', 'su', 'subramanian', 'sun', 'tian', 'west', 'yeoh']
15 | 
16 | name = sys.argv[1]
17 | type = sys.argv[2]
18 | 
19 | features = pd.read_csv('data/' + name + '_inputs.csv', header = None)
20 | labels = pd.read_csv('data/' + name + '_outputs.csv', header = None)
21 | 
22 | features.fillna(0, inplace = True)
23 | 
24 | features = np.asarray(features.values)
25 | labels = np.transpose(np.asarray(labels.values.ravel() - 1, dtype=int))
26 | 
27 | min_max_scaler = preprocessing.MinMaxScaler()
28 | features = min_max_scaler.fit_transform(features)
29 | 
30 | if type == 'lasso':
31 | 	gains = np.asarray(np.loadtxt('features/' + name + '_lasso.txt'))
32 | 	indexes = np.where(gains != 0)[0]
33 | else:
34 | 	gains = np.asarray(np.loadtxt('features/' + name + '_lasso.txt'))
35 | 	indexes = np.where(gains != 0)[0]
36 | 	gains = np.asarray(np.loadtxt('features/' + name + '_relieff.txt')) 
37 | 	indexes = gains.argsort()[-indexes.shape[0]:][::-1]
38 | 
39 | scores = []
40 | 
41 | loo = LeaveOneOut()
42 | 
43 | startTime = time.time()
44 | 
45 | for train_index, test_index in loo.split(features):
46 | 	x_train, x_test = features[train_index], features[test_index]
47 | 	y_train, y_test = labels[train_index], labels[test_index]
48 | 
49 | 	X_train = x_train[:, indexes]
50 | 	X_test = x_test[:, indexes]
51 | 	Y_train = y_train[:]
52 | 	Y_test = y_test[:]
53 | 
54 | 	batch_size = 1
55 | 	num_classes = np.max(labels) + 1
56 | 	epochs = 50
57 | 
58 | 	X_train = X_train.astype('float32')
59 | 	X_test = X_test.astype('float32')
60 | 	Y_train = Y_train[:]
61 | 	Y_test = Y_test[:]
62 | 
63 | 	clf = LinearSVC(random_state=0)
64 | 
65 | 	clf.fit(X_train, Y_train)
66 | 	score = clf.score(X_test, Y_test)
67 | 
68 | 	scores.append(score)
69 | 
70 | endTime = time.time()
71 | 	
72 | with open('results/' + name + '_svm_' + type + '.txt', 'w') as file:
73 | 	file.write('Score: ' + str(np.average(scores)) + '\n')
74 | 	file.write('Time: ' + str(endTime - startTime))
75 | 	file.close()
76 | 
77 | print('Score: ' + str(np.average(scores)))
78 | print('Time: ' + str(endTime - startTime))
79 | 


--------------------------------------------------------------------------------
/data/PLACEHOLDER:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kivancguckiran/microarray-classification/6d6359e3910d7cfdf8459d52d80ccb3d27444acb/data/PLACEHOLDER


--------------------------------------------------------------------------------
/features/PLACEHOLDER:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kivancguckiran/microarray-classification/6d6359e3910d7cfdf8459d52d80ccb3d27444acb/features/PLACEHOLDER


--------------------------------------------------------------------------------
/lasso.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | from skrebate import ReliefF
 4 | from sklearn import preprocessing
 5 | from sklearn.linear_model import Lasso
 6 | import sys
 7 | 
 8 | 
 9 | # filenames = ['alon', 'borovecki', 'burczynski', 'chiaretti', 'chin', 'chowdary', 'christensen', 'golub', 'gordon', 'gravier', 'khan', 'nakayama', 'pomeroy', 'shipp', 'singh', 'sorlie', 'su', 'subramanian', 'sun', 'tian', 'west', 'yeoh']
10 | 
11 | name = sys.argv[1]
12 | 
13 | features = pd.read_csv('data/' + name + '_inputs.csv', header = None)
14 | labels = pd.read_csv('data/' + name + '_outputs.csv', header = None)
15 | 
16 | features.fillna(0, inplace = True)
17 | 
18 | features = np.asarray(features.values)
19 | labels = np.transpose(np.asarray(labels.values.ravel() - 1, dtype=int))
20 | 
21 | min_max_scaler = preprocessing.MinMaxScaler()
22 | features = min_max_scaler.fit_transform(features)
23 | 
24 | lasso = Lasso(alpha=0.001)
25 | lasso.fit(features, labels)
26 | 
27 | indexes = np.asarray(np.where(lasso.coef_ != 0))
28 | 
29 | np.savetxt('features/' + name + '_lasso.txt', lasso.coef_)
30 | 
31 | print(name, ': ', indexes.shape)
32 | 


--------------------------------------------------------------------------------
/relieff.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | from skrebate import ReliefF
 4 | from sklearn import preprocessing
 5 | from sklearn.linear_model import Lasso
 6 | import sys
 7 | 
 8 | 
 9 | # filenames = ['alon', 'borovecki', 'burczynski', 'chiaretti', 'chin', 'chowdary', 'christensen', 'golub', 'gordon', 'gravier', 'khan', 'nakayama', 'pomeroy', 'shipp', 'singh', 'sorlie', 'su', 'subramanian', 'sun', 'tian', 'west', 'yeoh']
10 | 
11 | name = sys.argv[1]
12 | 
13 | features = pd.read_csv('data/' + name + '_inputs.csv', header = None)
14 | labels = pd.read_csv('data/' + name + '_outputs.csv', header = None)
15 | 
16 | features.fillna(0, inplace = True)
17 | 
18 | features = np.asarray(features.values)
19 | labels = np.transpose(np.asarray(labels.values.ravel() - 1, dtype=int))
20 | 
21 | min_max_scaler = preprocessing.MinMaxScaler()
22 | features = min_max_scaler.fit_transform(features)
23 | 
24 | fs = ReliefF()
25 | fs.fit(features, labels)
26 | 
27 | np.savetxt('features/' + name + '_relieff.txt', fs.feature_importances_)
28 | 
29 | 


--------------------------------------------------------------------------------
/results/PLACEHOLDER:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kivancguckiran/microarray-classification/6d6359e3910d7cfdf8459d52d80ccb3d27444acb/results/PLACEHOLDER


--------------------------------------------------------------------------------