├── LICENSE ├── README.md ├── classify_with_mlp.py ├── classify_with_rf.py ├── classify_with_svm.py ├── data └── PLACEHOLDER ├── features └── PLACEHOLDER ├── lasso.py ├── relieff.py └── results └── PLACEHOLDER /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Kıvanç Güçkıran 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Microarray Work 2 | ## [[Paper]](https://dergipark.org.tr/sdufenbed/issue/39838/453462)[[Datasets]](https://github.com/kivancguckiran/microarray-data) 3 | 4 | DNA Microarray Gene Expression Data Classification Using SVM and MLP with Feature Selection Methods Relief and LASSO. 5 | 6 | If you are planning to use this code in your research, please cite this [paper](https://dergipark.org.tr/sdufenbed/issue/39838/453462). 7 | 8 | ## Dataset 9 | Datasets are DNA microarray gene expression data. [Dataset Link](https://github.com/kivancguckiran/microarray-data). 10 | 11 | ## Methods 12 | We are using LASSO and Relief for Feature Selection and SVM and MLP for classification. 13 | 14 | ## Download 15 | 16 | ``` 17 | git clone https://github.com/kivancguckiran/microarray-classification 18 | ``` 19 | 20 | ## Usage 21 | *data* folder should be filled with the dataset you want to classify. 22 | 23 | ### Examples 24 | Select features using Relief from *alon* dataset. 25 | ``` 26 | python relieff.py alon 27 | ``` 28 | Select features using LASSO from *borovecki* dataset. 29 | ``` 30 | python lasso.py borovecki 31 | ``` 32 | Classify using MLP with Relief features with *subramanian* dataset. 33 | ``` 34 | python classify_with_mlp.py subramanian relief 35 | ``` 36 | Classify using SVM with LASSO features with *sun* dataset. 37 | ``` 38 | python classify_with_svm.py sun lasso 39 | ``` 40 | -------------------------------------------------------------------------------- /classify_with_mlp.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | import sys 5 | import time 6 | from sklearn.pipeline import make_pipeline 7 | from skrebate import ReliefF 8 | from sklearn.ensemble import RandomForestClassifier 9 | from sklearn.model_selection import cross_val_score, train_test_split, LeaveOneOut, KFold, StratifiedKFold 10 | from sklearn import preprocessing 11 | import keras 12 | from keras.datasets import mnist 13 | from keras.models import Sequential 14 | from keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout, Activation 15 | from keras.optimizers import Adam, RMSprop, SGD, Adamax 16 | 17 | 18 | # filenames = ['alon', 'borovecki', 'burczynski', 'chiaretti', 'chin', 'chowdary', 'christensen', 'golub', 'gordon', 'gravier', 'khan', 'nakayama', 'pomeroy', 'shipp', 'singh', 'sorlie', 'su', 'subramanian', 'sun', 'tian', 'west', 'yeoh'] 19 | 20 | name = sys.argv[1] 21 | type = sys.argv[2] 22 | 23 | features = pd.read_csv('data/' + name + '_inputs.csv', header = None) 24 | labels = pd.read_csv('data/' + name + '_outputs.csv', header = None) 25 | 26 | features.fillna(0, inplace = True) 27 | 28 | features = np.asarray(features.values) 29 | labels = np.transpose(np.asarray(labels.values.ravel() - 1, dtype=int)) 30 | 31 | min_max_scaler = preprocessing.MinMaxScaler() 32 | features = min_max_scaler.fit_transform(features) 33 | 34 | if type == 'lasso': 35 | gains = np.asarray(np.loadtxt('features/' + name + '_lasso.txt')) 36 | indexes = np.where(gains != 0)[0] 37 | else: 38 | gains = np.asarray(np.loadtxt('features/' + name + '_lasso.txt')) 39 | indexes = np.where(gains != 0)[0] 40 | gains = np.asarray(np.loadtxt('features/' + name + '_relieff.txt')) 41 | indexes = gains.argsort()[-indexes.shape[0]:][::-1] 42 | 43 | 44 | scores = [] 45 | 46 | loo = LeaveOneOut() 47 | 48 | startTime = time.time() 49 | 50 | for train_index, test_index in loo.split(features): 51 | x_train, x_test = features[train_index], features[test_index] 52 | y_train, y_test = labels[train_index], labels[test_index] 53 | 54 | X_train = x_train[:, indexes] 55 | X_test = x_test[:, indexes] 56 | Y_train = y_train[:] 57 | Y_test = y_test[:] 58 | 59 | batch_size = 1 60 | num_classes = np.max(labels) + 1 61 | epochs = 50 62 | 63 | X_train = X_train.astype('float32') 64 | X_test = X_test.astype('float32') 65 | Y_train = Y_train[:] 66 | Y_test = Y_test[:] 67 | # print(X_train.shape[0], 'train samples, ', Y_train.shape) 68 | # print(X_test.shape[0], 'test samples, ', Y_test.shape) 69 | 70 | # convert class vectors to binary class matrices 71 | Y_train = keras.utils.to_categorical(Y_train, num_classes) 72 | Y_test = keras.utils.to_categorical(Y_test, num_classes) 73 | 74 | model = Sequential() 75 | 76 | # Dense(64) is a fully-connected layer with 64 hidden units. 77 | # in the first layer, you must specify the expected input data shape: 78 | # here, 20-dimensional vectors. 79 | model.add(Dense(200, input_dim=X_train.shape[1], kernel_initializer='lecun_uniform', activation='relu')) 80 | model.add(Dense(100, kernel_initializer='lecun_uniform', activation='relu')) 81 | model.add(Dense(Y_train.shape[1], kernel_initializer='lecun_uniform', activation='softmax')) 82 | 83 | sgd = SGD(lr=0.005, decay=1e-6, momentum=0.9, nesterov=True) 84 | model.compile(loss='categorical_crossentropy', optimizer=Adamax(), metrics=['accuracy']) 85 | 86 | # model.summary() 87 | 88 | history = model.fit(X_train, Y_train, 89 | batch_size=batch_size, 90 | epochs=epochs, 91 | verbose=0, 92 | validation_data=(X_test, Y_test)) 93 | 94 | score = model.evaluate(X_test, Y_test, verbose=0) 95 | 96 | scores.append(score[1]) 97 | 98 | endTime = time.time() 99 | 100 | with open('results/' + name + '_mlp_' + type + '.txt', 'w') as file: 101 | file.write('Score: ' + str(np.average(scores)) + '\n') 102 | file.write('Time: ' + str(endTime - startTime)) 103 | file.close() 104 | 105 | print('Score: ' + str(np.average(scores))) 106 | print('Time: ' + str(endTime - startTime)) 107 | -------------------------------------------------------------------------------- /classify_with_rf.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | import sys 5 | import time 6 | from sklearn.pipeline import make_pipeline 7 | from skrebate import ReliefF 8 | from sklearn.ensemble import RandomForestClassifier 9 | from sklearn.model_selection import cross_val_score, train_test_split, LeaveOneOut, KFold, StratifiedKFold 10 | from sklearn import preprocessing 11 | from sklearn.svm import LinearSVC 12 | 13 | 14 | # filenames = ['alon', 'borovecki', 'burczynski', 'chiaretti', 'chin', 'chowdary', 'christensen', 'golub', 'gordon', 'gravier', 'khan', 'nakayama', 'pomeroy', 'shipp', 'singh', 'sorlie', 'su', 'subramanian', 'sun', 'tian', 'west', 'yeoh'] 15 | 16 | name = sys.argv[1] 17 | type = sys.argv[2] 18 | 19 | features = pd.read_csv('data/' + name + '_inputs.csv', header = None) 20 | labels = pd.read_csv('data/' + name + '_outputs.csv', header = None) 21 | 22 | features.fillna(0, inplace = True) 23 | 24 | features = np.asarray(features.values) 25 | labels = np.transpose(np.asarray(labels.values.ravel() - 1, dtype=int)) 26 | 27 | min_max_scaler = preprocessing.MinMaxScaler() 28 | features = min_max_scaler.fit_transform(features) 29 | 30 | if type == 'lasso': 31 | gains = np.asarray(np.loadtxt('features/' + name + '_lasso.txt')) 32 | indexes = np.where(gains != 0)[0] 33 | else: 34 | gains = np.asarray(np.loadtxt('features/' + name + '_lasso.txt')) 35 | indexes = np.where(gains != 0)[0] 36 | gains = np.asarray(np.loadtxt('features/' + name + '_relieff.txt')) 37 | indexes = gains.argsort()[-indexes.shape[0]:][::-1] 38 | 39 | scores = [] 40 | 41 | loo = LeaveOneOut() 42 | 43 | startTime = time.time() 44 | 45 | for train_index, test_index in loo.split(features): 46 | x_train, x_test = features[train_index], features[test_index] 47 | y_train, y_test = labels[train_index], labels[test_index] 48 | 49 | X_train = x_train[:, indexes] 50 | X_test = x_test[:, indexes] 51 | Y_train = y_train[:] 52 | Y_test = y_test[:] 53 | 54 | batch_size = 1 55 | num_classes = np.max(labels) + 1 56 | epochs = 50 57 | 58 | X_train = X_train.astype('float32') 59 | X_test = X_test.astype('float32') 60 | Y_train = Y_train[:] 61 | Y_test = Y_test[:] 62 | 63 | clf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0) 64 | 65 | 66 | clf.fit(X_train, Y_train) 67 | score = clf.score(X_test, Y_test) 68 | 69 | scores.append(score) 70 | 71 | endTime = time.time() 72 | 73 | with open('results/' + name + '_rf_' + type + '.txt', 'w') as file: 74 | file.write('Score: ' + str(np.average(scores)) + '\n') 75 | file.write('Time: ' + str(endTime - startTime)) 76 | file.close() 77 | 78 | print('Score: ' + str(np.average(scores))) 79 | print('Time: ' + str(endTime - startTime)) 80 | -------------------------------------------------------------------------------- /classify_with_svm.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | import sys 5 | import time 6 | from sklearn.pipeline import make_pipeline 7 | from skrebate import ReliefF 8 | from sklearn.ensemble import RandomForestClassifier 9 | from sklearn.model_selection import cross_val_score, train_test_split, LeaveOneOut, KFold, StratifiedKFold 10 | from sklearn import preprocessing 11 | from sklearn.svm import LinearSVC 12 | 13 | 14 | # filenames = ['alon', 'borovecki', 'burczynski', 'chiaretti', 'chin', 'chowdary', 'christensen', 'golub', 'gordon', 'gravier', 'khan', 'nakayama', 'pomeroy', 'shipp', 'singh', 'sorlie', 'su', 'subramanian', 'sun', 'tian', 'west', 'yeoh'] 15 | 16 | name = sys.argv[1] 17 | type = sys.argv[2] 18 | 19 | features = pd.read_csv('data/' + name + '_inputs.csv', header = None) 20 | labels = pd.read_csv('data/' + name + '_outputs.csv', header = None) 21 | 22 | features.fillna(0, inplace = True) 23 | 24 | features = np.asarray(features.values) 25 | labels = np.transpose(np.asarray(labels.values.ravel() - 1, dtype=int)) 26 | 27 | min_max_scaler = preprocessing.MinMaxScaler() 28 | features = min_max_scaler.fit_transform(features) 29 | 30 | if type == 'lasso': 31 | gains = np.asarray(np.loadtxt('features/' + name + '_lasso.txt')) 32 | indexes = np.where(gains != 0)[0] 33 | else: 34 | gains = np.asarray(np.loadtxt('features/' + name + '_lasso.txt')) 35 | indexes = np.where(gains != 0)[0] 36 | gains = np.asarray(np.loadtxt('features/' + name + '_relieff.txt')) 37 | indexes = gains.argsort()[-indexes.shape[0]:][::-1] 38 | 39 | scores = [] 40 | 41 | loo = LeaveOneOut() 42 | 43 | startTime = time.time() 44 | 45 | for train_index, test_index in loo.split(features): 46 | x_train, x_test = features[train_index], features[test_index] 47 | y_train, y_test = labels[train_index], labels[test_index] 48 | 49 | X_train = x_train[:, indexes] 50 | X_test = x_test[:, indexes] 51 | Y_train = y_train[:] 52 | Y_test = y_test[:] 53 | 54 | batch_size = 1 55 | num_classes = np.max(labels) + 1 56 | epochs = 50 57 | 58 | X_train = X_train.astype('float32') 59 | X_test = X_test.astype('float32') 60 | Y_train = Y_train[:] 61 | Y_test = Y_test[:] 62 | 63 | clf = LinearSVC(random_state=0) 64 | 65 | clf.fit(X_train, Y_train) 66 | score = clf.score(X_test, Y_test) 67 | 68 | scores.append(score) 69 | 70 | endTime = time.time() 71 | 72 | with open('results/' + name + '_svm_' + type + '.txt', 'w') as file: 73 | file.write('Score: ' + str(np.average(scores)) + '\n') 74 | file.write('Time: ' + str(endTime - startTime)) 75 | file.close() 76 | 77 | print('Score: ' + str(np.average(scores))) 78 | print('Time: ' + str(endTime - startTime)) 79 | -------------------------------------------------------------------------------- /data/PLACEHOLDER: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kivancguckiran/microarray-classification/6d6359e3910d7cfdf8459d52d80ccb3d27444acb/data/PLACEHOLDER -------------------------------------------------------------------------------- /features/PLACEHOLDER: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kivancguckiran/microarray-classification/6d6359e3910d7cfdf8459d52d80ccb3d27444acb/features/PLACEHOLDER -------------------------------------------------------------------------------- /lasso.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from skrebate import ReliefF 4 | from sklearn import preprocessing 5 | from sklearn.linear_model import Lasso 6 | import sys 7 | 8 | 9 | # filenames = ['alon', 'borovecki', 'burczynski', 'chiaretti', 'chin', 'chowdary', 'christensen', 'golub', 'gordon', 'gravier', 'khan', 'nakayama', 'pomeroy', 'shipp', 'singh', 'sorlie', 'su', 'subramanian', 'sun', 'tian', 'west', 'yeoh'] 10 | 11 | name = sys.argv[1] 12 | 13 | features = pd.read_csv('data/' + name + '_inputs.csv', header = None) 14 | labels = pd.read_csv('data/' + name + '_outputs.csv', header = None) 15 | 16 | features.fillna(0, inplace = True) 17 | 18 | features = np.asarray(features.values) 19 | labels = np.transpose(np.asarray(labels.values.ravel() - 1, dtype=int)) 20 | 21 | min_max_scaler = preprocessing.MinMaxScaler() 22 | features = min_max_scaler.fit_transform(features) 23 | 24 | lasso = Lasso(alpha=0.001) 25 | lasso.fit(features, labels) 26 | 27 | indexes = np.asarray(np.where(lasso.coef_ != 0)) 28 | 29 | np.savetxt('features/' + name + '_lasso.txt', lasso.coef_) 30 | 31 | print(name, ': ', indexes.shape) 32 | -------------------------------------------------------------------------------- /relieff.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from skrebate import ReliefF 4 | from sklearn import preprocessing 5 | from sklearn.linear_model import Lasso 6 | import sys 7 | 8 | 9 | # filenames = ['alon', 'borovecki', 'burczynski', 'chiaretti', 'chin', 'chowdary', 'christensen', 'golub', 'gordon', 'gravier', 'khan', 'nakayama', 'pomeroy', 'shipp', 'singh', 'sorlie', 'su', 'subramanian', 'sun', 'tian', 'west', 'yeoh'] 10 | 11 | name = sys.argv[1] 12 | 13 | features = pd.read_csv('data/' + name + '_inputs.csv', header = None) 14 | labels = pd.read_csv('data/' + name + '_outputs.csv', header = None) 15 | 16 | features.fillna(0, inplace = True) 17 | 18 | features = np.asarray(features.values) 19 | labels = np.transpose(np.asarray(labels.values.ravel() - 1, dtype=int)) 20 | 21 | min_max_scaler = preprocessing.MinMaxScaler() 22 | features = min_max_scaler.fit_transform(features) 23 | 24 | fs = ReliefF() 25 | fs.fit(features, labels) 26 | 27 | np.savetxt('features/' + name + '_relieff.txt', fs.feature_importances_) 28 | 29 | -------------------------------------------------------------------------------- /results/PLACEHOLDER: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kivancguckiran/microarray-classification/6d6359e3910d7cfdf8459d52d80ccb3d27444acb/results/PLACEHOLDER --------------------------------------------------------------------------------