├── .gitattributes ├── .gitignore ├── .travis.yml ├── LICENSE ├── README.md ├── config ├── experiments │ └── simple_experiment.ini └── main.ini ├── data └── autism.tsv ├── experiments ├── __init__.py ├── classifier.py ├── dataset.py └── experiment.py ├── methods ├── __init__.py ├── selection.py └── selection_wrapper.py ├── requirements.txt ├── run.py ├── tests ├── __init__.py ├── corr.py ├── test_fisher.py ├── test_pearson.py ├── test_statistics.py └── ttest.py └── utils ├── __init__.py ├── data_reader.py ├── log_saver.py └── statistics.py /.gitattributes: -------------------------------------------------------------------------------- 1 | .tsv filter=lfs diff=lfs merge=lfs -text 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | 103 | logs 104 | .idea/ 105 | .pytest_cache/ 106 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "3.5" 4 | script: 5 | - pytest -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Tomasz Latkowski 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Build Status](https://travis-ci.org/tlatkowski/tf-feature-selection.svg?branch=master)](https://travis-ci.org/tlatkowski/tf-feature-selection) ![](https://img.shields.io/badge/Status-under--dev-red.svg) ![](https://img.shields.io/badge/Python-3.6-blue.svg) ![](https://img.shields.io/badge/Tensorflow-1.12.2-blue.svg) ![](https://img.shields.io/badge/License-MIT-blue.svg) 2 | # tf-feature-selection 3 | Implementation of feature selection methods using TensorFlow library. 4 | -------------------------------------------------------------------------------- /config/experiments/simple_experiment.ini: -------------------------------------------------------------------------------- 1 | [SELECTION] 2 | num_features = 100 3 | method = fisher 4 | 5 | [CLASSIFIER] 6 | hidden_sizes = 20 7 | -------------------------------------------------------------------------------- /config/main.ini: -------------------------------------------------------------------------------- 1 | [TRAINING] 2 | num_epochs = 1000 3 | eval_every = 10 -------------------------------------------------------------------------------- /data/autism.tsv: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:b3b7d953d54f6bd08f9860347df05bbacfcccca254400cf4711b2be30e1cde71 3 | size 95496413 4 | -------------------------------------------------------------------------------- /experiments/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tlatkowski/tf-feature-selection/910f03764a675841eaf4578415ae697d78860b81/experiments/__init__.py -------------------------------------------------------------------------------- /experiments/classifier.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | def ff_neural_network(inputs, units): 5 | layer = tf.layers.dense(inputs, units=units, activation=tf.nn.tanh) 6 | output = tf.layers.dense(layer, units=1) 7 | return output 8 | 9 | 10 | class NeuralNetworkClassifier: 11 | 12 | def __init__(self, num_features, units): 13 | self.x = tf.placeholder(dtype=tf.float64, shape=[None, num_features], name='inputs') 14 | self.y = tf.placeholder(dtype=tf.float64, shape=[None, 1], name='labels') 15 | 16 | output = ff_neural_network(self.x, units=units) 17 | 18 | with tf.name_scope('loss'): 19 | self.loss = tf.losses.sigmoid_cross_entropy(self.y, output) 20 | self.opt = tf.train.AdamOptimizer(learning_rate=0.01).minimize(self.loss) 21 | 22 | with tf.name_scope('metrics'): 23 | self.prediction = tf.nn.sigmoid(output) 24 | 25 | self.correct_predictions = tf.equal(self.prediction, self.y) 26 | self.accuracy = tf.reduce_mean(tf.to_float(self.correct_predictions)) 27 | tf.summary.scalar("accuracy", self.accuracy) 28 | tf.summary.scalar("loss", self.loss) 29 | self.summary_op = tf.summary.merge_all() 30 | -------------------------------------------------------------------------------- /experiments/dataset.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.model_selection import StratifiedKFold 3 | 4 | from utils.data_reader import read 5 | 6 | 7 | class Dataset: 8 | 9 | def __init__(self, data_fn): 10 | self.data = read(data_fn) 11 | # FIXME 12 | self.labels = np.concatenate([np.ones(82, dtype=np.float64), np.zeros(64, dtype=np.float64)]) 13 | self.labels = np.reshape(self.labels, (-1, 1)) 14 | 15 | self.skf = StratifiedKFold(n_splits=10) 16 | 17 | def cross_validation(self): 18 | return enumerate(self.skf.split(self.data, self.labels.reshape(146))) 19 | 20 | def get_data(self, indices): 21 | return self.data[indices, :] 22 | 23 | def get_labels(self, indices): 24 | selected_labels = self.labels[indices] 25 | num_instances = [int(sum(selected_labels == 0)), int(sum(selected_labels == 1))] 26 | return num_instances, selected_labels 27 | -------------------------------------------------------------------------------- /experiments/experiment.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | from methods.selection import fisher, feature_correlation_with_class, t_test, random 4 | from methods.selection_wrapper import SelectionWrapper 5 | 6 | methods = { 7 | 'fisher': fisher, 8 | 'corr': feature_correlation_with_class, 9 | 'ttest': t_test, 10 | 'random': random 11 | } 12 | 13 | 14 | class Experiment: 15 | 16 | def __init__(self, experiment_config, num_instances, classifier, dataset): 17 | selection_method = methods[experiment_config['SELECTION']['method']] 18 | num_features = int(experiment_config['SELECTION']['num_features']) 19 | hidden_sizes = int(experiment_config['CLASSIFIER']['hidden_sizes']) 20 | 21 | with tf.name_scope('selection'): 22 | self.selection_wrapper = SelectionWrapper(dataset, 23 | num_instances=num_instances, 24 | selection_method=selection_method, 25 | num_features=num_features) 26 | 27 | with tf.name_scope('classifier'): 28 | self.clf = classifier(num_features, hidden_sizes) 29 | -------------------------------------------------------------------------------- /methods/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tlatkowski/tf-feature-selection/910f03764a675841eaf4578415ae697d78860b81/methods/__init__.py -------------------------------------------------------------------------------- /methods/selection.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | def fisher(data, num_instances: list, top_k_features=2): 5 | """ 6 | Performs Fisher feature selection method according to the following formula: 7 | D(f) = (m1(f) - m2(f) / (std1(f) - std2(f)) 8 | 9 | :param data: 10 | :param num_instances: 11 | :param top_k_features: 12 | :return: the list of most significant features. 13 | """ 14 | assert len(num_instances) == 2, "Fisher selection method can be performed for two-class problems." 15 | 16 | data = tf.convert_to_tensor(data) 17 | num_features = data.get_shape().as_list()[-1] 18 | if top_k_features > num_features: 19 | top_k_features = num_features 20 | class1, class2 = tf.split(data, num_instances) 21 | 22 | with tf.name_scope('fisher_selection'): 23 | mean1, std1 = tf.nn.moments(class1, axes=0) 24 | mean2, std2 = tf.nn.moments(class2, axes=0) 25 | fisher_coeffs = tf.abs(mean1 - mean2) / (std1 + std2) 26 | selected_features = tf.nn.top_k(fisher_coeffs, k=top_k_features) 27 | 28 | return selected_features 29 | 30 | 31 | def feature_correlation_with_class(data, num_instances: list, top_k_features=10): 32 | """ 33 | Makes feature correlation with class selection according to the following formula: 34 | D(f) = [(m1(f) - m(f))^2 + (m2(f) - m(f))^2] / 2*sigma(f)^2 35 | :return: the list of most significant features. 36 | """ 37 | data = tf.convert_to_tensor(data) 38 | num_features = data.get_shape().as_list()[-1] 39 | if top_k_features > num_features: 40 | top_k_features = num_features 41 | class1, class2 = tf.split(data, num_instances) 42 | 43 | with tf.name_scope('corr_selection'): 44 | mean1, std1 = tf.nn.moments(class1, axes=0) 45 | mean2, std2 = tf.nn.moments(class2, axes=0) 46 | mean, std = tf.nn.moments(data, axes=0) 47 | corr_coeffs = (tf.square(mean1 - mean) + tf.square(mean2 - mean)) / 2 * tf.square(std) 48 | selected_features = tf.nn.top_k(corr_coeffs, k=top_k_features) 49 | 50 | return selected_features 51 | 52 | 53 | def t_test(data, num_instances: list, top_k_features=10): 54 | """ 55 | Makes feature correlation with class selection according to the following formula: 56 | D(f) = [(m1(f) - m(f))^2 + (m2(f) - m(f))^2] / 2*sigma(f)^2 57 | :return: the list of most significant features. 58 | """ 59 | data = tf.convert_to_tensor(data) 60 | num_features = data.get_shape().as_list()[-1] 61 | if top_k_features > num_features: 62 | top_k_features = num_features 63 | class1, class2 = tf.split(data, num_instances) 64 | 65 | with tf.name_scope('t_test_selection'): 66 | mean1, std1 = tf.nn.moments(class1, axes=0) 67 | mean2, std2 = tf.nn.moments(class2, axes=0) 68 | t_test_coeffs = tf.abs(mean1 - mean2) / tf.sqrt( 69 | tf.square(std1) / num_instances[0] + tf.square(std2) / num_instances[1]) 70 | selected_features = tf.nn.top_k(t_test_coeffs, k=top_k_features) 71 | 72 | return selected_features 73 | 74 | 75 | def random(data, num_instances: list, top_k_features=10): 76 | data = tf.convert_to_tensor(data) 77 | num_features = data.get_shape().as_list()[-1] 78 | if top_k_features > num_features: 79 | top_k_features = num_features 80 | class1, class2 = tf.split(data, num_instances) 81 | 82 | with tf.name_scope('random_selection'): 83 | pass -------------------------------------------------------------------------------- /methods/selection_wrapper.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | class SelectionWrapper: 5 | 6 | def __init__(self, data, num_instances, selection_method=None, num_features=None): 7 | if data is None: 8 | raise ValueError('Provide data to make selection.') 9 | 10 | if selection_method is None: 11 | raise ValueError('Provide selection method.') 12 | 13 | if num_features is None: 14 | data = tf.convert_to_tensor(data) 15 | num_features = data.get_shape().as_list()[-1] 16 | 17 | self.values, self.indices = selection_method(data, num_instances, num_features) 18 | self.selected_data = tf.gather(data, self.indices, axis=1) 19 | 20 | def select(self, data): 21 | return tf.gather(data, self.indices, axis=1) 22 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | bleach==1.5.0 2 | enum34==1.1.6 3 | html5lib==0.9999999 4 | Markdown==2.6.11 5 | numpy==1.13.3 6 | pandas==0.19.0 7 | protobuf==3.5.1 8 | python-dateutil==2.6.1 9 | pytz==2018.3 10 | scikit-learn==0.19.1 11 | scipy==1.0.0 12 | six==1.11.0 13 | sklearn==0.0 14 | tensorflow==1.12.2 15 | tensorflow-tensorboard==0.4.0 16 | tqdm==4.19.6 17 | Werkzeug==0.15.3 18 | -------------------------------------------------------------------------------- /run.py: -------------------------------------------------------------------------------- 1 | import configparser 2 | from argparse import ArgumentParser 3 | 4 | import tensorflow as tf 5 | from tqdm import tqdm 6 | 7 | from experiments.classifier import NeuralNetworkClassifier 8 | from experiments.dataset import Dataset 9 | from experiments.experiment import Experiment 10 | from utils.log_saver import LogSaver 11 | 12 | 13 | def run_experiment(experiment_config): 14 | dataset = Dataset('data/autism.tsv') 15 | num_epochs = 1000 16 | eval_every = 10 17 | 18 | for fold_id, (train_idxs, test_idxs) in dataset.cross_validation(): 19 | 20 | data_train_fold = dataset.get_data(train_idxs) 21 | num_instances, labels_train_fold = dataset.get_labels(train_idxs) 22 | 23 | data_test_fold = dataset.get_data(test_idxs) 24 | _, labels_test_fold = dataset.get_labels(test_idxs) 25 | 26 | with tf.Graph().as_default() as graph: 27 | 28 | experiment = Experiment(experiment_config, num_instances, NeuralNetworkClassifier, data_train_fold) 29 | 30 | with tf.Session() as session: 31 | 32 | global_step = 0 33 | session.run(tf.global_variables_initializer()) 34 | 35 | log_saver = LogSaver('logs', 'fisher_fold{}'.format(fold_id), session.graph) 36 | 37 | train_selected_data = session.run(experiment.selection_wrapper.selected_data) 38 | test_selected_data = session.run(experiment.selection_wrapper.select(data_test_fold)) 39 | 40 | tqdm_iter = tqdm(range(num_epochs), desc='Epochs') 41 | 42 | for epoch in tqdm_iter: 43 | feed_dict = {experiment.clf.x: train_selected_data, experiment.clf.y: labels_train_fold} 44 | loss, _ = session.run([experiment.clf.loss, experiment.clf.opt], 45 | feed_dict=feed_dict) 46 | 47 | if epoch % eval_every == 0: 48 | summary = session.run(experiment.clf.summary_op, feed_dict=feed_dict) 49 | log_saver.log_train(summary, epoch) 50 | 51 | feed_dict = {experiment.clf.x: test_selected_data, experiment.clf.y: labels_test_fold} 52 | summary = session.run(experiment.clf.summary_op, feed_dict=feed_dict) 53 | log_saver.log_test(summary, epoch) 54 | 55 | tqdm_iter.set_postfix(loss='{:.2f}'.format(float(loss)), epoch=epoch) 56 | 57 | 58 | def main(): 59 | parser = ArgumentParser() 60 | parser.add_argument('experiment', 61 | default='simple_experiment', 62 | choices=['simple_experiment'], 63 | help='model used during training (default: %(default))') 64 | 65 | args = parser.parse_args() 66 | experiment_config = configparser.ConfigParser() 67 | experiment_config.read('config/experiments/{}.ini'.format(args.experiment)) 68 | 69 | run_experiment(experiment_config) 70 | 71 | 72 | if __name__ == '__main__': 73 | main() 74 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tlatkowski/tf-feature-selection/910f03764a675841eaf4578415ae697d78860b81/tests/__init__.py -------------------------------------------------------------------------------- /tests/corr.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | class CorrelationWithClassSelectionTest(tf.test.TestCase): 5 | 6 | def testCorrelationWithClassCorrectScore(self): 7 | raise NotImplementedError 8 | 9 | 10 | if __name__ == '__main__': 11 | tf.test.main() 12 | -------------------------------------------------------------------------------- /tests/test_fisher.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | 4 | from methods.selection import fisher 5 | from methods.selection_wrapper import SelectionWrapper 6 | 7 | 8 | class TestFisherSelection(tf.test.TestCase): 9 | 10 | def testFisherCorrectScore(self): 11 | with self.test_session() as test_session: 12 | data = np.array([[2, 2], 13 | [4, 4], 14 | [3, 6], 15 | [5, 6]]) 16 | num_instances = [2, 2] 17 | top_k = 2 18 | actual_most_significant_features, _ = test_session.run(fisher(data, num_instances, top_k)) 19 | correct_most_significant_features = [3., .5] 20 | 21 | self.assertAllEqual(actual_most_significant_features, correct_most_significant_features) 22 | 23 | def testFisherPickFirstSignificantFeature(self): 24 | with self.test_session() as test_session: 25 | data = np.array([[2, 2], 26 | [4, 4], 27 | [3, 6], 28 | [5, 6]]) 29 | 30 | num_instances = [2, 2] 31 | top_k = 1 32 | selection_wrapper = SelectionWrapper(data, 33 | num_instances, 34 | fisher, 35 | num_features=top_k) 36 | actual_most_significant_features = test_session.run(selection_wrapper.selected_data) 37 | correct_most_significant_features = [[2.], [4.], [6.], [6.]] 38 | 39 | self.assertAllEqual(actual_most_significant_features, correct_most_significant_features) 40 | 41 | def testFisherCorrectOrderOfFeatures(self): 42 | with self.test_session() as test_session: 43 | data = np.array([[2, 2], 44 | [4, 4], 45 | [3, 6], 46 | [5, 6]]) 47 | num_instances = [2, 2] 48 | top_k = 2 49 | _, actual_most_significant_features = test_session.run(fisher(data, num_instances, top_k)) 50 | correct_most_significant_features = [1., 0.] 51 | 52 | self.assertAllEqual(actual_most_significant_features, correct_most_significant_features) 53 | 54 | def testMoreThan2ClassesIsNotAllowed(self): 55 | with self.test_session() as test_session: 56 | data = np.array([[2, 2], 57 | [4, 4], 58 | [3, 6], 59 | [5, 6]]) 60 | num_instances = [2, 2, 2] 61 | top_k = 2 62 | with self.assertRaises(AssertionError): 63 | _, actual_most_significant_features = test_session.run(fisher(data, num_instances, top_k)) 64 | 65 | 66 | if __name__ == '__main__': 67 | tf.test.main() 68 | -------------------------------------------------------------------------------- /tests/test_pearson.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | 4 | from utils.statistics import pearson_correlation 5 | 6 | 7 | class TestPearson(tf.test.TestCase): 8 | 9 | def testPearsonCoefficientValueForTwoVectors(self): 10 | with self.test_session() as test_session: 11 | x1 = np.array([2., 3., 4.]) 12 | x2 = np.array([3., 1., 5.]) 13 | 14 | actual_pearson_coefficient = test_session.run(pearson_correlation(x1, x2)) 15 | correct_pearson_coefficient = [.5] 16 | 17 | self.assertEqual(actual_pearson_coefficient, correct_pearson_coefficient) 18 | 19 | def testNegativePearsonCoefficientValueForTwoVectors(self): 20 | with self.test_session() as test_session: 21 | x1 = np.array([1., 2., 3.]) 22 | x2 = np.array([-1., -2., -3.]) 23 | 24 | actual_pearson_coefficient = test_session.run(pearson_correlation(x1, x2)) 25 | correct_pearson_coefficient = [-1.] 26 | 27 | self.assertEqual(actual_pearson_coefficient, correct_pearson_coefficient) 28 | 29 | def testPositivePearsonCoefficientValueForTwoVectors(self): 30 | with self.test_session() as test_session: 31 | x1 = np.array([1., 2., 3.]) 32 | x2 = np.array([1., 2., 3.]) 33 | 34 | actual_pearson_coefficient = test_session.run(pearson_correlation(x1, x2)) 35 | correct_pearson_coefficient = [1.] 36 | 37 | self.assertEqual(actual_pearson_coefficient, correct_pearson_coefficient) 38 | 39 | 40 | if __name__ == '__main__': 41 | tf.test.main() 42 | -------------------------------------------------------------------------------- /tests/test_statistics.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | 4 | from utils.statistics import pooled_variance 5 | 6 | 7 | class TestStatistics(tf.test.TestCase): 8 | 9 | def testPooledVariance(self): 10 | with self.test_session() as test_session: 11 | data = np.array([[2., 3., 4., 5.], 12 | [2., 3., 4., 5.], 13 | [2., 3., 4., 5.], 14 | [2., 3., 4., 5.]]) 15 | num_instances = [2, 2] 16 | actual_pooled_variance = test_session.run(pooled_variance(data, num_instances)) 17 | correct_pooled_variance = [.0, .0, .0, .0] 18 | 19 | self.assertAllEqual(actual_pooled_variance, correct_pooled_variance) 20 | 21 | 22 | if __name__ == '__main__': 23 | tf.test.main() 24 | -------------------------------------------------------------------------------- /tests/ttest.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | class TtestSelectionTest(tf.test.TestCase): 5 | 6 | def testTtestCorrectScore(self): 7 | raise NotImplementedError 8 | 9 | 10 | if __name__ == '__main__': 11 | tf.test.main() 12 | -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tlatkowski/tf-feature-selection/910f03764a675841eaf4578415ae697d78860b81/utils/__init__.py -------------------------------------------------------------------------------- /utils/data_reader.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | 4 | def read(file_name): 5 | data = pd.read_csv(file_name, sep='\t', header=None, index_col=0).T 6 | return data.as_matrix() 7 | -------------------------------------------------------------------------------- /utils/log_saver.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import tensorflow as tf 4 | 5 | 6 | class LogSaver: 7 | 8 | def __init__(self, logs_path, model_name, graph: tf.Graph): 9 | if not os.path.isdir(logs_path): 10 | os.makedirs(logs_path) 11 | self.test_summary_writer = tf.summary.FileWriter('{}/{}/test/'.format(logs_path, model_name), graph=graph) 12 | self.train_summary_writer = tf.summary.FileWriter('{}/{}/train/'.format(logs_path, model_name), graph=graph) 13 | 14 | def log_test(self, summary, global_step): 15 | self.test_summary_writer.add_summary(summary, global_step) 16 | 17 | def log_train(self, summary, global_step): 18 | self.test_summary_writer.add_summary(summary, global_step) 19 | -------------------------------------------------------------------------------- /utils/statistics.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | def pearson_correlation(x1, x2): 5 | x1 = tf.convert_to_tensor(x1) 6 | x2 = tf.convert_to_tensor(x2) 7 | m1, std1 = tf.nn.moments(x1, axes=0) 8 | m2, std2 = tf.nn.moments(x2, axes=0) 9 | l = tf.reduce_sum((x1 - m1) * (x2 - m2)) 10 | i = tf.reduce_sum((x1 - m1) ** 2) * tf.reduce_sum((x2 - m2) ** 2) 11 | p = tf.sqrt(i) 12 | return l / p 13 | 14 | 15 | def f_test(data, num_instances): 16 | """ 17 | Performs F-statistic between the genes and the classification variable h 18 | as the score of maximum relevance. 19 | 20 | :param data: 21 | :param num_instances: 22 | :return: 23 | """ 24 | 25 | data = tf.convert_to_tensor(data) 26 | class1, class2 = tf.split(data, num_instances) 27 | K = 2 28 | with tf.name_scope('f_statistic'): 29 | mean1, var1 = tf.nn.moments(class1, axes=0) 30 | mean2, var2 = tf.nn.moments(class2, axes=0) 31 | mean, var = tf.nn.moments(data, axes=0) 32 | 33 | pooled_var = pooled_variance(data, num_instances) 34 | tf.reduce_sum(((mean1 - mean) + (mean2 - mean))/(K-1))/pooled_var 35 | 36 | 37 | def pooled_variance(data, num_instances): 38 | K = len(num_instances) 39 | n = sum(num_instances) 40 | data = tf.convert_to_tensor(data, dtype=tf.float32) 41 | split_classes = tf.split(data, num_instances) 42 | vars = [] 43 | for i in range(len(split_classes)): 44 | _, var = tf.nn.moments(split_classes[i], axes=0) 45 | vars.append(var) 46 | 47 | n_k = tf.to_float(tf.reshape(num_instances, [K, -1])) 48 | stacked_var = tf.stack(vars) 49 | pooled_var = tf.reduce_sum(stacked_var * (n_k - 1), axis=0) / (n - K) 50 | return pooled_var --------------------------------------------------------------------------------