├── .github └── ISSUE_TEMPLATE │ ├── bug-performance-issue.md │ ├── documentation-issue.md │ └── feature-request.md ├── .gitignore ├── .travis.yml ├── LICENSE ├── MANIFEST.in ├── README.md ├── build └── lib │ └── genoml │ ├── __init__.py │ ├── __main__.py │ ├── cli │ ├── __init__.py │ ├── continuous_supervised_test.py │ ├── continuous_supervised_train.py │ ├── continuous_supervised_tune.py │ ├── discrete_supervised_test.py │ ├── discrete_supervised_train.py │ ├── discrete_supervised_tune.py │ ├── harmonizing.py │ └── munging.py │ ├── continuous │ ├── __init__.py │ └── supervised │ │ ├── __init__.py │ │ ├── testing.py │ │ ├── training.py │ │ └── tuning.py │ ├── dependencies.py │ ├── discrete │ ├── __init__.py │ └── supervised │ │ ├── __init__.py │ │ ├── testing.py │ │ ├── training.py │ │ └── tuning.py │ ├── misc │ └── descriptions.json │ ├── preprocessing │ ├── __init__.py │ ├── adjuster.py │ ├── featureselection.py │ ├── harmonizing.py │ ├── munging.py │ └── vif.py │ └── utils.py ├── dist └── genoml2-1.0.1.tar.gz ├── docs ├── GettingStarted.sh └── current_file_structure.txt ├── examples ├── continuous │ ├── example_GWAS.csv │ ├── to_adjust.txt │ ├── training.bed │ ├── training.bim │ ├── training.fam │ ├── training_addit.csv │ ├── training_addit_confounder_example.csv │ ├── training_pheno.csv │ ├── validation.bed │ ├── validation.bim │ ├── validation.fam │ ├── validation_addit.csv │ └── validation_pheno.csv └── discrete │ ├── example_GWAS.csv │ ├── to_adjust.txt │ ├── training.bed │ ├── training.bim │ ├── training.fam │ ├── training_addit.csv │ ├── training_addit_confounder_example.csv │ ├── training_pheno.csv │ ├── validation.bed │ ├── validation.bim │ ├── validation.fam │ ├── validation_addit.csv │ └── validation_pheno.csv ├── genoml ├── __init__.py ├── __main__.py ├── cli │ ├── __init__.py │ ├── continuous_supervised_test.py │ ├── continuous_supervised_train.py │ ├── continuous_supervised_tune.py │ ├── discrete_supervised_test.py │ ├── discrete_supervised_train.py │ ├── discrete_supervised_tune.py │ ├── harmonizing.py │ └── munging.py ├── continuous │ ├── __init__.py │ └── supervised │ │ ├── __init__.py │ │ ├── testing.py │ │ ├── training.py │ │ └── tuning.py ├── dependencies.py ├── discrete │ ├── __init__.py │ ├── supervised │ │ ├── __init__.py │ │ ├── testing.py │ │ ├── training.py │ │ └── tuning.py │ └── utils.py ├── misc │ └── descriptions.json ├── preprocessing │ ├── __init__.py │ ├── adjuster.py │ ├── featureselection.py │ ├── harmonizing.py │ ├── munging.py │ └── vif.py └── utils.py ├── genoml2.egg-info ├── PKG-INFO ├── SOURCES.txt ├── dependency_links.txt ├── entry_points.txt ├── requires.txt └── top_level.txt ├── logo.png ├── outputs ├── test_discrete_geno.approx_feature_importance.txt ├── test_discrete_geno.best_algorithm.txt ├── test_discrete_geno.dataForML.h5 ├── test_discrete_geno.list_features.txt ├── test_discrete_geno.trainedModel.joblib ├── test_discrete_geno.trainedModel_trainingSample_Predictions.csv ├── test_discrete_geno.trainedModel_withheldSample_Predictions.csv ├── test_discrete_geno.trainedModel_withheldSample_ROC.png ├── test_discrete_geno.trainedModel_withheldSample_probabilities.png ├── test_discrete_geno.training_withheldSamples_performanceMetrics.csv ├── test_discrete_geno.tunedModel.joblib ├── test_discrete_geno.tunedModel_CV_Summary.csv ├── test_discrete_geno.tunedModel_allSample_Predictions.csv ├── test_discrete_geno.tunedModel_allSample_probabilities.png ├── test_discrete_geno.tunedModel_top10Iterations_Summary.csv ├── test_discrete_geno.umap_clustering.joblib ├── test_discrete_geno.umap_data_reduction.csv ├── test_discrete_geno.umap_plot.png ├── test_discrete_geno.variants_and_alleles.tab ├── validation_test_discrete_geno.dataForML.h5 ├── validation_test_discrete_geno.finalHarmonizedCols_toKeep.txt ├── validation_test_discrete_geno.list_features.txt ├── validation_test_discrete_geno.refColsHarmonize_toKeep.txt ├── validation_test_discrete_geno.refSNPs_andAlleles.bed ├── validation_test_discrete_geno.refSNPs_andAlleles.bim ├── validation_test_discrete_geno.refSNPs_andAlleles.fam ├── validation_test_discrete_geno.testedModel_allSample_ROC.png ├── validation_test_discrete_geno.testedModel_allSample_predictions.csv ├── validation_test_discrete_geno.testedModel_allSample_probabilities.png ├── validation_test_discrete_geno.testedModel_allSamples_performanceMetrics.csv └── validation_test_discrete_geno.variants_and_alleles.tab ├── requirements.txt ├── setup.cfg └── setup.py /.github/ISSUE_TEMPLATE/bug-performance-issue.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug/Performance Issue 3 | about: 'Use this template for reporting a bug or a performance issue. ' 4 | title: '' 5 | labels: type:bug/performance 6 | assignees: '' 7 | 8 | --- 9 | 10 | Please make sure that this is a bug. 11 | 12 | **System information:** 13 | - OS Platform and Distribution (e.g., Linux Ubuntu 16.04): 14 | - GenoML Installed from (source or binary): 15 | - GenoML Version: 16 | - Python Version: 17 | 18 | **Describe the current behavior:** 19 | 20 | **Describe the expected behavior:** 21 | 22 | **Code to reproduce the issue:** 23 | Provide a reproducible test case that is the bare minimum necessary to generate the problem. 24 | 25 | **Other Information / Logs** 26 | Include any logs or source code that would be helpful to diagnose the problem. If including tracebacks, please include the full traceback. Large logs and files should be attached. 27 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/documentation-issue.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Documentation Issue 3 | about: 'Use this template for documentation related issues. ' 4 | title: '' 5 | labels: type:docs 6 | assignees: '' 7 | 8 | --- 9 | 10 | Please make sure that this is a documentation issue. 11 | 12 | 13 | **System information** 14 | - GenoML Version: 15 | - Doc Link: 16 | 17 | 18 | **Describe the documentation Issue** 19 | 20 | **We welcome contributions by users. Will you be able to fix the Doc issue?** 21 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature-request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature Request 3 | about: Use this template for raising a feature request. 4 | title: '' 5 | labels: type:feature 6 | assignees: '' 7 | 8 | --- 9 | 10 | Please make sure that this is a feature request. 11 | 12 | **System information:** 13 | - GenoML Version: 14 | - GenoML Installed from (source or binary): 15 | - Are you willing to contribute to this request? (Yes/No): 16 | 17 | **Describe Current Behavior/State and Recommended Feature Request:** 18 | 19 | **Will this change the current API? How?** 20 | 21 | **Who Will Benefit from this Feature?** 22 | 23 | **Any Additional Information?** 24 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | outputs/.DS_Store 3 | .DS_Store 4 | genoml-git.code-workspace 5 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "3.6" # current default Python on Travis CI 4 | - "3.7" 5 | - "3.8" 6 | - "3.8-dev" # 3.8 development branch 7 | - "nightly" # nightly build 8 | install: 9 | - pip install . 10 | # command to run tests 11 | script: 12 | - echo "TODO" -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include requirements.txt -------------------------------------------------------------------------------- /build/lib/genoml/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The GenoML Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | from genoml import preprocessing 17 | from genoml import discrete 18 | from genoml import continuous 19 | from genoml import cli 20 | -------------------------------------------------------------------------------- /build/lib/genoml/cli/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The GenoML Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | -------------------------------------------------------------------------------- /build/lib/genoml/cli/continuous_supervised_test.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The GenoML Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | import sys 17 | import joblib 18 | import pandas as pd 19 | 20 | from genoml.continuous import supervised 21 | 22 | 23 | def main(prefix, test_prefix, refModel_prefix): 24 | print("") 25 | print("Here is some basic info on the command you are about to run.") 26 | print("Python version info...") 27 | print(sys.version) 28 | 29 | # Print out the chosen CLI arguments 30 | print("CLI argument info...") 31 | print(f"You are importing this test dataset: {test_prefix}.") 32 | print(f"You are applying the model saved here: {refModel_prefix}.") 33 | print( 34 | f"The results of this test application of your model will be saved in files with the given prefix: {prefix}.") 35 | print( 36 | "As a note, in all exported probabilities and other graphics, case status is treated as a 0 or 1, with 1 representing a positive case.") 37 | 38 | print("") 39 | 40 | # Specify prefix and dataframe variables to be passed into class 41 | run_prefix = prefix 42 | infile_h5 = test_prefix + ".dataForML.h5" 43 | df = pd.read_hdf(infile_h5, key="dataForML") 44 | 45 | infile_model = refModel_prefix + ".joblib" 46 | loaded_model = joblib.load(infile_model) 47 | 48 | # Pass the arguments to the class 49 | test = supervised.test(df, loaded_model, run_prefix) 50 | 51 | # Prep and show the dataframe 52 | test.prep_df() 53 | 54 | # Output the performance metrics 55 | test.performance_metrics() 56 | 57 | # Exporting predictions on withheld data 58 | test.export_pheno_predictions() 59 | 60 | # Exporting regression plot + summary 61 | test.regression_summary() 62 | 63 | # Thank the user 64 | print("") 65 | print( 66 | "Let's shut everything down, thanks for testing your model with GenoML!") 67 | print("") 68 | -------------------------------------------------------------------------------- /build/lib/genoml/cli/continuous_supervised_train.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The GenoML Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | import sys 17 | 18 | import numpy as np 19 | import pandas as pd 20 | 21 | from genoml import utils 22 | from genoml.continuous import supervised 23 | 24 | 25 | # TODO(mary): use or remove export_predictions 26 | @utils.DescriptionLoader.function_description("cli/continuous_supervised_train") 27 | def main(run_prefix, export_predictions, matching_columns_path): 28 | utils.DescriptionLoader.print("cli/continuous_supervised_train/info", 29 | python_version=sys.version, prefix=run_prefix) 30 | 31 | input_path = run_prefix + ".dataForML.h5" 32 | with utils.DescriptionLoader.context( 33 | "cli/continuous_supervised_train/input", path=input_path): 34 | df = pd.read_hdf(input_path, key="dataForML") 35 | 36 | if matching_columns_path: 37 | with utils.DescriptionLoader.context( 38 | "cli/continuous_supervised_train/matching_columns_path", 39 | matching_columns_path=matching_columns_path): 40 | with open(matching_columns_path, 'r') as matchingCols_file: 41 | matching_column_names_list = matchingCols_file.read().splitlines() 42 | 43 | df = df[np.intersect1d(df.columns, matching_column_names_list)] 44 | 45 | model = supervised.train(df, run_prefix) 46 | model.summary() 47 | model.compete() 48 | model.export_model() 49 | model.export_predictions() 50 | model.save_algorithm_results(run_prefix) 51 | model.save_best_algorithm(run_prefix) 52 | -------------------------------------------------------------------------------- /build/lib/genoml/cli/continuous_supervised_tune.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The GenoML Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | import pandas as pd 17 | 18 | from genoml.continuous import supervised 19 | 20 | 21 | def main(run_prefix, max_iter, cv_count): 22 | # TUNING 23 | # Create a dialogue with the user 24 | print("Here is some basic info on the command you are about to run.") 25 | print("CLI argument info...") 26 | print(f"Working with the dataset and best model corresponding to prefix {run_prefix} the timestamp from the merge is the prefix in most cases.") 27 | print(f"Your maximum number of tuning iterations is {max_iter} and if you are concerned about runtime, make this number smaller.") 28 | print(f"You are running {cv_count} rounds of cross-validation, and again... if you are concerned about runtime, make this number smaller.") 29 | print("Give credit where credit is due, for this stage of analysis we use code from the great contributors to python packages: argparse, xgboost, sklearn, pandas, numpy, time, matplotlib and seaborn.") 30 | print("As a note, in all exported probabilities and other graphics, case status is treated as a 0 or 1, with 1 representing a positive case.") 31 | 32 | print("") 33 | 34 | infile_h5 = run_prefix + ".dataForML.h5" 35 | df = pd.read_hdf(infile_h5, key = "dataForML") 36 | 37 | y_tune = df.PHENO 38 | X_tune = df.drop(columns=['PHENO']) 39 | IDs_tune = X_tune.ID 40 | X_tune = X_tune.drop(columns=['ID']) 41 | 42 | 43 | best_algo_name_in = run_prefix + '.best_algorithm.txt' 44 | best_algo_df = pd.read_csv(best_algo_name_in, header=None, index_col=False) 45 | best_algo = str(best_algo_df.iloc[0,0]) 46 | 47 | 48 | # Communicate to the user the best identified algorithm 49 | print(f"From previous analyses in the training phase, we've determined that the best algorithm for this application is {best_algo}... so let's tune it up and see what gains we can make!") 50 | 51 | # Tuning 52 | ## This calls on the functions made in the tune class (tuning.py) at the genoml.continuous.supervised 53 | model_tune = supervised.tune(df, run_prefix, max_iter, cv_count) 54 | model_tune.select_tuning_parameters() # Returns algo, hyperparameters, and scoring_metric 55 | model_tune.apply_tuning_parameters() # Randomized search with CV to tune 56 | model_tune.report_tune() # Summary of the top 10 iterations of the hyperparameter tune 57 | model_tune.summarize_tune() # Summary of the cross-validation 58 | model_tune.compare_performance() # Compares tuned performance to baseline to 59 | model_tune.export_tuned_data() # Export the newly tuned predictions 60 | model_tune.export_tune_regression() # Export the tuned and fitted regression model 61 | 62 | print("") 63 | print("End of tuning stage with GenoML.") 64 | print("") 65 | -------------------------------------------------------------------------------- /build/lib/genoml/cli/discrete_supervised_test.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The GenoML Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | import sys 17 | import joblib 18 | import pandas as pd 19 | 20 | from genoml.discrete import supervised 21 | 22 | def main(prefix, test_prefix, refModel_prefix): 23 | print("") 24 | print("Here is some basic info on the command you are about to run.") 25 | print("Python version info...") 26 | print(sys.version) 27 | 28 | # Print out the chosen CLI arguments 29 | print("CLI argument info...") 30 | print(f"You are importing this test dataset: {test_prefix}.") 31 | print(f"You are applying the model saved here: {refModel_prefix}.") 32 | print(f"The results of this test application of your model will be saved in files with the given prefix: {prefix}.") 33 | print("As a note, in all exported probabilities and other graphics, case status is treated as a 0 or 1, with 1 representing a positive case.") 34 | 35 | print("") 36 | 37 | # Specify prefix and dataframe variables to be passed into class 38 | run_prefix = prefix 39 | infile_h5 = test_prefix + ".dataForML.h5" 40 | df = pd.read_hdf(infile_h5, key = "dataForML") 41 | 42 | infile_model = refModel_prefix + ".joblib" 43 | loaded_model = joblib.load(infile_model) 44 | 45 | # Pass the arguments to the class 46 | test = supervised.test(df, loaded_model, run_prefix) 47 | 48 | # Prep and show the dataframe 49 | test.prep_df() 50 | 51 | # Export the ROC 52 | test.export_ROC() 53 | 54 | # Export the tested data 55 | test.export_tested_data() 56 | 57 | # Export the histograms 58 | test.export_histograms() 59 | 60 | # Export the additional summary stats 61 | test.additional_sumstats() 62 | 63 | # Thank the user 64 | print("") 65 | print("Let's shut everything down, thanks for testing your model with GenoML!") 66 | print("") 67 | 68 | 69 | -------------------------------------------------------------------------------- /build/lib/genoml/cli/discrete_supervised_train.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The GenoML Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | import sys 17 | import numpy as np 18 | import pandas as pd 19 | 20 | from genoml.discrete import supervised 21 | 22 | 23 | def main(prefix, metric_max, prob_hist, auc, matchingCols): 24 | print("") 25 | print("Here is some basic info on the command you are about to run.") 26 | print("Python Version info...") 27 | print(sys.version) 28 | 29 | # Print out chosen CLI arguments 30 | print("CLI argument info...") 31 | print(f"Working with dataset {prefix} from previous data munging efforts.") 32 | print(f"You have chosen to compete the algorithms based on {metric_max}.") 33 | print("Give credit where credit is due, for this stage of analysis we use code from the great contributors to Python packages: argparse, xgboost, sklearn, pandas, numpy, time, matplotlib and seaborn.") 34 | print("As a note, in all exported probabilities and other graphics, case status is treated as a 0 or 1, with 1 representing a positive case.") 35 | print("") 36 | 37 | # Specify prefix and dataframe variables to be passed into class 38 | run_prefix = prefix 39 | infile_h5 = run_prefix + ".dataForML.h5" 40 | df = pd.read_hdf(infile_h5, key = "dataForML") 41 | 42 | if (matchingCols != None): 43 | print(f"Looks like you are retraining your reference file. We are using the harmonized columns you provided here: {matchingCols}") 44 | print(f"Note that you might have different/less features than before, given this was harmonized between training and test dataset, and might mean your model now performs worse...") 45 | 46 | with open(matchingCols, 'r') as matchingCols_file: 47 | matching_column_names_list = matchingCols_file.read().splitlines() 48 | 49 | # Keep only the columns found in the file 50 | df = df[np.intersect1d(df.columns, matching_column_names_list)] 51 | 52 | model = supervised.train(df, run_prefix) 53 | model.summary() 54 | 55 | # Give user context prior to competing algorithms 56 | # Explains to users how we are splitting their data 70:30 57 | print("") 58 | print("Now let's compete these algorithms!") 59 | print("We'll update you as each algorithm runs, then summarize at the end.") 60 | print("Here we test each algorithm under default settings using the same training and test datasets derived from a 70% training and 30% testing split of your data.") 61 | print("For each algorithm, we will output the following metrics...") 62 | print("Algorithm name, hoping that's pretty self-explanatory. Plenty of resources on these common ML algorithms at https://scikit-learn.org and https://xgboost.readthedocs.io/.") 63 | print("AUC_percent, this is the area under the curve from receiver operating characteristic analyses. This is the most common metric of classifier performance in biomedical literature, we express this as a percent. We calculate AUC based on the predicted probability of being a case.") 64 | print("Accuracy_percent, this is the simple accuracy of the classifier, how many predictions were correct from best classification cutoff (python default).") 65 | print("Balanced_Accuracy_Percent, consider this as the accuracy resampled to a 1:1 mix of cases and controls. Imbalanced datasets can give funny results for simple accuracy.") 66 | print("Log_Loss, this is essentially the inverse of the likelihood function for a correct prediction, you want to minimize this.") 67 | print("Sensitivity, proportion of cases correctly identified.") 68 | print("Specificity, proportion of controls correctly identified.") 69 | print("PPV, this is the positive predictive value, the probability that subjects with a positive result actually have the disease.") 70 | print("NPV, this is the negative predictive value, the probability that subjects with a negative result don't have the disease.") 71 | print("We also log the runtimes per algorithm.") 72 | print("") 73 | print("Algorithm summaries incoming...") 74 | print("") 75 | 76 | # Compete the algorithms 77 | model.compete() 78 | 79 | # Output the results of the log 80 | model.results(metric_max) 81 | 82 | # Export the results 83 | model.export_model() 84 | 85 | # Export the AUC 86 | model.AUC(save=True) 87 | 88 | # Export the probability histograms 89 | model.export_prob_hist() 90 | 91 | # Save out the proper algorithm 92 | model.save_results(prefix, algorithmResults = True, bestAlgorithm = True) 93 | 94 | print("Thank you for training with GenoML!") -------------------------------------------------------------------------------- /build/lib/genoml/cli/discrete_supervised_tune.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The GenoML Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | import pandas as pd 17 | import numpy as np 18 | 19 | from genoml.discrete import supervised 20 | 21 | 22 | def main(run_prefix, metric_tune, max_iter, cv_count, matchingCols): 23 | # TUNING 24 | # Create a dialogue with the user 25 | print("Here is some basic info on the command you are about to run.") 26 | print("CLI argument info...") 27 | print(f"Working with the dataset and best model corresponding to prefix {run_prefix} the timestamp from the merge is the prefix in most cases.") 28 | print(f"You have chosen to tune the algorithms based on {metric_tune}.") 29 | print(f"Your maximum number of tuning iterations is {max_iter} and if you are concerned about runtime, make this number smaller.") 30 | print(f"You are running {cv_count} rounds of cross-validation, and again... if you are concerned about runtime, make this number smaller.") 31 | print("Give credit where credit is due, for this stage of analysis we use code from the great contributors to python packages: argparse, xgboost, sklearn, pandas, numpy, time, matplotlib and seaborn.") 32 | print("As a note, in all exported probabilities and other graphics, case status is treated as a 0 or 1, with 1 representing a positive case.") 33 | 34 | print("") 35 | 36 | infile_h5 = run_prefix + ".dataForML.h5" 37 | df = pd.read_hdf(infile_h5, key = "dataForML") 38 | 39 | # Addressing issue #12: 40 | if (matchingCols != None): 41 | print(f"We are using the harmonized columns you provided here: {matchingCols}") 42 | print(f"Note that you might have different/less features than before, given this was column list was harmonized between your reference and test dataset...") 43 | 44 | with open(matchingCols, 'r') as matchingCols_file: 45 | matching_column_names_list = matchingCols_file.read().splitlines() 46 | 47 | # Keep only the columns found in the file 48 | df = df[np.intersect1d(df.columns, matching_column_names_list)] 49 | 50 | best_algo_name_in = run_prefix + '.best_algorithm.txt' 51 | best_algo_df = pd.read_csv(best_algo_name_in, header=None, index_col=False) 52 | best_algo = str(best_algo_df.iloc[0,0]) 53 | 54 | # Communicate to the user the best identified algorithm 55 | print(f"From previous analyses in the training phase, we've determined that the best algorithm for this application is {best_algo}... so let's tune it up and see what gains we can make!") 56 | 57 | # Tuning 58 | ## This calls on the functions made in the tune class (tuning.py) at the genoml.discrete.supervised 59 | model_tune = supervised.tune(df, run_prefix, max_iter, cv_count) 60 | 61 | # Returns algo, hyperparameters, and scoring_metric 62 | model_tune.select_tuning_parameters(metric_tune) 63 | 64 | # Randomized search with CV to tune 65 | model_tune.apply_tuning_parameters() 66 | 67 | # Summary of the top 10 iterations of the hyperparameter tune 68 | model_tune.report_tune() 69 | 70 | # Summary of the cross-validation 71 | model_tune.summarize_tune() 72 | 73 | # Compares tuned performance to baseline to 74 | model_tune.compare_performance() 75 | 76 | # Export the ROC curve 77 | # model_tune.ROC() 78 | 79 | # Export the newly tuned predictions 80 | model_tune.export_tuned_data() 81 | 82 | # Export the probabilites 83 | model_tune.export_tune_hist_prob() 84 | 85 | 86 | print("") 87 | print("End of tuning stage with GenoML.") 88 | print("") 89 | -------------------------------------------------------------------------------- /build/lib/genoml/cli/harmonizing.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The GenoML Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | import sys 17 | from genoml import preprocessing 18 | 19 | 20 | def main(test_geno_prefix, test_prefix, ref_model_prefix, 21 | training_snps_alleles): 22 | # Print configurations 23 | print("Here is some basic info on the command you are about to run.") 24 | print("Python version info...") 25 | print(sys.version) 26 | print("CLI argument info...") 27 | print(f"You are importing test dataset {test_geno_prefix}.") 28 | print( 29 | f"Applying the model saved from your reference dataset in {ref_model_prefix}.") 30 | print( 31 | f"Reading in the SNP and allele information we will use to compare from {training_snps_alleles}.") 32 | print( 33 | f"The results of this test application of your model will be saved in files tagged {test_prefix}.") 34 | print( 35 | "As a note, in all exported probabilities and other graphics, case status is treated as a 0 or 1, with 1 representing a positive case.") 36 | 37 | # Run the harmonize script in genoml.preprocessing 38 | harmonizer = preprocessing.harmonizing(test_geno_prefix=test_geno_prefix, 39 | test_out_prefix=test_prefix, 40 | ref_model_prefix=ref_model_prefix, 41 | training_SNPs=training_snps_alleles) 42 | 43 | # Generate new binaries from the test dataset using the reference dataset SNPs 44 | harmonizer.generate_new_PLINK() 45 | 46 | # Read in PLINK binaries 47 | # harmonizer.read_PLINK() 48 | 49 | # Generate reference columns to keep for munging 50 | harmonizer.prep_refCols_file() 51 | 52 | # Thank the user 53 | print("Thank you for harmonizing with GenoML!") 54 | -------------------------------------------------------------------------------- /build/lib/genoml/cli/munging.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The GenoML Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | import argparse 17 | import sys 18 | 19 | import genoml.dependencies 20 | from genoml import preprocessing 21 | 22 | 23 | def main(prefix, impute, geno, skip_prune, r2_cutoff, pheno, addit, feature_selection, gwas, p, vif, iter, ref_cols_harmonize, umap_reduce, adjust_data, adjust_normalize, target_features, confounders, data_type): 24 | genoml.dependencies.check_dependencies() 25 | 26 | run_prefix = prefix 27 | impute_type = impute 28 | geno_path = geno 29 | prune_choice = skip_prune 30 | pheno_path = pheno 31 | addit_path = addit 32 | n_est = feature_selection 33 | gwas_path = gwas 34 | p_gwas = p 35 | r2_cutoff = r2_cutoff 36 | vif_thresh = vif 37 | vif_iter = iter 38 | refColsHarmonize = ref_cols_harmonize 39 | umap_reduce = umap_reduce 40 | adjust_data = adjust_data 41 | adjust_normalize = adjust_normalize 42 | target_features = target_features 43 | confounders = confounders 44 | 45 | # Print configurations 46 | print("") 47 | print("Here is some basic info on the command you are about to run.") 48 | print("Python version info...") 49 | print(sys.version) 50 | print("CLI argument info...") 51 | print( 52 | f"The output prefix for this run is {run_prefix} and will be appended to later runs of GenoML.") 53 | print(f"Working with genotype data? {geno_path}") 54 | print(f"Do you want GenoML to prune your SNPs for you? {prune_choice}") 55 | print(f"The pruning threshold you've chosen is {r2_cutoff}") 56 | print(f"Working with additional predictors? {addit_path}") 57 | print(f"Where is your phenotype file? {pheno_path}") 58 | print(f"Any use for an external set of GWAS summary stats? {gwas_path}") 59 | print( 60 | f"If you plan on using external GWAS summary stats for SNP filtering, we'll only keep SNPs at what P value? {p_gwas}") 61 | print(f"How strong is your VIF filter? {vif_thresh}") 62 | print(f"How many iterations of VIF filtering are you doing? {vif_iter}") 63 | print( 64 | f"The imputation method you picked is using the column {impute_type} to fill in any remaining NAs.") 65 | print(f"Will you be adjusting additional features using UMAP dimensionality reduction? {umap_reduce}") 66 | print( 67 | "Give credit where credit is due, for this stage of analysis we use code from the great contributors to python packages: os, sys, argparse, numpy, pandas, joblib, math and time. We also use PLINK v1.9 from https://www.cog-genomics.org/plink/1.9/.") 68 | print("") 69 | 70 | # Run the munging script in genoml.preprocessing 71 | munger = preprocessing.munging(pheno_path=pheno_path, run_prefix=run_prefix, impute_type=impute_type, skip_prune=prune_choice, 72 | p_gwas=p_gwas, addit_path=addit_path, gwas_path=gwas_path, geno_path=geno_path, refColsHarmonize=refColsHarmonize, r2_cutoff=r2_cutoff) 73 | 74 | # Process the PLINK inputs (for pruning) 75 | df = munger.plink_inputs() 76 | 77 | # Run the UMAP dimension reduction/ adjuster 78 | if (adjust_data == "yes" or umap_reduce == "yes"): 79 | adjuster = preprocessing.adjuster(run_prefix, df, target_features, confounders, adjust_data, adjust_normalize, umap_reduce) 80 | reduced_df = adjuster.umap_reducer() 81 | if (adjust_data == "yes"): 82 | print(f"\n You have chosen to adjust your data! \n") 83 | if (adjust_normalize == "yes"): 84 | print(f"\n You have also chosen to normalize your adjusted data \n") 85 | else: 86 | print(f"\n You have also chosen NOT to normalize your adjusted data \n") 87 | df = adjuster.normalize(reduced_df) 88 | 89 | # Run the feature selection using extraTrees 90 | if n_est > 0: 91 | featureSelection_df = preprocessing.featureselection(run_prefix, df, data_type, n_est) 92 | df = featureSelection_df.rank() 93 | featureSelection_df.export_data() 94 | 95 | # Run the VIF calculation 96 | if vif_iter > 0: 97 | vif_calc = preprocessing.vif(vif_iter, vif_thresh, df, 100, run_prefix) 98 | vif_calc.vif_calculations() 99 | 100 | # Thank the user 101 | print("Thank you for munging with GenoML!") 102 | -------------------------------------------------------------------------------- /build/lib/genoml/continuous/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The GenoML Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | from genoml.continuous import supervised 17 | -------------------------------------------------------------------------------- /build/lib/genoml/continuous/supervised/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The GenoML Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | from genoml.continuous.supervised.training import train 17 | from genoml.continuous.supervised.tuning import tune 18 | from genoml.continuous.supervised.testing import test 19 | -------------------------------------------------------------------------------- /build/lib/genoml/continuous/supervised/testing.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The GenoML Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | # Import the necessary packages 17 | import joblib 18 | import matplotlib.pyplot as plt 19 | import pandas as pd 20 | import seaborn as sns 21 | import sklearn 22 | import sys 23 | import xgboost 24 | import numpy as np 25 | from time import time 26 | import statsmodels.formula.api as sm 27 | from sklearn.metrics import explained_variance_score, mean_squared_error, median_absolute_error, r2_score 28 | 29 | class test: 30 | def __init__(self, df, loaded_model, run_prefix): 31 | self.df = df 32 | self.run_prefix = run_prefix 33 | self.loaded_model = loaded_model 34 | 35 | def prep_df(self): 36 | 37 | print("") 38 | print("Your data looks like this (showing the first few lines of the left-most and right-most columns)...") 39 | print("#"*70) 40 | print(self.df.describe()) 41 | print("#"*70) 42 | print("") 43 | 44 | # Save out and drop the PHENO and sample ID columns 45 | y_test = self.df.PHENO 46 | X_test = self.df.drop(columns=['PHENO']) 47 | IDs_test = X_test.ID 48 | X_test = X_test.drop(columns=['ID']) 49 | 50 | # Save variables to use globally within the class 51 | self.y_test = y_test 52 | self.X_test = X_test 53 | self.IDs_test = IDs_test 54 | 55 | return X_test 56 | 57 | def performance_metrics(self): 58 | 59 | log_cols=["Explained_variance_score", "Mean_squared_error", "Median_absolute_error", "R2_score"] 60 | log_table = pd.DataFrame(columns=log_cols) 61 | 62 | self.loaded_model.fit(self.X_test, self.y_test) 63 | 64 | print("") 65 | print("#"*70) 66 | 67 | test_predictions = self.loaded_model.predict(self.X_test) 68 | test_predictions = test_predictions 69 | evs = explained_variance_score(self.y_test, test_predictions) 70 | print("Explained variance score: {:.4}".format(evs)) 71 | 72 | test_predictions = self.loaded_model.predict(self.X_test) 73 | test_predictions = test_predictions 74 | mse = mean_squared_error(self.y_test, test_predictions) 75 | print("Mean squared error: {:.4}".format(mse)) 76 | 77 | test_predictions = self.loaded_model.predict(self.X_test) 78 | test_predictions = test_predictions 79 | mae = median_absolute_error(self.y_test, test_predictions) 80 | print("Median absolute error: {:.4}".format(mae)) 81 | 82 | test_predictions = self.loaded_model.predict(self.X_test) 83 | test_predictions = test_predictions 84 | r2s = r2_score(self.y_test, test_predictions) 85 | print("R^2 score: {:.4}".format(r2s)) 86 | 87 | log_entry = pd.DataFrame([[evs, mse, mae, r2s]], columns=log_cols) 88 | log_table = log_table.append(log_entry) 89 | 90 | print("#"*70) 91 | 92 | print("") 93 | 94 | log_outfile = self.run_prefix + '.testedModel_allSamples_performanceMetrics.csv' 95 | 96 | print("") 97 | print(f"This table below is also logged as {log_outfile} and is in your current working directory...") 98 | print("#"*70) 99 | print(log_table) 100 | print("#"*70) 101 | print("") 102 | 103 | log_table.to_csv(log_outfile, index=False) 104 | 105 | self.log_table = log_table 106 | return log_table 107 | 108 | def export_pheno_predictions(self): 109 | 110 | test_predicted_values = self.loaded_model.predict(self.X_test) 111 | test_predicted_values_df = pd.DataFrame(test_predicted_values) 112 | y_test_df = pd.DataFrame(self.y_test) 113 | IDs_test_df = pd.DataFrame(self.IDs_test) 114 | 115 | test_out = pd.concat([IDs_test_df.reset_index(), y_test_df.reset_index(drop=True), test_predicted_values_df.reset_index(drop=True)], axis = 1, ignore_index=True) 116 | test_out.columns=["INDEX","ID","PHENO_REPORTED","PHENO_PREDICTED"] 117 | test_out = test_out.drop(columns=["INDEX"]) 118 | 119 | test_outfile = self.run_prefix + '.testedModel_allSample_predictions.csv' 120 | test_out.to_csv(test_outfile, index=False) 121 | 122 | print("") 123 | print(f"Preview of the exported predictions exported as {test_outfile}, these are pretty straight forward.") 124 | print("They generally include the sample ID, the previously reported phenotype, and the predicted phenotype from that algorithm.") 125 | print("#"*70) 126 | print(test_out.head()) 127 | print("#"*70) 128 | 129 | self.test_out = test_out 130 | return test_out 131 | 132 | def regression_summary(self): 133 | 134 | genoML_colors = ["cyan","purple"] 135 | 136 | sns_plot = sns.regplot(data=self.test_out, y="PHENO_REPORTED", x="PHENO_PREDICTED", scatter_kws={"color": "cyan"}, line_kws={"color": "purple"}) 137 | 138 | plot_out = self.run_prefix + '.testedModel_allSamples_regressionPlot.png' 139 | sns_plot.figure.savefig(plot_out, dpi=600) 140 | 141 | print("") 142 | print(f"We are also exporting a regression plot for you here {plot_out}, this is a graphical representation of the difference between the reported and predicted phenotypes in the withheld test data for the best performing algorithm.") 143 | 144 | print("") 145 | print("Here is a quick summary of the regression comparing PHENO_REPORTED ~ PHENO_PREDICTED in the withheld test data...") 146 | print("") 147 | 148 | reg_model = sm.ols(formula='PHENO_REPORTED ~ PHENO_PREDICTED', data=self.test_out) 149 | fitted = reg_model.fit() 150 | print(fitted.summary()) 151 | 152 | fitted_out = self.run_prefix + 'testedModel_allSamples_regressionSummary.csv' 153 | 154 | with open(fitted_out, 'w') as fh: 155 | fh.write(fitted.summary().as_csv()) 156 | 157 | print(f"We are exporting this summary here: {fitted_out}") 158 | 159 | print("") 160 | print("...always good to see the P value for the predictor.") 161 | -------------------------------------------------------------------------------- /build/lib/genoml/dependencies.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The GenoML Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | import io 17 | import logging 18 | import os 19 | import pathlib 20 | import platform 21 | import requests 22 | import stat 23 | import subprocess 24 | import zipfile 25 | 26 | from genoml import utils 27 | 28 | 29 | def __get_executable_folder(): 30 | key = "GENOML_DEP_DIR" 31 | if key in os.environ: 32 | return os.path.abspath(os.environ.get(key)) 33 | else: 34 | return os.path.join(str(pathlib.Path.home()), ".genoml", "misc", 35 | "executables") 36 | 37 | 38 | __executable_folder = __get_executable_folder() 39 | 40 | 41 | def __check_exec(exec_path, *args, absolute_path=False): 42 | if not absolute_path: 43 | binary_path = os.path.join(__executable_folder, exec_path) 44 | else: 45 | binary_path = exec_path 46 | if not os.path.exists(binary_path): 47 | return False 48 | 49 | _ = subprocess.run([binary_path, *args], stdout=subprocess.DEVNULL, 50 | stderr=subprocess.DEVNULL) 51 | return True 52 | 53 | 54 | def __install_exec(url, exec_path): 55 | r = requests.get(url, verify=False, stream=True) 56 | r.raw.decode_content = True 57 | buffer = io.BytesIO() 58 | buffer.write(r.content) 59 | with zipfile.ZipFile(buffer, "r") as fp: 60 | fp.extractall(__executable_folder) 61 | 62 | binary_path = os.path.join(__executable_folder, exec_path) 63 | os.chmod(binary_path, stat.S_IEXEC) 64 | 65 | 66 | def __check_package(name): 67 | platform_system = platform.system() 68 | 69 | if name not in __DEPENDENCIES: 70 | raise EnvironmentError("Unknown package: {}".format(name)) 71 | 72 | if platform_system not in __DEPENDENCIES[name]: 73 | raise EnvironmentError( 74 | "Unknown supported OK: {}".format(platform_system)) 75 | 76 | entry = __DEPENDENCIES[name][platform_system] 77 | 78 | binary_name = entry["binary"] 79 | args = entry["version_args"] 80 | url = entry["url"] 81 | 82 | if __check_exec(binary_name, *args): 83 | logging.debug("{} is found".format(name)) 84 | return os.path.join(__executable_folder, binary_name) 85 | 86 | logging.warning("Installing {}".format(name)) 87 | __install_exec(url, binary_name) 88 | if not __check_exec(binary_name, *args): 89 | logging.warning("Failed to run {} after installation".format(name)) 90 | raise EnvironmentError("Can not install {}".format(name)) 91 | else: 92 | return os.path.join(__executable_folder, binary_name) 93 | 94 | 95 | @utils.DescriptionLoader.function_description("check_dependencies") 96 | def check_dependencies(): 97 | global __DEPENDENCIES 98 | ret = {} 99 | for package, data in __DEPENDENCIES.items(): 100 | if "checker" in data: 101 | with utils.DescriptionLoader.context( 102 | "check_dependencies_{}".format(package)): 103 | ret[package] = data["checker"]() 104 | 105 | return ret 106 | 107 | 108 | def check_plink(): 109 | return __check_package("Plink") 110 | 111 | 112 | __DEPENDENCIES = { 113 | "Plink": { 114 | "checker": check_plink, 115 | "Darwin": { 116 | "binary": "plink", 117 | "version_args": ["--version"], 118 | "url": "http://s3.amazonaws.com/plink1-assets/plink_mac_20200219.zip" 119 | }, 120 | "Linux": { 121 | "binary": "plink", 122 | "version_args": ["--version"], 123 | "url": "http://s3.amazonaws.com/plink1-assets/plink_linux_x86_64_20200219.zip" 124 | } 125 | }, 126 | } 127 | -------------------------------------------------------------------------------- /build/lib/genoml/discrete/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The GenoML Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | from genoml.discrete import supervised -------------------------------------------------------------------------------- /build/lib/genoml/discrete/supervised/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The GenoML Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | from genoml.discrete.supervised.training import train 17 | from genoml.discrete.supervised.tuning import tune 18 | from genoml.discrete.supervised.testing import test -------------------------------------------------------------------------------- /build/lib/genoml/misc/descriptions.json: -------------------------------------------------------------------------------- 1 | { 2 | "check_dependencies_Plink": { 3 | "title": "Checking plink", 4 | "description": "", 5 | "error": "" 6 | }, 7 | "check_dependencies": { 8 | "title": "Dependency Check", 9 | "description": "", 10 | "end": true, 11 | "error": "" 12 | }, 13 | "cli/continuous_supervised_train": { 14 | "title": "GenoML", 15 | "description": "Continuous Supervised Train", 16 | "end": true, 17 | "error": "" 18 | }, 19 | "cli/continuous_supervised_train/info": { 20 | "title": "Basic Info", 21 | "description": "Here is some basic info on the command you are about to run.\nPython version info:\n{python_version}\n\nWorking with dataset from previous data munging efforts at:\n\t{prefix}", 22 | "error": "" 23 | }, 24 | "cli/continuous_supervised_train/input": { 25 | "title": "Reading Input File: {path}", 26 | "description": "", 27 | "error": "" 28 | }, 29 | "cli/continuous_supervised_train/matching_columns_path": { 30 | "title": "", 31 | "description": "Looks like you are retraining your reference file. We are using the harmonized columns you provided here: {matching_columns_path}\nNote that you might have different/less features than before, given this was harmonized between training and test dataset, and might mean your model now performs worse...", 32 | "error": "" 33 | }, 34 | "continuous/supervised/training/Train/summary": { 35 | "title": "Input Data Summary", 36 | "description": "Your data looks like this (showing the first few lines of the left-most and right-most columns)...\n\n{data}", 37 | "error": "" 38 | }, 39 | "continuous/supervised/training/Train/compete": { 40 | "title": "Compete the algorithms", 41 | "description": "Now let's compete these algorithms!\nWe'll update you as each algorithm runs, then summarize at the end.\nHere we test each algorithm under default settings using the same training and test datasets derived from a 70% training and 30% testing split of your data.\nFor each algorithm, we will output the following metrics...\nAlgorithm name, hoping that's pretty self-explanatory. Plenty of resources on these common ML algorithms at https://scikit-learn.org and https://xgboost.readthedocs.io/.\nexplained_variance_score, this is the variance explained by the model per algorithm (scale from 0 to 1 with 1 being completely explained).\nmean_squared_error, this is the mean squared error from regression loss.\nmedian_absolute_error, median absolute error from regression loss.\nr2_score, standard r2 metric from linear regression (coefficient of determination), remember, this can be negative if your model is really bad.\nWe also log the runtimes per algorithm.\n\nAlgorithm summaries incoming...", 42 | "end": true, 43 | "error": "" 44 | }, 45 | "continuous/supervised/training/Train/compete/algorithm": { 46 | "title": "{name}", 47 | "description": "", 48 | "error": "" 49 | }, 50 | "continuous/supervised/training/Train/compete/algorithm/results": { 51 | "title": "{name} Results", 52 | "description": "{results}", 53 | "error": "" 54 | }, 55 | "continuous/supervised/training/Train/compete/algorithm/best": { 56 | "title": "Best Algorithm: {algorithm}", 57 | "description": "There are occasionally slight fluctuations in model performance on the same withheld samples.\n{metrics}", 58 | "error": "" 59 | }, 60 | "continuous/supervised/training/Train/export_model": { 61 | "title": "Exporting Model: {output_path}", 62 | "description": "this model has been saved as {output_path} for later use and can be found in your working directory.", 63 | "end": true, 64 | "error": "" 65 | }, 66 | "continuous/supervised/training/Train/save_algorithm_results": { 67 | "title": "Saving Algorithm Results: {output_path}", 68 | "description": "This table below is also logged as {output_path} and is in your current working directory...\n\n{data}", 69 | "end": true, 70 | "error": "" 71 | }, 72 | "continuous/supervised/training/Train/save_best_algorithm": { 73 | "title": "Saving Best Algorithm: {output_path}", 74 | "description": "Based on your withheld samples, the algorithm with the highest explained variance score is the {best_algorithm}... let's save that model name for you on {output_path}.", 75 | "end": true, 76 | "error": "" 77 | }, 78 | "continuous/supervised/training/Train/export_predictions/test_data": { 79 | "title": "Saving Prediction on Test Data: {output_path}", 80 | "description": "Preview of the exported predictions for the withheld test data that has been exported as {output_path} these are pretty straight forward.\nThey generally include the sample ID, the previously reported phenotype and the predicted phenotype from that algorithm,\n\n{data}", 81 | "end": true, 82 | "error": "" 83 | }, 84 | "continuous/supervised/training/Train/export_predictions/train_data": { 85 | "title": "Saving Prediction on Train Data: {output_path}", 86 | "description": "Preview of the exported predictions for the training samples which is naturally overfit and exported as {output_path} in the similar format as in the withheld test dataset that was just exported.\n\n{data}", 87 | "end": true, 88 | "error": "" 89 | }, 90 | "continuous/supervised/training/Train/export_predictions/plot": { 91 | "title": "Saving Regression Plot: {output_path}", 92 | "description": "Here is a quick summary of the regression comparing PHENO_REPORTED ~ PHENO_PREDICTED in the withheld test data...\n{data}\n...always good to see the P for the predictor.\n\nWe are also exporting a regression plot for you here {output_path} this is a graphical representation of the difference between the reported and predicted phenotypes in the withheld test data for the best performing algorithm.", 93 | "end": true, 94 | "error": "" 95 | } 96 | } 97 | -------------------------------------------------------------------------------- /build/lib/genoml/preprocessing/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The GenoML Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | from genoml.preprocessing.munging import munging 17 | from genoml.preprocessing.vif import vif 18 | from genoml.preprocessing.featureselection import featureselection 19 | from genoml.preprocessing.harmonizing import harmonizing 20 | from genoml.preprocessing.adjuster import adjuster 21 | 22 | __all__ = [] 23 | -------------------------------------------------------------------------------- /build/lib/genoml/preprocessing/adjuster.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The GenoML Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | import pandas as pd 17 | import numpy as np 18 | import statsmodels.api as sm 19 | import statsmodels.formula.api as smf 20 | import statistics 21 | import umap.umap_ as umap 22 | from joblib import dump, load 23 | import matplotlib.pyplot as plt 24 | from matplotlib import style 25 | import seaborn as sns 26 | 27 | class adjuster: 28 | def __init__(self, run_prefix, df, target_features, confounders, adjust_data, adjust_normalize, umap_reduce): 29 | self.run_prefix = run_prefix 30 | self.umap_reduce = umap_reduce 31 | self.target_columns = target_features 32 | self.confounders = confounders 33 | self.adjust_data = adjust_data 34 | self.normalize_switch = adjust_normalize 35 | 36 | df = self.run_prefix + ".dataForML.h5" 37 | self.munged_data = df 38 | 39 | self.target_data_df = pd.read_hdf(self.munged_data, 'dataForML') 40 | self.target_column_df = pd.read_csv(self.target_columns, names=['TARGETS']) 41 | 42 | self.confounders_df = pd.read_csv(self.confounders) 43 | 44 | # Keep only intersecting feature names left in munged set (removed either because --gwas or std dev of 0 etc.) 45 | target_data_list = self.target_data_df.columns 46 | target_column_list = self.target_column_df['TARGETS'].tolist() 47 | intersecting_list = list(set(target_data_list).intersection(set(target_column_list))) 48 | self.target_column_df = pd.DataFrame(intersecting_list,columns=['TARGETS']) 49 | 50 | def umap_reducer(self): 51 | 52 | if (self.umap_reduce == "yes"): 53 | IDs = self.confounders_df['ID'] 54 | IDs_df = pd.DataFrame(IDs) 55 | to_umap = self.confounders_df.drop(columns=['ID']) 56 | 57 | reducer = umap.UMAP(random_state=153) 58 | embedding = reducer.fit_transform(to_umap) 59 | 60 | embedding1 = pd.DataFrame(embedding[:,0]) 61 | embedding2 = pd.DataFrame(embedding[:,1]) 62 | 63 | out_data = pd.concat([IDs_df.reset_index(), embedding1.reset_index(drop=True), embedding2.reset_index(drop=True)], axis=1, ignore_index=True) 64 | out_data.columns = ['INDEX', 'ID', 'UMAP_embedding1', "UMAP_embedding2"] 65 | out_data = out_data.drop(columns=['INDEX']) 66 | 67 | # Plot 68 | print(f"Exporting UMAP plot...") 69 | fig, ax = plt.subplots(figsize=(12,10)) 70 | plt.scatter(embedding[:,0], embedding[:,1], cmap="cool") 71 | plt.title("Data Reduction to 2 Dimensions by UMAP", fontsize=18) 72 | plot_out = self.run_prefix + '.umap_plot.png' 73 | plt.savefig(plot_out, dpi=600) 74 | 75 | print(f"The UMAP plot has been exported and can be found here: {plot_out}") 76 | 77 | out_file = self.runplot_out = self.run_prefix + '.umap_data_reduction.csv' 78 | out_data.to_csv(out_file, index=False) 79 | 80 | print(f"The reduced UMAP 2 dimensions per sample .csv file can be found here: {out_file}") 81 | 82 | exported_reducer = reducer.fit(to_umap) 83 | algo_out = self.runplot_out = self.run_prefix + '.umap_clustering.joblib' 84 | dump(exported_reducer, algo_out) 85 | 86 | self.confounders_df = out_data 87 | 88 | print(f"The UMAP .joblib file can be found here: {algo_out}") 89 | 90 | return self.confounders_df 91 | 92 | def normalize(self, confounders_df): 93 | target_list = list(self.target_column_df['TARGETS']) 94 | confounder_list = list(confounders_df.columns[1:]) 95 | columns_to_keep_list = list(self.target_data_df.columns) 96 | 97 | adjustments_df = self.target_data_df.merge(confounders_df, how='inner', on='ID', suffixes=['', '_y']) 98 | 99 | formula_for_confounders = ' + '.join(confounder_list) 100 | 101 | for target in target_list: 102 | current_target = str(target) 103 | print(f"Looking at the following feature: {current_target}") 104 | 105 | current_formula = current_target + " ~ " + formula_for_confounders 106 | print(current_formula) 107 | 108 | target_model = smf.ols(formula=current_formula, data=adjustments_df).fit() 109 | 110 | if (self.normalize_switch == 'yes'): 111 | adjustments_df['temp'] = pd.to_numeric(target_model.resid) 112 | #print(type(adjustments_df['temp'])) 113 | mean_scalar = adjustments_df['temp'].mean() 114 | sd_scalar = adjustments_df['temp'].std() 115 | adjustments_df[current_target] = (adjustments_df['temp'] - mean_scalar)/sd_scalar 116 | adjustments_df.drop(columns=['temp'], inplace=True) 117 | else: 118 | adjustments_df[current_target] = pd.to_numeric(target_model.resid) 119 | 120 | adjusted_df = adjustments_df[columns_to_keep_list] 121 | 122 | outfile_h5 = self.run_prefix + ".dataForML.h5" 123 | adjusted_df.to_hdf(outfile_h5, key='dataForML', mode='w') 124 | 125 | if (self.normalize_switch == 'yes'): 126 | print(f"\n The adjusted dataframe following normalization can be found here: {outfile_h5}, your updated .dataForML file \n") 127 | else: 128 | print(f"\n The adjusted dataframe without normalization can be found here: {outfile_h5}, your updated .dataForML file \n") 129 | 130 | 131 | return adjusted_df 132 | -------------------------------------------------------------------------------- /build/lib/genoml/preprocessing/featureselection.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The GenoML Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | import pandas as pd 17 | from sklearn import ensemble 18 | from sklearn import feature_selection 19 | 20 | class featureselection: 21 | def __init__(self, run_prefix, df, data_type, n_est): 22 | self.run_prefix = run_prefix 23 | self.featureRanks = None 24 | self.n_est = n_est 25 | self.data_type = data_type 26 | 27 | # Double check there are no NAs in the dataset before proceeding 28 | remove_cols = df.columns[df.isna().any()].tolist() 29 | df.drop(remove_cols, axis=1, inplace=True) 30 | 31 | self.y = df['PHENO'] 32 | self.X = df.drop(columns=['PHENO']) 33 | X = self.X 34 | self.IDs = X.ID 35 | self.X = X.drop(columns=['ID']) 36 | 37 | def rank(self): 38 | print(f""" 39 | Beginning featureSelection using {self.n_est} estimators...""") 40 | 41 | if (self.data_type == "d"): 42 | print(f""" 43 | using extraTrees Classifier for your discrete dataset 44 | """) 45 | clf = ensemble.ExtraTreesClassifier(n_estimators=self.n_est) 46 | 47 | if (self.data_type == "c"): 48 | print(f""" 49 | using extraTrees Regressor for your continuous dataset 50 | """) 51 | clf = ensemble.ExtraTreesRegressor(n_estimators=self.n_est) 52 | 53 | clf.fit(self.X, self.y) 54 | self.featureRanks = clf.feature_importances_ 55 | 56 | # Code to drop the features below threshold and return the data set like it was (aka add PHENO and IDs back) 57 | model = feature_selection.SelectFromModel(clf, prefit=True) # find this import at top 58 | df_editing = model.transform(self.X) 59 | print(""" 60 | Printing feature name that corresponds to the dataframe column name, then printing the relative importance as we go... 61 | """) 62 | 63 | list_featureScores = [] 64 | 65 | for col,score in zip(self.X.columns,clf.feature_importances_): 66 | print(col,score) 67 | list_featureScores.append([col, score]) 68 | 69 | df_featureScores = pd.DataFrame(list_featureScores, columns=["Feature_Name", "Score"]) 70 | #df_featureScores = df_featureScores[df_featureScores['Score'] !=0] 71 | df_featureScores = df_featureScores.sort_values(by=['Score'], ascending=False) 72 | featureScores_outfile = self.run_prefix + ".approx_feature_importance.txt" 73 | df_featureScores.to_csv(featureScores_outfile, index=False, sep="\t") 74 | 75 | print(f""" 76 | You have reduced your dataset to {df_editing.shape[0]} samples at {df_editing.shape[1]} features, not including ID and PHENO. 77 | """) 78 | 79 | y_df = self.y 80 | ID_df = pd.DataFrame(self.IDs) 81 | features_selected = model.get_support() 82 | X_reduced = self.X.iloc[:,features_selected] 83 | df_selecta = pd.concat([ID_df.reset_index(drop=True), y_df.reset_index(drop=True), X_reduced.reset_index(drop=True)], axis = 1, ignore_index=False) 84 | 85 | self.df_selecta = df_selecta 86 | self.featureScores_outfile = featureScores_outfile 87 | 88 | return df_selecta 89 | 90 | def export_data(self): 91 | ## Export reduced data 92 | outfile_h5 = self.run_prefix + ".dataForML.h5" 93 | self.df_selecta.to_hdf(outfile_h5, key='dataForML') 94 | 95 | features_list = self.df_selecta.columns.values.tolist() 96 | 97 | features_listpath = self.run_prefix + ".list_features.txt" 98 | with open(features_listpath, 'w') as f: 99 | for feature in features_list: 100 | f.write("%s\n" % feature) 101 | 102 | print(f"""Exporting a new {outfile_h5} file that has a reduced feature set based on your importance approximations. 103 | This is a good dataset for general ML applications for the chosen PHENO as it includes only features that are likely to impact the model. 104 | 105 | An updated list of {len(features_list)} features, including ID and PHENO, that is in your munged dataForML.h5 file can be found here {features_listpath} 106 | 107 | A file with all your features, ranked from largest contributors at the top to smallest contributors at the bottom, can be found at {self.featureScores_outfile}. 108 | """) 109 | -------------------------------------------------------------------------------- /build/lib/genoml/preprocessing/harmonizing.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The GenoML Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | # Import the necessary packages 17 | import subprocess 18 | import numpy as np 19 | import sys 20 | import joblib 21 | import pandas as pd 22 | from pandas_plink import read_plink1_bin 23 | 24 | # Define the munging class 25 | import genoml.dependencies 26 | 27 | class harmonizing: 28 | def __init__(self, test_geno_prefix, test_out_prefix, ref_model_prefix, training_SNPs): 29 | 30 | # Initializing the variables we will use 31 | self.test_geno_prefix = test_geno_prefix 32 | self.test_out_prefix = test_out_prefix 33 | self.ref_model_prefix = ref_model_prefix 34 | self.training_SNPs = training_SNPs 35 | 36 | infile_h5 = ref_model_prefix + ".dataForML.h5" 37 | self.df = pd.read_hdf(infile_h5, key = "dataForML") 38 | 39 | def generate_new_PLINK(self): 40 | # Show first few lines of the dataframe 41 | print("") 42 | print("Your data looks like this (showing the first few lines of the left-most and right-most columns)...") 43 | print("#"*70) 44 | print(self.df.describe()) 45 | print("#"*70) 46 | print("") 47 | 48 | # Save out and drop the PHENO and sample ID columns 49 | y_test = self.df.PHENO 50 | X_test = self.df.drop(columns=['PHENO']) 51 | IDs_test = X_test.ID 52 | X_test = X_test.drop(columns=['ID']) 53 | 54 | # Save variables to use globally within the class 55 | self.y_test = y_test 56 | self.X_test = X_test 57 | self.IDs_test = IDs_test 58 | 59 | # Read in the column of SNPs from the SNP+Allele file read in 60 | snps_alleles_df = pd.read_csv(self.training_SNPs, header=None) 61 | snps_only = snps_alleles_df.iloc[:, 0] 62 | snps_temp = self.test_out_prefix + '.SNPS_only_toKeep_temp.txt' 63 | snps_only.to_csv(snps_temp, header=None, index=False) 64 | 65 | print(f"A temporary file of SNPs from your reference dataset to keep in your testing dataset has been exported here: {snps_temp}") 66 | 67 | # Prepare the bashes to keep the SNPs of interest from the reference dataset 68 | plink_exec = genoml.dependencies.check_plink() 69 | 70 | # Creating outfile with SNPs 71 | # Force the allele designations based on the reference dataset 72 | plink_outfile = self.test_out_prefix + ".refSNPs_andAlleles" 73 | 74 | print("") 75 | print(f"Now we will create PLINK binaries where the reference SNPS and alleles will be based off of your file here: {self.training_SNPs}") 76 | print("") 77 | 78 | bash1 = f"{plink_exec} --bfile " + self.test_geno_prefix + " --extract " + snps_temp + " --reference-allele " + self.training_SNPs + " --make-bed --out " + plink_outfile 79 | # Remove the .log file 80 | bash2 = "rm " + plink_outfile + ".log" 81 | # Remove the .SNPS_only_toKeep_temp.txt file 82 | bash3 = "rm " + snps_temp 83 | 84 | cmds_a = [bash1, bash2, bash3] 85 | 86 | for cmd in cmds_a: 87 | subprocess.run(cmd, shell=True) 88 | 89 | self.plink_outfile = plink_outfile 90 | 91 | print("") 92 | print(f"A new set of PLINK binaries generated from your test dataset with the SNPs you decided to keep from the reference dataset have been made here: {plink_outfile}") 93 | print("") 94 | 95 | # def read_PLINK(self): 96 | # # Read in using pandas PLINK (similar to munging) 97 | 98 | # bed_file = self.plink_outfile + ".bed" 99 | # plink_files_py = read_plink1_bin(bed_file) 100 | # plink_files = plink_files_py.drop(['fid','father','mother','gender', 'trait', 'chrom', 'cm', 'pos','a1']) 101 | 102 | # plink_files = plink_files.set_index({'sample':'iid','variant':'snp'}) 103 | # plink_files.values = plink_files.values.astype('int') 104 | 105 | # # swap pandas-plink genotype coding to match .raw format...more about that below: 106 | 107 | # # for example, assuming C in minor allele, alleles are coded in plink .raw labels homozygous for minor allele as 2 and homozygous for major allele as 0: 108 | # #A A -> 0 109 | # #A C -> 1 110 | # #C C -> 2 111 | # #0 0 -> NA 112 | 113 | # # where as, read_plink1_bin flips these, with homozygous minor allele = 0 and homozygous major allele = 2 114 | # #A A -> 2 115 | # #A C -> 1 116 | # #C C -> 0 117 | # #0 0 -> NA 118 | 119 | # two_idx = (plink_files.values == 2) 120 | # zero_idx = (plink_files.values == 0) 121 | 122 | # plink_files.values[two_idx] = 0 123 | # plink_files.values[zero_idx] = 2 124 | 125 | # plink_pd = plink_files.to_pandas() 126 | # plink_pd.reset_index(inplace=True) 127 | # raw_df = plink_pd.rename(columns={'sample': 'ID'}) 128 | 129 | # print("") 130 | # print("Your data looks like this (showing the first few lines of the left-most and right-most columns)...") 131 | # print("#"*70) 132 | # print(raw_df.describe()) 133 | # print("#"*70) 134 | # print("") 135 | 136 | # self.raw_df = raw_df 137 | 138 | # return raw_df 139 | 140 | def prep_refCols_file(self): 141 | # Make a list of the column names from the reference dataset 142 | ref_columns_list = self.df.columns.values.tolist() 143 | 144 | # Write out the columns to a text file we will use in munge later 145 | ref_cols_outfile = self.test_out_prefix + ".refColsHarmonize_toKeep.txt" 146 | 147 | with open(ref_cols_outfile, 'w') as filehandle: 148 | for col in ref_columns_list: 149 | filehandle.write('%s\n' % col) 150 | 151 | print("") 152 | print(f"A file with the columns in the reference file, to later use in the munging step and keep these same columns for the test dataset, has been generated here: {ref_cols_outfile}") 153 | print("") 154 | 155 | return ref_columns_list 156 | 157 | 158 | -------------------------------------------------------------------------------- /build/lib/genoml/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The GenoML Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | import json 17 | import os 18 | import time 19 | import traceback 20 | 21 | __author__ = 'Sayed Hadi Hashemi' 22 | 23 | import textwrap 24 | 25 | 26 | class ColoredBox: 27 | BLACK = 30 28 | RED = 31 29 | GREEN = 32 30 | YELLOW = 33 31 | BLUE = 34 32 | MAGENTA = 35 33 | CYAN = 36 34 | WHITE = 37 35 | RESET = 39 36 | 37 | def __init__(self, color=None): 38 | if color is None: 39 | color = self.GREEN 40 | self.__color = color 41 | 42 | def __enter__(self): 43 | print('\033[{}m'.format(self.__color), end="") 44 | 45 | def __exit__(self, exc_type, exc_val, exc_tb): 46 | print("\x1b[0m", end="") 47 | 48 | @classmethod 49 | def wrap(cls, text, color): 50 | return '\033[{}m'.format(color) + text + "\x1b[0m" 51 | 52 | 53 | class ContextScope: 54 | indent = 0 55 | _verbose = False 56 | 57 | def __init__(self, title, description, error, start=True, end=False, 58 | **kwargs): 59 | self._title = title.format(**kwargs) 60 | self._description = description.format(**kwargs) 61 | self._error = error.format(**kwargs) 62 | self._start = start 63 | self._end = end 64 | 65 | def __exit__(self, exc_type, exc_val, exc_tb): 66 | if exc_type is None and exc_val is None and exc_tb is None: 67 | if self._end: 68 | print( 69 | "{}{}: {}".format( 70 | self.get_prefix(ColoredBox.GREEN), 71 | ColoredBox.wrap(self._title, ColoredBox.GREEN), 72 | ColoredBox.wrap('[Done]', ColoredBox.GREEN))) 73 | self.remove_indent() 74 | else: 75 | print("{}{}: {}".format( 76 | self.get_prefix(ColoredBox.RED), self._title, 77 | ColoredBox.wrap('[Failed]', ColoredBox.RED))) 78 | print("{}".format(self.indent_text(self._error))) 79 | self.remove_indent() 80 | traceback.print_exception(exc_type, exc_val, exc_tb) 81 | exit(1) 82 | 83 | def __enter__(self): 84 | self.add_indent() 85 | if self._start: 86 | print() 87 | print("{}{}".format(self.get_prefix(ColoredBox.BLUE), 88 | ColoredBox.wrap(self._title, ColoredBox.BLUE))) 89 | if self._verbose and self._description: 90 | print("{}".format(self._description)) 91 | 92 | @classmethod 93 | def add_indent(cls): 94 | cls.indent += 1 95 | 96 | @classmethod 97 | def remove_indent(cls): 98 | cls.indent -= 1 99 | 100 | @classmethod 101 | def get_prefix(cls, color=None): 102 | indent_size = 4 103 | # text = "=" * (cls.indent * 4) + "> " 104 | text = "---> " * cls.indent 105 | if color: 106 | text = ColoredBox.wrap(text, color) 107 | return text 108 | 109 | @classmethod 110 | def indent_text(cls, text): 111 | WIDTH = 70 112 | indent = max(0, len(cls.get_prefix()) - 2) 113 | width = WIDTH - indent 114 | ret = textwrap.fill(text, width) 115 | ret = textwrap.indent(ret, " " * indent) 116 | return ret 117 | 118 | @classmethod 119 | def set_verbose(cls, verbose): 120 | cls._verbose = verbose 121 | 122 | 123 | def function_description(**dkwargs): 124 | def wrap(func): 125 | def func_wrapper(*args, **kwargs): 126 | with ContextScope(**dkwargs): 127 | return func(*args, **kwargs) 128 | 129 | return func_wrapper 130 | 131 | return wrap 132 | 133 | 134 | class DescriptionLoader: 135 | _descriptions = None 136 | 137 | @classmethod 138 | def _load(cls): 139 | description_file = os.path.join(os.path.dirname(__file__), 140 | "misc", "descriptions.json") 141 | with open(description_file) as fp: 142 | cls._descriptions = json.load(fp) 143 | 144 | @classmethod 145 | def function_description(cls, key, **kwargs): 146 | dkwargs = cls.get(key) 147 | return function_description(**dkwargs, **kwargs) 148 | 149 | @classmethod 150 | def get(cls, key): 151 | if cls._descriptions is None: 152 | cls._load() 153 | return cls._descriptions[key] 154 | 155 | @classmethod 156 | def context(cls, key, **kwargs): 157 | dkwargs = cls.get(key) 158 | return ContextScope(**dkwargs, **kwargs) 159 | 160 | @classmethod 161 | def print(cls, key, **kwargs): 162 | dkwargs = cls.get(key) 163 | with ContextScope(**dkwargs, **kwargs): 164 | pass 165 | 166 | 167 | class Timer: 168 | def __init__(self): 169 | self.start = None 170 | self.end = None 171 | 172 | def start_timer(self): 173 | self.start = time.time() 174 | 175 | def __enter__(self): 176 | self.start_timer() 177 | return self 178 | 179 | def __exit__(self, *args): 180 | self.stop_timer() 181 | 182 | def stop_timer(self): 183 | self.end = time.time() 184 | 185 | def elapsed(self): 186 | return self.end - self.start 187 | -------------------------------------------------------------------------------- /dist/genoml2-1.0.1.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GenoML/genoml2/469cf66f3d5594488dcd95497e0051d80602df30/dist/genoml2-1.0.1.tar.gz -------------------------------------------------------------------------------- /docs/current_file_structure.txt: -------------------------------------------------------------------------------- 1 | . 2 | ├── docs 3 | │ ├── current_file_structure.txt 4 | │ └── GettingStarted.sh 5 | ├── examples 6 | │ ├── continuous 7 | │ │ ├── example_GWAS.csv 8 | │ │ ├── training_addit.csv 9 | │ │ ├── training_pheno.csv 10 | │ │ ├── training.bed 11 | │ │ ├── training.bim 12 | │ │ ├── training.fam 13 | │ │ ├── validation_addit.csv 14 | │ │ ├── validation_pheno.csv 15 | │ │ ├── validation.bed 16 | │ │ ├── validation.bim 17 | │ │ └── validation.fam 18 | │ └── discrete 19 | │ ├── example_GWAS.csv 20 | │ ├── training_addit.csv 21 | │ ├── training_pheno.csv 22 | │ ├── training.bed 23 | │ ├── training.bim 24 | │ ├── training.fam 25 | │ ├── validation_addit.csv 26 | │ ├── validation_pheno.csv 27 | │ ├── validation.bed 28 | │ ├── validation.bim 29 | │ └── validation.fam 30 | ├── genoml 31 | │ ├── cli 32 | │ │ ├── __init__.py 33 | │ │ ├── continuous_supervised_test.py 34 | │ │ ├── continuous_supervised_train.py 35 | │ │ ├── continuous_supervised_tune.py 36 | │ │ ├── discrete_supervised_test.py 37 | │ │ ├── discrete_supervised_train.py 38 | │ │ └── discrete_supervised_tune.py 39 | │ ├── continuous 40 | │ │ ├── supervised 41 | │ │ │ ├── __init__.py 42 | │ │ │ ├── testing.py 43 | │ │ │ ├── training.py 44 | │ │ │ └── tuning.py 45 | │ │ └── __init__.py 46 | │ ├── discrete 47 | │ │ ├── supervised 48 | │ │ │ ├── __init__.py 49 | │ │ │ ├── testing.py 50 | │ │ │ ├── training.py 51 | │ │ │ └── tuning.py 52 | │ │ └── __init__.py 53 | │ ├── preprocessing 54 | │ │ ├── __init__.py 55 | │ │ ├── featureselection.py 56 | │ │ ├── harmonizing.py 57 | │ │ ├── munging.py 58 | │ │ └── vif.py 59 | │ ├── __init__.py 60 | │ ├── dependencies.py 61 | │ ├── GenoML.py 62 | │ ├── GenoMLHarmonizing.py 63 | │ ├── GenoMLMunging.py 64 | │ └── utils.py 65 | ├── outputs 66 | │ ├── test_continuous_geno_approx_feature_importance.txt 67 | │ ├── test_continuous_geno.best_algorithm.txt 68 | │ ├── test_continuous_geno.dataForML.h5 69 | │ ├── test_continuous_geno.p_threshold_variants.tab 70 | │ ├── test_continuous_geno.trainedModel_trainingSample_Predictions.csv 71 | │ ├── test_continuous_geno.trainedModel_withheldSample_Predictions.csv 72 | │ ├── test_continuous_geno.trainedModel_withheldSample_regression.png 73 | │ ├── test_continuous_geno.trainedModel.joblib 74 | │ ├── test_continuous_geno.training_withheldSamples_performanceMetrics.csv 75 | │ ├── test_continuous_geno.tunedModel_allSample_Predictions.csv 76 | │ ├── test_continuous_geno.tunedModel_allSample_regression.png 77 | │ ├── test_continuous_geno.tunedModel_CV_Summary.csv 78 | │ ├── test_continuous_geno.tunedModel_top10Iterations_Summary.csv 79 | │ ├── test_continuous_geno.tunedModel.joblib 80 | │ ├── test_continuous_geno.variants_and_alleles.tab 81 | │ ├── test_discrete_geno_approx_feature_importance.txt 82 | │ ├── test_discrete_geno.best_algorithm.txt 83 | │ ├── test_discrete_geno.dataForML.h5 84 | │ ├── test_discrete_geno.p_threshold_variants.tab 85 | │ ├── test_discrete_geno.trainedModel_trainingSample_featureImportance.csv 86 | │ ├── test_discrete_geno.trainedModel_trainingSample_Predictions.csv 87 | │ ├── test_discrete_geno.trainedModel_withheldSample_Predictions.csv 88 | │ ├── test_discrete_geno.trainedModel_withheldSample_probabilities.png 89 | │ ├── test_discrete_geno.trainedModel_withheldSample_ROC.png 90 | │ ├── test_discrete_geno.trainedModel.joblib 91 | │ ├── test_discrete_geno.training_withheldSamples_performanceMetrics.csv 92 | │ ├── test_discrete_geno.tunedModel_allSample_Predictions.csv 93 | │ ├── test_discrete_geno.tunedModel_allSample_probabilities.png 94 | │ ├── test_discrete_geno.tunedModel_allSample_ROC.png 95 | │ ├── test_discrete_geno.tunedModel_CV_Summary.csv 96 | │ ├── test_discrete_geno.tunedModel_top10Iterations_Summary.csv 97 | │ ├── test_discrete_geno.tunedModel.joblib 98 | │ ├── test_discrete_geno.variants_and_alleles.tab 99 | │ ├── test.csv 100 | │ ├── validation_test_continuous_geno_finalHarmonizedCols_toKeep.txt 101 | │ ├── validation_test_continuous_geno_refColsHarmonize_toKeep.txt 102 | │ ├── validation_test_continuous_geno_refSNPs_andAlleles.bed 103 | │ ├── validation_test_continuous_geno_refSNPs_andAlleles.bim 104 | │ ├── validation_test_continuous_geno_refSNPs_andAlleles.fam 105 | │ ├── validation_test_continuous_geno.best_algorithm.txt 106 | │ ├── validation_test_continuous_geno.dataForML.h5 107 | │ ├── validation_test_continuous_geno.testedModel_allSample_predictions.csv 108 | │ ├── validation_test_continuous_geno.testedModel_allSamples_performanceMetrics.csv 109 | │ ├── validation_test_continuous_geno.testedModel_allSamples_regressionPlot.png 110 | │ ├── validation_test_continuous_geno.trainedModel_trainingSample_Predictions.csv 111 | │ ├── validation_test_continuous_geno.trainedModel_withheldSample_Predictions.csv 112 | │ ├── validation_test_continuous_geno.trainedModel_withheldSample_regression.png 113 | │ ├── validation_test_continuous_geno.trainedModel.joblib 114 | │ ├── validation_test_continuous_geno.training_withheldSamples_performanceMetrics.csv 115 | │ ├── validation_test_continuous_geno.variants_and_alleles.tab 116 | │ ├── validation_test_continuous_genotestedModel_allSamples_regressionSummary.csv 117 | │ ├── validation_test_discrete_geno_finalHarmonizedCols_toKeep.txt 118 | │ ├── validation_test_discrete_geno_refColsHarmonize_toKeep.txt 119 | │ ├── validation_test_discrete_geno_refSNPs_andAlleles.bed 120 | │ ├── validation_test_discrete_geno_refSNPs_andAlleles.bim 121 | │ ├── validation_test_discrete_geno_refSNPs_andAlleles.fam 122 | │ ├── validation_test_discrete_geno.best_algorithm.txt 123 | │ ├── validation_test_discrete_geno.dataForML.h5 124 | │ ├── validation_test_discrete_geno.testedModel_allSample_predictions.csv 125 | │ ├── validation_test_discrete_geno.testedModel_allSample_probabilities.png 126 | │ ├── validation_test_discrete_geno.testedModel_allSample_ROC.png 127 | │ ├── validation_test_discrete_geno.testedModel_allSamples_performanceMetrics.csv 128 | │ ├── validation_test_discrete_geno.trainedModel_trainingSample_Predictions.csv 129 | │ ├── validation_test_discrete_geno.trainedModel_withheldSample_Predictions.csv 130 | │ ├── validation_test_discrete_geno.trainedModel_withheldSample_probabilities.png 131 | │ ├── validation_test_discrete_geno.trainedModel_withheldSample_ROC.png 132 | │ ├── validation_test_discrete_geno.trainedModel.joblib 133 | │ ├── validation_test_discrete_geno.training_withheldSamples_performanceMetrics.csv 134 | │ └── validation_test_discrete_geno.variants_and_alleles.tab 135 | ├── GettingStarted.sh 136 | ├── LICENSE 137 | ├── logo.png 138 | ├── README.md 139 | ├── requirements.txt 140 | └── setup.py -------------------------------------------------------------------------------- /examples/continuous/to_adjust.txt: -------------------------------------------------------------------------------- 1 | snp410 2 | snp403 3 | snp164 -------------------------------------------------------------------------------- /examples/continuous/training.bed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GenoML/genoml2/469cf66f3d5594488dcd95497e0051d80602df30/examples/continuous/training.bed -------------------------------------------------------------------------------- /examples/continuous/validation.bed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GenoML/genoml2/469cf66f3d5594488dcd95497e0051d80602df30/examples/continuous/validation.bed -------------------------------------------------------------------------------- /examples/continuous/validation.fam: -------------------------------------------------------------------------------- 1 | valdiation34 valdiation34 0 0 2 2 2 | valdiation75 valdiation75 0 0 1 2 3 | valdiation65 valdiation65 0 0 2 2 4 | valdiation15 valdiation15 0 0 2 1 5 | valdiation5 valdiation5 0 0 1 2 6 | valdiation24 valdiation24 0 0 1 1 7 | valdiation14 valdiation14 0 0 2 2 8 | valdiation58 valdiation58 0 0 1 2 9 | valdiation10 valdiation10 0 0 1 2 10 | valdiation89 valdiation89 0 0 1 2 11 | valdiation20 valdiation20 0 0 2 1 12 | valdiation59 valdiation59 0 0 1 1 13 | valdiation19 valdiation19 0 0 2 2 14 | valdiation13 valdiation13 0 0 2 2 15 | valdiation21 valdiation21 0 0 2 2 16 | valdiation35 valdiation35 0 0 1 2 17 | valdiation1 valdiation1 0 0 2 1 18 | valdiation92 valdiation92 0 0 1 1 19 | valdiation74 valdiation74 0 0 2 2 20 | valdiation94 valdiation94 0 0 1 2 21 | valdiation2 valdiation2 0 0 2 2 22 | valdiation37 valdiation37 0 0 1 2 23 | valdiation44 valdiation44 0 0 2 1 24 | valdiation48 valdiation48 0 0 1 1 25 | valdiation49 valdiation49 0 0 1 2 26 | valdiation17 valdiation17 0 0 1 2 27 | valdiation18 valdiation18 0 0 1 2 28 | valdiation83 valdiation83 0 0 2 2 29 | valdiation68 valdiation68 0 0 1 2 30 | valdiation50 valdiation50 0 0 1 2 31 | valdiation22 valdiation22 0 0 1 2 32 | valdiation33 valdiation33 0 0 2 1 33 | valdiation43 valdiation43 0 0 2 1 34 | valdiation60 valdiation60 0 0 2 2 35 | valdiation70 valdiation70 0 0 1 2 36 | valdiation99 valdiation99 0 0 1 2 37 | valdiation36 valdiation36 0 0 2 1 38 | valdiation51 valdiation51 0 0 1 2 39 | valdiation76 valdiation76 0 0 2 2 40 | valdiation64 valdiation64 0 0 1 2 41 | valdiation69 valdiation69 0 0 2 1 42 | valdiation32 valdiation32 0 0 1 2 43 | valdiation88 valdiation88 0 0 1 2 44 | valdiation11 valdiation11 0 0 1 1 45 | valdiation3 valdiation3 0 0 1 2 46 | valdiation46 valdiation46 0 0 1 2 47 | valdiation27 valdiation27 0 0 1 2 48 | valdiation63 valdiation63 0 0 1 2 49 | valdiation4 valdiation4 0 0 1 2 50 | valdiation85 valdiation85 0 0 1 2 51 | valdiation23 valdiation23 0 0 1 1 52 | valdiation84 valdiation84 0 0 1 2 53 | valdiation71 valdiation71 0 0 1 2 54 | valdiation54 valdiation54 0 0 1 1 55 | valdiation55 valdiation55 0 0 2 1 56 | valdiation26 valdiation26 0 0 1 2 57 | valdiation56 valdiation56 0 0 1 2 58 | valdiation72 valdiation72 0 0 1 2 59 | valdiation93 valdiation93 0 0 1 1 60 | valdiation8 valdiation8 0 0 1 2 61 | valdiation30 valdiation30 0 0 1 2 62 | valdiation39 valdiation39 0 0 1 1 63 | valdiation81 valdiation81 0 0 1 1 64 | valdiation80 valdiation80 0 0 1 2 65 | valdiation100 valdiation100 0 0 2 2 66 | valdiation9 valdiation9 0 0 1 2 67 | valdiation96 valdiation96 0 0 1 2 68 | valdiation12 valdiation12 0 0 2 2 69 | valdiation6 valdiation6 0 0 1 2 70 | valdiation31 valdiation31 0 0 2 2 71 | valdiation45 valdiation45 0 0 2 2 72 | valdiation87 valdiation87 0 0 1 2 73 | valdiation53 valdiation53 0 0 1 2 74 | valdiation86 valdiation86 0 0 1 1 75 | valdiation91 valdiation91 0 0 1 2 76 | valdiation25 valdiation25 0 0 1 1 77 | valdiation95 valdiation95 0 0 1 2 78 | valdiation62 valdiation62 0 0 2 2 79 | valdiation42 valdiation42 0 0 2 2 80 | valdiation98 valdiation98 0 0 1 2 81 | valdiation16 valdiation16 0 0 2 2 82 | valdiation38 valdiation38 0 0 2 1 83 | valdiation52 valdiation52 0 0 1 2 84 | valdiation57 valdiation57 0 0 1 1 85 | valdiation47 valdiation47 0 0 1 2 86 | valdiation28 valdiation28 0 0 1 2 87 | valdiation78 valdiation78 0 0 2 1 88 | valdiation29 valdiation29 0 0 1 1 89 | valdiation97 valdiation97 0 0 2 2 90 | valdiation40 valdiation40 0 0 1 1 91 | valdiation66 valdiation66 0 0 1 2 92 | valdiation61 valdiation61 0 0 2 2 93 | valdiation77 valdiation77 0 0 2 2 94 | valdiation90 valdiation90 0 0 1 2 95 | valdiation79 valdiation79 0 0 1 2 96 | valdiation41 valdiation41 0 0 1 2 97 | valdiation82 valdiation82 0 0 1 1 98 | valdiation7 valdiation7 0 0 2 2 99 | valdiation67 valdiation67 0 0 1 2 100 | valdiation73 valdiation73 0 0 1 2 101 | -------------------------------------------------------------------------------- /examples/continuous/validation_addit.csv: -------------------------------------------------------------------------------- 1 | ID,SEX_COV,UPSIT,FAMILY_HISTORY 2 | valdiation27,0,30,0 3 | valdiation10,0,30,0 4 | valdiation1,1,36,0 5 | valdiation77,1,21,1 6 | valdiation20,1,38,0 7 | valdiation70,0,12,0 8 | valdiation54,0,15,0 9 | valdiation56,0,12,0 10 | valdiation8,0,31,0 11 | valdiation23,0,33,0 12 | valdiation24,0,34,0 13 | valdiation38,1,30,0 14 | valdiation9,0,37,0 15 | valdiation60,1,20,1 16 | valdiation65,1,11,0 17 | valdiation50,0,22,0 18 | valdiation37,0,20,0 19 | valdiation19,1,36,1 20 | valdiation79,0,18,0 21 | valdiation36,1,27,0 22 | valdiation92,0,33,0 23 | valdiation33,1,36,0 24 | valdiation80,0,15,0 25 | valdiation90,0,38,0 26 | valdiation28,0,18,1 27 | valdiation3,0,25,1 28 | valdiation45,1,25,1 29 | valdiation64,0,15,0 30 | valdiation87,0,31,0 31 | valdiation49,0,17,0 32 | valdiation76,1,24,0 33 | valdiation30,0,33,0 34 | valdiation84,0,9,0 35 | valdiation88,0,13,0 36 | valdiation58,0,25,0 37 | valdiation100,1,17,1 38 | valdiation42,1,23,0 39 | valdiation67,0,17,0 40 | valdiation16,1,19,0 41 | valdiation43,1,30,0 42 | valdiation48,0,40,0 43 | valdiation18,0,15,0 44 | valdiation17,0,11,0 45 | valdiation2,1,28,1 46 | valdiation12,1,16,0 47 | valdiation34,1,26,1 48 | valdiation5,0,36,0 49 | valdiation63,0,32,0 50 | valdiation81,0,21,0 51 | valdiation22,0,15,1 52 | valdiation47,0,11,0 53 | valdiation75,0,28,1 54 | valdiation35,0,29,0 55 | valdiation21,1,34,0 56 | valdiation14,1,30,0 57 | valdiation31,1,20,0 58 | valdiation39,0,32,0 59 | valdiation83,1,38,0 60 | valdiation29,0,35,0 61 | valdiation4,0,5,0 62 | valdiation99,0,29,0 63 | valdiation73,0,27,1 64 | valdiation13,1,18,1 65 | valdiation91,0,15,1 66 | valdiation41,0,27,1 67 | valdiation78,1,35,0 68 | valdiation52,0,13,1 69 | valdiation6,0,19,0 70 | valdiation15,1,35,0 71 | valdiation55,1,37,0 72 | valdiation97,1,33,1 73 | valdiation26,0,8,0 74 | valdiation96,0,17,1 75 | valdiation85,0,22,1 76 | valdiation46,0,10,1 77 | valdiation95,0,22,1 78 | valdiation57,0,22,0 79 | valdiation61,1,20,0 80 | valdiation68,0,11,0 81 | valdiation25,0,28,0 82 | valdiation93,0,30,0 83 | valdiation44,1,33,0 84 | valdiation11,0,36,0 85 | valdiation62,1,19,0 86 | valdiation51,0,34,0 87 | valdiation94,0,10,0 88 | valdiation69,1,27,0 89 | valdiation32,0,9,0 90 | valdiation74,1,34,0 91 | valdiation86,0,35,0 92 | valdiation59,0,35,0 93 | valdiation53,0,35,0 94 | valdiation82,0,36,0 95 | valdiation71,0,15,1 96 | valdiation7,1,36,0 97 | valdiation66,0,25,0 98 | valdiation72,0,19,1 99 | valdiation98,0,22,0 100 | valdiation89,0,13,0 101 | valdiation40,0,38,0 102 | -------------------------------------------------------------------------------- /examples/continuous/validation_pheno.csv: -------------------------------------------------------------------------------- 1 | ID,PHENO 2 | valdiation27,60 3 | valdiation10,59 4 | valdiation1,73 5 | valdiation77,75 6 | valdiation20,62 7 | valdiation70,52 8 | valdiation54,79 9 | valdiation56,76 10 | valdiation8,67 11 | valdiation23,55 12 | valdiation24,56 13 | valdiation38,67 14 | valdiation9,76 15 | valdiation60,42 16 | valdiation65,70 17 | valdiation50,65 18 | valdiation37,55 19 | valdiation19,65 20 | valdiation79,64 21 | valdiation36,56 22 | valdiation92,60 23 | valdiation33,59 24 | valdiation80,72 25 | valdiation90,68 26 | valdiation28,66 27 | valdiation3,57 28 | valdiation45,83 29 | valdiation64,50 30 | valdiation87,73 31 | valdiation49,72 32 | valdiation76,72 33 | valdiation30,62 34 | valdiation84,55 35 | valdiation88,69 36 | valdiation58,71 37 | valdiation100,68 38 | valdiation42,72 39 | valdiation67,72 40 | valdiation16,77 41 | valdiation43,45 42 | valdiation48,61 43 | valdiation18,59 44 | valdiation17,66 45 | valdiation2,51 46 | valdiation12,56 47 | valdiation34,63 48 | valdiation5,72 49 | valdiation63,50 50 | valdiation81,79 51 | valdiation22,55 52 | valdiation47,79 53 | valdiation75,75 54 | valdiation35,76 55 | valdiation21,57 56 | valdiation14,52 57 | valdiation31,71 58 | valdiation39,67 59 | valdiation83,48 60 | valdiation29,63 61 | valdiation4,78 62 | valdiation99,64 63 | valdiation73,65 64 | valdiation13,63 65 | valdiation91,74 66 | valdiation41,47 67 | valdiation78,57 68 | valdiation52,65 69 | valdiation6,55 70 | valdiation15,72 71 | valdiation55,74 72 | valdiation97,43 73 | valdiation26,65 74 | valdiation96,61 75 | valdiation85,59 76 | valdiation46,54 77 | valdiation95,71 78 | valdiation57,76 79 | valdiation61,57 80 | valdiation68,69 81 | valdiation25,74 82 | valdiation93,58 83 | valdiation44,66 84 | valdiation11,78 85 | valdiation62,77 86 | valdiation51,63 87 | valdiation94,72 88 | valdiation69,60 89 | valdiation32,73 90 | valdiation74,53 91 | valdiation86,77 92 | valdiation59,72 93 | valdiation53,73 94 | valdiation82,59 95 | valdiation71,60 96 | valdiation7,51 97 | valdiation66,75 98 | valdiation72,57 99 | valdiation98,74 100 | valdiation89,58 101 | valdiation40,61 102 | -------------------------------------------------------------------------------- /examples/discrete/to_adjust.txt: -------------------------------------------------------------------------------- 1 | snp410 2 | snp403 3 | snp164 -------------------------------------------------------------------------------- /examples/discrete/training.bed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GenoML/genoml2/469cf66f3d5594488dcd95497e0051d80602df30/examples/discrete/training.bed -------------------------------------------------------------------------------- /examples/discrete/validation.bed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GenoML/genoml2/469cf66f3d5594488dcd95497e0051d80602df30/examples/discrete/validation.bed -------------------------------------------------------------------------------- /examples/discrete/validation.fam: -------------------------------------------------------------------------------- 1 | valdiation34 valdiation34 0 0 2 2 2 | valdiation75 valdiation75 0 0 1 2 3 | valdiation65 valdiation65 0 0 2 2 4 | valdiation15 valdiation15 0 0 2 1 5 | valdiation5 valdiation5 0 0 1 2 6 | valdiation24 valdiation24 0 0 1 1 7 | valdiation14 valdiation14 0 0 2 2 8 | valdiation58 valdiation58 0 0 1 2 9 | valdiation10 valdiation10 0 0 1 2 10 | valdiation89 valdiation89 0 0 1 2 11 | valdiation20 valdiation20 0 0 2 1 12 | valdiation59 valdiation59 0 0 1 1 13 | valdiation19 valdiation19 0 0 2 2 14 | valdiation13 valdiation13 0 0 2 2 15 | valdiation21 valdiation21 0 0 2 2 16 | valdiation35 valdiation35 0 0 1 2 17 | valdiation1 valdiation1 0 0 2 1 18 | valdiation92 valdiation92 0 0 1 1 19 | valdiation74 valdiation74 0 0 2 2 20 | valdiation94 valdiation94 0 0 1 2 21 | valdiation2 valdiation2 0 0 2 2 22 | valdiation37 valdiation37 0 0 1 2 23 | valdiation44 valdiation44 0 0 2 1 24 | valdiation48 valdiation48 0 0 1 1 25 | valdiation49 valdiation49 0 0 1 2 26 | valdiation17 valdiation17 0 0 1 2 27 | valdiation18 valdiation18 0 0 1 2 28 | valdiation83 valdiation83 0 0 2 2 29 | valdiation68 valdiation68 0 0 1 2 30 | valdiation50 valdiation50 0 0 1 2 31 | valdiation22 valdiation22 0 0 1 2 32 | valdiation33 valdiation33 0 0 2 1 33 | valdiation43 valdiation43 0 0 2 1 34 | valdiation60 valdiation60 0 0 2 2 35 | valdiation70 valdiation70 0 0 1 2 36 | valdiation99 valdiation99 0 0 1 2 37 | valdiation36 valdiation36 0 0 2 1 38 | valdiation51 valdiation51 0 0 1 2 39 | valdiation76 valdiation76 0 0 2 2 40 | valdiation64 valdiation64 0 0 1 2 41 | valdiation69 valdiation69 0 0 2 1 42 | valdiation32 valdiation32 0 0 1 2 43 | valdiation88 valdiation88 0 0 1 2 44 | valdiation11 valdiation11 0 0 1 1 45 | valdiation3 valdiation3 0 0 1 2 46 | valdiation46 valdiation46 0 0 1 2 47 | valdiation27 valdiation27 0 0 1 2 48 | valdiation63 valdiation63 0 0 1 2 49 | valdiation4 valdiation4 0 0 1 2 50 | valdiation85 valdiation85 0 0 1 2 51 | valdiation23 valdiation23 0 0 1 1 52 | valdiation84 valdiation84 0 0 1 2 53 | valdiation71 valdiation71 0 0 1 2 54 | valdiation54 valdiation54 0 0 1 1 55 | valdiation55 valdiation55 0 0 2 1 56 | valdiation26 valdiation26 0 0 1 2 57 | valdiation56 valdiation56 0 0 1 2 58 | valdiation72 valdiation72 0 0 1 2 59 | valdiation93 valdiation93 0 0 1 1 60 | valdiation8 valdiation8 0 0 1 2 61 | valdiation30 valdiation30 0 0 1 2 62 | valdiation39 valdiation39 0 0 1 1 63 | valdiation81 valdiation81 0 0 1 1 64 | valdiation80 valdiation80 0 0 1 2 65 | valdiation100 valdiation100 0 0 2 2 66 | valdiation9 valdiation9 0 0 1 2 67 | valdiation96 valdiation96 0 0 1 2 68 | valdiation12 valdiation12 0 0 2 2 69 | valdiation6 valdiation6 0 0 1 2 70 | valdiation31 valdiation31 0 0 2 2 71 | valdiation45 valdiation45 0 0 2 2 72 | valdiation87 valdiation87 0 0 1 2 73 | valdiation53 valdiation53 0 0 1 2 74 | valdiation86 valdiation86 0 0 1 1 75 | valdiation91 valdiation91 0 0 1 2 76 | valdiation25 valdiation25 0 0 1 1 77 | valdiation95 valdiation95 0 0 1 2 78 | valdiation62 valdiation62 0 0 2 2 79 | valdiation42 valdiation42 0 0 2 2 80 | valdiation98 valdiation98 0 0 1 2 81 | valdiation16 valdiation16 0 0 2 2 82 | valdiation38 valdiation38 0 0 2 1 83 | valdiation52 valdiation52 0 0 1 2 84 | valdiation57 valdiation57 0 0 1 1 85 | valdiation47 valdiation47 0 0 1 2 86 | valdiation28 valdiation28 0 0 1 2 87 | valdiation78 valdiation78 0 0 2 1 88 | valdiation29 valdiation29 0 0 1 1 89 | valdiation97 valdiation97 0 0 2 2 90 | valdiation40 valdiation40 0 0 1 1 91 | valdiation66 valdiation66 0 0 1 2 92 | valdiation61 valdiation61 0 0 2 2 93 | valdiation77 valdiation77 0 0 2 2 94 | valdiation90 valdiation90 0 0 1 2 95 | valdiation79 valdiation79 0 0 1 2 96 | valdiation41 valdiation41 0 0 1 2 97 | valdiation82 valdiation82 0 0 1 1 98 | valdiation7 valdiation7 0 0 2 2 99 | valdiation67 valdiation67 0 0 1 2 100 | valdiation73 valdiation73 0 0 1 2 101 | -------------------------------------------------------------------------------- /examples/discrete/validation_addit.csv: -------------------------------------------------------------------------------- 1 | ID,SEX_COV,AGE,UPSIT,FAMILY_HISTORY 2 | valdiation27,0,60,30,0 3 | valdiation10,0,59,30,0 4 | valdiation1,1,73,36,0 5 | valdiation77,1,75,21,1 6 | valdiation20,1,62,38,0 7 | valdiation70,0,52,12,0 8 | valdiation54,0,79,15,0 9 | valdiation56,0,76,12,0 10 | valdiation8,0,67,31,0 11 | valdiation23,0,55,33,0 12 | valdiation24,0,56,34,0 13 | valdiation38,1,67,30,0 14 | valdiation9,0,76,37,0 15 | valdiation60,1,42,20,1 16 | valdiation65,1,70,11,0 17 | valdiation50,0,65,22,0 18 | valdiation37,0,55,20,0 19 | valdiation19,1,65,36,1 20 | valdiation79,0,64,18,0 21 | valdiation36,1,56,27,0 22 | valdiation92,0,60,33,0 23 | valdiation33,1,59,36,0 24 | valdiation80,0,72,15,0 25 | valdiation90,0,68,38,0 26 | valdiation28,0,66,18,1 27 | valdiation3,0,57,25,1 28 | valdiation45,1,83,25,1 29 | valdiation64,0,50,15,0 30 | valdiation87,0,73,31,0 31 | valdiation49,0,72,17,0 32 | valdiation76,1,72,24,0 33 | valdiation30,0,62,33,0 34 | valdiation84,0,55,9,0 35 | valdiation88,0,69,13,0 36 | valdiation58,0,71,25,0 37 | valdiation100,1,68,17,1 38 | valdiation42,1,72,23,0 39 | valdiation67,0,72,17,0 40 | valdiation16,1,77,19,0 41 | valdiation43,1,45,30,0 42 | valdiation48,0,61,40,0 43 | valdiation18,0,59,15,0 44 | valdiation17,0,66,11,0 45 | valdiation2,1,51,28,1 46 | valdiation12,1,56,16,0 47 | valdiation34,1,63,26,1 48 | valdiation5,0,72,36,0 49 | valdiation63,0,50,32,0 50 | valdiation81,0,79,21,0 51 | valdiation22,0,55,15,1 52 | valdiation47,0,79,11,0 53 | valdiation75,0,75,28,1 54 | valdiation35,0,76,29,0 55 | valdiation21,1,57,34,0 56 | valdiation14,1,52,30,0 57 | valdiation31,1,71,20,0 58 | valdiation39,0,67,32,0 59 | valdiation83,1,48,38,0 60 | valdiation29,0,63,35,0 61 | valdiation4,0,78,5,0 62 | valdiation99,0,64,29,0 63 | valdiation73,0,65,27,1 64 | valdiation13,1,63,18,1 65 | valdiation91,0,74,15,1 66 | valdiation41,0,47,27,1 67 | valdiation78,1,57,35,0 68 | valdiation52,0,65,13,1 69 | valdiation6,0,55,19,0 70 | valdiation15,1,72,35,0 71 | valdiation55,1,74,37,0 72 | valdiation97,1,43,33,1 73 | valdiation26,0,65,8,0 74 | valdiation96,0,61,17,1 75 | valdiation85,0,59,22,1 76 | valdiation46,0,54,10,1 77 | valdiation95,0,71,22,1 78 | valdiation57,0,76,22,0 79 | valdiation61,1,57,20,0 80 | valdiation68,0,69,11,0 81 | valdiation25,0,74,28,0 82 | valdiation93,0,58,30,0 83 | valdiation44,1,66,33,0 84 | valdiation11,0,78,36,0 85 | valdiation62,1,77,19,0 86 | valdiation51,0,63,34,0 87 | valdiation94,0,72,10,0 88 | valdiation69,1,60,27,0 89 | valdiation32,0,73,9,0 90 | valdiation74,1,53,34,0 91 | valdiation86,0,77,35,0 92 | valdiation59,0,72,35,0 93 | valdiation53,0,73,35,0 94 | valdiation82,0,59,36,0 95 | valdiation71,0,60,15,1 96 | valdiation7,1,51,36,0 97 | valdiation66,0,75,25,0 98 | valdiation72,0,57,19,1 99 | valdiation98,0,74,22,0 100 | valdiation89,0,58,13,0 101 | valdiation40,0,61,38,0 102 | -------------------------------------------------------------------------------- /examples/discrete/validation_pheno.csv: -------------------------------------------------------------------------------- 1 | ID,PHENO 2 | valdiation1,0 3 | valdiation2,1 4 | valdiation3,1 5 | valdiation4,1 6 | valdiation5,1 7 | valdiation6,1 8 | valdiation7,1 9 | valdiation8,1 10 | valdiation9,1 11 | valdiation10,1 12 | valdiation11,0 13 | valdiation12,1 14 | valdiation13,1 15 | valdiation14,1 16 | valdiation15,0 17 | valdiation16,1 18 | valdiation17,1 19 | valdiation18,1 20 | valdiation19,1 21 | valdiation20,0 22 | valdiation21,1 23 | valdiation22,1 24 | valdiation23,0 25 | valdiation24,0 26 | valdiation25,0 27 | valdiation26,1 28 | valdiation27,1 29 | valdiation28,1 30 | valdiation29,0 31 | valdiation30,1 32 | valdiation31,1 33 | valdiation32,1 34 | valdiation33,0 35 | valdiation34,1 36 | valdiation35,1 37 | valdiation36,0 38 | valdiation37,1 39 | valdiation38,0 40 | valdiation39,0 41 | valdiation40,0 42 | valdiation41,1 43 | valdiation42,1 44 | valdiation43,0 45 | valdiation44,0 46 | valdiation45,1 47 | valdiation46,1 48 | valdiation47,1 49 | valdiation48,0 50 | valdiation49,1 51 | valdiation50,1 52 | valdiation51,1 53 | valdiation52,1 54 | valdiation53,1 55 | valdiation54,0 56 | valdiation55,0 57 | valdiation56,1 58 | valdiation57,0 59 | valdiation58,1 60 | valdiation59,0 61 | valdiation60,1 62 | valdiation61,1 63 | valdiation62,1 64 | valdiation63,1 65 | valdiation64,1 66 | valdiation65,1 67 | valdiation66,1 68 | valdiation67,1 69 | valdiation68,1 70 | valdiation69,0 71 | valdiation70,1 72 | valdiation71,1 73 | valdiation72,1 74 | valdiation73,1 75 | valdiation74,1 76 | valdiation75,1 77 | valdiation76,1 78 | valdiation77,1 79 | valdiation78,0 80 | valdiation79,1 81 | valdiation80,1 82 | valdiation81,0 83 | valdiation82,0 84 | valdiation83,1 85 | valdiation84,1 86 | valdiation85,1 87 | valdiation86,0 88 | valdiation87,1 89 | valdiation88,1 90 | valdiation89,1 91 | valdiation90,1 92 | valdiation91,1 93 | valdiation92,0 94 | valdiation93,0 95 | valdiation94,1 96 | valdiation95,1 97 | valdiation96,1 98 | valdiation97,1 99 | valdiation98,1 100 | valdiation99,1 101 | valdiation100,1 102 | -------------------------------------------------------------------------------- /genoml/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The GenoML Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | from genoml import preprocessing 17 | from genoml import discrete 18 | from genoml import continuous 19 | from genoml import cli 20 | -------------------------------------------------------------------------------- /genoml/cli/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The GenoML Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | -------------------------------------------------------------------------------- /genoml/cli/continuous_supervised_test.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The GenoML Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | from genoml.continuous import supervised 17 | import joblib 18 | import pandas as pd 19 | from pathlib import Path 20 | import sys 21 | 22 | 23 | def main(prefix, test_prefix, refModel_prefix): 24 | print("") 25 | print("Here is some basic info on the command you are about to run.") 26 | print("Python version info...") 27 | print(sys.version) 28 | 29 | # Print out the chosen CLI arguments 30 | print("CLI argument info...") 31 | print(f"You are importing this test dataset: {test_prefix}.") 32 | print(f"You are applying the model saved here: {refModel_prefix}.") 33 | print(f"The results of this test application of your model will be saved in files with the given prefix: {prefix}.") 34 | print("As a note, in all exported probabilities and other graphics, case status is treated as a 0 or 1, with 1 " 35 | "representing a positive case.") 36 | 37 | print("") 38 | 39 | # Specify prefix and dataframe variables to be passed into class 40 | #run_prefix = prefix 41 | #infile_h5 = test_prefix + ".dataForML.h5" 42 | #df = pd.read_hdf(infile_h5, key="dataForML") 43 | infile_h5 = Path(prefix).joinpath("Munge").joinpath("dataForML.h5") 44 | df = pd.read_hdf(infile_h5, key="dataForML") 45 | 46 | infile_model = Path(prefix).joinpath("Tune").joinpath("tunedModel.joblib") 47 | loaded_model = joblib.load(infile_model) 48 | 49 | # Pass the arguments to the class 50 | #test = supervised.test(df, loaded_model, run_prefix) 51 | test = supervised.test(df, loaded_model, prefix) 52 | 53 | # Prep and show the dataframe 54 | test.prep_df() 55 | 56 | # Output the performance metrics 57 | test.performance_metrics() 58 | 59 | # Exporting predictions on withheld data 60 | test.export_pheno_predictions() 61 | 62 | # Exporting regression plot + summary 63 | test.regression_summary() 64 | 65 | # Thank the user 66 | print("") 67 | print("Let's shut everything down, thanks for testing your model with GenoML!") 68 | print("") 69 | -------------------------------------------------------------------------------- /genoml/cli/continuous_supervised_train.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The GenoML Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | import sys 17 | 18 | import numpy as np 19 | import pandas as pd 20 | from pathlib import Path 21 | from genoml import utils 22 | from genoml.continuous import supervised 23 | 24 | 25 | # TODO(mary): use or remove export_predictions 26 | @utils.DescriptionLoader.function_description("cli/continuous_supervised_train") 27 | def main(run_prefix, export_predictions, matching_columns_path): 28 | utils.DescriptionLoader.print("cli/continuous_supervised_train/info", 29 | python_version=sys.version, prefix=run_prefix) 30 | 31 | input_path = Path(run_prefix).joinpath("Munge").joinpath("dataForML.h5") 32 | with utils.DescriptionLoader.context( 33 | "cli/continuous_supervised_train/input", path=input_path): 34 | df = pd.read_hdf(input_path, key="dataForML") 35 | 36 | if matching_columns_path: 37 | with utils.DescriptionLoader.context( 38 | "cli/continuous_supervised_train/matching_columns_path", 39 | matching_columns_path=matching_columns_path): 40 | with open(matching_columns_path, 'r') as matchingCols_file: 41 | matching_column_names_list = matchingCols_file.read().splitlines() 42 | 43 | df = df[np.intersect1d(df.columns, matching_column_names_list)] 44 | 45 | model = supervised.train(df, run_prefix) 46 | model.summary() 47 | model.compete() 48 | model.export_model() 49 | model.export_predictions() 50 | model.save_algorithm_results(run_prefix) 51 | model.save_best_algorithm(run_prefix) 52 | -------------------------------------------------------------------------------- /genoml/cli/continuous_supervised_tune.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The GenoML Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | import pandas as pd 17 | from genoml.continuous import supervised 18 | from pathlib import Path 19 | import numpy as np 20 | 21 | 22 | def main(run_prefix, max_iter, cv_count, matchingCols): 23 | # TUNING 24 | # Create a dialogue with the user 25 | print("Here is some basic info on the command you are about to run.") 26 | print("CLI argument info...") 27 | print(f"Working with the dataset and best model corresponding to prefix {run_prefix} the timestamp from the merge " 28 | f"is the prefix in most cases.") 29 | print(f"Your maximum number of tuning iterations is {max_iter} and if you are concerned about runtime, make this " 30 | f"number smaller.") 31 | print(f"You are running {cv_count} rounds of cross-validation, and again... if you are concerned about runtime, " 32 | f"make this number smaller.") 33 | print("Give credit where credit is due, for this stage of analysis we use code from the great contributors to " 34 | "python packages: argparse, xgboost, sklearn, pandas, numpy, time, matplotlib and seaborn.") 35 | print("As a note, in all exported probabilities and other graphics, case status is treated as a 0 or 1, with 1 " 36 | "representing a positive case.") 37 | print("") 38 | 39 | infile_h5 = Path(run_prefix).joinpath("Munge").joinpath("dataForML.h5") 40 | df = pd.read_hdf(infile_h5, key="dataForML") 41 | 42 | # Addressing issue #12: 43 | if (matchingCols != None): 44 | print(f"We are using the harmonized columns you provided here: {matchingCols}") 45 | print(f"Note that you might have different/less features than before, given this was column list was harmonized between your reference and test dataset...") 46 | 47 | with open(matchingCols, 'r') as matchingCols_file: 48 | matching_column_names_list = matchingCols_file.read().splitlines() 49 | 50 | # Keep only the columns found in the file 51 | df = df[np.intersect1d(df.columns, matching_column_names_list)] 52 | 53 | y_tune = df.PHENO 54 | X_tune = df.drop(columns=['PHENO']) 55 | IDs_tune = X_tune.ID 56 | X_tune = X_tune.drop(columns=['ID']) 57 | 58 | best_algo_name_in = Path(run_prefix).joinpath("Train").joinpath('best_algorithm.txt') 59 | best_algo_df = pd.read_csv(best_algo_name_in, header=None, index_col=False) 60 | best_algo = str(best_algo_df.iloc[0,0]) 61 | 62 | 63 | # Communicate to the user the best identified algorithm 64 | print(f"From previous analyses in the training phase, we've determined that the best algorithm for this " 65 | f"application is {best_algo}... so let's tune it up and see what gains we can make!") 66 | 67 | # Tuning 68 | ## This calls on the functions made in the tune class (tuning.py) at the genoml.continuous.supervised 69 | model_tune = supervised.tune(df, run_prefix, max_iter, cv_count) 70 | model_tune.select_tuning_parameters() # Returns algo, hyperparameters, and scoring_metric 71 | model_tune.apply_tuning_parameters() # Randomized search with CV to tune 72 | model_tune.report_tune() # Summary of the top 10 iterations of the hyperparameter tune 73 | model_tune.summarize_tune() # Summary of the cross-validation 74 | model_tune.compare_performance() # Compares tuned performance to baseline to 75 | model_tune.export_tuned_data() # Export the newly tuned predictions 76 | model_tune.export_tune_regression() # Export the tuned and fitted regression model 77 | 78 | print("") 79 | print("End of tuning stage with GenoML.") 80 | print("") 81 | -------------------------------------------------------------------------------- /genoml/cli/discrete_supervised_test.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The GenoML Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | from genoml.discrete import supervised 17 | import joblib 18 | import pandas as pd 19 | from pathlib import Path 20 | import sys 21 | 22 | def main(prefix, test_prefix, refModel_prefix): 23 | print("") 24 | print("Here is some basic info on the command you are about to run.") 25 | print("Python version info...") 26 | print(sys.version) 27 | 28 | # Print out the chosen CLI arguments 29 | print("CLI argument info...") 30 | print(f"You are importing this test dataset: {test_prefix}.") 31 | print(f"You are applying the model saved here: {refModel_prefix}.") 32 | print(f"The results of this test application of your model will be saved in files with the given prefix: {prefix}.") 33 | print("As a note, in all exported probabilities and other graphics, case status is treated as a 0 or 1, with 1 representing a positive case.") 34 | 35 | print("") 36 | 37 | # Specify prefix and dataframe variables to be passed into class 38 | infile_h5 = Path(prefix).joinpath("Munge").joinpath("dataForML.h5") 39 | infile_model = Path(prefix).joinpath("Tune").joinpath("tunedModel.joblib") 40 | loaded_model = joblib.load(infile_model) 41 | 42 | # Pass the arguments to the class 43 | df = pd.read_hdf(infile_h5, key="dataForML") 44 | test = supervised.test(df, loaded_model, prefix) 45 | 46 | # Prep and show the dataframe 47 | test.prep_df() 48 | 49 | # Export the ROC and precision-recall plots 50 | test.plot_results(save=True) 51 | 52 | # Export the probability histograms and data tables. 53 | test.export_prediction_data() 54 | 55 | # Export the additional summary stats 56 | #test.additional_sumstats() 57 | 58 | # Thank the user 59 | print("") 60 | print("Let's shut everything down, thanks for testing your model with GenoML!") 61 | print("") 62 | 63 | 64 | -------------------------------------------------------------------------------- /genoml/cli/discrete_supervised_train.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The GenoML Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | import sys 17 | import numpy as np 18 | import pandas as pd 19 | from pathlib import Path 20 | from genoml.discrete import supervised 21 | 22 | 23 | def main(prefix, metric_max, prob_hist, auc, matchingCols): 24 | print("") 25 | print("Here is some basic info on the command you are about to run.") 26 | print("Python Version info...") 27 | print(sys.version) 28 | 29 | # Print out chosen CLI arguments 30 | print("CLI argument info...") 31 | print(f"Working with dataset {prefix} from previous data munging efforts.") 32 | print(f"You have chosen to compete the algorithms based on {metric_max}.") 33 | print("Give credit where credit is due, for this stage of analysis we use code from the great contributors to Python packages: argparse, xgboost, sklearn, pandas, numpy, time, matplotlib and seaborn.") 34 | print("As a note, in all exported probabilities and other graphics, case status is treated as a 0 or 1, with 1 representing a positive case.") 35 | print("") 36 | 37 | # Specify prefix and dataframe variables to be passed into class 38 | run_prefix = prefix 39 | infile_h5 = Path(run_prefix).joinpath("Munge").joinpath("dataForML.h5") 40 | df = pd.read_hdf(infile_h5, key = "dataForML") 41 | 42 | if (matchingCols != None): 43 | print(f"Looks like you are retraining your reference file. We are using the harmonized columns you provided here: {matchingCols}") 44 | print(f"Note that you might have different/less features than before, given this was harmonized between training and test dataset, and might mean your model now performs worse...") 45 | 46 | with open(matchingCols, 'r') as matchingCols_file: 47 | matching_column_names_list = matchingCols_file.read().splitlines() 48 | 49 | # Keep only the columns found in the file 50 | df = df[np.intersect1d(df.columns, matching_column_names_list)] 51 | 52 | model = supervised.train(df, run_prefix) 53 | model.summary() 54 | 55 | # Give user context prior to competing algorithms 56 | # Explains to users how we are splitting their data 70:30 57 | print("") 58 | print("Now let's compete these algorithms!") 59 | print("We'll update you as each algorithm runs, then summarize at the end.") 60 | print("Here we test each algorithm under default settings using the same training and test datasets derived from a 70% training and 30% testing split of your data.") 61 | print("For each algorithm, we will output the following metrics...") 62 | print("Algorithm name, hoping that's pretty self-explanatory. Plenty of resources on these common ML algorithms at https://scikit-learn.org and https://xgboost.readthedocs.io/.") 63 | print("AUC_percent, this is the area under the curve from receiver operating characteristic analyses. This is the most common metric of classifier performance in biomedical literature, we express this as a percent. We calculate AUC based on the predicted probability of being a case.") 64 | print("Accuracy_percent, this is the simple accuracy of the classifier, how many predictions were correct from best classification cutoff (python default).") 65 | print("Balanced_Accuracy_Percent, consider this as the accuracy resampled to a 1:1 mix of cases and controls. Imbalanced datasets can give funny results for simple accuracy.") 66 | print("Log_Loss, this is essentially the inverse of the likelihood function for a correct prediction, you want to minimize this.") 67 | print("Sensitivity, proportion of cases correctly identified.") 68 | print("Specificity, proportion of controls correctly identified.") 69 | print("PPV, this is the positive predictive value, the probability that subjects with a positive result actually have the disease.") 70 | print("NPV, this is the negative predictive value, the probability that subjects with a negative result don't have the disease.") 71 | print("We also log the runtimes per algorithm.") 72 | print("") 73 | print("Algorithm summaries incoming...") 74 | print("") 75 | 76 | # Compete the algorithms 77 | model.compete() 78 | 79 | # Output the results of the log 80 | model.results(metric_max) 81 | 82 | # Export the results 83 | model.export_model() 84 | 85 | # Export the ROC and precision-recall plots 86 | model.plot_results(save=True) 87 | 88 | # Export the probability histograms and data tables. 89 | model.export_prediction_data() 90 | 91 | # Save out the proper algorithm 92 | model.save_results(algorithm_results=True, best_algorithm=True) 93 | 94 | print("Thank you for training with GenoML!") -------------------------------------------------------------------------------- /genoml/cli/discrete_supervised_tune.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The GenoML Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | import pandas as pd 17 | import numpy as np 18 | from pathlib import Path 19 | from genoml.discrete import supervised 20 | 21 | 22 | def main(run_prefix, metric_tune, max_iter, cv_count, matchingCols): 23 | # TUNING 24 | # Create a dialogue with the user 25 | print("Here is some basic info on the command you are about to run.") 26 | print("CLI argument info...") 27 | print(f"Working with the dataset and best model corresponding to prefix {run_prefix} the timestamp from the merge is the prefix in most cases.") 28 | print(f"You have chosen to tune the algorithms based on {metric_tune}.") 29 | print(f"Your maximum number of tuning iterations is {max_iter} and if you are concerned about runtime, make this number smaller.") 30 | print(f"You are running {cv_count} rounds of cross-validation, and again... if you are concerned about runtime, make this number smaller.") 31 | print("Give credit where credit is due, for this stage of analysis we use code from the great contributors to python packages: argparse, xgboost, sklearn, pandas, numpy, time, matplotlib and seaborn.") 32 | print("As a note, in all exported probabilities and other graphics, case status is treated as a 0 or 1, with 1 representing a positive case.") 33 | 34 | print("") 35 | 36 | infile_h5 = Path(run_prefix).joinpath("Munge").joinpath("dataForML.h5") 37 | df = pd.read_hdf(infile_h5, key = "dataForML") 38 | 39 | # Addressing issue #12: 40 | if (matchingCols != None): 41 | print(f"We are using the harmonized columns you provided here: {matchingCols}") 42 | print(f"Note that you might have different/less features than before, given this was column list was harmonized between your reference and test dataset...") 43 | 44 | with open(matchingCols, 'r') as matchingCols_file: 45 | matching_column_names_list = matchingCols_file.read().splitlines() 46 | 47 | # Keep only the columns found in the file 48 | df = df[np.intersect1d(df.columns, matching_column_names_list)] 49 | 50 | best_algo_name_in = Path(run_prefix).joinpath("Train").joinpath('best_algorithm.txt')#run_prefix + 'best_algorithm.txt' 51 | best_algo_df = pd.read_csv(best_algo_name_in, header=None, index_col=False) 52 | best_algo = str(best_algo_df.iloc[0,0]) 53 | 54 | # Communicate to the user the best identified algorithm 55 | print(f"From previous analyses in the training phase, we've determined that the best algorithm for this application is {best_algo}... so let's tune it up and see what gains we can make!") 56 | 57 | # Tuning 58 | model_tune = supervised.tune(df, run_prefix, max_iter, cv_count) 59 | model_tune.select_tuning_parameters(metric_tune) 60 | model_tune.apply_tuning_parameters() 61 | model_tune.report_tune() 62 | model_tune.summarize_tune() 63 | model_tune.compare_performance() 64 | model_tune.plot_results(save=True) 65 | model_tune.export_prediction_data() 66 | 67 | print("") 68 | print("End of tuning stage with GenoML.") 69 | print("") 70 | -------------------------------------------------------------------------------- /genoml/cli/harmonizing.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The GenoML Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | import sys 17 | from genoml import preprocessing 18 | 19 | 20 | def main(test_geno_prefix, test_prefix, ref_model_prefix, 21 | training_snps_alleles): 22 | # Print configurations 23 | print("Here is some basic info on the command you are about to run.") 24 | print("Python version info...") 25 | print(sys.version) 26 | print("CLI argument info...") 27 | print(f"You are importing test dataset {test_geno_prefix}.") 28 | print( 29 | f"Applying the model saved from your reference dataset in {ref_model_prefix}.") 30 | print( 31 | f"Reading in the SNP and allele information we will use to compare from {training_snps_alleles}.") 32 | print( 33 | f"The results of this test application of your model will be saved in files tagged {test_prefix}.") 34 | print( 35 | "As a note, in all exported probabilities and other graphics, case status is treated as a 0 or 1, with 1 representing a positive case.") 36 | 37 | # Run the harmonize script in genoml.preprocessing 38 | harmonizer = preprocessing.harmonizing(test_geno_prefix=test_geno_prefix, 39 | test_out_prefix=test_prefix, 40 | ref_model_prefix=ref_model_prefix, 41 | training_SNPs=training_snps_alleles) 42 | 43 | # Generate new binaries from the test dataset using the reference dataset SNPs 44 | harmonizer.generate_new_PLINK() 45 | 46 | # Read in PLINK binaries 47 | # harmonizer.read_PLINK() 48 | 49 | # Generate reference columns to keep for munging 50 | harmonizer.prep_refCols_file() 51 | 52 | # Thank the user 53 | print("Thank you for harmonizing with GenoML!") 54 | -------------------------------------------------------------------------------- /genoml/cli/munging.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The GenoML Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | import argparse 17 | import sys 18 | 19 | import genoml.dependencies 20 | from genoml import preprocessing 21 | 22 | 23 | def main(prefix, impute, geno, skip_prune, r2_cutoff, pheno, addit, feature_selection, gwas, p, vif, iter, ref_cols_harmonize, umap_reduce, adjust_data, adjust_normalize, target_features, confounders, data_type): 24 | genoml.dependencies.check_dependencies() 25 | 26 | run_prefix = prefix 27 | impute_type = impute 28 | geno_path = geno 29 | prune_choice = skip_prune 30 | pheno_path = pheno 31 | addit_path = addit 32 | n_est = feature_selection 33 | gwas_path = gwas 34 | p_gwas = p 35 | r2_cutoff = r2_cutoff 36 | vif_thresh = vif 37 | vif_iter = iter 38 | refColsHarmonize = ref_cols_harmonize 39 | umap_reduce = umap_reduce 40 | adjust_data = adjust_data 41 | adjust_normalize = adjust_normalize 42 | target_features = target_features 43 | confounders = confounders 44 | 45 | # Print configurations 46 | print("") 47 | print("Here is some basic info on the command you are about to run.") 48 | print("Python version info...") 49 | print(sys.version) 50 | print("CLI argument info...") 51 | print( 52 | f"The output prefix for this run is {run_prefix} and will be appended to later runs of GenoML.") 53 | print(f"Working with genotype data? {geno_path}") 54 | print(f"Do you want GenoML to prune your SNPs for you? {prune_choice}") 55 | print(f"The pruning threshold you've chosen is {r2_cutoff}") 56 | print(f"Working with additional predictors? {addit_path}") 57 | print(f"Where is your phenotype file? {pheno_path}") 58 | print(f"Any use for an external set of GWAS summary stats? {gwas_path}") 59 | print( 60 | f"If you plan on using external GWAS summary stats for SNP filtering, we'll only keep SNPs at what P value? {p_gwas}") 61 | print(f"How strong is your VIF filter? {vif_thresh}") 62 | print(f"How many iterations of VIF filtering are you doing? {vif_iter}") 63 | print( 64 | f"The imputation method you picked is using the column {impute_type} to fill in any remaining NAs.") 65 | print(f"Will you be adjusting additional features using UMAP dimensionality reduction? {umap_reduce}") 66 | print( 67 | "Give credit where credit is due, for this stage of analysis we use code from the great contributors to python packages: os, sys, argparse, numpy, pandas, joblib, math and time. We also use PLINK v1.9 from https://www.cog-genomics.org/plink/1.9/.") 68 | print("") 69 | 70 | # Run the munging script in genoml.preprocessing 71 | munger = preprocessing.munging(pheno_path=pheno_path, run_prefix=run_prefix, impute_type=impute_type, skip_prune=prune_choice, 72 | p_gwas=p_gwas, addit_path=addit_path, gwas_path=gwas_path, geno_path=geno_path, refColsHarmonize=refColsHarmonize, r2_cutoff=r2_cutoff) 73 | 74 | # Process the PLINK inputs (for pruning) 75 | df = munger.plink_inputs() 76 | 77 | # Run the UMAP dimension reduction/ adjuster 78 | if (adjust_data == "yes" or umap_reduce == "yes"): 79 | adjuster = preprocessing.adjuster(run_prefix, df, target_features, confounders, adjust_data, adjust_normalize, umap_reduce) 80 | reduced_df = adjuster.umap_reducer() 81 | if (adjust_data == "yes"): 82 | print(f"\n You have chosen to adjust your data! \n") 83 | if (adjust_normalize == "yes"): 84 | print(f"\n You have also chosen to normalize your adjusted data \n") 85 | else: 86 | print(f"\n You have also chosen NOT to normalize your adjusted data \n") 87 | df = adjuster.normalize(reduced_df) 88 | 89 | # Run the feature selection using extraTrees 90 | if n_est > 0: 91 | featureSelection_df = preprocessing.featureselection(run_prefix, df, data_type, n_est) 92 | df = featureSelection_df.rank() 93 | featureSelection_df.export_data() 94 | 95 | # Run the VIF calculation 96 | if vif_iter > 0: 97 | vif_calc = preprocessing.vif(vif_iter, vif_thresh, df, 100, run_prefix) 98 | vif_calc.vif_calculations() 99 | 100 | # Thank the user 101 | print("Thank you for munging with GenoML!") 102 | -------------------------------------------------------------------------------- /genoml/continuous/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The GenoML Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | from genoml.continuous import supervised 17 | -------------------------------------------------------------------------------- /genoml/continuous/supervised/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The GenoML Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | from genoml.continuous.supervised.training import train 17 | from genoml.continuous.supervised.tuning import tune 18 | from genoml.continuous.supervised.testing import test 19 | -------------------------------------------------------------------------------- /genoml/continuous/supervised/testing.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The GenoML Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | # Import the necessary packages 17 | import pandas as pd 18 | from pathlib import Path 19 | import seaborn as sns 20 | import statsmodels.formula.api as sm 21 | from sklearn.metrics import explained_variance_score, mean_squared_error, median_absolute_error, r2_score 22 | 23 | class test: 24 | def __init__(self, df, loaded_model, run_prefix): 25 | self.df = df 26 | path = Path(run_prefix).joinpath("Test") 27 | if not path.is_dir(): 28 | path.mkdir() 29 | self.run_prefix = path 30 | self.loaded_model = loaded_model 31 | 32 | def prep_df(self): 33 | 34 | print("") 35 | print("Your data looks like this (showing the first few lines of the left-most and right-most columns)...") 36 | print("#"*70) 37 | print(self.df.describe()) 38 | print("#"*70) 39 | print("") 40 | 41 | # Save out and drop the PHENO and sample ID columns 42 | y_test = self.df.PHENO 43 | X_test = self.df.drop(columns=['PHENO']) 44 | IDs_test = X_test.ID 45 | X_test = X_test.drop(columns=['ID']) 46 | 47 | # Save variables to use globally within the class 48 | self.y_test = y_test 49 | self.X_test = X_test 50 | self.IDs_test = IDs_test 51 | 52 | return X_test 53 | 54 | def performance_metrics(self): 55 | 56 | log_cols=["Explained_variance_score", "Mean_squared_error", "Median_absolute_error", "R2_score"] 57 | log_table = pd.DataFrame(columns=log_cols) 58 | 59 | self.loaded_model.fit(self.X_test, self.y_test) 60 | 61 | print("") 62 | print("#"*70) 63 | 64 | test_predictions = self.loaded_model.predict(self.X_test) 65 | test_predictions = test_predictions 66 | evs = explained_variance_score(self.y_test, test_predictions) 67 | print("Explained variance score: {:.4}".format(evs)) 68 | 69 | test_predictions = self.loaded_model.predict(self.X_test) 70 | test_predictions = test_predictions 71 | mse = mean_squared_error(self.y_test, test_predictions) 72 | print("Mean squared error: {:.4}".format(mse)) 73 | 74 | test_predictions = self.loaded_model.predict(self.X_test) 75 | test_predictions = test_predictions 76 | mae = median_absolute_error(self.y_test, test_predictions) 77 | print("Median absolute error: {:.4}".format(mae)) 78 | 79 | test_predictions = self.loaded_model.predict(self.X_test) 80 | test_predictions = test_predictions 81 | r2s = r2_score(self.y_test, test_predictions) 82 | print("R^2 score: {:.4}".format(r2s)) 83 | 84 | log_entry = pd.DataFrame([[evs, mse, mae, r2s]], columns=log_cols) 85 | log_table = log_table._append(log_entry) 86 | 87 | print("#"*70) 88 | 89 | print("") 90 | 91 | log_outfile = self.run_prefix.joinpath('testedModel_allSamples_performanceMetrics.csv') 92 | 93 | print("") 94 | print(f"This table below is also logged as {log_outfile} and is in your current working directory...") 95 | print("#"*70) 96 | print(log_table) 97 | print("#"*70) 98 | print("") 99 | 100 | log_table.to_csv(log_outfile, index=False) 101 | 102 | self.log_table = log_table 103 | return log_table 104 | 105 | def export_pheno_predictions(self): 106 | 107 | test_predicted_values = self.loaded_model.predict(self.X_test) 108 | test_predicted_values_df = pd.DataFrame(test_predicted_values) 109 | y_test_df = pd.DataFrame(self.y_test) 110 | IDs_test_df = pd.DataFrame(self.IDs_test) 111 | 112 | test_out = pd.concat([IDs_test_df.reset_index(), y_test_df.reset_index(drop=True), test_predicted_values_df.reset_index(drop=True)], axis = 1, ignore_index=True) 113 | test_out.columns=["INDEX","ID","PHENO_REPORTED","PHENO_PREDICTED"] 114 | test_out = test_out.drop(columns=["INDEX"]) 115 | 116 | test_outfile = self.run_prefix.joinpath('testedModel_allSample_predictions.csv') 117 | test_out.to_csv(test_outfile, index=False) 118 | 119 | print("") 120 | print(f"Preview of the exported predictions exported as {test_outfile}, these are pretty straight forward.") 121 | print("They generally include the sample ID, the previously reported phenotype, and the predicted phenotype from that algorithm.") 122 | print("#"*70) 123 | print(test_out.head()) 124 | print("#"*70) 125 | 126 | self.test_out = test_out 127 | return test_out 128 | 129 | def regression_summary(self): 130 | 131 | genoML_colors = ["cyan","purple"] 132 | 133 | sns_plot = sns.regplot(data=self.test_out, y="PHENO_REPORTED", x="PHENO_PREDICTED", scatter_kws={"color": "cyan"}, line_kws={"color": "purple"}) 134 | 135 | plot_out = self.run_prefix.joinpath('testedModel_allSamples_regressionPlot.png') 136 | sns_plot.figure.savefig(plot_out, dpi=600) 137 | 138 | print("") 139 | print(f"We are also exporting a regression plot for you here {plot_out}, this is a graphical representation of the difference between the reported and predicted phenotypes in the withheld test data for the best performing algorithm.") 140 | 141 | print("") 142 | print("Here is a quick summary of the regression comparing PHENO_REPORTED ~ PHENO_PREDICTED in the withheld test data...") 143 | print("") 144 | 145 | reg_model = sm.ols(formula='PHENO_REPORTED ~ PHENO_PREDICTED', data=self.test_out) 146 | fitted = reg_model.fit() 147 | print(fitted.summary()) 148 | 149 | fitted_out = self.run_prefix.joinpath('testedModel_allSamples_regressionSummary.csv') 150 | 151 | with open(fitted_out, 'w') as fh: 152 | fh.write(fitted.summary().as_csv()) 153 | 154 | print(f"We are exporting this summary here: {fitted_out}") 155 | 156 | print("") 157 | print("...always good to see the P value for the predictor.") 158 | -------------------------------------------------------------------------------- /genoml/dependencies.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The GenoML Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | import io 17 | import logging 18 | import os 19 | import pathlib 20 | import platform 21 | import requests 22 | import stat 23 | import subprocess 24 | import zipfile 25 | 26 | from genoml import utils 27 | 28 | 29 | def __get_executable_folder(): 30 | key = "GENOML_DEP_DIR" 31 | if key in os.environ: 32 | return os.path.abspath(os.environ.get(key)) 33 | else: 34 | return os.path.join(str(pathlib.Path.home()), ".genoml", "misc", 35 | "executables") 36 | 37 | 38 | __executable_folder = __get_executable_folder() 39 | 40 | 41 | def __check_exec(exec_path, *args, absolute_path=False): 42 | if not absolute_path: 43 | binary_path = os.path.join(__executable_folder, exec_path) 44 | else: 45 | binary_path = exec_path 46 | if not os.path.exists(binary_path): 47 | return False 48 | 49 | _ = subprocess.run([binary_path, *args], stdout=subprocess.DEVNULL, 50 | stderr=subprocess.DEVNULL) 51 | return True 52 | 53 | 54 | def __install_exec(url, exec_path): 55 | r = requests.get(url, verify=False, stream=True) 56 | r.raw.decode_content = True 57 | buffer = io.BytesIO() 58 | buffer.write(r.content) 59 | with zipfile.ZipFile(buffer, "r") as fp: 60 | fp.extractall(__executable_folder) 61 | 62 | binary_path = os.path.join(__executable_folder, exec_path) 63 | os.chmod(binary_path, stat.S_IEXEC) 64 | 65 | 66 | def __check_package(name): 67 | platform_system = platform.system() 68 | 69 | if name not in __DEPENDENCIES: 70 | raise EnvironmentError("Unknown package: {}".format(name)) 71 | 72 | if platform_system not in __DEPENDENCIES[name]: 73 | raise EnvironmentError( 74 | "Unknown supported OK: {}".format(platform_system)) 75 | 76 | entry = __DEPENDENCIES[name][platform_system] 77 | 78 | binary_name = entry["binary"] 79 | args = entry["version_args"] 80 | url = entry["url"] 81 | 82 | if __check_exec(binary_name, *args): 83 | logging.debug("{} is found".format(name)) 84 | return os.path.join(__executable_folder, binary_name) 85 | 86 | logging.warning("Installing {}".format(name)) 87 | __install_exec(url, binary_name) 88 | if not __check_exec(binary_name, *args): 89 | logging.warning("Failed to run {} after installation".format(name)) 90 | raise EnvironmentError("Can not install {}".format(name)) 91 | else: 92 | return os.path.join(__executable_folder, binary_name) 93 | 94 | 95 | @utils.DescriptionLoader.function_description("check_dependencies") 96 | def check_dependencies(): 97 | global __DEPENDENCIES 98 | ret = {} 99 | for package, data in __DEPENDENCIES.items(): 100 | if "checker" in data: 101 | with utils.DescriptionLoader.context( 102 | "check_dependencies_{}".format(package)): 103 | ret[package] = data["checker"]() 104 | 105 | return ret 106 | 107 | 108 | def check_plink(): 109 | return __check_package("Plink") 110 | 111 | 112 | __DEPENDENCIES = { 113 | "Plink": { 114 | "checker": check_plink, 115 | "Darwin": { 116 | "binary": "plink", 117 | "version_args": ["--version"], 118 | "url": "http://s3.amazonaws.com/plink1-assets/plink_mac_20200219.zip" 119 | }, 120 | "Linux": { 121 | "binary": "plink", 122 | "version_args": ["--version"], 123 | "url": "http://s3.amazonaws.com/plink1-assets/plink_linux_x86_64_20200219.zip" 124 | } 125 | }, 126 | } 127 | -------------------------------------------------------------------------------- /genoml/discrete/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The GenoML Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | from genoml.discrete import supervised -------------------------------------------------------------------------------- /genoml/discrete/supervised/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The GenoML Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | from genoml.discrete.supervised.training import train 17 | from genoml.discrete.supervised.tuning import tune 18 | from genoml.discrete.supervised.testing import test -------------------------------------------------------------------------------- /genoml/discrete/supervised/testing.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The GenoML Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | # Import the necessary packages 17 | from pathlib import Path 18 | import genoml.discrete.utils as discrete_utils 19 | 20 | class test: 21 | def __init__(self, df, loaded_model, run_prefix): 22 | self.df = df 23 | path = Path(run_prefix).joinpath("Test") 24 | if not path.is_dir(): 25 | path.mkdir() 26 | self.run_prefix = path 27 | self.algo = loaded_model 28 | 29 | def prep_df(self): 30 | print("") 31 | print("Your data looks like this (showing the first few lines of the left-most and right-most columns)...") 32 | print("#"*70) 33 | print(self.df.describe()) 34 | print("#"*70) 35 | print("") 36 | 37 | # Save out and drop the PHENO and sample ID columns 38 | y_test = self.df.PHENO 39 | X_test = self.df.drop(columns=['PHENO']) 40 | IDs_test = X_test.ID 41 | X_test = X_test.drop(columns=['ID']) 42 | 43 | # Save variables to use globally within the class 44 | self.y_test = y_test 45 | self.X_test = X_test 46 | self.IDs_test = IDs_test 47 | 48 | def plot_results(self, save=False): 49 | # Issue #24: RandomForestClassifier is finicky - can't recalculate moving forward like the other 50 | self.algo.fit(self.X_test, self.y_test) 51 | plot_path = self.run_prefix.joinpath('testModel_withheldSample_ROC.png') 52 | ground_truth = self.y_test.values 53 | predictions = self.algo.predict(self.X_test) 54 | discrete_utils.ROC(save, plot_path, ground_truth, predictions) 55 | discrete_utils.precision_recall_plot(save, plot_path, ground_truth, predictions) 56 | 57 | def export_prediction_data(self): 58 | test_out = discrete_utils.export_prediction_tables( 59 | self.algo, 60 | self.y_test, 61 | self.X_test, 62 | self.IDs_test, 63 | self.run_prefix.joinpath('tunedModel_withheldSample_testingPredictions.csv'), 64 | ) 65 | 66 | discrete_utils.export_prob_hist( 67 | test_out, 68 | self.run_prefix.joinpath('tunedModel_withheldSample_testingProbabilities'), 69 | ) 70 | 71 | def additional_sumstats(self): 72 | print("") 73 | print("#"*70) 74 | print("Some additional summary stats logging from your application of your model to the test dataset.") 75 | print("") 76 | 77 | log_outfile = self.run_prefix.joinpath('tunedModel_validationCohort_allCasesControls_performanceMetrics.csv') 78 | log_table = discrete_utils.summary_stats( 79 | self.algo, 80 | self.y_test, 81 | self.X_test, 82 | ) 83 | log_table.to_csv(log_outfile, index=False) 84 | 85 | print("") 86 | print("#"*70) 87 | print("") 88 | print(f"This table below is also logged as {log_outfile} and is in your current working directory...") 89 | print("#"*70) 90 | print(log_table) 91 | print("#"*70) 92 | print("") 93 | -------------------------------------------------------------------------------- /genoml/misc/descriptions.json: -------------------------------------------------------------------------------- 1 | { 2 | "check_dependencies_Plink": { 3 | "title": "Checking plink", 4 | "description": "", 5 | "error": "" 6 | }, 7 | "check_dependencies": { 8 | "title": "Dependency Check", 9 | "description": "", 10 | "end": true, 11 | "error": "" 12 | }, 13 | "cli/continuous_supervised_train": { 14 | "title": "GenoML", 15 | "description": "Continuous Supervised Train", 16 | "end": true, 17 | "error": "" 18 | }, 19 | "cli/continuous_supervised_train/info": { 20 | "title": "Basic Info", 21 | "description": "Here is some basic info on the command you are about to run.\nPython version info:\n{python_version}\n\nWorking with dataset from previous data munging efforts at:\n\t{prefix}", 22 | "error": "" 23 | }, 24 | "cli/continuous_supervised_train/input": { 25 | "title": "Reading Input File: {path}", 26 | "description": "", 27 | "error": "" 28 | }, 29 | "cli/continuous_supervised_train/matching_columns_path": { 30 | "title": "", 31 | "description": "Looks like you are retraining your reference file. We are using the harmonized columns you provided here: {matching_columns_path}\nNote that you might have different/less features than before, given this was harmonized between training and test dataset, and might mean your model now performs worse...", 32 | "error": "" 33 | }, 34 | "continuous/supervised/training/Train/summary": { 35 | "title": "Input Data Summary", 36 | "description": "Your data looks like this (showing the first few lines of the left-most and right-most columns)...\n\n{data}", 37 | "error": "" 38 | }, 39 | "continuous/supervised/training/Train/compete": { 40 | "title": "Compete the algorithms", 41 | "description": "Now let's compete these algorithms!\nWe'll update you as each algorithm runs, then summarize at the end.\nHere we test each algorithm under default settings using the same training and test datasets derived from a 70% training and 30% testing split of your data.\nFor each algorithm, we will output the following metrics...\nAlgorithm name, hoping that's pretty self-explanatory. Plenty of resources on these common ML algorithms at https://scikit-learn.org and https://xgboost.readthedocs.io/.\nexplained_variance_score, this is the variance explained by the model per algorithm (scale from 0 to 1 with 1 being completely explained).\nmean_squared_error, this is the mean squared error from regression loss.\nmedian_absolute_error, median absolute error from regression loss.\nr2_score, standard r2 metric from linear regression (coefficient of determination), remember, this can be negative if your model is really bad.\nWe also log the runtimes per algorithm.\n\nAlgorithm summaries incoming...", 42 | "end": true, 43 | "error": "" 44 | }, 45 | "continuous/supervised/training/Train/compete/algorithm": { 46 | "title": "{name}", 47 | "description": "", 48 | "error": "" 49 | }, 50 | "continuous/supervised/training/Train/compete/algorithm/results": { 51 | "title": "{name} Results", 52 | "description": "{results}", 53 | "error": "" 54 | }, 55 | "continuous/supervised/training/Train/compete/algorithm/best": { 56 | "title": "Best Algorithm: {algorithm}", 57 | "description": "There are occasionally slight fluctuations in model performance on the same withheld samples.\n{metrics}", 58 | "error": "" 59 | }, 60 | "continuous/supervised/training/Train/export_model": { 61 | "title": "Exporting Model: {output_path}", 62 | "description": "this model has been saved as {output_path} for later use and can be found in your working directory.", 63 | "end": true, 64 | "error": "" 65 | }, 66 | "continuous/supervised/training/Train/save_algorithm_results": { 67 | "title": "Saving Algorithm Results: {output_path}", 68 | "description": "This table below is also logged as {output_path} and is in your current working directory...\n\n{data}", 69 | "end": true, 70 | "error": "" 71 | }, 72 | "continuous/supervised/training/Train/save_best_algorithm": { 73 | "title": "Saving Best Algorithm: {output_path}", 74 | "description": "Based on your withheld samples, the algorithm with the highest explained variance score is the {best_algorithm}... let's save that model name for you on {output_path}.", 75 | "end": true, 76 | "error": "" 77 | }, 78 | "continuous/supervised/training/Train/export_predictions/test_data": { 79 | "title": "Saving Prediction on Test Data: {output_path}", 80 | "description": "Preview of the exported predictions for the withheld test data that has been exported as {output_path} these are pretty straight forward.\nThey generally include the sample ID, the previously reported phenotype and the predicted phenotype from that algorithm,\n\n{data}", 81 | "end": true, 82 | "error": "" 83 | }, 84 | "continuous/supervised/training/Train/export_predictions/train_data": { 85 | "title": "Saving Prediction on Train Data: {output_path}", 86 | "description": "Preview of the exported predictions for the training samples which is naturally overfit and exported as {output_path} in the similar format as in the withheld test dataset that was just exported.\n\n{data}", 87 | "end": true, 88 | "error": "" 89 | }, 90 | "continuous/supervised/training/Train/export_predictions/plot": { 91 | "title": "Saving Regression Plot: {output_path}", 92 | "description": "Here is a quick summary of the regression comparing PHENO_REPORTED ~ PHENO_PREDICTED in the withheld test data...\n{data}\n...always good to see the P for the predictor.\n\nWe are also exporting a regression plot for you here {output_path} this is a graphical representation of the difference between the reported and predicted phenotypes in the withheld test data for the best performing algorithm.", 93 | "end": true, 94 | "error": "" 95 | } 96 | } 97 | -------------------------------------------------------------------------------- /genoml/preprocessing/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The GenoML Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | from genoml.preprocessing.munging import munging 17 | from genoml.preprocessing.vif import vif 18 | from genoml.preprocessing.featureselection import featureselection 19 | from genoml.preprocessing.harmonizing import harmonizing 20 | from genoml.preprocessing.adjuster import adjuster 21 | 22 | __all__ = [] 23 | -------------------------------------------------------------------------------- /genoml/preprocessing/adjuster.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The GenoML Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | import pandas as pd 17 | import numpy as np 18 | import statsmodels.api as sm 19 | import statsmodels.formula.api as smf 20 | import statistics 21 | import umap.umap_ as umap 22 | from joblib import dump, load 23 | import matplotlib.pyplot as plt 24 | from matplotlib import style 25 | import seaborn as sns 26 | 27 | class adjuster: 28 | def __init__(self, run_prefix, df, target_features, confounders, adjust_data, adjust_normalize, umap_reduce): 29 | self.run_prefix = run_prefix 30 | self.umap_reduce = umap_reduce 31 | self.target_columns = target_features 32 | self.confounders = confounders 33 | self.adjust_data = adjust_data 34 | self.normalize_switch = adjust_normalize 35 | 36 | df = self.run_prefix + ".dataForML.h5" 37 | self.munged_data = df 38 | 39 | self.target_data_df = pd.read_hdf(self.munged_data, 'dataForML') 40 | self.target_column_df = pd.read_csv(self.target_columns, names=['TARGETS']) 41 | 42 | self.confounders_df = pd.read_csv(self.confounders) 43 | 44 | # Keep only intersecting feature names left in munged set (removed either because --gwas or std dev of 0 etc.) 45 | target_data_list = self.target_data_df.columns 46 | target_column_list = self.target_column_df['TARGETS'].tolist() 47 | intersecting_list = list(set(target_data_list).intersection(set(target_column_list))) 48 | self.target_column_df = pd.DataFrame(intersecting_list,columns=['TARGETS']) 49 | 50 | def umap_reducer(self): 51 | 52 | if (self.umap_reduce == "yes"): 53 | IDs = self.confounders_df['ID'] 54 | IDs_df = pd.DataFrame(IDs) 55 | to_umap = self.confounders_df.drop(columns=['ID']) 56 | 57 | reducer = umap.UMAP(random_state=153) 58 | embedding = reducer.fit_transform(to_umap) 59 | 60 | embedding1 = pd.DataFrame(embedding[:,0]) 61 | embedding2 = pd.DataFrame(embedding[:,1]) 62 | 63 | out_data = pd.concat([IDs_df.reset_index(), embedding1.reset_index(drop=True), embedding2.reset_index(drop=True)], axis=1, ignore_index=True) 64 | out_data.columns = ['INDEX', 'ID', 'UMAP_embedding1', "UMAP_embedding2"] 65 | out_data = out_data.drop(columns=['INDEX']) 66 | 67 | # Plot 68 | print(f"Exporting UMAP plot...") 69 | fig, ax = plt.subplots(figsize=(12,10)) 70 | plt.scatter(embedding[:,0], embedding[:,1], cmap="cool") 71 | plt.title("Data Reduction to 2 Dimensions by UMAP", fontsize=18) 72 | plot_out = self.run_prefix + '.umap_plot.png' 73 | plt.savefig(plot_out, dpi=600) 74 | 75 | print(f"The UMAP plot has been exported and can be found here: {plot_out}") 76 | 77 | out_file = self.runplot_out = self.run_prefix + '.umap_data_reduction.csv' 78 | out_data.to_csv(out_file, index=False) 79 | 80 | print(f"The reduced UMAP 2 dimensions per sample .csv file can be found here: {out_file}") 81 | 82 | exported_reducer = reducer.fit(to_umap) 83 | algo_out = self.runplot_out = self.run_prefix + '.umap_clustering.joblib' 84 | dump(exported_reducer, algo_out) 85 | 86 | self.confounders_df = out_data 87 | 88 | print(f"The UMAP .joblib file can be found here: {algo_out}") 89 | 90 | return self.confounders_df 91 | 92 | def normalize(self, confounders_df): 93 | target_list = list(self.target_column_df['TARGETS']) 94 | confounder_list = list(confounders_df.columns[1:]) 95 | columns_to_keep_list = list(self.target_data_df.columns) 96 | 97 | adjustments_df = self.target_data_df.merge(confounders_df, how='inner', on='ID', suffixes=['', '_y']) 98 | 99 | formula_for_confounders = ' + '.join(confounder_list) 100 | 101 | for target in target_list: 102 | current_target = str(target) 103 | print(f"Looking at the following feature: {current_target}") 104 | 105 | current_formula = current_target + " ~ " + formula_for_confounders 106 | print(current_formula) 107 | 108 | target_model = smf.ols(formula=current_formula, data=adjustments_df).fit() 109 | 110 | if (self.normalize_switch == 'yes'): 111 | adjustments_df['temp'] = pd.to_numeric(target_model.resid) 112 | #print(type(adjustments_df['temp'])) 113 | mean_scalar = adjustments_df['temp'].mean() 114 | sd_scalar = adjustments_df['temp'].std() 115 | adjustments_df[current_target] = (adjustments_df['temp'] - mean_scalar)/sd_scalar 116 | adjustments_df.drop(columns=['temp'], inplace=True) 117 | else: 118 | adjustments_df[current_target] = pd.to_numeric(target_model.resid) 119 | 120 | adjusted_df = adjustments_df[columns_to_keep_list] 121 | 122 | outfile_h5 = self.run_prefix + ".dataForML.h5" 123 | adjusted_df.to_hdf(outfile_h5, key='dataForML', mode='w') 124 | 125 | if (self.normalize_switch == 'yes'): 126 | print(f"\n The adjusted dataframe following normalization can be found here: {outfile_h5}, your updated .dataForML file \n") 127 | else: 128 | print(f"\n The adjusted dataframe without normalization can be found here: {outfile_h5}, your updated .dataForML file \n") 129 | 130 | 131 | return adjusted_df 132 | -------------------------------------------------------------------------------- /genoml/preprocessing/featureselection.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The GenoML Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | import pandas as pd 17 | from sklearn import ensemble 18 | from sklearn import feature_selection 19 | 20 | class featureselection: 21 | def __init__(self, run_prefix, df, data_type, n_est): 22 | self.run_prefix = run_prefix 23 | self.featureRanks = None 24 | self.n_est = n_est 25 | self.data_type = data_type 26 | 27 | # Double check there are no NAs in the dataset before proceeding 28 | remove_cols = df.columns[df.isna().any()].tolist() 29 | df.drop(remove_cols, axis=1, inplace=True) 30 | 31 | self.y = df['PHENO'] 32 | self.X = df.drop(columns=['PHENO']) 33 | X = self.X 34 | self.IDs = X.ID 35 | self.X = X.drop(columns=['ID']) 36 | 37 | def rank(self): 38 | print(f""" 39 | Beginning featureSelection using {self.n_est} estimators...""") 40 | 41 | if (self.data_type == "d"): 42 | print(f""" 43 | using extraTrees Classifier for your discrete dataset 44 | """) 45 | clf = ensemble.ExtraTreesClassifier(n_estimators=self.n_est) 46 | 47 | if (self.data_type == "c"): 48 | print(f""" 49 | using extraTrees Regressor for your continuous dataset 50 | """) 51 | clf = ensemble.ExtraTreesRegressor(n_estimators=self.n_est) 52 | 53 | clf.fit(self.X, self.y) 54 | self.featureRanks = clf.feature_importances_ 55 | 56 | # Code to drop the features below threshold and return the data set like it was (aka add PHENO and IDs back) 57 | model = feature_selection.SelectFromModel(clf, prefit=True) # find this import at top 58 | df_editing = model.transform(self.X) 59 | print(""" 60 | Printing feature name that corresponds to the dataframe column name, then printing the relative importance as we go... 61 | """) 62 | 63 | list_featureScores = [] 64 | 65 | for col,score in zip(self.X.columns,clf.feature_importances_): 66 | print(col,score) 67 | list_featureScores.append([col, score]) 68 | 69 | df_featureScores = pd.DataFrame(list_featureScores, columns=["Feature_Name", "Score"]) 70 | #df_featureScores = df_featureScores[df_featureScores['Score'] !=0] 71 | df_featureScores = df_featureScores.sort_values(by=['Score'], ascending=False) 72 | featureScores_outfile = self.run_prefix + ".approx_feature_importance.txt" 73 | df_featureScores.to_csv(featureScores_outfile, index=False, sep="\t") 74 | 75 | print(f""" 76 | You have reduced your dataset to {df_editing.shape[0]} samples at {df_editing.shape[1]} features, not including ID and PHENO. 77 | """) 78 | 79 | y_df = self.y 80 | ID_df = pd.DataFrame(self.IDs) 81 | features_selected = model.get_support() 82 | X_reduced = self.X.iloc[:,features_selected] 83 | df_selecta = pd.concat([ID_df.reset_index(drop=True), y_df.reset_index(drop=True), X_reduced.reset_index(drop=True)], axis = 1, ignore_index=False) 84 | 85 | self.df_selecta = df_selecta 86 | self.featureScores_outfile = featureScores_outfile 87 | 88 | return df_selecta 89 | 90 | def export_data(self): 91 | ## Export reduced data 92 | outfile_h5 = self.run_prefix + ".dataForML.h5" 93 | self.df_selecta.to_hdf(outfile_h5, key='dataForML') 94 | 95 | features_list = self.df_selecta.columns.values.tolist() 96 | 97 | features_listpath = self.run_prefix + ".list_features.txt" 98 | with open(features_listpath, 'w') as f: 99 | for feature in features_list: 100 | f.write("%s\n" % feature) 101 | 102 | print(f"""Exporting a new {outfile_h5} file that has a reduced feature set based on your importance approximations. 103 | This is a good dataset for general ML applications for the chosen PHENO as it includes only features that are likely to impact the model. 104 | 105 | An updated list of {len(features_list)} features, including ID and PHENO, that is in your munged dataForML.h5 file can be found here {features_listpath} 106 | 107 | A file with all your features, ranked from largest contributors at the top to smallest contributors at the bottom, can be found at {self.featureScores_outfile}. 108 | """) 109 | -------------------------------------------------------------------------------- /genoml/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The GenoML Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | import json 17 | import os 18 | import time 19 | import traceback 20 | 21 | __author__ = 'Sayed Hadi Hashemi' 22 | 23 | import textwrap 24 | 25 | 26 | class ColoredBox: 27 | BLACK = 30 28 | RED = 31 29 | GREEN = 32 30 | YELLOW = 33 31 | BLUE = 34 32 | MAGENTA = 35 33 | CYAN = 36 34 | WHITE = 37 35 | RESET = 39 36 | 37 | def __init__(self, color=None): 38 | if color is None: 39 | color = self.GREEN 40 | self.__color = color 41 | 42 | def __enter__(self): 43 | print('\033[{}m'.format(self.__color), end="") 44 | 45 | def __exit__(self, exc_type, exc_val, exc_tb): 46 | print("\x1b[0m", end="") 47 | 48 | @classmethod 49 | def wrap(cls, text, color): 50 | return '\033[{}m'.format(color) + text + "\x1b[0m" 51 | 52 | 53 | class ContextScope: 54 | indent = 0 55 | _verbose = False 56 | 57 | def __init__(self, title, description, error, start=True, end=False, 58 | **kwargs): 59 | self._title = title.format(**kwargs) 60 | self._description = description.format(**kwargs) 61 | self._error = error.format(**kwargs) 62 | self._start = start 63 | self._end = end 64 | 65 | def __exit__(self, exc_type, exc_val, exc_tb): 66 | if exc_type is None and exc_val is None and exc_tb is None: 67 | if self._end: 68 | print( 69 | "{}{}: {}".format( 70 | self.get_prefix(ColoredBox.GREEN), 71 | ColoredBox.wrap(self._title, ColoredBox.GREEN), 72 | ColoredBox.wrap('[Done]', ColoredBox.GREEN))) 73 | self.remove_indent() 74 | else: 75 | print("{}{}: {}".format( 76 | self.get_prefix(ColoredBox.RED), self._title, 77 | ColoredBox.wrap('[Failed]', ColoredBox.RED))) 78 | print("{}".format(self.indent_text(self._error))) 79 | self.remove_indent() 80 | traceback.print_exception(exc_type, exc_val, exc_tb) 81 | exit(1) 82 | 83 | def __enter__(self): 84 | self.add_indent() 85 | if self._start: 86 | print() 87 | print("{}{}".format(self.get_prefix(ColoredBox.BLUE), 88 | ColoredBox.wrap(self._title, ColoredBox.BLUE))) 89 | if self._verbose and self._description: 90 | print("{}".format(self._description)) 91 | 92 | @classmethod 93 | def add_indent(cls): 94 | cls.indent += 1 95 | 96 | @classmethod 97 | def remove_indent(cls): 98 | cls.indent -= 1 99 | 100 | @classmethod 101 | def get_prefix(cls, color=None): 102 | indent_size = 4 103 | # text = "=" * (cls.indent * 4) + "> " 104 | text = "---> " * cls.indent 105 | if color: 106 | text = ColoredBox.wrap(text, color) 107 | return text 108 | 109 | @classmethod 110 | def indent_text(cls, text): 111 | WIDTH = 70 112 | indent = max(0, len(cls.get_prefix()) - 2) 113 | width = WIDTH - indent 114 | ret = textwrap.fill(text, width) 115 | ret = textwrap.indent(ret, " " * indent) 116 | return ret 117 | 118 | @classmethod 119 | def set_verbose(cls, verbose): 120 | cls._verbose = verbose 121 | 122 | 123 | def function_description(**dkwargs): 124 | def wrap(func): 125 | def func_wrapper(*args, **kwargs): 126 | with ContextScope(**dkwargs): 127 | return func(*args, **kwargs) 128 | 129 | return func_wrapper 130 | 131 | return wrap 132 | 133 | 134 | class DescriptionLoader: 135 | _descriptions = None 136 | 137 | @classmethod 138 | def _load(cls): 139 | description_file = os.path.join(os.path.dirname(__file__), 140 | "misc", "descriptions.json") 141 | with open(description_file) as fp: 142 | cls._descriptions = json.load(fp) 143 | 144 | @classmethod 145 | def function_description(cls, key, **kwargs): 146 | dkwargs = cls.get(key) 147 | return function_description(**dkwargs, **kwargs) 148 | 149 | @classmethod 150 | def get(cls, key): 151 | if cls._descriptions is None: 152 | cls._load() 153 | return cls._descriptions[key] 154 | 155 | @classmethod 156 | def context(cls, key, **kwargs): 157 | dkwargs = cls.get(key) 158 | return ContextScope(**dkwargs, **kwargs) 159 | 160 | @classmethod 161 | def print(cls, key, **kwargs): 162 | dkwargs = cls.get(key) 163 | with ContextScope(**dkwargs, **kwargs): 164 | pass 165 | 166 | 167 | class Timer: 168 | def __init__(self): 169 | self.start = None 170 | self.end = None 171 | 172 | def start_timer(self): 173 | self.start = time.time() 174 | 175 | def __enter__(self): 176 | self.start_timer() 177 | return self 178 | 179 | def __exit__(self, *args): 180 | self.stop_timer() 181 | 182 | def stop_timer(self): 183 | self.end = time.time() 184 | 185 | def elapsed(self): 186 | return self.end - self.start 187 | -------------------------------------------------------------------------------- /genoml2.egg-info/SOURCES.txt: -------------------------------------------------------------------------------- 1 | LICENSE 2 | MANIFEST.in 3 | README.md 4 | requirements.txt 5 | setup.cfg 6 | setup.py 7 | genoml/__init__.py 8 | genoml/__main__.py 9 | genoml/dependencies.py 10 | genoml/utils.py 11 | genoml/cli/__init__.py 12 | genoml/cli/continuous_supervised_test.py 13 | genoml/cli/continuous_supervised_train.py 14 | genoml/cli/continuous_supervised_tune.py 15 | genoml/cli/discrete_supervised_test.py 16 | genoml/cli/discrete_supervised_train.py 17 | genoml/cli/discrete_supervised_tune.py 18 | genoml/cli/harmonizing.py 19 | genoml/cli/munging.py 20 | genoml/continuous/__init__.py 21 | genoml/continuous/supervised/__init__.py 22 | genoml/continuous/supervised/testing.py 23 | genoml/continuous/supervised/training.py 24 | genoml/continuous/supervised/tuning.py 25 | genoml/discrete/__init__.py 26 | genoml/discrete/utils.py 27 | genoml/discrete/supervised/__init__.py 28 | genoml/discrete/supervised/testing.py 29 | genoml/discrete/supervised/training.py 30 | genoml/discrete/supervised/tuning.py 31 | genoml/misc/descriptions.json 32 | genoml/preprocessing/__init__.py 33 | genoml/preprocessing/adjuster.py 34 | genoml/preprocessing/featureselection.py 35 | genoml/preprocessing/harmonizing.py 36 | genoml/preprocessing/munging.py 37 | genoml/preprocessing/vif.py 38 | genoml2.egg-info/PKG-INFO 39 | genoml2.egg-info/SOURCES.txt 40 | genoml2.egg-info/dependency_links.txt 41 | genoml2.egg-info/entry_points.txt 42 | genoml2.egg-info/requires.txt 43 | genoml2.egg-info/top_level.txt -------------------------------------------------------------------------------- /genoml2.egg-info/dependency_links.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /genoml2.egg-info/entry_points.txt: -------------------------------------------------------------------------------- 1 | [console_scripts] 2 | genoml = genoml.__main__:handle_main 3 | -------------------------------------------------------------------------------- /genoml2.egg-info/requires.txt: -------------------------------------------------------------------------------- 1 | joblib 2 | matplotlib 3 | numpy 4 | tables 5 | pandas 6 | pandas_plink 7 | requests 8 | scikit-learn 9 | scipy 10 | seaborn 11 | statsmodels 12 | xgboost==2.0.3 13 | umap-learn 14 | -------------------------------------------------------------------------------- /genoml2.egg-info/top_level.txt: -------------------------------------------------------------------------------- 1 | genoml 2 | -------------------------------------------------------------------------------- /logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GenoML/genoml2/469cf66f3d5594488dcd95497e0051d80602df30/logo.png -------------------------------------------------------------------------------- /outputs/test_discrete_geno.best_algorithm.txt: -------------------------------------------------------------------------------- 1 | SGDClassifier -------------------------------------------------------------------------------- /outputs/test_discrete_geno.dataForML.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GenoML/genoml2/469cf66f3d5594488dcd95497e0051d80602df30/outputs/test_discrete_geno.dataForML.h5 -------------------------------------------------------------------------------- /outputs/test_discrete_geno.list_features.txt: -------------------------------------------------------------------------------- 1 | ID 2 | PHENO 3 | snp410 4 | snp403 5 | snp164 6 | snp363 7 | snp439 8 | snp370 9 | snp389 10 | snp475 11 | snp399 12 | snp87 13 | snp308 14 | snp223 15 | snp131 16 | snp94 17 | snp79 18 | snp420 19 | snp344 20 | snp281 21 | snp139 22 | snp379 23 | snp77 24 | snp431 25 | snp445 26 | snp360 27 | snp137 28 | snp27 29 | snp432 30 | snp28 31 | snp29 32 | snp208 33 | snp319 34 | snp30 35 | snp85 36 | snp433 37 | snp380 38 | snp31 39 | snp499 40 | snp443 41 | snp488 42 | snp32 43 | snp33 44 | snp113 45 | snp242 46 | snp369 47 | snp34 48 | snp278 49 | snp35 50 | snp160 51 | snp429 52 | snp159 53 | snp396 54 | snp374 55 | snp437 56 | snp486 57 | snp376 58 | snp268 59 | snp255 60 | snp423 61 | snp250 62 | snp411 63 | snp194 64 | snp267 65 | snp424 66 | snp408 67 | snp260 68 | snp130 69 | snp110 70 | snp372 71 | snp409 72 | snp138 73 | snp37 74 | snp343 75 | snp248 76 | snp352 77 | snp38 78 | snp262 79 | snp441 80 | snp39 81 | snp307 82 | snp393 83 | snp133 84 | snp84 85 | snp158 86 | snp292 87 | snp413 88 | snp336 89 | snp204 90 | snp465 91 | snp213 92 | snp350 93 | snp43 94 | snp44 95 | snp150 96 | snp434 97 | snp483 98 | snp143 99 | snp270 100 | snp98 101 | snp341 102 | snp243 103 | snp495 104 | snp145 105 | snp45 106 | snp155 107 | snp490 108 | snp192 109 | snp383 110 | snp163 111 | snp303 112 | snp148 113 | snp115 114 | snp325 115 | snp455 116 | snp418 117 | snp195 118 | snp210 119 | snp199 120 | snp109 121 | snp263 122 | snp442 123 | snp498 124 | snp168 125 | snp232 126 | snp238 127 | snp482 128 | snp41 129 | snp419 130 | snp42 131 | snp428 132 | snp272 133 | snp114 134 | snp293 135 | snp167 136 | snp55 137 | snp56 138 | snp406 139 | snp47 140 | snp201 141 | snp313 142 | snp449 143 | snp49 144 | snp290 145 | snp458 146 | snp473 147 | snp385 148 | snp276 149 | snp333 150 | snp462 151 | snp479 152 | snp50 153 | snp51 154 | snp52 155 | snp183 156 | snp318 157 | snp166 158 | snp241 159 | snp315 160 | snp53 161 | snp91 162 | snp284 163 | snp280 164 | snp354 165 | snp46 166 | snp464 167 | snp83 168 | snp351 169 | snp353 170 | snp497 171 | snp48 172 | snp100 173 | snp88 174 | snp282 175 | snp178 176 | snp461 177 | snp329 178 | snp259 179 | snp211 180 | snp422 181 | snp346 182 | snp59 183 | snp170 184 | snp73 185 | snp417 186 | snp436 187 | snp57 188 | snp86 189 | snp312 190 | snp99 191 | snp112 192 | snp217 193 | snp427 194 | snp338 195 | snp58 196 | snp275 197 | snp182 198 | snp357 199 | snp89 200 | snp181 201 | snp203 202 | snp302 203 | snp324 204 | snp310 205 | snp298 206 | snp185 207 | snp61 208 | snp392 209 | snp452 210 | snp189 211 | snp62 212 | snp334 213 | snp162 214 | snp348 215 | snp96 216 | snp247 217 | snp253 218 | snp228 219 | snp416 220 | snp172 221 | snp468 222 | snp121 223 | snp400 224 | snp258 225 | snp492 226 | snp477 227 | snp337 228 | snp144 229 | snp63 230 | snp80 231 | snp161 232 | snp330 233 | snp316 234 | snp226 235 | snp246 236 | snp60 237 | snp124 238 | snp218 239 | snp92 240 | snp200 241 | snp322 242 | snp126 243 | snp496 244 | snp361 245 | snp234 246 | snp300 247 | snp64 248 | snp256 249 | snp493 250 | snp141 251 | snp151 252 | snp489 253 | snp212 254 | snp471 255 | snp180 256 | snp412 257 | snp135 258 | snp463 259 | snp65 260 | snp440 261 | snp66 262 | snp111 263 | snp67 264 | snp187 265 | snp221 266 | snp149 267 | snp386 268 | snp240 269 | snp474 270 | snp332 271 | snp407 272 | snp101 273 | snp500 274 | snp75 275 | snp68 276 | snp69 277 | snp106 278 | snp273 279 | snp481 280 | snp70 281 | snp230 282 | snp387 283 | snp390 284 | snp207 285 | snp485 286 | snp430 287 | snp342 288 | snp116 289 | snp494 290 | snp3 291 | snp82 292 | snp469 293 | snp398 294 | snp265 295 | snp266 296 | snp456 297 | snp478 298 | snp251 299 | snp402 300 | snp328 301 | snp184 302 | snp323 303 | snp205 304 | snp447 305 | snp125 306 | snp157 307 | snp146 308 | snp305 309 | snp1 310 | snp373 311 | snp156 312 | snp295 313 | snp2 314 | snp103 315 | snp397 316 | snp71 317 | snp404 318 | snp384 319 | snp4 320 | snp206 321 | snp169 322 | snp134 323 | snp236 324 | snp136 325 | snp237 326 | snp467 327 | snp448 328 | snp271 329 | snp286 330 | snp320 331 | snp6 332 | snp426 333 | snp277 334 | snp105 335 | snp127 336 | snp231 337 | snp142 338 | snp484 339 | snp74 340 | snp365 341 | snp152 342 | snp5 343 | snp239 344 | snp288 345 | snp299 346 | snp401 347 | snp291 348 | snp176 349 | snp487 350 | snp321 351 | snp171 352 | snp301 353 | snp9 354 | snp10 355 | snp173 356 | snp11 357 | snp220 358 | snp274 359 | snp261 360 | snp296 361 | snp287 362 | snp314 363 | snp193 364 | snp108 365 | snp7 366 | snp8 367 | snp326 368 | snp375 369 | snp12 370 | snp371 371 | snp491 372 | snp13 373 | snp229 374 | snp175 375 | snp215 376 | snp191 377 | snp249 378 | snp425 379 | snp283 380 | snp222 381 | snp209 382 | snp14 383 | snp198 384 | snp233 385 | snp444 386 | snp335 387 | snp476 388 | snp219 389 | snp81 390 | snp15 391 | snp76 392 | snp147 393 | snp381 394 | snp190 395 | snp16 396 | snp225 397 | snp245 398 | snp264 399 | snp129 400 | snp285 401 | snp17 402 | snp118 403 | snp18 404 | snp19 405 | snp20 406 | snp93 407 | snp358 408 | snp254 409 | snp188 410 | snp438 411 | snp317 412 | snp154 413 | snp480 414 | snp309 415 | snp347 416 | snp421 417 | snp72 418 | snp23 419 | snp227 420 | snp235 421 | snp78 422 | snp470 423 | snp128 424 | snp331 425 | snp327 426 | snp104 427 | snp457 428 | snp359 429 | snp140 430 | snp340 431 | snp21 432 | snp22 433 | snp177 434 | snp388 435 | snp378 436 | snp202 437 | snp24 438 | snp117 439 | snp102 440 | snp214 441 | snp252 442 | snp25 443 | snp216 444 | snp26 445 | snp289 446 | snp132 447 | snp446 448 | snp311 449 | snp355 450 | snp414 451 | snp364 452 | snp356 453 | snp394 454 | snp454 455 | snp119 456 | snp257 457 | snp122 458 | snp349 459 | snp186 460 | snp345 461 | snp415 462 | snp466 463 | snp36 464 | snp120 465 | snp459 466 | snp297 467 | snp269 468 | snp197 469 | snp460 470 | snp123 471 | snp294 472 | snp366 473 | snp472 474 | snp395 475 | snp244 476 | snp339 477 | snp451 478 | snp196 479 | snp174 480 | snp90 481 | snp95 482 | snp377 483 | snp97 484 | snp382 485 | snp450 486 | snp107 487 | snp224 488 | snp368 489 | snp304 490 | snp306 491 | -------------------------------------------------------------------------------- /outputs/test_discrete_geno.trainedModel.joblib: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GenoML/genoml2/469cf66f3d5594488dcd95497e0051d80602df30/outputs/test_discrete_geno.trainedModel.joblib -------------------------------------------------------------------------------- /outputs/test_discrete_geno.trainedModel_withheldSample_Predictions.csv: -------------------------------------------------------------------------------- 1 | ID,CASE_REPORTED,CASE_PROBABILITY,CASE_PREDICTED 2 | sample362,1,1.0,1 3 | sample74,0,1.0,1 4 | sample375,0,0.0,0 5 | sample156,1,1.0,1 6 | sample105,1,1.0,1 7 | sample395,1,0.0,0 8 | sample378,1,1.0,1 9 | sample125,0,1.0,1 10 | sample69,1,0.0,0 11 | sample451,1,1.0,1 12 | sample10,1,0.0,0 13 | sample195,1,1.0,1 14 | sample407,0,0.0,0 15 | sample85,1,0.0,0 16 | sample372,1,0.0,0 17 | sample389,0,0.0,0 18 | sample496,1,1.0,1 19 | sample31,1,1.0,1 20 | sample317,1,1.0,1 21 | sample409,0,1.0,1 22 | sample491,1,1.0,1 23 | sample492,1,1.0,1 24 | sample281,0,0.0,0 25 | sample357,1,1.0,1 26 | sample77,1,0.0,0 27 | sample462,1,0.0,0 28 | sample498,1,1.0,1 29 | sample212,0,1.0,1 30 | sample102,1,1.0,1 31 | sample335,0,1.0,1 32 | sample476,1,0.0,0 33 | sample337,1,0.0,0 34 | sample441,1,0.0,0 35 | sample174,1,0.0,0 36 | sample3,1,1.0,1 37 | sample334,0,1.0,1 38 | sample410,1,1.0,1 39 | sample71,1,0.0,0 40 | sample210,1,1.0,1 41 | sample64,1,0.0,0 42 | sample385,1,1.0,1 43 | sample94,0,0.0,0 44 | sample486,1,1.0,1 45 | sample186,1,1.0,1 46 | sample34,0,0.0,0 47 | sample78,1,1.0,1 48 | sample1,1,1.0,1 49 | sample12,1,1.0,1 50 | sample416,0,0.0,0 51 | sample23,1,1.0,1 52 | sample73,0,1.0,1 53 | sample183,1,1.0,1 54 | sample132,1,1.0,1 55 | sample411,1,1.0,1 56 | sample194,1,1.0,1 57 | sample56,1,1.0,1 58 | sample149,1,0.0,0 59 | sample19,0,1.0,1 60 | sample205,1,0.0,0 61 | sample79,1,0.0,0 62 | sample495,1,0.0,0 63 | sample263,1,1.0,1 64 | sample324,0,1.0,1 65 | sample484,1,1.0,1 66 | sample80,0,1.0,1 67 | sample40,1,1.0,1 68 | sample452,0,1.0,1 69 | sample47,0,0.0,0 70 | sample239,1,1.0,1 71 | sample392,0,1.0,1 72 | sample353,1,1.0,1 73 | sample342,1,1.0,1 74 | sample278,1,1.0,1 75 | sample291,1,0.0,0 76 | sample318,1,0.0,0 77 | sample305,1,1.0,1 78 | sample269,1,0.0,0 79 | sample70,1,0.0,0 80 | sample456,1,1.0,1 81 | sample466,1,1.0,1 82 | sample155,1,1.0,1 83 | sample83,1,0.0,0 84 | sample478,1,0.0,0 85 | sample173,1,1.0,1 86 | sample322,1,1.0,1 87 | sample91,1,1.0,1 88 | sample181,1,1.0,1 89 | sample415,1,0.0,0 90 | sample313,1,1.0,1 91 | sample279,1,1.0,1 92 | sample382,1,1.0,1 93 | sample473,1,0.0,0 94 | sample363,1,0.0,0 95 | sample325,1,1.0,1 96 | sample432,1,0.0,0 97 | sample348,0,1.0,1 98 | sample87,1,1.0,1 99 | sample76,1,1.0,1 100 | sample439,1,1.0,1 101 | sample16,1,1.0,1 102 | sample250,0,1.0,1 103 | sample434,1,1.0,1 104 | sample20,0,0.0,0 105 | sample323,1,1.0,1 106 | sample333,1,0.0,0 107 | sample57,0,1.0,1 108 | sample302,1,0.0,0 109 | sample230,1,1.0,1 110 | sample332,1,1.0,1 111 | sample133,1,0.0,0 112 | sample138,0,0.0,0 113 | sample424,1,1.0,1 114 | sample336,1,1.0,1 115 | sample26,0,1.0,1 116 | sample465,0,1.0,1 117 | sample282,1,0.0,0 118 | sample248,0,0.0,0 119 | sample238,0,1.0,1 120 | sample118,1,1.0,1 121 | sample43,1,0.0,0 122 | sample221,1,0.0,0 123 | sample177,0,1.0,1 124 | sample321,0,1.0,1 125 | sample154,0,1.0,1 126 | sample232,1,1.0,1 127 | sample228,1,1.0,1 128 | sample418,1,0.0,0 129 | sample204,1,0.0,0 130 | sample127,0,1.0,1 131 | sample330,1,0.0,0 132 | sample32,1,1.0,1 133 | sample114,0,1.0,1 134 | sample471,1,1.0,1 135 | sample272,1,1.0,1 136 | sample141,0,1.0,1 137 | sample58,1,1.0,1 138 | sample193,1,1.0,1 139 | sample25,1,1.0,1 140 | sample18,1,1.0,1 141 | sample266,1,0.0,0 142 | sample67,1,1.0,1 143 | sample209,1,1.0,1 144 | sample480,1,1.0,1 145 | sample95,1,0.0,0 146 | sample254,1,0.0,0 147 | sample267,0,1.0,1 148 | sample24,1,1.0,1 149 | sample223,0,1.0,1 150 | sample262,1,1.0,1 151 | sample427,1,1.0,1 152 | -------------------------------------------------------------------------------- /outputs/test_discrete_geno.trainedModel_withheldSample_ROC.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GenoML/genoml2/469cf66f3d5594488dcd95497e0051d80602df30/outputs/test_discrete_geno.trainedModel_withheldSample_ROC.png -------------------------------------------------------------------------------- /outputs/test_discrete_geno.trainedModel_withheldSample_probabilities.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GenoML/genoml2/469cf66f3d5594488dcd95497e0051d80602df30/outputs/test_discrete_geno.trainedModel_withheldSample_probabilities.png -------------------------------------------------------------------------------- /outputs/test_discrete_geno.training_withheldSamples_performanceMetrics.csv: -------------------------------------------------------------------------------- 1 | Algorithm,AUC_Percent,Accuracy_Percent,Balanced_Accuracy_Percent,Log_Loss,Sensitivity,Specificity,PPV,NPV,Runtime_Seconds 2 | LogisticRegression,49.701028462090406,59.333333333333336,50.2870126763932,1.3083665335134453,0.6814159292035398,0.32432432432432434,0.7549019607843137,0.25,0.17054200172424316 3 | RandomForestClassifier,50.334848122458745,74.0,49.11504424778761,0.5929014943106782,0.9823008849557522,0.0,0.75,0.0,0.189835786819458 4 | AdaBoostClassifier,35.20688830423344,54.666666666666664,42.64530016742406,0.6912231659615806,0.6637168141592921,0.1891891891891892,0.7142857142857143,0.15555555555555556,0.17150497436523438 5 | GradientBoostingClassifier,50.92083233676155,62.66666666666667,49.772781631188714,0.6887642407734422,0.7522123893805309,0.24324324324324326,0.7522123893805309,0.24324324324324326,0.4369809627532959 6 | SGDClassifier,54.3171490074145,62.66666666666667,54.317149007414486,12.894476520766654,0.7079646017699115,0.3783783783783784,0.7766990291262136,0.2978723404255319,0.026642799377441406 7 | SVC,47.40492705094475,75.33333333333333,50.0,0.5824498890799489,1.0,0.0,0.7533333333333333,,0.40736913681030273 8 | MLPClassifier,49.17483855536953,66.0,54.71179143745515,1.2453128289862851,0.7699115044247787,0.32432432432432434,0.7767857142857143,0.3157894736842105,0.4303250312805176 9 | KNeighborsClassifier,53.34848122458742,62.66666666666667,52.49940205692418,1.9597754698972316,0.7256637168141593,0.32432432432432434,0.7663551401869159,0.27906976744186046,0.12562084197998047 10 | LinearDiscriminantAnalysis,44.59459459459459,53.333333333333336,49.94020569241808,6.016274091996749,0.5663716814159292,0.43243243243243246,0.7529411764705882,0.24615384615384617,0.10006976127624512 11 | QuadraticDiscriminantAnalysis,51.80578808897393,50.66666666666667,51.805788088973934,17.039129688155935,0.49557522123893805,0.5405405405405406,0.7671232876712328,0.2597402597402597,0.06195402145385742 12 | BaggingClassifier,47.27337957426453,62.0,50.23917723032767,0.8853380657190238,0.7345132743362832,0.2702702702702703,0.7545454545454545,0.25,0.13388800621032715 13 | XGBClassifier,47.38100932791198,69.33333333333334,56.015307342740975,0.7044975252573689,0.8230088495575221,0.2972972972972973,0.7815126050420168,0.3548387096774194,0.8208780288696289 14 | -------------------------------------------------------------------------------- /outputs/test_discrete_geno.tunedModel.joblib: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GenoML/genoml2/469cf66f3d5594488dcd95497e0051d80602df30/outputs/test_discrete_geno.tunedModel.joblib -------------------------------------------------------------------------------- /outputs/test_discrete_geno.tunedModel_CV_Summary.csv: -------------------------------------------------------------------------------- 1 | Mean_CV_Score_Baseline,Standard_Dev_CV_Score_Baseline,Min_CV_Score_Baseline,Max_CV_Score_Baseline,Mean_CV_Score_BestTuned,Standard_Dev_CV_Score_BestTuned,Min_CV_Score_BestTuned,Max_CV_Score_BestTuned 2 | 0.8752857471880757,0.012154420363495264,0.8515624999999999,0.886029411764706,0.8770257112311965,0.005740413855556065,0.8662683823529411,0.8828125 3 | -------------------------------------------------------------------------------- /outputs/test_discrete_geno.tunedModel_allSample_probabilities.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GenoML/genoml2/469cf66f3d5594488dcd95497e0051d80602df30/outputs/test_discrete_geno.tunedModel_allSample_probabilities.png -------------------------------------------------------------------------------- /outputs/test_discrete_geno.tunedModel_top10Iterations_Summary.csv: -------------------------------------------------------------------------------- 1 | Model_Rank,Mean_Validation_Score,Mean_Standard_Deviation,Parameters 2 | 1,0.8777594154772709,0.00614681385285427,{'n_estimators': 113} 3 | 2,0.8774203861067569,0.01466490966393507,{'n_estimators': 456} 4 | 3,0.8767881328548249,0.01328590033923356,{'n_estimators': 428} 5 | 4,0.8766358059703545,0.01336681019692871,{'n_estimators': 96} 6 | 5,0.8764338493111129,0.016218470814769632,{'n_estimators': 23} 7 | 6,0.8763807390066827,0.005794217414978005,{'n_estimators': 114} 8 | 7,0.876344472747023,0.01900285729384202,{'n_estimators': 790} 9 | 8,0.8762983664714131,0.007999675002041077,{'n_estimators': 201} 10 | 9,0.876111363198856,0.015236886652742407,{'n_estimators': 218} 11 | 10,0.8759275396694441,0.00940246779427247,{'n_estimators': 209} 12 | -------------------------------------------------------------------------------- /outputs/test_discrete_geno.umap_clustering.joblib: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GenoML/genoml2/469cf66f3d5594488dcd95497e0051d80602df30/outputs/test_discrete_geno.umap_clustering.joblib -------------------------------------------------------------------------------- /outputs/test_discrete_geno.umap_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GenoML/genoml2/469cf66f3d5594488dcd95497e0051d80602df30/outputs/test_discrete_geno.umap_plot.png -------------------------------------------------------------------------------- /outputs/validation_test_discrete_geno.dataForML.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GenoML/genoml2/469cf66f3d5594488dcd95497e0051d80602df30/outputs/validation_test_discrete_geno.dataForML.h5 -------------------------------------------------------------------------------- /outputs/validation_test_discrete_geno.finalHarmonizedCols_toKeep.txt: -------------------------------------------------------------------------------- 1 | AGE 2 | FAMILY_HISTORY 3 | ID 4 | PHENO 5 | SEX_COV 6 | UPSIT 7 | snp1 8 | snp10 9 | snp100 10 | snp101 11 | snp102 12 | snp103 13 | snp104 14 | snp105 15 | snp106 16 | snp107 17 | snp108 18 | snp109 19 | snp11 20 | snp110 21 | snp111 22 | snp112 23 | snp113 24 | snp114 25 | snp115 26 | snp116 27 | snp117 28 | snp118 29 | snp119 30 | snp12 31 | snp120 32 | snp121 33 | snp122 34 | snp123 35 | snp124 36 | snp125 37 | snp126 38 | snp127 39 | snp128 40 | snp13 41 | snp130 42 | snp131 43 | snp132 44 | snp133 45 | snp134 46 | snp135 47 | snp136 48 | snp137 49 | snp138 50 | snp139 51 | snp14 52 | snp140 53 | snp141 54 | snp142 55 | snp143 56 | snp144 57 | snp145 58 | snp146 59 | snp147 60 | snp148 61 | snp149 62 | snp15 63 | snp150 64 | snp151 65 | snp152 66 | snp154 67 | snp155 68 | snp156 69 | snp157 70 | snp158 71 | snp159 72 | snp16 73 | snp160 74 | snp161 75 | snp162 76 | snp163 77 | snp164 78 | snp166 79 | snp167 80 | snp168 81 | snp169 82 | snp17 83 | snp170 84 | snp171 85 | snp172 86 | snp173 87 | snp174 88 | snp175 89 | snp176 90 | snp177 91 | snp178 92 | snp18 93 | snp180 94 | snp181 95 | snp182 96 | snp183 97 | snp184 98 | snp185 99 | snp186 100 | snp187 101 | snp188 102 | snp189 103 | snp19 104 | snp190 105 | snp191 106 | snp192 107 | snp193 108 | snp194 109 | snp195 110 | snp196 111 | snp197 112 | snp198 113 | snp199 114 | snp2 115 | snp20 116 | snp200 117 | snp201 118 | snp202 119 | snp203 120 | snp204 121 | snp205 122 | snp206 123 | snp207 124 | snp208 125 | snp209 126 | snp21 127 | snp210 128 | snp211 129 | snp212 130 | snp213 131 | snp214 132 | snp215 133 | snp216 134 | snp217 135 | snp218 136 | snp219 137 | snp22 138 | snp220 139 | snp221 140 | snp222 141 | snp223 142 | snp224 143 | snp225 144 | snp226 145 | snp227 146 | snp228 147 | snp229 148 | snp23 149 | snp230 150 | snp231 151 | snp232 152 | snp233 153 | snp234 154 | snp235 155 | snp236 156 | snp237 157 | snp238 158 | snp239 159 | snp24 160 | snp240 161 | snp241 162 | snp242 163 | snp243 164 | snp244 165 | snp245 166 | snp246 167 | snp247 168 | snp248 169 | snp249 170 | snp25 171 | snp250 172 | snp251 173 | snp252 174 | snp253 175 | snp254 176 | snp255 177 | snp256 178 | snp257 179 | snp258 180 | snp259 181 | snp26 182 | snp260 183 | snp261 184 | snp262 185 | snp263 186 | snp264 187 | snp265 188 | snp266 189 | snp267 190 | snp268 191 | snp269 192 | snp27 193 | snp270 194 | snp271 195 | snp272 196 | snp273 197 | snp274 198 | snp275 199 | snp276 200 | snp277 201 | snp278 202 | snp28 203 | snp280 204 | snp281 205 | snp282 206 | snp283 207 | snp284 208 | snp285 209 | snp286 210 | snp287 211 | snp288 212 | snp289 213 | snp29 214 | snp290 215 | snp291 216 | snp292 217 | snp293 218 | snp294 219 | snp295 220 | snp296 221 | snp297 222 | snp298 223 | snp299 224 | snp3 225 | snp30 226 | snp300 227 | snp301 228 | snp302 229 | snp303 230 | snp304 231 | snp305 232 | snp306 233 | snp307 234 | snp308 235 | snp309 236 | snp31 237 | snp310 238 | snp311 239 | snp312 240 | snp313 241 | snp314 242 | snp315 243 | snp316 244 | snp317 245 | snp318 246 | snp319 247 | snp32 248 | snp320 249 | snp321 250 | snp322 251 | snp323 252 | snp324 253 | snp325 254 | snp326 255 | snp327 256 | snp328 257 | snp329 258 | snp33 259 | snp330 260 | snp331 261 | snp332 262 | snp333 263 | snp334 264 | snp335 265 | snp336 266 | snp337 267 | snp338 268 | snp339 269 | snp34 270 | snp340 271 | snp341 272 | snp342 273 | snp343 274 | snp344 275 | snp345 276 | snp346 277 | snp347 278 | snp348 279 | snp349 280 | snp35 281 | snp350 282 | snp351 283 | snp352 284 | snp353 285 | snp354 286 | snp355 287 | snp356 288 | snp357 289 | snp358 290 | snp359 291 | snp36 292 | snp360 293 | snp361 294 | snp363 295 | snp364 296 | snp365 297 | snp366 298 | snp368 299 | snp369 300 | snp37 301 | snp370 302 | snp371 303 | snp372 304 | snp373 305 | snp375 306 | snp376 307 | snp377 308 | snp378 309 | snp379 310 | snp38 311 | snp380 312 | snp381 313 | snp382 314 | snp383 315 | snp384 316 | snp385 317 | snp386 318 | snp387 319 | snp388 320 | snp389 321 | snp39 322 | snp390 323 | snp392 324 | snp393 325 | snp394 326 | snp395 327 | snp396 328 | snp397 329 | snp398 330 | snp399 331 | snp4 332 | snp400 333 | snp401 334 | snp402 335 | snp403 336 | snp404 337 | snp406 338 | snp407 339 | snp408 340 | snp409 341 | snp41 342 | snp410 343 | snp411 344 | snp412 345 | snp413 346 | snp414 347 | snp415 348 | snp416 349 | snp417 350 | snp418 351 | snp419 352 | snp42 353 | snp420 354 | snp421 355 | snp422 356 | snp423 357 | snp424 358 | snp425 359 | snp426 360 | snp427 361 | snp428 362 | snp429 363 | snp43 364 | snp430 365 | snp431 366 | snp432 367 | snp433 368 | snp434 369 | snp436 370 | snp437 371 | snp438 372 | snp439 373 | snp44 374 | snp440 375 | snp441 376 | snp442 377 | snp443 378 | snp444 379 | snp445 380 | snp446 381 | snp447 382 | snp448 383 | snp449 384 | snp45 385 | snp450 386 | snp451 387 | snp452 388 | snp454 389 | snp455 390 | snp456 391 | snp457 392 | snp458 393 | snp459 394 | snp46 395 | snp460 396 | snp461 397 | snp462 398 | snp463 399 | snp464 400 | snp465 401 | snp466 402 | snp467 403 | snp468 404 | snp469 405 | snp47 406 | snp470 407 | snp471 408 | snp472 409 | snp473 410 | snp474 411 | snp475 412 | snp476 413 | snp477 414 | snp478 415 | snp479 416 | snp48 417 | snp480 418 | snp481 419 | snp482 420 | snp483 421 | snp484 422 | snp485 423 | snp486 424 | snp487 425 | snp488 426 | snp489 427 | snp49 428 | snp490 429 | snp491 430 | snp492 431 | snp493 432 | snp494 433 | snp495 434 | snp496 435 | snp497 436 | snp498 437 | snp499 438 | snp5 439 | snp50 440 | snp500 441 | snp51 442 | snp52 443 | snp53 444 | snp55 445 | snp56 446 | snp57 447 | snp58 448 | snp59 449 | snp6 450 | snp60 451 | snp61 452 | snp62 453 | snp63 454 | snp64 455 | snp65 456 | snp66 457 | snp67 458 | snp68 459 | snp69 460 | snp7 461 | snp70 462 | snp71 463 | snp72 464 | snp73 465 | snp74 466 | snp75 467 | snp76 468 | snp77 469 | snp78 470 | snp79 471 | snp8 472 | snp80 473 | snp81 474 | snp82 475 | snp83 476 | snp84 477 | snp85 478 | snp86 479 | snp87 480 | snp88 481 | snp89 482 | snp9 483 | snp90 484 | snp91 485 | snp92 486 | snp93 487 | snp94 488 | snp95 489 | snp96 490 | snp97 491 | snp98 492 | snp99 493 | -------------------------------------------------------------------------------- /outputs/validation_test_discrete_geno.list_features.txt: -------------------------------------------------------------------------------- 1 | AGE 2 | FAMILY_HISTORY 3 | ID 4 | PHENO 5 | SEX_COV 6 | UPSIT 7 | snp1 8 | snp10 9 | snp100 10 | snp101 11 | snp102 12 | snp103 13 | snp104 14 | snp105 15 | snp106 16 | snp107 17 | snp108 18 | snp109 19 | snp11 20 | snp110 21 | snp111 22 | snp112 23 | snp113 24 | snp114 25 | snp115 26 | snp116 27 | snp117 28 | snp118 29 | snp119 30 | snp12 31 | snp120 32 | snp121 33 | snp122 34 | snp123 35 | snp124 36 | snp125 37 | snp126 38 | snp127 39 | snp128 40 | snp13 41 | snp130 42 | snp131 43 | snp132 44 | snp133 45 | snp134 46 | snp135 47 | snp136 48 | snp137 49 | snp138 50 | snp139 51 | snp14 52 | snp140 53 | snp141 54 | snp142 55 | snp143 56 | snp144 57 | snp145 58 | snp146 59 | snp147 60 | snp148 61 | snp149 62 | snp15 63 | snp150 64 | snp151 65 | snp152 66 | snp154 67 | snp155 68 | snp156 69 | snp157 70 | snp158 71 | snp159 72 | snp16 73 | snp160 74 | snp161 75 | snp162 76 | snp163 77 | snp164 78 | snp166 79 | snp167 80 | snp168 81 | snp169 82 | snp17 83 | snp170 84 | snp171 85 | snp172 86 | snp173 87 | snp174 88 | snp175 89 | snp176 90 | snp177 91 | snp178 92 | snp18 93 | snp180 94 | snp181 95 | snp182 96 | snp183 97 | snp184 98 | snp185 99 | snp186 100 | snp187 101 | snp188 102 | snp189 103 | snp19 104 | snp190 105 | snp191 106 | snp192 107 | snp193 108 | snp194 109 | snp195 110 | snp196 111 | snp197 112 | snp198 113 | snp199 114 | snp2 115 | snp20 116 | snp200 117 | snp201 118 | snp202 119 | snp203 120 | snp204 121 | snp205 122 | snp206 123 | snp207 124 | snp208 125 | snp209 126 | snp21 127 | snp210 128 | snp211 129 | snp212 130 | snp213 131 | snp214 132 | snp215 133 | snp216 134 | snp217 135 | snp218 136 | snp219 137 | snp22 138 | snp220 139 | snp221 140 | snp222 141 | snp223 142 | snp224 143 | snp225 144 | snp226 145 | snp227 146 | snp228 147 | snp229 148 | snp23 149 | snp230 150 | snp231 151 | snp232 152 | snp233 153 | snp234 154 | snp235 155 | snp236 156 | snp237 157 | snp238 158 | snp239 159 | snp24 160 | snp240 161 | snp241 162 | snp242 163 | snp243 164 | snp244 165 | snp245 166 | snp246 167 | snp247 168 | snp248 169 | snp249 170 | snp25 171 | snp250 172 | snp251 173 | snp252 174 | snp253 175 | snp254 176 | snp255 177 | snp256 178 | snp257 179 | snp258 180 | snp259 181 | snp26 182 | snp260 183 | snp261 184 | snp262 185 | snp263 186 | snp264 187 | snp265 188 | snp266 189 | snp267 190 | snp268 191 | snp269 192 | snp27 193 | snp270 194 | snp271 195 | snp272 196 | snp273 197 | snp274 198 | snp275 199 | snp276 200 | snp277 201 | snp278 202 | snp28 203 | snp280 204 | snp281 205 | snp282 206 | snp283 207 | snp284 208 | snp285 209 | snp286 210 | snp287 211 | snp288 212 | snp289 213 | snp29 214 | snp290 215 | snp291 216 | snp292 217 | snp293 218 | snp294 219 | snp295 220 | snp296 221 | snp297 222 | snp298 223 | snp299 224 | snp3 225 | snp30 226 | snp300 227 | snp301 228 | snp302 229 | snp303 230 | snp304 231 | snp305 232 | snp306 233 | snp307 234 | snp308 235 | snp309 236 | snp31 237 | snp310 238 | snp311 239 | snp312 240 | snp313 241 | snp314 242 | snp315 243 | snp316 244 | snp317 245 | snp318 246 | snp319 247 | snp32 248 | snp320 249 | snp321 250 | snp322 251 | snp323 252 | snp324 253 | snp325 254 | snp326 255 | snp327 256 | snp328 257 | snp329 258 | snp33 259 | snp330 260 | snp331 261 | snp332 262 | snp333 263 | snp334 264 | snp335 265 | snp336 266 | snp337 267 | snp338 268 | snp339 269 | snp34 270 | snp340 271 | snp341 272 | snp342 273 | snp343 274 | snp344 275 | snp345 276 | snp346 277 | snp347 278 | snp348 279 | snp349 280 | snp35 281 | snp350 282 | snp351 283 | snp352 284 | snp353 285 | snp354 286 | snp355 287 | snp356 288 | snp357 289 | snp358 290 | snp359 291 | snp36 292 | snp360 293 | snp361 294 | snp363 295 | snp364 296 | snp365 297 | snp366 298 | snp368 299 | snp369 300 | snp37 301 | snp370 302 | snp371 303 | snp372 304 | snp373 305 | snp375 306 | snp376 307 | snp377 308 | snp378 309 | snp379 310 | snp38 311 | snp380 312 | snp381 313 | snp382 314 | snp383 315 | snp384 316 | snp385 317 | snp386 318 | snp387 319 | snp388 320 | snp389 321 | snp39 322 | snp390 323 | snp392 324 | snp393 325 | snp394 326 | snp395 327 | snp396 328 | snp397 329 | snp398 330 | snp399 331 | snp4 332 | snp400 333 | snp401 334 | snp402 335 | snp403 336 | snp404 337 | snp406 338 | snp407 339 | snp408 340 | snp409 341 | snp41 342 | snp410 343 | snp411 344 | snp412 345 | snp413 346 | snp414 347 | snp415 348 | snp416 349 | snp417 350 | snp418 351 | snp419 352 | snp42 353 | snp420 354 | snp421 355 | snp422 356 | snp423 357 | snp424 358 | snp425 359 | snp426 360 | snp427 361 | snp428 362 | snp429 363 | snp43 364 | snp430 365 | snp431 366 | snp432 367 | snp433 368 | snp434 369 | snp436 370 | snp437 371 | snp438 372 | snp439 373 | snp44 374 | snp440 375 | snp441 376 | snp442 377 | snp443 378 | snp444 379 | snp445 380 | snp446 381 | snp447 382 | snp448 383 | snp449 384 | snp45 385 | snp450 386 | snp451 387 | snp452 388 | snp454 389 | snp455 390 | snp456 391 | snp457 392 | snp458 393 | snp459 394 | snp46 395 | snp460 396 | snp461 397 | snp462 398 | snp463 399 | snp464 400 | snp465 401 | snp466 402 | snp467 403 | snp468 404 | snp469 405 | snp47 406 | snp470 407 | snp471 408 | snp472 409 | snp473 410 | snp474 411 | snp475 412 | snp476 413 | snp477 414 | snp478 415 | snp479 416 | snp48 417 | snp480 418 | snp481 419 | snp482 420 | snp483 421 | snp484 422 | snp485 423 | snp486 424 | snp487 425 | snp488 426 | snp489 427 | snp49 428 | snp490 429 | snp491 430 | snp492 431 | snp493 432 | snp494 433 | snp495 434 | snp496 435 | snp497 436 | snp498 437 | snp499 438 | snp5 439 | snp50 440 | snp500 441 | snp51 442 | snp52 443 | snp53 444 | snp55 445 | snp56 446 | snp57 447 | snp58 448 | snp59 449 | snp6 450 | snp60 451 | snp61 452 | snp62 453 | snp63 454 | snp64 455 | snp65 456 | snp66 457 | snp67 458 | snp68 459 | snp69 460 | snp7 461 | snp70 462 | snp71 463 | snp72 464 | snp73 465 | snp74 466 | snp75 467 | snp76 468 | snp77 469 | snp78 470 | snp79 471 | snp8 472 | snp80 473 | snp81 474 | snp82 475 | snp83 476 | snp84 477 | snp85 478 | snp86 479 | snp87 480 | snp88 481 | snp89 482 | snp9 483 | snp90 484 | snp91 485 | snp92 486 | snp93 487 | snp94 488 | snp95 489 | snp96 490 | snp97 491 | snp98 492 | snp99 493 | -------------------------------------------------------------------------------- /outputs/validation_test_discrete_geno.refColsHarmonize_toKeep.txt: -------------------------------------------------------------------------------- 1 | ID 2 | PHENO 3 | SEX_COV 4 | AGE 5 | UPSIT 6 | FAMILY_HISTORY 7 | snp410 8 | snp403 9 | snp164 10 | snp363 11 | snp439 12 | snp370 13 | snp389 14 | snp475 15 | snp399 16 | snp87 17 | snp308 18 | snp223 19 | snp131 20 | snp94 21 | snp79 22 | snp420 23 | snp344 24 | snp281 25 | snp139 26 | snp379 27 | snp77 28 | snp431 29 | snp445 30 | snp360 31 | snp137 32 | snp27 33 | snp432 34 | snp28 35 | snp29 36 | snp208 37 | snp319 38 | snp30 39 | snp85 40 | snp433 41 | snp380 42 | snp31 43 | snp499 44 | snp443 45 | snp488 46 | snp32 47 | snp33 48 | snp113 49 | snp242 50 | snp369 51 | snp34 52 | snp278 53 | snp35 54 | snp160 55 | snp429 56 | snp159 57 | snp396 58 | snp374 59 | snp437 60 | snp486 61 | snp376 62 | snp268 63 | snp255 64 | snp423 65 | snp250 66 | snp411 67 | snp194 68 | snp267 69 | snp424 70 | snp408 71 | snp260 72 | snp130 73 | snp110 74 | snp372 75 | snp409 76 | snp138 77 | snp37 78 | snp343 79 | snp248 80 | snp352 81 | snp38 82 | snp262 83 | snp441 84 | snp39 85 | snp307 86 | snp393 87 | snp133 88 | snp84 89 | snp158 90 | snp292 91 | snp413 92 | snp336 93 | snp204 94 | snp465 95 | snp213 96 | snp350 97 | snp43 98 | snp44 99 | snp150 100 | snp434 101 | snp483 102 | snp143 103 | snp270 104 | snp98 105 | snp341 106 | snp243 107 | snp495 108 | snp145 109 | snp45 110 | snp155 111 | snp490 112 | snp192 113 | snp383 114 | snp163 115 | snp303 116 | snp148 117 | snp115 118 | snp325 119 | snp455 120 | snp418 121 | snp195 122 | snp210 123 | snp199 124 | snp109 125 | snp263 126 | snp442 127 | snp498 128 | snp168 129 | snp232 130 | snp238 131 | snp482 132 | snp41 133 | snp419 134 | snp42 135 | snp428 136 | snp272 137 | snp114 138 | snp293 139 | snp167 140 | snp55 141 | snp56 142 | snp406 143 | snp47 144 | snp201 145 | snp313 146 | snp449 147 | snp49 148 | snp290 149 | snp458 150 | snp473 151 | snp385 152 | snp276 153 | snp333 154 | snp462 155 | snp479 156 | snp50 157 | snp51 158 | snp52 159 | snp183 160 | snp318 161 | snp166 162 | snp241 163 | snp315 164 | snp53 165 | snp91 166 | snp284 167 | snp280 168 | snp354 169 | snp46 170 | snp464 171 | snp83 172 | snp351 173 | snp353 174 | snp497 175 | snp48 176 | snp100 177 | snp88 178 | snp282 179 | snp178 180 | snp461 181 | snp329 182 | snp259 183 | snp211 184 | snp422 185 | snp346 186 | snp59 187 | snp170 188 | snp73 189 | snp417 190 | snp436 191 | snp57 192 | snp86 193 | snp312 194 | snp99 195 | snp112 196 | snp217 197 | snp427 198 | snp338 199 | snp58 200 | snp275 201 | snp182 202 | snp357 203 | snp89 204 | snp181 205 | snp203 206 | snp302 207 | snp324 208 | snp310 209 | snp298 210 | snp185 211 | snp61 212 | snp392 213 | snp452 214 | snp189 215 | snp62 216 | snp334 217 | snp162 218 | snp348 219 | snp96 220 | snp247 221 | snp253 222 | snp228 223 | snp416 224 | snp172 225 | snp468 226 | snp121 227 | snp400 228 | snp258 229 | snp492 230 | snp477 231 | snp337 232 | snp144 233 | snp63 234 | snp80 235 | snp161 236 | snp330 237 | snp316 238 | snp226 239 | snp246 240 | snp60 241 | snp124 242 | snp218 243 | snp92 244 | snp200 245 | snp322 246 | snp126 247 | snp496 248 | snp361 249 | snp234 250 | snp300 251 | snp64 252 | snp256 253 | snp493 254 | snp141 255 | snp151 256 | snp489 257 | snp212 258 | snp471 259 | snp180 260 | snp412 261 | snp135 262 | snp463 263 | snp65 264 | snp440 265 | snp66 266 | snp111 267 | snp67 268 | snp187 269 | snp221 270 | snp149 271 | snp386 272 | snp240 273 | snp474 274 | snp332 275 | snp407 276 | snp101 277 | snp500 278 | snp75 279 | snp68 280 | snp69 281 | snp106 282 | snp273 283 | snp481 284 | snp70 285 | snp230 286 | snp387 287 | snp390 288 | snp207 289 | snp485 290 | snp430 291 | snp342 292 | snp116 293 | snp494 294 | snp3 295 | snp82 296 | snp469 297 | snp398 298 | snp265 299 | snp266 300 | snp456 301 | snp478 302 | snp251 303 | snp402 304 | snp328 305 | snp184 306 | snp323 307 | snp205 308 | snp447 309 | snp125 310 | snp157 311 | snp146 312 | snp305 313 | snp1 314 | snp373 315 | snp156 316 | snp295 317 | snp2 318 | snp103 319 | snp397 320 | snp71 321 | snp404 322 | snp384 323 | snp4 324 | snp206 325 | snp169 326 | snp134 327 | snp236 328 | snp136 329 | snp237 330 | snp467 331 | snp448 332 | snp271 333 | snp286 334 | snp320 335 | snp6 336 | snp426 337 | snp277 338 | snp105 339 | snp127 340 | snp231 341 | snp142 342 | snp484 343 | snp74 344 | snp365 345 | snp152 346 | snp5 347 | snp239 348 | snp288 349 | snp299 350 | snp401 351 | snp291 352 | snp176 353 | snp487 354 | snp321 355 | snp171 356 | snp301 357 | snp9 358 | snp10 359 | snp173 360 | snp11 361 | snp220 362 | snp274 363 | snp261 364 | snp296 365 | snp287 366 | snp314 367 | snp193 368 | snp108 369 | snp7 370 | snp8 371 | snp326 372 | snp375 373 | snp12 374 | snp371 375 | snp491 376 | snp13 377 | snp229 378 | snp175 379 | snp215 380 | snp191 381 | snp249 382 | snp425 383 | snp283 384 | snp222 385 | snp209 386 | snp14 387 | snp198 388 | snp233 389 | snp444 390 | snp335 391 | snp476 392 | snp219 393 | snp81 394 | snp15 395 | snp76 396 | snp147 397 | snp381 398 | snp190 399 | snp16 400 | snp225 401 | snp245 402 | snp264 403 | snp129 404 | snp285 405 | snp17 406 | snp118 407 | snp18 408 | snp19 409 | snp20 410 | snp93 411 | snp358 412 | snp254 413 | snp188 414 | snp438 415 | snp317 416 | snp154 417 | snp480 418 | snp309 419 | snp347 420 | snp421 421 | snp72 422 | snp23 423 | snp227 424 | snp235 425 | snp78 426 | snp470 427 | snp128 428 | snp331 429 | snp327 430 | snp104 431 | snp457 432 | snp359 433 | snp140 434 | snp340 435 | snp21 436 | snp22 437 | snp177 438 | snp388 439 | snp378 440 | snp202 441 | snp24 442 | snp117 443 | snp102 444 | snp214 445 | snp252 446 | snp25 447 | snp216 448 | snp26 449 | snp289 450 | snp132 451 | snp446 452 | snp311 453 | snp355 454 | snp414 455 | snp364 456 | snp356 457 | snp394 458 | snp454 459 | snp119 460 | snp257 461 | snp122 462 | snp349 463 | snp186 464 | snp345 465 | snp415 466 | snp466 467 | snp36 468 | snp120 469 | snp459 470 | snp297 471 | snp269 472 | snp197 473 | snp460 474 | snp123 475 | snp294 476 | snp366 477 | snp472 478 | snp395 479 | snp244 480 | snp339 481 | snp451 482 | snp196 483 | snp174 484 | snp90 485 | snp95 486 | snp377 487 | snp97 488 | snp382 489 | snp450 490 | snp107 491 | snp224 492 | snp368 493 | snp304 494 | snp306 495 | -------------------------------------------------------------------------------- /outputs/validation_test_discrete_geno.refSNPs_andAlleles.bed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GenoML/genoml2/469cf66f3d5594488dcd95497e0051d80602df30/outputs/validation_test_discrete_geno.refSNPs_andAlleles.bed -------------------------------------------------------------------------------- /outputs/validation_test_discrete_geno.refSNPs_andAlleles.fam: -------------------------------------------------------------------------------- 1 | valdiation34 valdiation34 0 0 2 2 2 | valdiation75 valdiation75 0 0 1 2 3 | valdiation65 valdiation65 0 0 2 2 4 | valdiation15 valdiation15 0 0 2 1 5 | valdiation5 valdiation5 0 0 1 2 6 | valdiation24 valdiation24 0 0 1 1 7 | valdiation14 valdiation14 0 0 2 2 8 | valdiation58 valdiation58 0 0 1 2 9 | valdiation10 valdiation10 0 0 1 2 10 | valdiation89 valdiation89 0 0 1 2 11 | valdiation20 valdiation20 0 0 2 1 12 | valdiation59 valdiation59 0 0 1 1 13 | valdiation19 valdiation19 0 0 2 2 14 | valdiation13 valdiation13 0 0 2 2 15 | valdiation21 valdiation21 0 0 2 2 16 | valdiation35 valdiation35 0 0 1 2 17 | valdiation1 valdiation1 0 0 2 1 18 | valdiation92 valdiation92 0 0 1 1 19 | valdiation74 valdiation74 0 0 2 2 20 | valdiation94 valdiation94 0 0 1 2 21 | valdiation2 valdiation2 0 0 2 2 22 | valdiation37 valdiation37 0 0 1 2 23 | valdiation44 valdiation44 0 0 2 1 24 | valdiation48 valdiation48 0 0 1 1 25 | valdiation49 valdiation49 0 0 1 2 26 | valdiation17 valdiation17 0 0 1 2 27 | valdiation18 valdiation18 0 0 1 2 28 | valdiation83 valdiation83 0 0 2 2 29 | valdiation68 valdiation68 0 0 1 2 30 | valdiation50 valdiation50 0 0 1 2 31 | valdiation22 valdiation22 0 0 1 2 32 | valdiation33 valdiation33 0 0 2 1 33 | valdiation43 valdiation43 0 0 2 1 34 | valdiation60 valdiation60 0 0 2 2 35 | valdiation70 valdiation70 0 0 1 2 36 | valdiation99 valdiation99 0 0 1 2 37 | valdiation36 valdiation36 0 0 2 1 38 | valdiation51 valdiation51 0 0 1 2 39 | valdiation76 valdiation76 0 0 2 2 40 | valdiation64 valdiation64 0 0 1 2 41 | valdiation69 valdiation69 0 0 2 1 42 | valdiation32 valdiation32 0 0 1 2 43 | valdiation88 valdiation88 0 0 1 2 44 | valdiation11 valdiation11 0 0 1 1 45 | valdiation3 valdiation3 0 0 1 2 46 | valdiation46 valdiation46 0 0 1 2 47 | valdiation27 valdiation27 0 0 1 2 48 | valdiation63 valdiation63 0 0 1 2 49 | valdiation4 valdiation4 0 0 1 2 50 | valdiation85 valdiation85 0 0 1 2 51 | valdiation23 valdiation23 0 0 1 1 52 | valdiation84 valdiation84 0 0 1 2 53 | valdiation71 valdiation71 0 0 1 2 54 | valdiation54 valdiation54 0 0 1 1 55 | valdiation55 valdiation55 0 0 2 1 56 | valdiation26 valdiation26 0 0 1 2 57 | valdiation56 valdiation56 0 0 1 2 58 | valdiation72 valdiation72 0 0 1 2 59 | valdiation93 valdiation93 0 0 1 1 60 | valdiation8 valdiation8 0 0 1 2 61 | valdiation30 valdiation30 0 0 1 2 62 | valdiation39 valdiation39 0 0 1 1 63 | valdiation81 valdiation81 0 0 1 1 64 | valdiation80 valdiation80 0 0 1 2 65 | valdiation100 valdiation100 0 0 2 2 66 | valdiation9 valdiation9 0 0 1 2 67 | valdiation96 valdiation96 0 0 1 2 68 | valdiation12 valdiation12 0 0 2 2 69 | valdiation6 valdiation6 0 0 1 2 70 | valdiation31 valdiation31 0 0 2 2 71 | valdiation45 valdiation45 0 0 2 2 72 | valdiation87 valdiation87 0 0 1 2 73 | valdiation53 valdiation53 0 0 1 2 74 | valdiation86 valdiation86 0 0 1 1 75 | valdiation91 valdiation91 0 0 1 2 76 | valdiation25 valdiation25 0 0 1 1 77 | valdiation95 valdiation95 0 0 1 2 78 | valdiation62 valdiation62 0 0 2 2 79 | valdiation42 valdiation42 0 0 2 2 80 | valdiation98 valdiation98 0 0 1 2 81 | valdiation16 valdiation16 0 0 2 2 82 | valdiation38 valdiation38 0 0 2 1 83 | valdiation52 valdiation52 0 0 1 2 84 | valdiation57 valdiation57 0 0 1 1 85 | valdiation47 valdiation47 0 0 1 2 86 | valdiation28 valdiation28 0 0 1 2 87 | valdiation78 valdiation78 0 0 2 1 88 | valdiation29 valdiation29 0 0 1 1 89 | valdiation97 valdiation97 0 0 2 2 90 | valdiation40 valdiation40 0 0 1 1 91 | valdiation66 valdiation66 0 0 1 2 92 | valdiation61 valdiation61 0 0 2 2 93 | valdiation77 valdiation77 0 0 2 2 94 | valdiation90 valdiation90 0 0 1 2 95 | valdiation79 valdiation79 0 0 1 2 96 | valdiation41 valdiation41 0 0 1 2 97 | valdiation82 valdiation82 0 0 1 1 98 | valdiation7 valdiation7 0 0 2 2 99 | valdiation67 valdiation67 0 0 1 2 100 | valdiation73 valdiation73 0 0 1 2 101 | -------------------------------------------------------------------------------- /outputs/validation_test_discrete_geno.testedModel_allSample_ROC.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GenoML/genoml2/469cf66f3d5594488dcd95497e0051d80602df30/outputs/validation_test_discrete_geno.testedModel_allSample_ROC.png -------------------------------------------------------------------------------- /outputs/validation_test_discrete_geno.testedModel_allSample_predictions.csv: -------------------------------------------------------------------------------- 1 | ID,CASE_REPORTED,CASE_PROBABILITY,CASE_PREDICTED 2 | valdiation1,0,0.03148861333566866,0 3 | valdiation2,1,0.9849483263923975,1 4 | valdiation3,1,0.9394837298774558,1 5 | valdiation4,1,0.961536162754122,1 6 | valdiation5,1,0.7495487657093041,1 7 | valdiation6,1,0.9770457209562207,1 8 | valdiation7,1,0.8482988366528482,1 9 | valdiation8,1,0.7736793567653053,1 10 | valdiation9,1,0.6662222480606722,1 11 | valdiation10,1,0.8777924765950229,1 12 | valdiation11,0,0.06308968944624066,0 13 | valdiation12,1,0.9841752543221782,1 14 | valdiation13,1,0.9715629013535524,1 15 | valdiation14,1,0.8865205600711621,1 16 | valdiation15,0,0.09400063147306198,0 17 | valdiation16,1,0.9827536027621778,1 18 | valdiation17,1,0.9852498838419805,1 19 | valdiation18,1,0.9779392459770798,1 20 | valdiation19,1,0.8215149005406969,1 21 | valdiation20,0,0.025766961939865437,0 22 | valdiation21,1,0.369723015891961,0 23 | valdiation22,1,0.992850603956108,1 24 | valdiation23,0,0.22210115032526745,0 25 | valdiation24,0,0.10090341403815763,0 26 | valdiation25,0,0.20967163765538469,0 27 | valdiation26,1,0.9954998695495226,1 28 | valdiation27,1,0.9622843947181122,1 29 | valdiation28,1,0.9926081702605717,1 30 | valdiation29,0,0.05603870368131757,0 31 | valdiation30,1,0.688307955072017,1 32 | valdiation31,1,0.9946113192557112,1 33 | valdiation32,1,0.9766874201362098,1 34 | valdiation33,0,0.06878102918507902,0 35 | valdiation34,1,0.9894603825649909,1 36 | valdiation35,1,0.9437301028345729,1 37 | valdiation36,0,0.21216235062749125,0 38 | valdiation37,1,0.9900018821933396,1 39 | valdiation38,0,0.16901206233073723,0 40 | valdiation39,0,0.13817595449303555,0 41 | valdiation40,0,0.04568187764569424,0 42 | valdiation41,1,0.9846117972454992,1 43 | valdiation42,1,0.9662683691473066,1 44 | valdiation43,0,0.19761096748372325,0 45 | valdiation44,0,0.09155973982361744,0 46 | valdiation45,1,0.8777371456685015,1 47 | valdiation46,1,0.9911946720102857,1 48 | valdiation47,1,0.950193585755543,1 49 | valdiation48,0,0.021267349785308882,0 50 | valdiation49,1,0.9947173924809013,1 51 | valdiation50,1,0.9798199701239221,1 52 | valdiation51,1,0.7785928326337268,1 53 | valdiation52,1,0.9877527076932267,1 54 | valdiation53,1,0.6619314886523919,1 55 | valdiation54,0,0.24334927888057628,0 56 | valdiation55,0,0.023175285267309574,0 57 | valdiation56,1,0.9936299851165581,1 58 | valdiation57,0,0.292972066817925,0 59 | valdiation58,1,0.9453775699409687,1 60 | valdiation59,0,0.08359695641646878,0 61 | valdiation60,1,0.9826793556869285,1 62 | valdiation61,1,0.9804351180970784,1 63 | valdiation62,1,0.9689776045129581,1 64 | valdiation63,1,0.43015270870091094,0 65 | valdiation64,1,0.9842623094116923,1 66 | valdiation65,1,0.9822136540202853,1 67 | valdiation66,1,0.8657350197834053,1 68 | valdiation67,1,0.9896966555486313,1 69 | valdiation68,1,0.9840282172936693,1 70 | valdiation69,0,0.27260734300721334,0 71 | valdiation70,1,0.9840772920766343,1 72 | valdiation71,1,0.9750412686846891,1 73 | valdiation72,1,0.9916573991337486,1 74 | valdiation73,1,0.962258027120737,1 75 | valdiation74,1,0.827550733382712,1 76 | valdiation75,1,0.9716689143266516,1 77 | valdiation76,1,0.9601409855961331,1 78 | valdiation77,1,0.9726736058066694,1 79 | valdiation78,0,0.09561175299797499,0 80 | valdiation79,1,0.9711145567192419,1 81 | valdiation80,1,0.9805958147654943,1 82 | valdiation81,0,0.30162483312327265,0 83 | valdiation82,0,0.02553170681822126,0 84 | valdiation83,1,0.7234706286257632,1 85 | valdiation84,1,0.9923825256192076,1 86 | valdiation85,1,0.9866697978848589,1 87 | valdiation86,0,0.06355608737123379,0 88 | valdiation87,1,0.5943631474810299,1 89 | valdiation88,1,0.9888374151591128,1 90 | valdiation89,1,0.9885651828332006,1 91 | valdiation90,1,0.7357631184903739,1 92 | valdiation91,1,0.9722710446370482,1 93 | valdiation92,0,0.17406641415140675,0 94 | valdiation93,0,0.06171541281506878,0 95 | valdiation94,1,0.9892974833143502,1 96 | valdiation95,1,0.9847112377880253,1 97 | valdiation96,1,0.9920365721699342,1 98 | valdiation97,1,0.8508495360265258,1 99 | valdiation98,1,0.9892594847751349,1 100 | valdiation99,1,0.9175627336719306,1 101 | valdiation100,1,0.9971595269910182,1 102 | -------------------------------------------------------------------------------- /outputs/validation_test_discrete_geno.testedModel_allSample_probabilities.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GenoML/genoml2/469cf66f3d5594488dcd95497e0051d80602df30/outputs/validation_test_discrete_geno.testedModel_allSample_probabilities.png -------------------------------------------------------------------------------- /outputs/validation_test_discrete_geno.testedModel_allSamples_performanceMetrics.csv: -------------------------------------------------------------------------------- 1 | AUC_Percent,Accuracy_Percent,Balanced_Accuracy_Percent,Log_Loss,Sensitivity,Specificity,PPV,NPV 2 | 100.0,100.0,100.0,0.002694198189173816,1.0,1.0,1.0,1.0 3 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | joblib 2 | matplotlib 3 | numpy 4 | tables 5 | pandas 6 | pandas_plink 7 | requests 8 | scikit-learn 9 | scipy 10 | seaborn 11 | statsmodels 12 | xgboost==2.0.3 13 | umap-learn -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The GenoML Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | import setuptools 17 | 18 | with open('requirements.txt') as file: 19 | requires = [line.strip() for line in file if not line.startswith('#')] 20 | 21 | with open("README.md", "r") as fh: 22 | long_description = fh.read() 23 | 24 | setuptools.setup( 25 | name="genoml2", 26 | version="1.0.1", 27 | maintainer="The GenoML Development Team", 28 | maintainer_email="genoml@googlegroups.com", 29 | description="GenoML is an automated machine learning tool that optimizes" 30 | " basic machine learning pipelines for genomic data.", 31 | long_description=long_description, 32 | long_description_content_type="text/markdown", 33 | url="https://genoml.github.io/", 34 | download_url="https://github.com/GenoML/genoml2/archive/v1.0.1.tar.gz", 35 | entry_points={ 36 | 'console_scripts': 37 | ['genoml=genoml.__main__:handle_main'], 38 | }, 39 | packages=setuptools.find_packages(), 40 | install_requires=requires, 41 | classifiers=[ 42 | "Development Status :: 4 - Beta", 43 | "Programming Language :: Python :: 3.6", 44 | "License :: OSI Approved :: Apache Software License", 45 | "Operating System :: OS Independent", 46 | ], 47 | python_requires='>=3.6', 48 | package_data={'genoml': ['misc/*']}, 49 | ) 50 | --------------------------------------------------------------------------------