├── .github
└── ISSUE_TEMPLATE
│ ├── bug-performance-issue.md
│ ├── documentation-issue.md
│ └── feature-request.md
├── .gitignore
├── .travis.yml
├── LICENSE
├── MANIFEST.in
├── README.md
├── build
└── lib
│ └── genoml
│ ├── __init__.py
│ ├── __main__.py
│ ├── cli
│ ├── __init__.py
│ ├── continuous_supervised_test.py
│ ├── continuous_supervised_train.py
│ ├── continuous_supervised_tune.py
│ ├── discrete_supervised_test.py
│ ├── discrete_supervised_train.py
│ ├── discrete_supervised_tune.py
│ ├── harmonizing.py
│ └── munging.py
│ ├── continuous
│ ├── __init__.py
│ └── supervised
│ │ ├── __init__.py
│ │ ├── testing.py
│ │ ├── training.py
│ │ └── tuning.py
│ ├── dependencies.py
│ ├── discrete
│ ├── __init__.py
│ └── supervised
│ │ ├── __init__.py
│ │ ├── testing.py
│ │ ├── training.py
│ │ └── tuning.py
│ ├── misc
│ └── descriptions.json
│ ├── preprocessing
│ ├── __init__.py
│ ├── adjuster.py
│ ├── featureselection.py
│ ├── harmonizing.py
│ ├── munging.py
│ └── vif.py
│ └── utils.py
├── dist
└── genoml2-1.0.1.tar.gz
├── docs
├── GettingStarted.sh
└── current_file_structure.txt
├── examples
├── continuous
│ ├── example_GWAS.csv
│ ├── to_adjust.txt
│ ├── training.bed
│ ├── training.bim
│ ├── training.fam
│ ├── training_addit.csv
│ ├── training_addit_confounder_example.csv
│ ├── training_pheno.csv
│ ├── validation.bed
│ ├── validation.bim
│ ├── validation.fam
│ ├── validation_addit.csv
│ └── validation_pheno.csv
└── discrete
│ ├── example_GWAS.csv
│ ├── to_adjust.txt
│ ├── training.bed
│ ├── training.bim
│ ├── training.fam
│ ├── training_addit.csv
│ ├── training_addit_confounder_example.csv
│ ├── training_pheno.csv
│ ├── validation.bed
│ ├── validation.bim
│ ├── validation.fam
│ ├── validation_addit.csv
│ └── validation_pheno.csv
├── genoml
├── __init__.py
├── __main__.py
├── cli
│ ├── __init__.py
│ ├── continuous_supervised_test.py
│ ├── continuous_supervised_train.py
│ ├── continuous_supervised_tune.py
│ ├── discrete_supervised_test.py
│ ├── discrete_supervised_train.py
│ ├── discrete_supervised_tune.py
│ ├── harmonizing.py
│ └── munging.py
├── continuous
│ ├── __init__.py
│ └── supervised
│ │ ├── __init__.py
│ │ ├── testing.py
│ │ ├── training.py
│ │ └── tuning.py
├── dependencies.py
├── discrete
│ ├── __init__.py
│ ├── supervised
│ │ ├── __init__.py
│ │ ├── testing.py
│ │ ├── training.py
│ │ └── tuning.py
│ └── utils.py
├── misc
│ └── descriptions.json
├── preprocessing
│ ├── __init__.py
│ ├── adjuster.py
│ ├── featureselection.py
│ ├── harmonizing.py
│ ├── munging.py
│ └── vif.py
└── utils.py
├── genoml2.egg-info
├── PKG-INFO
├── SOURCES.txt
├── dependency_links.txt
├── entry_points.txt
├── requires.txt
└── top_level.txt
├── logo.png
├── outputs
├── test_discrete_geno.approx_feature_importance.txt
├── test_discrete_geno.best_algorithm.txt
├── test_discrete_geno.dataForML.h5
├── test_discrete_geno.list_features.txt
├── test_discrete_geno.trainedModel.joblib
├── test_discrete_geno.trainedModel_trainingSample_Predictions.csv
├── test_discrete_geno.trainedModel_withheldSample_Predictions.csv
├── test_discrete_geno.trainedModel_withheldSample_ROC.png
├── test_discrete_geno.trainedModel_withheldSample_probabilities.png
├── test_discrete_geno.training_withheldSamples_performanceMetrics.csv
├── test_discrete_geno.tunedModel.joblib
├── test_discrete_geno.tunedModel_CV_Summary.csv
├── test_discrete_geno.tunedModel_allSample_Predictions.csv
├── test_discrete_geno.tunedModel_allSample_probabilities.png
├── test_discrete_geno.tunedModel_top10Iterations_Summary.csv
├── test_discrete_geno.umap_clustering.joblib
├── test_discrete_geno.umap_data_reduction.csv
├── test_discrete_geno.umap_plot.png
├── test_discrete_geno.variants_and_alleles.tab
├── validation_test_discrete_geno.dataForML.h5
├── validation_test_discrete_geno.finalHarmonizedCols_toKeep.txt
├── validation_test_discrete_geno.list_features.txt
├── validation_test_discrete_geno.refColsHarmonize_toKeep.txt
├── validation_test_discrete_geno.refSNPs_andAlleles.bed
├── validation_test_discrete_geno.refSNPs_andAlleles.bim
├── validation_test_discrete_geno.refSNPs_andAlleles.fam
├── validation_test_discrete_geno.testedModel_allSample_ROC.png
├── validation_test_discrete_geno.testedModel_allSample_predictions.csv
├── validation_test_discrete_geno.testedModel_allSample_probabilities.png
├── validation_test_discrete_geno.testedModel_allSamples_performanceMetrics.csv
└── validation_test_discrete_geno.variants_and_alleles.tab
├── requirements.txt
├── setup.cfg
└── setup.py
/.github/ISSUE_TEMPLATE/bug-performance-issue.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Bug/Performance Issue
3 | about: 'Use this template for reporting a bug or a performance issue. '
4 | title: ''
5 | labels: type:bug/performance
6 | assignees: ''
7 |
8 | ---
9 |
10 | Please make sure that this is a bug.
11 |
12 | **System information:**
13 | - OS Platform and Distribution (e.g., Linux Ubuntu 16.04):
14 | - GenoML Installed from (source or binary):
15 | - GenoML Version:
16 | - Python Version:
17 |
18 | **Describe the current behavior:**
19 |
20 | **Describe the expected behavior:**
21 |
22 | **Code to reproduce the issue:**
23 | Provide a reproducible test case that is the bare minimum necessary to generate the problem.
24 |
25 | **Other Information / Logs**
26 | Include any logs or source code that would be helpful to diagnose the problem. If including tracebacks, please include the full traceback. Large logs and files should be attached.
27 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/documentation-issue.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Documentation Issue
3 | about: 'Use this template for documentation related issues. '
4 | title: ''
5 | labels: type:docs
6 | assignees: ''
7 |
8 | ---
9 |
10 | Please make sure that this is a documentation issue.
11 |
12 |
13 | **System information**
14 | - GenoML Version:
15 | - Doc Link:
16 |
17 |
18 | **Describe the documentation Issue**
19 |
20 | **We welcome contributions by users. Will you be able to fix the Doc issue?**
21 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature-request.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Feature Request
3 | about: Use this template for raising a feature request.
4 | title: ''
5 | labels: type:feature
6 | assignees: ''
7 |
8 | ---
9 |
10 | Please make sure that this is a feature request.
11 |
12 | **System information:**
13 | - GenoML Version:
14 | - GenoML Installed from (source or binary):
15 | - Are you willing to contribute to this request? (Yes/No):
16 |
17 | **Describe Current Behavior/State and Recommended Feature Request:**
18 |
19 | **Will this change the current API? How?**
20 |
21 | **Who Will Benefit from this Feature?**
22 |
23 | **Any Additional Information?**
24 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 |
2 | outputs/.DS_Store
3 | .DS_Store
4 | genoml-git.code-workspace
5 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: python
2 | python:
3 | - "3.6" # current default Python on Travis CI
4 | - "3.7"
5 | - "3.8"
6 | - "3.8-dev" # 3.8 development branch
7 | - "nightly" # nightly build
8 | install:
9 | - pip install .
10 | # command to run tests
11 | script:
12 | - echo "TODO"
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include requirements.txt
--------------------------------------------------------------------------------
/build/lib/genoml/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 The GenoML Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | from genoml import preprocessing
17 | from genoml import discrete
18 | from genoml import continuous
19 | from genoml import cli
20 |
--------------------------------------------------------------------------------
/build/lib/genoml/cli/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 The GenoML Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
--------------------------------------------------------------------------------
/build/lib/genoml/cli/continuous_supervised_test.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 The GenoML Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | import sys
17 | import joblib
18 | import pandas as pd
19 |
20 | from genoml.continuous import supervised
21 |
22 |
23 | def main(prefix, test_prefix, refModel_prefix):
24 | print("")
25 | print("Here is some basic info on the command you are about to run.")
26 | print("Python version info...")
27 | print(sys.version)
28 |
29 | # Print out the chosen CLI arguments
30 | print("CLI argument info...")
31 | print(f"You are importing this test dataset: {test_prefix}.")
32 | print(f"You are applying the model saved here: {refModel_prefix}.")
33 | print(
34 | f"The results of this test application of your model will be saved in files with the given prefix: {prefix}.")
35 | print(
36 | "As a note, in all exported probabilities and other graphics, case status is treated as a 0 or 1, with 1 representing a positive case.")
37 |
38 | print("")
39 |
40 | # Specify prefix and dataframe variables to be passed into class
41 | run_prefix = prefix
42 | infile_h5 = test_prefix + ".dataForML.h5"
43 | df = pd.read_hdf(infile_h5, key="dataForML")
44 |
45 | infile_model = refModel_prefix + ".joblib"
46 | loaded_model = joblib.load(infile_model)
47 |
48 | # Pass the arguments to the class
49 | test = supervised.test(df, loaded_model, run_prefix)
50 |
51 | # Prep and show the dataframe
52 | test.prep_df()
53 |
54 | # Output the performance metrics
55 | test.performance_metrics()
56 |
57 | # Exporting predictions on withheld data
58 | test.export_pheno_predictions()
59 |
60 | # Exporting regression plot + summary
61 | test.regression_summary()
62 |
63 | # Thank the user
64 | print("")
65 | print(
66 | "Let's shut everything down, thanks for testing your model with GenoML!")
67 | print("")
68 |
--------------------------------------------------------------------------------
/build/lib/genoml/cli/continuous_supervised_train.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 The GenoML Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | import sys
17 |
18 | import numpy as np
19 | import pandas as pd
20 |
21 | from genoml import utils
22 | from genoml.continuous import supervised
23 |
24 |
25 | # TODO(mary): use or remove export_predictions
26 | @utils.DescriptionLoader.function_description("cli/continuous_supervised_train")
27 | def main(run_prefix, export_predictions, matching_columns_path):
28 | utils.DescriptionLoader.print("cli/continuous_supervised_train/info",
29 | python_version=sys.version, prefix=run_prefix)
30 |
31 | input_path = run_prefix + ".dataForML.h5"
32 | with utils.DescriptionLoader.context(
33 | "cli/continuous_supervised_train/input", path=input_path):
34 | df = pd.read_hdf(input_path, key="dataForML")
35 |
36 | if matching_columns_path:
37 | with utils.DescriptionLoader.context(
38 | "cli/continuous_supervised_train/matching_columns_path",
39 | matching_columns_path=matching_columns_path):
40 | with open(matching_columns_path, 'r') as matchingCols_file:
41 | matching_column_names_list = matchingCols_file.read().splitlines()
42 |
43 | df = df[np.intersect1d(df.columns, matching_column_names_list)]
44 |
45 | model = supervised.train(df, run_prefix)
46 | model.summary()
47 | model.compete()
48 | model.export_model()
49 | model.export_predictions()
50 | model.save_algorithm_results(run_prefix)
51 | model.save_best_algorithm(run_prefix)
52 |
--------------------------------------------------------------------------------
/build/lib/genoml/cli/continuous_supervised_tune.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 The GenoML Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | import pandas as pd
17 |
18 | from genoml.continuous import supervised
19 |
20 |
21 | def main(run_prefix, max_iter, cv_count):
22 | # TUNING
23 | # Create a dialogue with the user
24 | print("Here is some basic info on the command you are about to run.")
25 | print("CLI argument info...")
26 | print(f"Working with the dataset and best model corresponding to prefix {run_prefix} the timestamp from the merge is the prefix in most cases.")
27 | print(f"Your maximum number of tuning iterations is {max_iter} and if you are concerned about runtime, make this number smaller.")
28 | print(f"You are running {cv_count} rounds of cross-validation, and again... if you are concerned about runtime, make this number smaller.")
29 | print("Give credit where credit is due, for this stage of analysis we use code from the great contributors to python packages: argparse, xgboost, sklearn, pandas, numpy, time, matplotlib and seaborn.")
30 | print("As a note, in all exported probabilities and other graphics, case status is treated as a 0 or 1, with 1 representing a positive case.")
31 |
32 | print("")
33 |
34 | infile_h5 = run_prefix + ".dataForML.h5"
35 | df = pd.read_hdf(infile_h5, key = "dataForML")
36 |
37 | y_tune = df.PHENO
38 | X_tune = df.drop(columns=['PHENO'])
39 | IDs_tune = X_tune.ID
40 | X_tune = X_tune.drop(columns=['ID'])
41 |
42 |
43 | best_algo_name_in = run_prefix + '.best_algorithm.txt'
44 | best_algo_df = pd.read_csv(best_algo_name_in, header=None, index_col=False)
45 | best_algo = str(best_algo_df.iloc[0,0])
46 |
47 |
48 | # Communicate to the user the best identified algorithm
49 | print(f"From previous analyses in the training phase, we've determined that the best algorithm for this application is {best_algo}... so let's tune it up and see what gains we can make!")
50 |
51 | # Tuning
52 | ## This calls on the functions made in the tune class (tuning.py) at the genoml.continuous.supervised
53 | model_tune = supervised.tune(df, run_prefix, max_iter, cv_count)
54 | model_tune.select_tuning_parameters() # Returns algo, hyperparameters, and scoring_metric
55 | model_tune.apply_tuning_parameters() # Randomized search with CV to tune
56 | model_tune.report_tune() # Summary of the top 10 iterations of the hyperparameter tune
57 | model_tune.summarize_tune() # Summary of the cross-validation
58 | model_tune.compare_performance() # Compares tuned performance to baseline to
59 | model_tune.export_tuned_data() # Export the newly tuned predictions
60 | model_tune.export_tune_regression() # Export the tuned and fitted regression model
61 |
62 | print("")
63 | print("End of tuning stage with GenoML.")
64 | print("")
65 |
--------------------------------------------------------------------------------
/build/lib/genoml/cli/discrete_supervised_test.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 The GenoML Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | import sys
17 | import joblib
18 | import pandas as pd
19 |
20 | from genoml.discrete import supervised
21 |
22 | def main(prefix, test_prefix, refModel_prefix):
23 | print("")
24 | print("Here is some basic info on the command you are about to run.")
25 | print("Python version info...")
26 | print(sys.version)
27 |
28 | # Print out the chosen CLI arguments
29 | print("CLI argument info...")
30 | print(f"You are importing this test dataset: {test_prefix}.")
31 | print(f"You are applying the model saved here: {refModel_prefix}.")
32 | print(f"The results of this test application of your model will be saved in files with the given prefix: {prefix}.")
33 | print("As a note, in all exported probabilities and other graphics, case status is treated as a 0 or 1, with 1 representing a positive case.")
34 |
35 | print("")
36 |
37 | # Specify prefix and dataframe variables to be passed into class
38 | run_prefix = prefix
39 | infile_h5 = test_prefix + ".dataForML.h5"
40 | df = pd.read_hdf(infile_h5, key = "dataForML")
41 |
42 | infile_model = refModel_prefix + ".joblib"
43 | loaded_model = joblib.load(infile_model)
44 |
45 | # Pass the arguments to the class
46 | test = supervised.test(df, loaded_model, run_prefix)
47 |
48 | # Prep and show the dataframe
49 | test.prep_df()
50 |
51 | # Export the ROC
52 | test.export_ROC()
53 |
54 | # Export the tested data
55 | test.export_tested_data()
56 |
57 | # Export the histograms
58 | test.export_histograms()
59 |
60 | # Export the additional summary stats
61 | test.additional_sumstats()
62 |
63 | # Thank the user
64 | print("")
65 | print("Let's shut everything down, thanks for testing your model with GenoML!")
66 | print("")
67 |
68 |
69 |
--------------------------------------------------------------------------------
/build/lib/genoml/cli/discrete_supervised_train.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 The GenoML Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | import sys
17 | import numpy as np
18 | import pandas as pd
19 |
20 | from genoml.discrete import supervised
21 |
22 |
23 | def main(prefix, metric_max, prob_hist, auc, matchingCols):
24 | print("")
25 | print("Here is some basic info on the command you are about to run.")
26 | print("Python Version info...")
27 | print(sys.version)
28 |
29 | # Print out chosen CLI arguments
30 | print("CLI argument info...")
31 | print(f"Working with dataset {prefix} from previous data munging efforts.")
32 | print(f"You have chosen to compete the algorithms based on {metric_max}.")
33 | print("Give credit where credit is due, for this stage of analysis we use code from the great contributors to Python packages: argparse, xgboost, sklearn, pandas, numpy, time, matplotlib and seaborn.")
34 | print("As a note, in all exported probabilities and other graphics, case status is treated as a 0 or 1, with 1 representing a positive case.")
35 | print("")
36 |
37 | # Specify prefix and dataframe variables to be passed into class
38 | run_prefix = prefix
39 | infile_h5 = run_prefix + ".dataForML.h5"
40 | df = pd.read_hdf(infile_h5, key = "dataForML")
41 |
42 | if (matchingCols != None):
43 | print(f"Looks like you are retraining your reference file. We are using the harmonized columns you provided here: {matchingCols}")
44 | print(f"Note that you might have different/less features than before, given this was harmonized between training and test dataset, and might mean your model now performs worse...")
45 |
46 | with open(matchingCols, 'r') as matchingCols_file:
47 | matching_column_names_list = matchingCols_file.read().splitlines()
48 |
49 | # Keep only the columns found in the file
50 | df = df[np.intersect1d(df.columns, matching_column_names_list)]
51 |
52 | model = supervised.train(df, run_prefix)
53 | model.summary()
54 |
55 | # Give user context prior to competing algorithms
56 | # Explains to users how we are splitting their data 70:30
57 | print("")
58 | print("Now let's compete these algorithms!")
59 | print("We'll update you as each algorithm runs, then summarize at the end.")
60 | print("Here we test each algorithm under default settings using the same training and test datasets derived from a 70% training and 30% testing split of your data.")
61 | print("For each algorithm, we will output the following metrics...")
62 | print("Algorithm name, hoping that's pretty self-explanatory. Plenty of resources on these common ML algorithms at https://scikit-learn.org and https://xgboost.readthedocs.io/.")
63 | print("AUC_percent, this is the area under the curve from receiver operating characteristic analyses. This is the most common metric of classifier performance in biomedical literature, we express this as a percent. We calculate AUC based on the predicted probability of being a case.")
64 | print("Accuracy_percent, this is the simple accuracy of the classifier, how many predictions were correct from best classification cutoff (python default).")
65 | print("Balanced_Accuracy_Percent, consider this as the accuracy resampled to a 1:1 mix of cases and controls. Imbalanced datasets can give funny results for simple accuracy.")
66 | print("Log_Loss, this is essentially the inverse of the likelihood function for a correct prediction, you want to minimize this.")
67 | print("Sensitivity, proportion of cases correctly identified.")
68 | print("Specificity, proportion of controls correctly identified.")
69 | print("PPV, this is the positive predictive value, the probability that subjects with a positive result actually have the disease.")
70 | print("NPV, this is the negative predictive value, the probability that subjects with a negative result don't have the disease.")
71 | print("We also log the runtimes per algorithm.")
72 | print("")
73 | print("Algorithm summaries incoming...")
74 | print("")
75 |
76 | # Compete the algorithms
77 | model.compete()
78 |
79 | # Output the results of the log
80 | model.results(metric_max)
81 |
82 | # Export the results
83 | model.export_model()
84 |
85 | # Export the AUC
86 | model.AUC(save=True)
87 |
88 | # Export the probability histograms
89 | model.export_prob_hist()
90 |
91 | # Save out the proper algorithm
92 | model.save_results(prefix, algorithmResults = True, bestAlgorithm = True)
93 |
94 | print("Thank you for training with GenoML!")
--------------------------------------------------------------------------------
/build/lib/genoml/cli/discrete_supervised_tune.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 The GenoML Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | import pandas as pd
17 | import numpy as np
18 |
19 | from genoml.discrete import supervised
20 |
21 |
22 | def main(run_prefix, metric_tune, max_iter, cv_count, matchingCols):
23 | # TUNING
24 | # Create a dialogue with the user
25 | print("Here is some basic info on the command you are about to run.")
26 | print("CLI argument info...")
27 | print(f"Working with the dataset and best model corresponding to prefix {run_prefix} the timestamp from the merge is the prefix in most cases.")
28 | print(f"You have chosen to tune the algorithms based on {metric_tune}.")
29 | print(f"Your maximum number of tuning iterations is {max_iter} and if you are concerned about runtime, make this number smaller.")
30 | print(f"You are running {cv_count} rounds of cross-validation, and again... if you are concerned about runtime, make this number smaller.")
31 | print("Give credit where credit is due, for this stage of analysis we use code from the great contributors to python packages: argparse, xgboost, sklearn, pandas, numpy, time, matplotlib and seaborn.")
32 | print("As a note, in all exported probabilities and other graphics, case status is treated as a 0 or 1, with 1 representing a positive case.")
33 |
34 | print("")
35 |
36 | infile_h5 = run_prefix + ".dataForML.h5"
37 | df = pd.read_hdf(infile_h5, key = "dataForML")
38 |
39 | # Addressing issue #12:
40 | if (matchingCols != None):
41 | print(f"We are using the harmonized columns you provided here: {matchingCols}")
42 | print(f"Note that you might have different/less features than before, given this was column list was harmonized between your reference and test dataset...")
43 |
44 | with open(matchingCols, 'r') as matchingCols_file:
45 | matching_column_names_list = matchingCols_file.read().splitlines()
46 |
47 | # Keep only the columns found in the file
48 | df = df[np.intersect1d(df.columns, matching_column_names_list)]
49 |
50 | best_algo_name_in = run_prefix + '.best_algorithm.txt'
51 | best_algo_df = pd.read_csv(best_algo_name_in, header=None, index_col=False)
52 | best_algo = str(best_algo_df.iloc[0,0])
53 |
54 | # Communicate to the user the best identified algorithm
55 | print(f"From previous analyses in the training phase, we've determined that the best algorithm for this application is {best_algo}... so let's tune it up and see what gains we can make!")
56 |
57 | # Tuning
58 | ## This calls on the functions made in the tune class (tuning.py) at the genoml.discrete.supervised
59 | model_tune = supervised.tune(df, run_prefix, max_iter, cv_count)
60 |
61 | # Returns algo, hyperparameters, and scoring_metric
62 | model_tune.select_tuning_parameters(metric_tune)
63 |
64 | # Randomized search with CV to tune
65 | model_tune.apply_tuning_parameters()
66 |
67 | # Summary of the top 10 iterations of the hyperparameter tune
68 | model_tune.report_tune()
69 |
70 | # Summary of the cross-validation
71 | model_tune.summarize_tune()
72 |
73 | # Compares tuned performance to baseline to
74 | model_tune.compare_performance()
75 |
76 | # Export the ROC curve
77 | # model_tune.ROC()
78 |
79 | # Export the newly tuned predictions
80 | model_tune.export_tuned_data()
81 |
82 | # Export the probabilites
83 | model_tune.export_tune_hist_prob()
84 |
85 |
86 | print("")
87 | print("End of tuning stage with GenoML.")
88 | print("")
89 |
--------------------------------------------------------------------------------
/build/lib/genoml/cli/harmonizing.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 The GenoML Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | import sys
17 | from genoml import preprocessing
18 |
19 |
20 | def main(test_geno_prefix, test_prefix, ref_model_prefix,
21 | training_snps_alleles):
22 | # Print configurations
23 | print("Here is some basic info on the command you are about to run.")
24 | print("Python version info...")
25 | print(sys.version)
26 | print("CLI argument info...")
27 | print(f"You are importing test dataset {test_geno_prefix}.")
28 | print(
29 | f"Applying the model saved from your reference dataset in {ref_model_prefix}.")
30 | print(
31 | f"Reading in the SNP and allele information we will use to compare from {training_snps_alleles}.")
32 | print(
33 | f"The results of this test application of your model will be saved in files tagged {test_prefix}.")
34 | print(
35 | "As a note, in all exported probabilities and other graphics, case status is treated as a 0 or 1, with 1 representing a positive case.")
36 |
37 | # Run the harmonize script in genoml.preprocessing
38 | harmonizer = preprocessing.harmonizing(test_geno_prefix=test_geno_prefix,
39 | test_out_prefix=test_prefix,
40 | ref_model_prefix=ref_model_prefix,
41 | training_SNPs=training_snps_alleles)
42 |
43 | # Generate new binaries from the test dataset using the reference dataset SNPs
44 | harmonizer.generate_new_PLINK()
45 |
46 | # Read in PLINK binaries
47 | # harmonizer.read_PLINK()
48 |
49 | # Generate reference columns to keep for munging
50 | harmonizer.prep_refCols_file()
51 |
52 | # Thank the user
53 | print("Thank you for harmonizing with GenoML!")
54 |
--------------------------------------------------------------------------------
/build/lib/genoml/cli/munging.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 The GenoML Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | import argparse
17 | import sys
18 |
19 | import genoml.dependencies
20 | from genoml import preprocessing
21 |
22 |
23 | def main(prefix, impute, geno, skip_prune, r2_cutoff, pheno, addit, feature_selection, gwas, p, vif, iter, ref_cols_harmonize, umap_reduce, adjust_data, adjust_normalize, target_features, confounders, data_type):
24 | genoml.dependencies.check_dependencies()
25 |
26 | run_prefix = prefix
27 | impute_type = impute
28 | geno_path = geno
29 | prune_choice = skip_prune
30 | pheno_path = pheno
31 | addit_path = addit
32 | n_est = feature_selection
33 | gwas_path = gwas
34 | p_gwas = p
35 | r2_cutoff = r2_cutoff
36 | vif_thresh = vif
37 | vif_iter = iter
38 | refColsHarmonize = ref_cols_harmonize
39 | umap_reduce = umap_reduce
40 | adjust_data = adjust_data
41 | adjust_normalize = adjust_normalize
42 | target_features = target_features
43 | confounders = confounders
44 |
45 | # Print configurations
46 | print("")
47 | print("Here is some basic info on the command you are about to run.")
48 | print("Python version info...")
49 | print(sys.version)
50 | print("CLI argument info...")
51 | print(
52 | f"The output prefix for this run is {run_prefix} and will be appended to later runs of GenoML.")
53 | print(f"Working with genotype data? {geno_path}")
54 | print(f"Do you want GenoML to prune your SNPs for you? {prune_choice}")
55 | print(f"The pruning threshold you've chosen is {r2_cutoff}")
56 | print(f"Working with additional predictors? {addit_path}")
57 | print(f"Where is your phenotype file? {pheno_path}")
58 | print(f"Any use for an external set of GWAS summary stats? {gwas_path}")
59 | print(
60 | f"If you plan on using external GWAS summary stats for SNP filtering, we'll only keep SNPs at what P value? {p_gwas}")
61 | print(f"How strong is your VIF filter? {vif_thresh}")
62 | print(f"How many iterations of VIF filtering are you doing? {vif_iter}")
63 | print(
64 | f"The imputation method you picked is using the column {impute_type} to fill in any remaining NAs.")
65 | print(f"Will you be adjusting additional features using UMAP dimensionality reduction? {umap_reduce}")
66 | print(
67 | "Give credit where credit is due, for this stage of analysis we use code from the great contributors to python packages: os, sys, argparse, numpy, pandas, joblib, math and time. We also use PLINK v1.9 from https://www.cog-genomics.org/plink/1.9/.")
68 | print("")
69 |
70 | # Run the munging script in genoml.preprocessing
71 | munger = preprocessing.munging(pheno_path=pheno_path, run_prefix=run_prefix, impute_type=impute_type, skip_prune=prune_choice,
72 | p_gwas=p_gwas, addit_path=addit_path, gwas_path=gwas_path, geno_path=geno_path, refColsHarmonize=refColsHarmonize, r2_cutoff=r2_cutoff)
73 |
74 | # Process the PLINK inputs (for pruning)
75 | df = munger.plink_inputs()
76 |
77 | # Run the UMAP dimension reduction/ adjuster
78 | if (adjust_data == "yes" or umap_reduce == "yes"):
79 | adjuster = preprocessing.adjuster(run_prefix, df, target_features, confounders, adjust_data, adjust_normalize, umap_reduce)
80 | reduced_df = adjuster.umap_reducer()
81 | if (adjust_data == "yes"):
82 | print(f"\n You have chosen to adjust your data! \n")
83 | if (adjust_normalize == "yes"):
84 | print(f"\n You have also chosen to normalize your adjusted data \n")
85 | else:
86 | print(f"\n You have also chosen NOT to normalize your adjusted data \n")
87 | df = adjuster.normalize(reduced_df)
88 |
89 | # Run the feature selection using extraTrees
90 | if n_est > 0:
91 | featureSelection_df = preprocessing.featureselection(run_prefix, df, data_type, n_est)
92 | df = featureSelection_df.rank()
93 | featureSelection_df.export_data()
94 |
95 | # Run the VIF calculation
96 | if vif_iter > 0:
97 | vif_calc = preprocessing.vif(vif_iter, vif_thresh, df, 100, run_prefix)
98 | vif_calc.vif_calculations()
99 |
100 | # Thank the user
101 | print("Thank you for munging with GenoML!")
102 |
--------------------------------------------------------------------------------
/build/lib/genoml/continuous/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 The GenoML Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | from genoml.continuous import supervised
17 |
--------------------------------------------------------------------------------
/build/lib/genoml/continuous/supervised/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 The GenoML Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | from genoml.continuous.supervised.training import train
17 | from genoml.continuous.supervised.tuning import tune
18 | from genoml.continuous.supervised.testing import test
19 |
--------------------------------------------------------------------------------
/build/lib/genoml/continuous/supervised/testing.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 The GenoML Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | # Import the necessary packages
17 | import joblib
18 | import matplotlib.pyplot as plt
19 | import pandas as pd
20 | import seaborn as sns
21 | import sklearn
22 | import sys
23 | import xgboost
24 | import numpy as np
25 | from time import time
26 | import statsmodels.formula.api as sm
27 | from sklearn.metrics import explained_variance_score, mean_squared_error, median_absolute_error, r2_score
28 |
29 | class test:
30 | def __init__(self, df, loaded_model, run_prefix):
31 | self.df = df
32 | self.run_prefix = run_prefix
33 | self.loaded_model = loaded_model
34 |
35 | def prep_df(self):
36 |
37 | print("")
38 | print("Your data looks like this (showing the first few lines of the left-most and right-most columns)...")
39 | print("#"*70)
40 | print(self.df.describe())
41 | print("#"*70)
42 | print("")
43 |
44 | # Save out and drop the PHENO and sample ID columns
45 | y_test = self.df.PHENO
46 | X_test = self.df.drop(columns=['PHENO'])
47 | IDs_test = X_test.ID
48 | X_test = X_test.drop(columns=['ID'])
49 |
50 | # Save variables to use globally within the class
51 | self.y_test = y_test
52 | self.X_test = X_test
53 | self.IDs_test = IDs_test
54 |
55 | return X_test
56 |
57 | def performance_metrics(self):
58 |
59 | log_cols=["Explained_variance_score", "Mean_squared_error", "Median_absolute_error", "R2_score"]
60 | log_table = pd.DataFrame(columns=log_cols)
61 |
62 | self.loaded_model.fit(self.X_test, self.y_test)
63 |
64 | print("")
65 | print("#"*70)
66 |
67 | test_predictions = self.loaded_model.predict(self.X_test)
68 | test_predictions = test_predictions
69 | evs = explained_variance_score(self.y_test, test_predictions)
70 | print("Explained variance score: {:.4}".format(evs))
71 |
72 | test_predictions = self.loaded_model.predict(self.X_test)
73 | test_predictions = test_predictions
74 | mse = mean_squared_error(self.y_test, test_predictions)
75 | print("Mean squared error: {:.4}".format(mse))
76 |
77 | test_predictions = self.loaded_model.predict(self.X_test)
78 | test_predictions = test_predictions
79 | mae = median_absolute_error(self.y_test, test_predictions)
80 | print("Median absolute error: {:.4}".format(mae))
81 |
82 | test_predictions = self.loaded_model.predict(self.X_test)
83 | test_predictions = test_predictions
84 | r2s = r2_score(self.y_test, test_predictions)
85 | print("R^2 score: {:.4}".format(r2s))
86 |
87 | log_entry = pd.DataFrame([[evs, mse, mae, r2s]], columns=log_cols)
88 | log_table = log_table.append(log_entry)
89 |
90 | print("#"*70)
91 |
92 | print("")
93 |
94 | log_outfile = self.run_prefix + '.testedModel_allSamples_performanceMetrics.csv'
95 |
96 | print("")
97 | print(f"This table below is also logged as {log_outfile} and is in your current working directory...")
98 | print("#"*70)
99 | print(log_table)
100 | print("#"*70)
101 | print("")
102 |
103 | log_table.to_csv(log_outfile, index=False)
104 |
105 | self.log_table = log_table
106 | return log_table
107 |
108 | def export_pheno_predictions(self):
109 |
110 | test_predicted_values = self.loaded_model.predict(self.X_test)
111 | test_predicted_values_df = pd.DataFrame(test_predicted_values)
112 | y_test_df = pd.DataFrame(self.y_test)
113 | IDs_test_df = pd.DataFrame(self.IDs_test)
114 |
115 | test_out = pd.concat([IDs_test_df.reset_index(), y_test_df.reset_index(drop=True), test_predicted_values_df.reset_index(drop=True)], axis = 1, ignore_index=True)
116 | test_out.columns=["INDEX","ID","PHENO_REPORTED","PHENO_PREDICTED"]
117 | test_out = test_out.drop(columns=["INDEX"])
118 |
119 | test_outfile = self.run_prefix + '.testedModel_allSample_predictions.csv'
120 | test_out.to_csv(test_outfile, index=False)
121 |
122 | print("")
123 | print(f"Preview of the exported predictions exported as {test_outfile}, these are pretty straight forward.")
124 | print("They generally include the sample ID, the previously reported phenotype, and the predicted phenotype from that algorithm.")
125 | print("#"*70)
126 | print(test_out.head())
127 | print("#"*70)
128 |
129 | self.test_out = test_out
130 | return test_out
131 |
132 | def regression_summary(self):
133 |
134 | genoML_colors = ["cyan","purple"]
135 |
136 | sns_plot = sns.regplot(data=self.test_out, y="PHENO_REPORTED", x="PHENO_PREDICTED", scatter_kws={"color": "cyan"}, line_kws={"color": "purple"})
137 |
138 | plot_out = self.run_prefix + '.testedModel_allSamples_regressionPlot.png'
139 | sns_plot.figure.savefig(plot_out, dpi=600)
140 |
141 | print("")
142 | print(f"We are also exporting a regression plot for you here {plot_out}, this is a graphical representation of the difference between the reported and predicted phenotypes in the withheld test data for the best performing algorithm.")
143 |
144 | print("")
145 | print("Here is a quick summary of the regression comparing PHENO_REPORTED ~ PHENO_PREDICTED in the withheld test data...")
146 | print("")
147 |
148 | reg_model = sm.ols(formula='PHENO_REPORTED ~ PHENO_PREDICTED', data=self.test_out)
149 | fitted = reg_model.fit()
150 | print(fitted.summary())
151 |
152 | fitted_out = self.run_prefix + 'testedModel_allSamples_regressionSummary.csv'
153 |
154 | with open(fitted_out, 'w') as fh:
155 | fh.write(fitted.summary().as_csv())
156 |
157 | print(f"We are exporting this summary here: {fitted_out}")
158 |
159 | print("")
160 | print("...always good to see the P value for the predictor.")
161 |
--------------------------------------------------------------------------------
/build/lib/genoml/dependencies.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 The GenoML Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | import io
17 | import logging
18 | import os
19 | import pathlib
20 | import platform
21 | import requests
22 | import stat
23 | import subprocess
24 | import zipfile
25 |
26 | from genoml import utils
27 |
28 |
29 | def __get_executable_folder():
30 | key = "GENOML_DEP_DIR"
31 | if key in os.environ:
32 | return os.path.abspath(os.environ.get(key))
33 | else:
34 | return os.path.join(str(pathlib.Path.home()), ".genoml", "misc",
35 | "executables")
36 |
37 |
38 | __executable_folder = __get_executable_folder()
39 |
40 |
41 | def __check_exec(exec_path, *args, absolute_path=False):
42 | if not absolute_path:
43 | binary_path = os.path.join(__executable_folder, exec_path)
44 | else:
45 | binary_path = exec_path
46 | if not os.path.exists(binary_path):
47 | return False
48 |
49 | _ = subprocess.run([binary_path, *args], stdout=subprocess.DEVNULL,
50 | stderr=subprocess.DEVNULL)
51 | return True
52 |
53 |
54 | def __install_exec(url, exec_path):
55 | r = requests.get(url, verify=False, stream=True)
56 | r.raw.decode_content = True
57 | buffer = io.BytesIO()
58 | buffer.write(r.content)
59 | with zipfile.ZipFile(buffer, "r") as fp:
60 | fp.extractall(__executable_folder)
61 |
62 | binary_path = os.path.join(__executable_folder, exec_path)
63 | os.chmod(binary_path, stat.S_IEXEC)
64 |
65 |
66 | def __check_package(name):
67 | platform_system = platform.system()
68 |
69 | if name not in __DEPENDENCIES:
70 | raise EnvironmentError("Unknown package: {}".format(name))
71 |
72 | if platform_system not in __DEPENDENCIES[name]:
73 | raise EnvironmentError(
74 | "Unknown supported OK: {}".format(platform_system))
75 |
76 | entry = __DEPENDENCIES[name][platform_system]
77 |
78 | binary_name = entry["binary"]
79 | args = entry["version_args"]
80 | url = entry["url"]
81 |
82 | if __check_exec(binary_name, *args):
83 | logging.debug("{} is found".format(name))
84 | return os.path.join(__executable_folder, binary_name)
85 |
86 | logging.warning("Installing {}".format(name))
87 | __install_exec(url, binary_name)
88 | if not __check_exec(binary_name, *args):
89 | logging.warning("Failed to run {} after installation".format(name))
90 | raise EnvironmentError("Can not install {}".format(name))
91 | else:
92 | return os.path.join(__executable_folder, binary_name)
93 |
94 |
95 | @utils.DescriptionLoader.function_description("check_dependencies")
96 | def check_dependencies():
97 | global __DEPENDENCIES
98 | ret = {}
99 | for package, data in __DEPENDENCIES.items():
100 | if "checker" in data:
101 | with utils.DescriptionLoader.context(
102 | "check_dependencies_{}".format(package)):
103 | ret[package] = data["checker"]()
104 |
105 | return ret
106 |
107 |
108 | def check_plink():
109 | return __check_package("Plink")
110 |
111 |
112 | __DEPENDENCIES = {
113 | "Plink": {
114 | "checker": check_plink,
115 | "Darwin": {
116 | "binary": "plink",
117 | "version_args": ["--version"],
118 | "url": "http://s3.amazonaws.com/plink1-assets/plink_mac_20200219.zip"
119 | },
120 | "Linux": {
121 | "binary": "plink",
122 | "version_args": ["--version"],
123 | "url": "http://s3.amazonaws.com/plink1-assets/plink_linux_x86_64_20200219.zip"
124 | }
125 | },
126 | }
127 |
--------------------------------------------------------------------------------
/build/lib/genoml/discrete/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 The GenoML Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | from genoml.discrete import supervised
--------------------------------------------------------------------------------
/build/lib/genoml/discrete/supervised/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 The GenoML Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | from genoml.discrete.supervised.training import train
17 | from genoml.discrete.supervised.tuning import tune
18 | from genoml.discrete.supervised.testing import test
--------------------------------------------------------------------------------
/build/lib/genoml/misc/descriptions.json:
--------------------------------------------------------------------------------
1 | {
2 | "check_dependencies_Plink": {
3 | "title": "Checking plink",
4 | "description": "",
5 | "error": ""
6 | },
7 | "check_dependencies": {
8 | "title": "Dependency Check",
9 | "description": "",
10 | "end": true,
11 | "error": ""
12 | },
13 | "cli/continuous_supervised_train": {
14 | "title": "GenoML",
15 | "description": "Continuous Supervised Train",
16 | "end": true,
17 | "error": ""
18 | },
19 | "cli/continuous_supervised_train/info": {
20 | "title": "Basic Info",
21 | "description": "Here is some basic info on the command you are about to run.\nPython version info:\n{python_version}\n\nWorking with dataset from previous data munging efforts at:\n\t{prefix}",
22 | "error": ""
23 | },
24 | "cli/continuous_supervised_train/input": {
25 | "title": "Reading Input File: {path}",
26 | "description": "",
27 | "error": ""
28 | },
29 | "cli/continuous_supervised_train/matching_columns_path": {
30 | "title": "",
31 | "description": "Looks like you are retraining your reference file. We are using the harmonized columns you provided here: {matching_columns_path}\nNote that you might have different/less features than before, given this was harmonized between training and test dataset, and might mean your model now performs worse...",
32 | "error": ""
33 | },
34 | "continuous/supervised/training/Train/summary": {
35 | "title": "Input Data Summary",
36 | "description": "Your data looks like this (showing the first few lines of the left-most and right-most columns)...\n\n{data}",
37 | "error": ""
38 | },
39 | "continuous/supervised/training/Train/compete": {
40 | "title": "Compete the algorithms",
41 | "description": "Now let's compete these algorithms!\nWe'll update you as each algorithm runs, then summarize at the end.\nHere we test each algorithm under default settings using the same training and test datasets derived from a 70% training and 30% testing split of your data.\nFor each algorithm, we will output the following metrics...\nAlgorithm name, hoping that's pretty self-explanatory. Plenty of resources on these common ML algorithms at https://scikit-learn.org and https://xgboost.readthedocs.io/.\nexplained_variance_score, this is the variance explained by the model per algorithm (scale from 0 to 1 with 1 being completely explained).\nmean_squared_error, this is the mean squared error from regression loss.\nmedian_absolute_error, median absolute error from regression loss.\nr2_score, standard r2 metric from linear regression (coefficient of determination), remember, this can be negative if your model is really bad.\nWe also log the runtimes per algorithm.\n\nAlgorithm summaries incoming...",
42 | "end": true,
43 | "error": ""
44 | },
45 | "continuous/supervised/training/Train/compete/algorithm": {
46 | "title": "{name}",
47 | "description": "",
48 | "error": ""
49 | },
50 | "continuous/supervised/training/Train/compete/algorithm/results": {
51 | "title": "{name} Results",
52 | "description": "{results}",
53 | "error": ""
54 | },
55 | "continuous/supervised/training/Train/compete/algorithm/best": {
56 | "title": "Best Algorithm: {algorithm}",
57 | "description": "There are occasionally slight fluctuations in model performance on the same withheld samples.\n{metrics}",
58 | "error": ""
59 | },
60 | "continuous/supervised/training/Train/export_model": {
61 | "title": "Exporting Model: {output_path}",
62 | "description": "this model has been saved as {output_path} for later use and can be found in your working directory.",
63 | "end": true,
64 | "error": ""
65 | },
66 | "continuous/supervised/training/Train/save_algorithm_results": {
67 | "title": "Saving Algorithm Results: {output_path}",
68 | "description": "This table below is also logged as {output_path} and is in your current working directory...\n\n{data}",
69 | "end": true,
70 | "error": ""
71 | },
72 | "continuous/supervised/training/Train/save_best_algorithm": {
73 | "title": "Saving Best Algorithm: {output_path}",
74 | "description": "Based on your withheld samples, the algorithm with the highest explained variance score is the {best_algorithm}... let's save that model name for you on {output_path}.",
75 | "end": true,
76 | "error": ""
77 | },
78 | "continuous/supervised/training/Train/export_predictions/test_data": {
79 | "title": "Saving Prediction on Test Data: {output_path}",
80 | "description": "Preview of the exported predictions for the withheld test data that has been exported as {output_path} these are pretty straight forward.\nThey generally include the sample ID, the previously reported phenotype and the predicted phenotype from that algorithm,\n\n{data}",
81 | "end": true,
82 | "error": ""
83 | },
84 | "continuous/supervised/training/Train/export_predictions/train_data": {
85 | "title": "Saving Prediction on Train Data: {output_path}",
86 | "description": "Preview of the exported predictions for the training samples which is naturally overfit and exported as {output_path} in the similar format as in the withheld test dataset that was just exported.\n\n{data}",
87 | "end": true,
88 | "error": ""
89 | },
90 | "continuous/supervised/training/Train/export_predictions/plot": {
91 | "title": "Saving Regression Plot: {output_path}",
92 | "description": "Here is a quick summary of the regression comparing PHENO_REPORTED ~ PHENO_PREDICTED in the withheld test data...\n{data}\n...always good to see the P for the predictor.\n\nWe are also exporting a regression plot for you here {output_path} this is a graphical representation of the difference between the reported and predicted phenotypes in the withheld test data for the best performing algorithm.",
93 | "end": true,
94 | "error": ""
95 | }
96 | }
97 |
--------------------------------------------------------------------------------
/build/lib/genoml/preprocessing/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 The GenoML Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | from genoml.preprocessing.munging import munging
17 | from genoml.preprocessing.vif import vif
18 | from genoml.preprocessing.featureselection import featureselection
19 | from genoml.preprocessing.harmonizing import harmonizing
20 | from genoml.preprocessing.adjuster import adjuster
21 |
22 | __all__ = []
23 |
--------------------------------------------------------------------------------
/build/lib/genoml/preprocessing/adjuster.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 The GenoML Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | import pandas as pd
17 | import numpy as np
18 | import statsmodels.api as sm
19 | import statsmodels.formula.api as smf
20 | import statistics
21 | import umap.umap_ as umap
22 | from joblib import dump, load
23 | import matplotlib.pyplot as plt
24 | from matplotlib import style
25 | import seaborn as sns
26 |
27 | class adjuster:
28 | def __init__(self, run_prefix, df, target_features, confounders, adjust_data, adjust_normalize, umap_reduce):
29 | self.run_prefix = run_prefix
30 | self.umap_reduce = umap_reduce
31 | self.target_columns = target_features
32 | self.confounders = confounders
33 | self.adjust_data = adjust_data
34 | self.normalize_switch = adjust_normalize
35 |
36 | df = self.run_prefix + ".dataForML.h5"
37 | self.munged_data = df
38 |
39 | self.target_data_df = pd.read_hdf(self.munged_data, 'dataForML')
40 | self.target_column_df = pd.read_csv(self.target_columns, names=['TARGETS'])
41 |
42 | self.confounders_df = pd.read_csv(self.confounders)
43 |
44 | # Keep only intersecting feature names left in munged set (removed either because --gwas or std dev of 0 etc.)
45 | target_data_list = self.target_data_df.columns
46 | target_column_list = self.target_column_df['TARGETS'].tolist()
47 | intersecting_list = list(set(target_data_list).intersection(set(target_column_list)))
48 | self.target_column_df = pd.DataFrame(intersecting_list,columns=['TARGETS'])
49 |
50 | def umap_reducer(self):
51 |
52 | if (self.umap_reduce == "yes"):
53 | IDs = self.confounders_df['ID']
54 | IDs_df = pd.DataFrame(IDs)
55 | to_umap = self.confounders_df.drop(columns=['ID'])
56 |
57 | reducer = umap.UMAP(random_state=153)
58 | embedding = reducer.fit_transform(to_umap)
59 |
60 | embedding1 = pd.DataFrame(embedding[:,0])
61 | embedding2 = pd.DataFrame(embedding[:,1])
62 |
63 | out_data = pd.concat([IDs_df.reset_index(), embedding1.reset_index(drop=True), embedding2.reset_index(drop=True)], axis=1, ignore_index=True)
64 | out_data.columns = ['INDEX', 'ID', 'UMAP_embedding1', "UMAP_embedding2"]
65 | out_data = out_data.drop(columns=['INDEX'])
66 |
67 | # Plot
68 | print(f"Exporting UMAP plot...")
69 | fig, ax = plt.subplots(figsize=(12,10))
70 | plt.scatter(embedding[:,0], embedding[:,1], cmap="cool")
71 | plt.title("Data Reduction to 2 Dimensions by UMAP", fontsize=18)
72 | plot_out = self.run_prefix + '.umap_plot.png'
73 | plt.savefig(plot_out, dpi=600)
74 |
75 | print(f"The UMAP plot has been exported and can be found here: {plot_out}")
76 |
77 | out_file = self.runplot_out = self.run_prefix + '.umap_data_reduction.csv'
78 | out_data.to_csv(out_file, index=False)
79 |
80 | print(f"The reduced UMAP 2 dimensions per sample .csv file can be found here: {out_file}")
81 |
82 | exported_reducer = reducer.fit(to_umap)
83 | algo_out = self.runplot_out = self.run_prefix + '.umap_clustering.joblib'
84 | dump(exported_reducer, algo_out)
85 |
86 | self.confounders_df = out_data
87 |
88 | print(f"The UMAP .joblib file can be found here: {algo_out}")
89 |
90 | return self.confounders_df
91 |
92 | def normalize(self, confounders_df):
93 | target_list = list(self.target_column_df['TARGETS'])
94 | confounder_list = list(confounders_df.columns[1:])
95 | columns_to_keep_list = list(self.target_data_df.columns)
96 |
97 | adjustments_df = self.target_data_df.merge(confounders_df, how='inner', on='ID', suffixes=['', '_y'])
98 |
99 | formula_for_confounders = ' + '.join(confounder_list)
100 |
101 | for target in target_list:
102 | current_target = str(target)
103 | print(f"Looking at the following feature: {current_target}")
104 |
105 | current_formula = current_target + " ~ " + formula_for_confounders
106 | print(current_formula)
107 |
108 | target_model = smf.ols(formula=current_formula, data=adjustments_df).fit()
109 |
110 | if (self.normalize_switch == 'yes'):
111 | adjustments_df['temp'] = pd.to_numeric(target_model.resid)
112 | #print(type(adjustments_df['temp']))
113 | mean_scalar = adjustments_df['temp'].mean()
114 | sd_scalar = adjustments_df['temp'].std()
115 | adjustments_df[current_target] = (adjustments_df['temp'] - mean_scalar)/sd_scalar
116 | adjustments_df.drop(columns=['temp'], inplace=True)
117 | else:
118 | adjustments_df[current_target] = pd.to_numeric(target_model.resid)
119 |
120 | adjusted_df = adjustments_df[columns_to_keep_list]
121 |
122 | outfile_h5 = self.run_prefix + ".dataForML.h5"
123 | adjusted_df.to_hdf(outfile_h5, key='dataForML', mode='w')
124 |
125 | if (self.normalize_switch == 'yes'):
126 | print(f"\n The adjusted dataframe following normalization can be found here: {outfile_h5}, your updated .dataForML file \n")
127 | else:
128 | print(f"\n The adjusted dataframe without normalization can be found here: {outfile_h5}, your updated .dataForML file \n")
129 |
130 |
131 | return adjusted_df
132 |
--------------------------------------------------------------------------------
/build/lib/genoml/preprocessing/featureselection.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 The GenoML Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | import pandas as pd
17 | from sklearn import ensemble
18 | from sklearn import feature_selection
19 |
20 | class featureselection:
21 | def __init__(self, run_prefix, df, data_type, n_est):
22 | self.run_prefix = run_prefix
23 | self.featureRanks = None
24 | self.n_est = n_est
25 | self.data_type = data_type
26 |
27 | # Double check there are no NAs in the dataset before proceeding
28 | remove_cols = df.columns[df.isna().any()].tolist()
29 | df.drop(remove_cols, axis=1, inplace=True)
30 |
31 | self.y = df['PHENO']
32 | self.X = df.drop(columns=['PHENO'])
33 | X = self.X
34 | self.IDs = X.ID
35 | self.X = X.drop(columns=['ID'])
36 |
37 | def rank(self):
38 | print(f"""
39 | Beginning featureSelection using {self.n_est} estimators...""")
40 |
41 | if (self.data_type == "d"):
42 | print(f"""
43 | using extraTrees Classifier for your discrete dataset
44 | """)
45 | clf = ensemble.ExtraTreesClassifier(n_estimators=self.n_est)
46 |
47 | if (self.data_type == "c"):
48 | print(f"""
49 | using extraTrees Regressor for your continuous dataset
50 | """)
51 | clf = ensemble.ExtraTreesRegressor(n_estimators=self.n_est)
52 |
53 | clf.fit(self.X, self.y)
54 | self.featureRanks = clf.feature_importances_
55 |
56 | # Code to drop the features below threshold and return the data set like it was (aka add PHENO and IDs back)
57 | model = feature_selection.SelectFromModel(clf, prefit=True) # find this import at top
58 | df_editing = model.transform(self.X)
59 | print("""
60 | Printing feature name that corresponds to the dataframe column name, then printing the relative importance as we go...
61 | """)
62 |
63 | list_featureScores = []
64 |
65 | for col,score in zip(self.X.columns,clf.feature_importances_):
66 | print(col,score)
67 | list_featureScores.append([col, score])
68 |
69 | df_featureScores = pd.DataFrame(list_featureScores, columns=["Feature_Name", "Score"])
70 | #df_featureScores = df_featureScores[df_featureScores['Score'] !=0]
71 | df_featureScores = df_featureScores.sort_values(by=['Score'], ascending=False)
72 | featureScores_outfile = self.run_prefix + ".approx_feature_importance.txt"
73 | df_featureScores.to_csv(featureScores_outfile, index=False, sep="\t")
74 |
75 | print(f"""
76 | You have reduced your dataset to {df_editing.shape[0]} samples at {df_editing.shape[1]} features, not including ID and PHENO.
77 | """)
78 |
79 | y_df = self.y
80 | ID_df = pd.DataFrame(self.IDs)
81 | features_selected = model.get_support()
82 | X_reduced = self.X.iloc[:,features_selected]
83 | df_selecta = pd.concat([ID_df.reset_index(drop=True), y_df.reset_index(drop=True), X_reduced.reset_index(drop=True)], axis = 1, ignore_index=False)
84 |
85 | self.df_selecta = df_selecta
86 | self.featureScores_outfile = featureScores_outfile
87 |
88 | return df_selecta
89 |
90 | def export_data(self):
91 | ## Export reduced data
92 | outfile_h5 = self.run_prefix + ".dataForML.h5"
93 | self.df_selecta.to_hdf(outfile_h5, key='dataForML')
94 |
95 | features_list = self.df_selecta.columns.values.tolist()
96 |
97 | features_listpath = self.run_prefix + ".list_features.txt"
98 | with open(features_listpath, 'w') as f:
99 | for feature in features_list:
100 | f.write("%s\n" % feature)
101 |
102 | print(f"""Exporting a new {outfile_h5} file that has a reduced feature set based on your importance approximations.
103 | This is a good dataset for general ML applications for the chosen PHENO as it includes only features that are likely to impact the model.
104 |
105 | An updated list of {len(features_list)} features, including ID and PHENO, that is in your munged dataForML.h5 file can be found here {features_listpath}
106 |
107 | A file with all your features, ranked from largest contributors at the top to smallest contributors at the bottom, can be found at {self.featureScores_outfile}.
108 | """)
109 |
--------------------------------------------------------------------------------
/build/lib/genoml/preprocessing/harmonizing.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 The GenoML Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | # Import the necessary packages
17 | import subprocess
18 | import numpy as np
19 | import sys
20 | import joblib
21 | import pandas as pd
22 | from pandas_plink import read_plink1_bin
23 |
24 | # Define the munging class
25 | import genoml.dependencies
26 |
27 | class harmonizing:
28 | def __init__(self, test_geno_prefix, test_out_prefix, ref_model_prefix, training_SNPs):
29 |
30 | # Initializing the variables we will use
31 | self.test_geno_prefix = test_geno_prefix
32 | self.test_out_prefix = test_out_prefix
33 | self.ref_model_prefix = ref_model_prefix
34 | self.training_SNPs = training_SNPs
35 |
36 | infile_h5 = ref_model_prefix + ".dataForML.h5"
37 | self.df = pd.read_hdf(infile_h5, key = "dataForML")
38 |
39 | def generate_new_PLINK(self):
40 | # Show first few lines of the dataframe
41 | print("")
42 | print("Your data looks like this (showing the first few lines of the left-most and right-most columns)...")
43 | print("#"*70)
44 | print(self.df.describe())
45 | print("#"*70)
46 | print("")
47 |
48 | # Save out and drop the PHENO and sample ID columns
49 | y_test = self.df.PHENO
50 | X_test = self.df.drop(columns=['PHENO'])
51 | IDs_test = X_test.ID
52 | X_test = X_test.drop(columns=['ID'])
53 |
54 | # Save variables to use globally within the class
55 | self.y_test = y_test
56 | self.X_test = X_test
57 | self.IDs_test = IDs_test
58 |
59 | # Read in the column of SNPs from the SNP+Allele file read in
60 | snps_alleles_df = pd.read_csv(self.training_SNPs, header=None)
61 | snps_only = snps_alleles_df.iloc[:, 0]
62 | snps_temp = self.test_out_prefix + '.SNPS_only_toKeep_temp.txt'
63 | snps_only.to_csv(snps_temp, header=None, index=False)
64 |
65 | print(f"A temporary file of SNPs from your reference dataset to keep in your testing dataset has been exported here: {snps_temp}")
66 |
67 | # Prepare the bashes to keep the SNPs of interest from the reference dataset
68 | plink_exec = genoml.dependencies.check_plink()
69 |
70 | # Creating outfile with SNPs
71 | # Force the allele designations based on the reference dataset
72 | plink_outfile = self.test_out_prefix + ".refSNPs_andAlleles"
73 |
74 | print("")
75 | print(f"Now we will create PLINK binaries where the reference SNPS and alleles will be based off of your file here: {self.training_SNPs}")
76 | print("")
77 |
78 | bash1 = f"{plink_exec} --bfile " + self.test_geno_prefix + " --extract " + snps_temp + " --reference-allele " + self.training_SNPs + " --make-bed --out " + plink_outfile
79 | # Remove the .log file
80 | bash2 = "rm " + plink_outfile + ".log"
81 | # Remove the .SNPS_only_toKeep_temp.txt file
82 | bash3 = "rm " + snps_temp
83 |
84 | cmds_a = [bash1, bash2, bash3]
85 |
86 | for cmd in cmds_a:
87 | subprocess.run(cmd, shell=True)
88 |
89 | self.plink_outfile = plink_outfile
90 |
91 | print("")
92 | print(f"A new set of PLINK binaries generated from your test dataset with the SNPs you decided to keep from the reference dataset have been made here: {plink_outfile}")
93 | print("")
94 |
95 | # def read_PLINK(self):
96 | # # Read in using pandas PLINK (similar to munging)
97 |
98 | # bed_file = self.plink_outfile + ".bed"
99 | # plink_files_py = read_plink1_bin(bed_file)
100 | # plink_files = plink_files_py.drop(['fid','father','mother','gender', 'trait', 'chrom', 'cm', 'pos','a1'])
101 |
102 | # plink_files = plink_files.set_index({'sample':'iid','variant':'snp'})
103 | # plink_files.values = plink_files.values.astype('int')
104 |
105 | # # swap pandas-plink genotype coding to match .raw format...more about that below:
106 |
107 | # # for example, assuming C in minor allele, alleles are coded in plink .raw labels homozygous for minor allele as 2 and homozygous for major allele as 0:
108 | # #A A -> 0
109 | # #A C -> 1
110 | # #C C -> 2
111 | # #0 0 -> NA
112 |
113 | # # where as, read_plink1_bin flips these, with homozygous minor allele = 0 and homozygous major allele = 2
114 | # #A A -> 2
115 | # #A C -> 1
116 | # #C C -> 0
117 | # #0 0 -> NA
118 |
119 | # two_idx = (plink_files.values == 2)
120 | # zero_idx = (plink_files.values == 0)
121 |
122 | # plink_files.values[two_idx] = 0
123 | # plink_files.values[zero_idx] = 2
124 |
125 | # plink_pd = plink_files.to_pandas()
126 | # plink_pd.reset_index(inplace=True)
127 | # raw_df = plink_pd.rename(columns={'sample': 'ID'})
128 |
129 | # print("")
130 | # print("Your data looks like this (showing the first few lines of the left-most and right-most columns)...")
131 | # print("#"*70)
132 | # print(raw_df.describe())
133 | # print("#"*70)
134 | # print("")
135 |
136 | # self.raw_df = raw_df
137 |
138 | # return raw_df
139 |
140 | def prep_refCols_file(self):
141 | # Make a list of the column names from the reference dataset
142 | ref_columns_list = self.df.columns.values.tolist()
143 |
144 | # Write out the columns to a text file we will use in munge later
145 | ref_cols_outfile = self.test_out_prefix + ".refColsHarmonize_toKeep.txt"
146 |
147 | with open(ref_cols_outfile, 'w') as filehandle:
148 | for col in ref_columns_list:
149 | filehandle.write('%s\n' % col)
150 |
151 | print("")
152 | print(f"A file with the columns in the reference file, to later use in the munging step and keep these same columns for the test dataset, has been generated here: {ref_cols_outfile}")
153 | print("")
154 |
155 | return ref_columns_list
156 |
157 |
158 |
--------------------------------------------------------------------------------
/build/lib/genoml/utils.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 The GenoML Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | import json
17 | import os
18 | import time
19 | import traceback
20 |
21 | __author__ = 'Sayed Hadi Hashemi'
22 |
23 | import textwrap
24 |
25 |
26 | class ColoredBox:
27 | BLACK = 30
28 | RED = 31
29 | GREEN = 32
30 | YELLOW = 33
31 | BLUE = 34
32 | MAGENTA = 35
33 | CYAN = 36
34 | WHITE = 37
35 | RESET = 39
36 |
37 | def __init__(self, color=None):
38 | if color is None:
39 | color = self.GREEN
40 | self.__color = color
41 |
42 | def __enter__(self):
43 | print('\033[{}m'.format(self.__color), end="")
44 |
45 | def __exit__(self, exc_type, exc_val, exc_tb):
46 | print("\x1b[0m", end="")
47 |
48 | @classmethod
49 | def wrap(cls, text, color):
50 | return '\033[{}m'.format(color) + text + "\x1b[0m"
51 |
52 |
53 | class ContextScope:
54 | indent = 0
55 | _verbose = False
56 |
57 | def __init__(self, title, description, error, start=True, end=False,
58 | **kwargs):
59 | self._title = title.format(**kwargs)
60 | self._description = description.format(**kwargs)
61 | self._error = error.format(**kwargs)
62 | self._start = start
63 | self._end = end
64 |
65 | def __exit__(self, exc_type, exc_val, exc_tb):
66 | if exc_type is None and exc_val is None and exc_tb is None:
67 | if self._end:
68 | print(
69 | "{}{}: {}".format(
70 | self.get_prefix(ColoredBox.GREEN),
71 | ColoredBox.wrap(self._title, ColoredBox.GREEN),
72 | ColoredBox.wrap('[Done]', ColoredBox.GREEN)))
73 | self.remove_indent()
74 | else:
75 | print("{}{}: {}".format(
76 | self.get_prefix(ColoredBox.RED), self._title,
77 | ColoredBox.wrap('[Failed]', ColoredBox.RED)))
78 | print("{}".format(self.indent_text(self._error)))
79 | self.remove_indent()
80 | traceback.print_exception(exc_type, exc_val, exc_tb)
81 | exit(1)
82 |
83 | def __enter__(self):
84 | self.add_indent()
85 | if self._start:
86 | print()
87 | print("{}{}".format(self.get_prefix(ColoredBox.BLUE),
88 | ColoredBox.wrap(self._title, ColoredBox.BLUE)))
89 | if self._verbose and self._description:
90 | print("{}".format(self._description))
91 |
92 | @classmethod
93 | def add_indent(cls):
94 | cls.indent += 1
95 |
96 | @classmethod
97 | def remove_indent(cls):
98 | cls.indent -= 1
99 |
100 | @classmethod
101 | def get_prefix(cls, color=None):
102 | indent_size = 4
103 | # text = "=" * (cls.indent * 4) + "> "
104 | text = "---> " * cls.indent
105 | if color:
106 | text = ColoredBox.wrap(text, color)
107 | return text
108 |
109 | @classmethod
110 | def indent_text(cls, text):
111 | WIDTH = 70
112 | indent = max(0, len(cls.get_prefix()) - 2)
113 | width = WIDTH - indent
114 | ret = textwrap.fill(text, width)
115 | ret = textwrap.indent(ret, " " * indent)
116 | return ret
117 |
118 | @classmethod
119 | def set_verbose(cls, verbose):
120 | cls._verbose = verbose
121 |
122 |
123 | def function_description(**dkwargs):
124 | def wrap(func):
125 | def func_wrapper(*args, **kwargs):
126 | with ContextScope(**dkwargs):
127 | return func(*args, **kwargs)
128 |
129 | return func_wrapper
130 |
131 | return wrap
132 |
133 |
134 | class DescriptionLoader:
135 | _descriptions = None
136 |
137 | @classmethod
138 | def _load(cls):
139 | description_file = os.path.join(os.path.dirname(__file__),
140 | "misc", "descriptions.json")
141 | with open(description_file) as fp:
142 | cls._descriptions = json.load(fp)
143 |
144 | @classmethod
145 | def function_description(cls, key, **kwargs):
146 | dkwargs = cls.get(key)
147 | return function_description(**dkwargs, **kwargs)
148 |
149 | @classmethod
150 | def get(cls, key):
151 | if cls._descriptions is None:
152 | cls._load()
153 | return cls._descriptions[key]
154 |
155 | @classmethod
156 | def context(cls, key, **kwargs):
157 | dkwargs = cls.get(key)
158 | return ContextScope(**dkwargs, **kwargs)
159 |
160 | @classmethod
161 | def print(cls, key, **kwargs):
162 | dkwargs = cls.get(key)
163 | with ContextScope(**dkwargs, **kwargs):
164 | pass
165 |
166 |
167 | class Timer:
168 | def __init__(self):
169 | self.start = None
170 | self.end = None
171 |
172 | def start_timer(self):
173 | self.start = time.time()
174 |
175 | def __enter__(self):
176 | self.start_timer()
177 | return self
178 |
179 | def __exit__(self, *args):
180 | self.stop_timer()
181 |
182 | def stop_timer(self):
183 | self.end = time.time()
184 |
185 | def elapsed(self):
186 | return self.end - self.start
187 |
--------------------------------------------------------------------------------
/dist/genoml2-1.0.1.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GenoML/genoml2/469cf66f3d5594488dcd95497e0051d80602df30/dist/genoml2-1.0.1.tar.gz
--------------------------------------------------------------------------------
/docs/current_file_structure.txt:
--------------------------------------------------------------------------------
1 | .
2 | ├── docs
3 | │ ├── current_file_structure.txt
4 | │ └── GettingStarted.sh
5 | ├── examples
6 | │ ├── continuous
7 | │ │ ├── example_GWAS.csv
8 | │ │ ├── training_addit.csv
9 | │ │ ├── training_pheno.csv
10 | │ │ ├── training.bed
11 | │ │ ├── training.bim
12 | │ │ ├── training.fam
13 | │ │ ├── validation_addit.csv
14 | │ │ ├── validation_pheno.csv
15 | │ │ ├── validation.bed
16 | │ │ ├── validation.bim
17 | │ │ └── validation.fam
18 | │ └── discrete
19 | │ ├── example_GWAS.csv
20 | │ ├── training_addit.csv
21 | │ ├── training_pheno.csv
22 | │ ├── training.bed
23 | │ ├── training.bim
24 | │ ├── training.fam
25 | │ ├── validation_addit.csv
26 | │ ├── validation_pheno.csv
27 | │ ├── validation.bed
28 | │ ├── validation.bim
29 | │ └── validation.fam
30 | ├── genoml
31 | │ ├── cli
32 | │ │ ├── __init__.py
33 | │ │ ├── continuous_supervised_test.py
34 | │ │ ├── continuous_supervised_train.py
35 | │ │ ├── continuous_supervised_tune.py
36 | │ │ ├── discrete_supervised_test.py
37 | │ │ ├── discrete_supervised_train.py
38 | │ │ └── discrete_supervised_tune.py
39 | │ ├── continuous
40 | │ │ ├── supervised
41 | │ │ │ ├── __init__.py
42 | │ │ │ ├── testing.py
43 | │ │ │ ├── training.py
44 | │ │ │ └── tuning.py
45 | │ │ └── __init__.py
46 | │ ├── discrete
47 | │ │ ├── supervised
48 | │ │ │ ├── __init__.py
49 | │ │ │ ├── testing.py
50 | │ │ │ ├── training.py
51 | │ │ │ └── tuning.py
52 | │ │ └── __init__.py
53 | │ ├── preprocessing
54 | │ │ ├── __init__.py
55 | │ │ ├── featureselection.py
56 | │ │ ├── harmonizing.py
57 | │ │ ├── munging.py
58 | │ │ └── vif.py
59 | │ ├── __init__.py
60 | │ ├── dependencies.py
61 | │ ├── GenoML.py
62 | │ ├── GenoMLHarmonizing.py
63 | │ ├── GenoMLMunging.py
64 | │ └── utils.py
65 | ├── outputs
66 | │ ├── test_continuous_geno_approx_feature_importance.txt
67 | │ ├── test_continuous_geno.best_algorithm.txt
68 | │ ├── test_continuous_geno.dataForML.h5
69 | │ ├── test_continuous_geno.p_threshold_variants.tab
70 | │ ├── test_continuous_geno.trainedModel_trainingSample_Predictions.csv
71 | │ ├── test_continuous_geno.trainedModel_withheldSample_Predictions.csv
72 | │ ├── test_continuous_geno.trainedModel_withheldSample_regression.png
73 | │ ├── test_continuous_geno.trainedModel.joblib
74 | │ ├── test_continuous_geno.training_withheldSamples_performanceMetrics.csv
75 | │ ├── test_continuous_geno.tunedModel_allSample_Predictions.csv
76 | │ ├── test_continuous_geno.tunedModel_allSample_regression.png
77 | │ ├── test_continuous_geno.tunedModel_CV_Summary.csv
78 | │ ├── test_continuous_geno.tunedModel_top10Iterations_Summary.csv
79 | │ ├── test_continuous_geno.tunedModel.joblib
80 | │ ├── test_continuous_geno.variants_and_alleles.tab
81 | │ ├── test_discrete_geno_approx_feature_importance.txt
82 | │ ├── test_discrete_geno.best_algorithm.txt
83 | │ ├── test_discrete_geno.dataForML.h5
84 | │ ├── test_discrete_geno.p_threshold_variants.tab
85 | │ ├── test_discrete_geno.trainedModel_trainingSample_featureImportance.csv
86 | │ ├── test_discrete_geno.trainedModel_trainingSample_Predictions.csv
87 | │ ├── test_discrete_geno.trainedModel_withheldSample_Predictions.csv
88 | │ ├── test_discrete_geno.trainedModel_withheldSample_probabilities.png
89 | │ ├── test_discrete_geno.trainedModel_withheldSample_ROC.png
90 | │ ├── test_discrete_geno.trainedModel.joblib
91 | │ ├── test_discrete_geno.training_withheldSamples_performanceMetrics.csv
92 | │ ├── test_discrete_geno.tunedModel_allSample_Predictions.csv
93 | │ ├── test_discrete_geno.tunedModel_allSample_probabilities.png
94 | │ ├── test_discrete_geno.tunedModel_allSample_ROC.png
95 | │ ├── test_discrete_geno.tunedModel_CV_Summary.csv
96 | │ ├── test_discrete_geno.tunedModel_top10Iterations_Summary.csv
97 | │ ├── test_discrete_geno.tunedModel.joblib
98 | │ ├── test_discrete_geno.variants_and_alleles.tab
99 | │ ├── test.csv
100 | │ ├── validation_test_continuous_geno_finalHarmonizedCols_toKeep.txt
101 | │ ├── validation_test_continuous_geno_refColsHarmonize_toKeep.txt
102 | │ ├── validation_test_continuous_geno_refSNPs_andAlleles.bed
103 | │ ├── validation_test_continuous_geno_refSNPs_andAlleles.bim
104 | │ ├── validation_test_continuous_geno_refSNPs_andAlleles.fam
105 | │ ├── validation_test_continuous_geno.best_algorithm.txt
106 | │ ├── validation_test_continuous_geno.dataForML.h5
107 | │ ├── validation_test_continuous_geno.testedModel_allSample_predictions.csv
108 | │ ├── validation_test_continuous_geno.testedModel_allSamples_performanceMetrics.csv
109 | │ ├── validation_test_continuous_geno.testedModel_allSamples_regressionPlot.png
110 | │ ├── validation_test_continuous_geno.trainedModel_trainingSample_Predictions.csv
111 | │ ├── validation_test_continuous_geno.trainedModel_withheldSample_Predictions.csv
112 | │ ├── validation_test_continuous_geno.trainedModel_withheldSample_regression.png
113 | │ ├── validation_test_continuous_geno.trainedModel.joblib
114 | │ ├── validation_test_continuous_geno.training_withheldSamples_performanceMetrics.csv
115 | │ ├── validation_test_continuous_geno.variants_and_alleles.tab
116 | │ ├── validation_test_continuous_genotestedModel_allSamples_regressionSummary.csv
117 | │ ├── validation_test_discrete_geno_finalHarmonizedCols_toKeep.txt
118 | │ ├── validation_test_discrete_geno_refColsHarmonize_toKeep.txt
119 | │ ├── validation_test_discrete_geno_refSNPs_andAlleles.bed
120 | │ ├── validation_test_discrete_geno_refSNPs_andAlleles.bim
121 | │ ├── validation_test_discrete_geno_refSNPs_andAlleles.fam
122 | │ ├── validation_test_discrete_geno.best_algorithm.txt
123 | │ ├── validation_test_discrete_geno.dataForML.h5
124 | │ ├── validation_test_discrete_geno.testedModel_allSample_predictions.csv
125 | │ ├── validation_test_discrete_geno.testedModel_allSample_probabilities.png
126 | │ ├── validation_test_discrete_geno.testedModel_allSample_ROC.png
127 | │ ├── validation_test_discrete_geno.testedModel_allSamples_performanceMetrics.csv
128 | │ ├── validation_test_discrete_geno.trainedModel_trainingSample_Predictions.csv
129 | │ ├── validation_test_discrete_geno.trainedModel_withheldSample_Predictions.csv
130 | │ ├── validation_test_discrete_geno.trainedModel_withheldSample_probabilities.png
131 | │ ├── validation_test_discrete_geno.trainedModel_withheldSample_ROC.png
132 | │ ├── validation_test_discrete_geno.trainedModel.joblib
133 | │ ├── validation_test_discrete_geno.training_withheldSamples_performanceMetrics.csv
134 | │ └── validation_test_discrete_geno.variants_and_alleles.tab
135 | ├── GettingStarted.sh
136 | ├── LICENSE
137 | ├── logo.png
138 | ├── README.md
139 | ├── requirements.txt
140 | └── setup.py
--------------------------------------------------------------------------------
/examples/continuous/to_adjust.txt:
--------------------------------------------------------------------------------
1 | snp410
2 | snp403
3 | snp164
--------------------------------------------------------------------------------
/examples/continuous/training.bed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GenoML/genoml2/469cf66f3d5594488dcd95497e0051d80602df30/examples/continuous/training.bed
--------------------------------------------------------------------------------
/examples/continuous/validation.bed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GenoML/genoml2/469cf66f3d5594488dcd95497e0051d80602df30/examples/continuous/validation.bed
--------------------------------------------------------------------------------
/examples/continuous/validation.fam:
--------------------------------------------------------------------------------
1 | valdiation34 valdiation34 0 0 2 2
2 | valdiation75 valdiation75 0 0 1 2
3 | valdiation65 valdiation65 0 0 2 2
4 | valdiation15 valdiation15 0 0 2 1
5 | valdiation5 valdiation5 0 0 1 2
6 | valdiation24 valdiation24 0 0 1 1
7 | valdiation14 valdiation14 0 0 2 2
8 | valdiation58 valdiation58 0 0 1 2
9 | valdiation10 valdiation10 0 0 1 2
10 | valdiation89 valdiation89 0 0 1 2
11 | valdiation20 valdiation20 0 0 2 1
12 | valdiation59 valdiation59 0 0 1 1
13 | valdiation19 valdiation19 0 0 2 2
14 | valdiation13 valdiation13 0 0 2 2
15 | valdiation21 valdiation21 0 0 2 2
16 | valdiation35 valdiation35 0 0 1 2
17 | valdiation1 valdiation1 0 0 2 1
18 | valdiation92 valdiation92 0 0 1 1
19 | valdiation74 valdiation74 0 0 2 2
20 | valdiation94 valdiation94 0 0 1 2
21 | valdiation2 valdiation2 0 0 2 2
22 | valdiation37 valdiation37 0 0 1 2
23 | valdiation44 valdiation44 0 0 2 1
24 | valdiation48 valdiation48 0 0 1 1
25 | valdiation49 valdiation49 0 0 1 2
26 | valdiation17 valdiation17 0 0 1 2
27 | valdiation18 valdiation18 0 0 1 2
28 | valdiation83 valdiation83 0 0 2 2
29 | valdiation68 valdiation68 0 0 1 2
30 | valdiation50 valdiation50 0 0 1 2
31 | valdiation22 valdiation22 0 0 1 2
32 | valdiation33 valdiation33 0 0 2 1
33 | valdiation43 valdiation43 0 0 2 1
34 | valdiation60 valdiation60 0 0 2 2
35 | valdiation70 valdiation70 0 0 1 2
36 | valdiation99 valdiation99 0 0 1 2
37 | valdiation36 valdiation36 0 0 2 1
38 | valdiation51 valdiation51 0 0 1 2
39 | valdiation76 valdiation76 0 0 2 2
40 | valdiation64 valdiation64 0 0 1 2
41 | valdiation69 valdiation69 0 0 2 1
42 | valdiation32 valdiation32 0 0 1 2
43 | valdiation88 valdiation88 0 0 1 2
44 | valdiation11 valdiation11 0 0 1 1
45 | valdiation3 valdiation3 0 0 1 2
46 | valdiation46 valdiation46 0 0 1 2
47 | valdiation27 valdiation27 0 0 1 2
48 | valdiation63 valdiation63 0 0 1 2
49 | valdiation4 valdiation4 0 0 1 2
50 | valdiation85 valdiation85 0 0 1 2
51 | valdiation23 valdiation23 0 0 1 1
52 | valdiation84 valdiation84 0 0 1 2
53 | valdiation71 valdiation71 0 0 1 2
54 | valdiation54 valdiation54 0 0 1 1
55 | valdiation55 valdiation55 0 0 2 1
56 | valdiation26 valdiation26 0 0 1 2
57 | valdiation56 valdiation56 0 0 1 2
58 | valdiation72 valdiation72 0 0 1 2
59 | valdiation93 valdiation93 0 0 1 1
60 | valdiation8 valdiation8 0 0 1 2
61 | valdiation30 valdiation30 0 0 1 2
62 | valdiation39 valdiation39 0 0 1 1
63 | valdiation81 valdiation81 0 0 1 1
64 | valdiation80 valdiation80 0 0 1 2
65 | valdiation100 valdiation100 0 0 2 2
66 | valdiation9 valdiation9 0 0 1 2
67 | valdiation96 valdiation96 0 0 1 2
68 | valdiation12 valdiation12 0 0 2 2
69 | valdiation6 valdiation6 0 0 1 2
70 | valdiation31 valdiation31 0 0 2 2
71 | valdiation45 valdiation45 0 0 2 2
72 | valdiation87 valdiation87 0 0 1 2
73 | valdiation53 valdiation53 0 0 1 2
74 | valdiation86 valdiation86 0 0 1 1
75 | valdiation91 valdiation91 0 0 1 2
76 | valdiation25 valdiation25 0 0 1 1
77 | valdiation95 valdiation95 0 0 1 2
78 | valdiation62 valdiation62 0 0 2 2
79 | valdiation42 valdiation42 0 0 2 2
80 | valdiation98 valdiation98 0 0 1 2
81 | valdiation16 valdiation16 0 0 2 2
82 | valdiation38 valdiation38 0 0 2 1
83 | valdiation52 valdiation52 0 0 1 2
84 | valdiation57 valdiation57 0 0 1 1
85 | valdiation47 valdiation47 0 0 1 2
86 | valdiation28 valdiation28 0 0 1 2
87 | valdiation78 valdiation78 0 0 2 1
88 | valdiation29 valdiation29 0 0 1 1
89 | valdiation97 valdiation97 0 0 2 2
90 | valdiation40 valdiation40 0 0 1 1
91 | valdiation66 valdiation66 0 0 1 2
92 | valdiation61 valdiation61 0 0 2 2
93 | valdiation77 valdiation77 0 0 2 2
94 | valdiation90 valdiation90 0 0 1 2
95 | valdiation79 valdiation79 0 0 1 2
96 | valdiation41 valdiation41 0 0 1 2
97 | valdiation82 valdiation82 0 0 1 1
98 | valdiation7 valdiation7 0 0 2 2
99 | valdiation67 valdiation67 0 0 1 2
100 | valdiation73 valdiation73 0 0 1 2
101 |
--------------------------------------------------------------------------------
/examples/continuous/validation_addit.csv:
--------------------------------------------------------------------------------
1 | ID,SEX_COV,UPSIT,FAMILY_HISTORY
2 | valdiation27,0,30,0
3 | valdiation10,0,30,0
4 | valdiation1,1,36,0
5 | valdiation77,1,21,1
6 | valdiation20,1,38,0
7 | valdiation70,0,12,0
8 | valdiation54,0,15,0
9 | valdiation56,0,12,0
10 | valdiation8,0,31,0
11 | valdiation23,0,33,0
12 | valdiation24,0,34,0
13 | valdiation38,1,30,0
14 | valdiation9,0,37,0
15 | valdiation60,1,20,1
16 | valdiation65,1,11,0
17 | valdiation50,0,22,0
18 | valdiation37,0,20,0
19 | valdiation19,1,36,1
20 | valdiation79,0,18,0
21 | valdiation36,1,27,0
22 | valdiation92,0,33,0
23 | valdiation33,1,36,0
24 | valdiation80,0,15,0
25 | valdiation90,0,38,0
26 | valdiation28,0,18,1
27 | valdiation3,0,25,1
28 | valdiation45,1,25,1
29 | valdiation64,0,15,0
30 | valdiation87,0,31,0
31 | valdiation49,0,17,0
32 | valdiation76,1,24,0
33 | valdiation30,0,33,0
34 | valdiation84,0,9,0
35 | valdiation88,0,13,0
36 | valdiation58,0,25,0
37 | valdiation100,1,17,1
38 | valdiation42,1,23,0
39 | valdiation67,0,17,0
40 | valdiation16,1,19,0
41 | valdiation43,1,30,0
42 | valdiation48,0,40,0
43 | valdiation18,0,15,0
44 | valdiation17,0,11,0
45 | valdiation2,1,28,1
46 | valdiation12,1,16,0
47 | valdiation34,1,26,1
48 | valdiation5,0,36,0
49 | valdiation63,0,32,0
50 | valdiation81,0,21,0
51 | valdiation22,0,15,1
52 | valdiation47,0,11,0
53 | valdiation75,0,28,1
54 | valdiation35,0,29,0
55 | valdiation21,1,34,0
56 | valdiation14,1,30,0
57 | valdiation31,1,20,0
58 | valdiation39,0,32,0
59 | valdiation83,1,38,0
60 | valdiation29,0,35,0
61 | valdiation4,0,5,0
62 | valdiation99,0,29,0
63 | valdiation73,0,27,1
64 | valdiation13,1,18,1
65 | valdiation91,0,15,1
66 | valdiation41,0,27,1
67 | valdiation78,1,35,0
68 | valdiation52,0,13,1
69 | valdiation6,0,19,0
70 | valdiation15,1,35,0
71 | valdiation55,1,37,0
72 | valdiation97,1,33,1
73 | valdiation26,0,8,0
74 | valdiation96,0,17,1
75 | valdiation85,0,22,1
76 | valdiation46,0,10,1
77 | valdiation95,0,22,1
78 | valdiation57,0,22,0
79 | valdiation61,1,20,0
80 | valdiation68,0,11,0
81 | valdiation25,0,28,0
82 | valdiation93,0,30,0
83 | valdiation44,1,33,0
84 | valdiation11,0,36,0
85 | valdiation62,1,19,0
86 | valdiation51,0,34,0
87 | valdiation94,0,10,0
88 | valdiation69,1,27,0
89 | valdiation32,0,9,0
90 | valdiation74,1,34,0
91 | valdiation86,0,35,0
92 | valdiation59,0,35,0
93 | valdiation53,0,35,0
94 | valdiation82,0,36,0
95 | valdiation71,0,15,1
96 | valdiation7,1,36,0
97 | valdiation66,0,25,0
98 | valdiation72,0,19,1
99 | valdiation98,0,22,0
100 | valdiation89,0,13,0
101 | valdiation40,0,38,0
102 |
--------------------------------------------------------------------------------
/examples/continuous/validation_pheno.csv:
--------------------------------------------------------------------------------
1 | ID,PHENO
2 | valdiation27,60
3 | valdiation10,59
4 | valdiation1,73
5 | valdiation77,75
6 | valdiation20,62
7 | valdiation70,52
8 | valdiation54,79
9 | valdiation56,76
10 | valdiation8,67
11 | valdiation23,55
12 | valdiation24,56
13 | valdiation38,67
14 | valdiation9,76
15 | valdiation60,42
16 | valdiation65,70
17 | valdiation50,65
18 | valdiation37,55
19 | valdiation19,65
20 | valdiation79,64
21 | valdiation36,56
22 | valdiation92,60
23 | valdiation33,59
24 | valdiation80,72
25 | valdiation90,68
26 | valdiation28,66
27 | valdiation3,57
28 | valdiation45,83
29 | valdiation64,50
30 | valdiation87,73
31 | valdiation49,72
32 | valdiation76,72
33 | valdiation30,62
34 | valdiation84,55
35 | valdiation88,69
36 | valdiation58,71
37 | valdiation100,68
38 | valdiation42,72
39 | valdiation67,72
40 | valdiation16,77
41 | valdiation43,45
42 | valdiation48,61
43 | valdiation18,59
44 | valdiation17,66
45 | valdiation2,51
46 | valdiation12,56
47 | valdiation34,63
48 | valdiation5,72
49 | valdiation63,50
50 | valdiation81,79
51 | valdiation22,55
52 | valdiation47,79
53 | valdiation75,75
54 | valdiation35,76
55 | valdiation21,57
56 | valdiation14,52
57 | valdiation31,71
58 | valdiation39,67
59 | valdiation83,48
60 | valdiation29,63
61 | valdiation4,78
62 | valdiation99,64
63 | valdiation73,65
64 | valdiation13,63
65 | valdiation91,74
66 | valdiation41,47
67 | valdiation78,57
68 | valdiation52,65
69 | valdiation6,55
70 | valdiation15,72
71 | valdiation55,74
72 | valdiation97,43
73 | valdiation26,65
74 | valdiation96,61
75 | valdiation85,59
76 | valdiation46,54
77 | valdiation95,71
78 | valdiation57,76
79 | valdiation61,57
80 | valdiation68,69
81 | valdiation25,74
82 | valdiation93,58
83 | valdiation44,66
84 | valdiation11,78
85 | valdiation62,77
86 | valdiation51,63
87 | valdiation94,72
88 | valdiation69,60
89 | valdiation32,73
90 | valdiation74,53
91 | valdiation86,77
92 | valdiation59,72
93 | valdiation53,73
94 | valdiation82,59
95 | valdiation71,60
96 | valdiation7,51
97 | valdiation66,75
98 | valdiation72,57
99 | valdiation98,74
100 | valdiation89,58
101 | valdiation40,61
102 |
--------------------------------------------------------------------------------
/examples/discrete/to_adjust.txt:
--------------------------------------------------------------------------------
1 | snp410
2 | snp403
3 | snp164
--------------------------------------------------------------------------------
/examples/discrete/training.bed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GenoML/genoml2/469cf66f3d5594488dcd95497e0051d80602df30/examples/discrete/training.bed
--------------------------------------------------------------------------------
/examples/discrete/validation.bed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GenoML/genoml2/469cf66f3d5594488dcd95497e0051d80602df30/examples/discrete/validation.bed
--------------------------------------------------------------------------------
/examples/discrete/validation.fam:
--------------------------------------------------------------------------------
1 | valdiation34 valdiation34 0 0 2 2
2 | valdiation75 valdiation75 0 0 1 2
3 | valdiation65 valdiation65 0 0 2 2
4 | valdiation15 valdiation15 0 0 2 1
5 | valdiation5 valdiation5 0 0 1 2
6 | valdiation24 valdiation24 0 0 1 1
7 | valdiation14 valdiation14 0 0 2 2
8 | valdiation58 valdiation58 0 0 1 2
9 | valdiation10 valdiation10 0 0 1 2
10 | valdiation89 valdiation89 0 0 1 2
11 | valdiation20 valdiation20 0 0 2 1
12 | valdiation59 valdiation59 0 0 1 1
13 | valdiation19 valdiation19 0 0 2 2
14 | valdiation13 valdiation13 0 0 2 2
15 | valdiation21 valdiation21 0 0 2 2
16 | valdiation35 valdiation35 0 0 1 2
17 | valdiation1 valdiation1 0 0 2 1
18 | valdiation92 valdiation92 0 0 1 1
19 | valdiation74 valdiation74 0 0 2 2
20 | valdiation94 valdiation94 0 0 1 2
21 | valdiation2 valdiation2 0 0 2 2
22 | valdiation37 valdiation37 0 0 1 2
23 | valdiation44 valdiation44 0 0 2 1
24 | valdiation48 valdiation48 0 0 1 1
25 | valdiation49 valdiation49 0 0 1 2
26 | valdiation17 valdiation17 0 0 1 2
27 | valdiation18 valdiation18 0 0 1 2
28 | valdiation83 valdiation83 0 0 2 2
29 | valdiation68 valdiation68 0 0 1 2
30 | valdiation50 valdiation50 0 0 1 2
31 | valdiation22 valdiation22 0 0 1 2
32 | valdiation33 valdiation33 0 0 2 1
33 | valdiation43 valdiation43 0 0 2 1
34 | valdiation60 valdiation60 0 0 2 2
35 | valdiation70 valdiation70 0 0 1 2
36 | valdiation99 valdiation99 0 0 1 2
37 | valdiation36 valdiation36 0 0 2 1
38 | valdiation51 valdiation51 0 0 1 2
39 | valdiation76 valdiation76 0 0 2 2
40 | valdiation64 valdiation64 0 0 1 2
41 | valdiation69 valdiation69 0 0 2 1
42 | valdiation32 valdiation32 0 0 1 2
43 | valdiation88 valdiation88 0 0 1 2
44 | valdiation11 valdiation11 0 0 1 1
45 | valdiation3 valdiation3 0 0 1 2
46 | valdiation46 valdiation46 0 0 1 2
47 | valdiation27 valdiation27 0 0 1 2
48 | valdiation63 valdiation63 0 0 1 2
49 | valdiation4 valdiation4 0 0 1 2
50 | valdiation85 valdiation85 0 0 1 2
51 | valdiation23 valdiation23 0 0 1 1
52 | valdiation84 valdiation84 0 0 1 2
53 | valdiation71 valdiation71 0 0 1 2
54 | valdiation54 valdiation54 0 0 1 1
55 | valdiation55 valdiation55 0 0 2 1
56 | valdiation26 valdiation26 0 0 1 2
57 | valdiation56 valdiation56 0 0 1 2
58 | valdiation72 valdiation72 0 0 1 2
59 | valdiation93 valdiation93 0 0 1 1
60 | valdiation8 valdiation8 0 0 1 2
61 | valdiation30 valdiation30 0 0 1 2
62 | valdiation39 valdiation39 0 0 1 1
63 | valdiation81 valdiation81 0 0 1 1
64 | valdiation80 valdiation80 0 0 1 2
65 | valdiation100 valdiation100 0 0 2 2
66 | valdiation9 valdiation9 0 0 1 2
67 | valdiation96 valdiation96 0 0 1 2
68 | valdiation12 valdiation12 0 0 2 2
69 | valdiation6 valdiation6 0 0 1 2
70 | valdiation31 valdiation31 0 0 2 2
71 | valdiation45 valdiation45 0 0 2 2
72 | valdiation87 valdiation87 0 0 1 2
73 | valdiation53 valdiation53 0 0 1 2
74 | valdiation86 valdiation86 0 0 1 1
75 | valdiation91 valdiation91 0 0 1 2
76 | valdiation25 valdiation25 0 0 1 1
77 | valdiation95 valdiation95 0 0 1 2
78 | valdiation62 valdiation62 0 0 2 2
79 | valdiation42 valdiation42 0 0 2 2
80 | valdiation98 valdiation98 0 0 1 2
81 | valdiation16 valdiation16 0 0 2 2
82 | valdiation38 valdiation38 0 0 2 1
83 | valdiation52 valdiation52 0 0 1 2
84 | valdiation57 valdiation57 0 0 1 1
85 | valdiation47 valdiation47 0 0 1 2
86 | valdiation28 valdiation28 0 0 1 2
87 | valdiation78 valdiation78 0 0 2 1
88 | valdiation29 valdiation29 0 0 1 1
89 | valdiation97 valdiation97 0 0 2 2
90 | valdiation40 valdiation40 0 0 1 1
91 | valdiation66 valdiation66 0 0 1 2
92 | valdiation61 valdiation61 0 0 2 2
93 | valdiation77 valdiation77 0 0 2 2
94 | valdiation90 valdiation90 0 0 1 2
95 | valdiation79 valdiation79 0 0 1 2
96 | valdiation41 valdiation41 0 0 1 2
97 | valdiation82 valdiation82 0 0 1 1
98 | valdiation7 valdiation7 0 0 2 2
99 | valdiation67 valdiation67 0 0 1 2
100 | valdiation73 valdiation73 0 0 1 2
101 |
--------------------------------------------------------------------------------
/examples/discrete/validation_addit.csv:
--------------------------------------------------------------------------------
1 | ID,SEX_COV,AGE,UPSIT,FAMILY_HISTORY
2 | valdiation27,0,60,30,0
3 | valdiation10,0,59,30,0
4 | valdiation1,1,73,36,0
5 | valdiation77,1,75,21,1
6 | valdiation20,1,62,38,0
7 | valdiation70,0,52,12,0
8 | valdiation54,0,79,15,0
9 | valdiation56,0,76,12,0
10 | valdiation8,0,67,31,0
11 | valdiation23,0,55,33,0
12 | valdiation24,0,56,34,0
13 | valdiation38,1,67,30,0
14 | valdiation9,0,76,37,0
15 | valdiation60,1,42,20,1
16 | valdiation65,1,70,11,0
17 | valdiation50,0,65,22,0
18 | valdiation37,0,55,20,0
19 | valdiation19,1,65,36,1
20 | valdiation79,0,64,18,0
21 | valdiation36,1,56,27,0
22 | valdiation92,0,60,33,0
23 | valdiation33,1,59,36,0
24 | valdiation80,0,72,15,0
25 | valdiation90,0,68,38,0
26 | valdiation28,0,66,18,1
27 | valdiation3,0,57,25,1
28 | valdiation45,1,83,25,1
29 | valdiation64,0,50,15,0
30 | valdiation87,0,73,31,0
31 | valdiation49,0,72,17,0
32 | valdiation76,1,72,24,0
33 | valdiation30,0,62,33,0
34 | valdiation84,0,55,9,0
35 | valdiation88,0,69,13,0
36 | valdiation58,0,71,25,0
37 | valdiation100,1,68,17,1
38 | valdiation42,1,72,23,0
39 | valdiation67,0,72,17,0
40 | valdiation16,1,77,19,0
41 | valdiation43,1,45,30,0
42 | valdiation48,0,61,40,0
43 | valdiation18,0,59,15,0
44 | valdiation17,0,66,11,0
45 | valdiation2,1,51,28,1
46 | valdiation12,1,56,16,0
47 | valdiation34,1,63,26,1
48 | valdiation5,0,72,36,0
49 | valdiation63,0,50,32,0
50 | valdiation81,0,79,21,0
51 | valdiation22,0,55,15,1
52 | valdiation47,0,79,11,0
53 | valdiation75,0,75,28,1
54 | valdiation35,0,76,29,0
55 | valdiation21,1,57,34,0
56 | valdiation14,1,52,30,0
57 | valdiation31,1,71,20,0
58 | valdiation39,0,67,32,0
59 | valdiation83,1,48,38,0
60 | valdiation29,0,63,35,0
61 | valdiation4,0,78,5,0
62 | valdiation99,0,64,29,0
63 | valdiation73,0,65,27,1
64 | valdiation13,1,63,18,1
65 | valdiation91,0,74,15,1
66 | valdiation41,0,47,27,1
67 | valdiation78,1,57,35,0
68 | valdiation52,0,65,13,1
69 | valdiation6,0,55,19,0
70 | valdiation15,1,72,35,0
71 | valdiation55,1,74,37,0
72 | valdiation97,1,43,33,1
73 | valdiation26,0,65,8,0
74 | valdiation96,0,61,17,1
75 | valdiation85,0,59,22,1
76 | valdiation46,0,54,10,1
77 | valdiation95,0,71,22,1
78 | valdiation57,0,76,22,0
79 | valdiation61,1,57,20,0
80 | valdiation68,0,69,11,0
81 | valdiation25,0,74,28,0
82 | valdiation93,0,58,30,0
83 | valdiation44,1,66,33,0
84 | valdiation11,0,78,36,0
85 | valdiation62,1,77,19,0
86 | valdiation51,0,63,34,0
87 | valdiation94,0,72,10,0
88 | valdiation69,1,60,27,0
89 | valdiation32,0,73,9,0
90 | valdiation74,1,53,34,0
91 | valdiation86,0,77,35,0
92 | valdiation59,0,72,35,0
93 | valdiation53,0,73,35,0
94 | valdiation82,0,59,36,0
95 | valdiation71,0,60,15,1
96 | valdiation7,1,51,36,0
97 | valdiation66,0,75,25,0
98 | valdiation72,0,57,19,1
99 | valdiation98,0,74,22,0
100 | valdiation89,0,58,13,0
101 | valdiation40,0,61,38,0
102 |
--------------------------------------------------------------------------------
/examples/discrete/validation_pheno.csv:
--------------------------------------------------------------------------------
1 | ID,PHENO
2 | valdiation1,0
3 | valdiation2,1
4 | valdiation3,1
5 | valdiation4,1
6 | valdiation5,1
7 | valdiation6,1
8 | valdiation7,1
9 | valdiation8,1
10 | valdiation9,1
11 | valdiation10,1
12 | valdiation11,0
13 | valdiation12,1
14 | valdiation13,1
15 | valdiation14,1
16 | valdiation15,0
17 | valdiation16,1
18 | valdiation17,1
19 | valdiation18,1
20 | valdiation19,1
21 | valdiation20,0
22 | valdiation21,1
23 | valdiation22,1
24 | valdiation23,0
25 | valdiation24,0
26 | valdiation25,0
27 | valdiation26,1
28 | valdiation27,1
29 | valdiation28,1
30 | valdiation29,0
31 | valdiation30,1
32 | valdiation31,1
33 | valdiation32,1
34 | valdiation33,0
35 | valdiation34,1
36 | valdiation35,1
37 | valdiation36,0
38 | valdiation37,1
39 | valdiation38,0
40 | valdiation39,0
41 | valdiation40,0
42 | valdiation41,1
43 | valdiation42,1
44 | valdiation43,0
45 | valdiation44,0
46 | valdiation45,1
47 | valdiation46,1
48 | valdiation47,1
49 | valdiation48,0
50 | valdiation49,1
51 | valdiation50,1
52 | valdiation51,1
53 | valdiation52,1
54 | valdiation53,1
55 | valdiation54,0
56 | valdiation55,0
57 | valdiation56,1
58 | valdiation57,0
59 | valdiation58,1
60 | valdiation59,0
61 | valdiation60,1
62 | valdiation61,1
63 | valdiation62,1
64 | valdiation63,1
65 | valdiation64,1
66 | valdiation65,1
67 | valdiation66,1
68 | valdiation67,1
69 | valdiation68,1
70 | valdiation69,0
71 | valdiation70,1
72 | valdiation71,1
73 | valdiation72,1
74 | valdiation73,1
75 | valdiation74,1
76 | valdiation75,1
77 | valdiation76,1
78 | valdiation77,1
79 | valdiation78,0
80 | valdiation79,1
81 | valdiation80,1
82 | valdiation81,0
83 | valdiation82,0
84 | valdiation83,1
85 | valdiation84,1
86 | valdiation85,1
87 | valdiation86,0
88 | valdiation87,1
89 | valdiation88,1
90 | valdiation89,1
91 | valdiation90,1
92 | valdiation91,1
93 | valdiation92,0
94 | valdiation93,0
95 | valdiation94,1
96 | valdiation95,1
97 | valdiation96,1
98 | valdiation97,1
99 | valdiation98,1
100 | valdiation99,1
101 | valdiation100,1
102 |
--------------------------------------------------------------------------------
/genoml/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 The GenoML Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | from genoml import preprocessing
17 | from genoml import discrete
18 | from genoml import continuous
19 | from genoml import cli
20 |
--------------------------------------------------------------------------------
/genoml/cli/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 The GenoML Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
--------------------------------------------------------------------------------
/genoml/cli/continuous_supervised_test.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 The GenoML Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | from genoml.continuous import supervised
17 | import joblib
18 | import pandas as pd
19 | from pathlib import Path
20 | import sys
21 |
22 |
23 | def main(prefix, test_prefix, refModel_prefix):
24 | print("")
25 | print("Here is some basic info on the command you are about to run.")
26 | print("Python version info...")
27 | print(sys.version)
28 |
29 | # Print out the chosen CLI arguments
30 | print("CLI argument info...")
31 | print(f"You are importing this test dataset: {test_prefix}.")
32 | print(f"You are applying the model saved here: {refModel_prefix}.")
33 | print(f"The results of this test application of your model will be saved in files with the given prefix: {prefix}.")
34 | print("As a note, in all exported probabilities and other graphics, case status is treated as a 0 or 1, with 1 "
35 | "representing a positive case.")
36 |
37 | print("")
38 |
39 | # Specify prefix and dataframe variables to be passed into class
40 | #run_prefix = prefix
41 | #infile_h5 = test_prefix + ".dataForML.h5"
42 | #df = pd.read_hdf(infile_h5, key="dataForML")
43 | infile_h5 = Path(prefix).joinpath("Munge").joinpath("dataForML.h5")
44 | df = pd.read_hdf(infile_h5, key="dataForML")
45 |
46 | infile_model = Path(prefix).joinpath("Tune").joinpath("tunedModel.joblib")
47 | loaded_model = joblib.load(infile_model)
48 |
49 | # Pass the arguments to the class
50 | #test = supervised.test(df, loaded_model, run_prefix)
51 | test = supervised.test(df, loaded_model, prefix)
52 |
53 | # Prep and show the dataframe
54 | test.prep_df()
55 |
56 | # Output the performance metrics
57 | test.performance_metrics()
58 |
59 | # Exporting predictions on withheld data
60 | test.export_pheno_predictions()
61 |
62 | # Exporting regression plot + summary
63 | test.regression_summary()
64 |
65 | # Thank the user
66 | print("")
67 | print("Let's shut everything down, thanks for testing your model with GenoML!")
68 | print("")
69 |
--------------------------------------------------------------------------------
/genoml/cli/continuous_supervised_train.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 The GenoML Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | import sys
17 |
18 | import numpy as np
19 | import pandas as pd
20 | from pathlib import Path
21 | from genoml import utils
22 | from genoml.continuous import supervised
23 |
24 |
25 | # TODO(mary): use or remove export_predictions
26 | @utils.DescriptionLoader.function_description("cli/continuous_supervised_train")
27 | def main(run_prefix, export_predictions, matching_columns_path):
28 | utils.DescriptionLoader.print("cli/continuous_supervised_train/info",
29 | python_version=sys.version, prefix=run_prefix)
30 |
31 | input_path = Path(run_prefix).joinpath("Munge").joinpath("dataForML.h5")
32 | with utils.DescriptionLoader.context(
33 | "cli/continuous_supervised_train/input", path=input_path):
34 | df = pd.read_hdf(input_path, key="dataForML")
35 |
36 | if matching_columns_path:
37 | with utils.DescriptionLoader.context(
38 | "cli/continuous_supervised_train/matching_columns_path",
39 | matching_columns_path=matching_columns_path):
40 | with open(matching_columns_path, 'r') as matchingCols_file:
41 | matching_column_names_list = matchingCols_file.read().splitlines()
42 |
43 | df = df[np.intersect1d(df.columns, matching_column_names_list)]
44 |
45 | model = supervised.train(df, run_prefix)
46 | model.summary()
47 | model.compete()
48 | model.export_model()
49 | model.export_predictions()
50 | model.save_algorithm_results(run_prefix)
51 | model.save_best_algorithm(run_prefix)
52 |
--------------------------------------------------------------------------------
/genoml/cli/continuous_supervised_tune.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 The GenoML Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | import pandas as pd
17 | from genoml.continuous import supervised
18 | from pathlib import Path
19 | import numpy as np
20 |
21 |
22 | def main(run_prefix, max_iter, cv_count, matchingCols):
23 | # TUNING
24 | # Create a dialogue with the user
25 | print("Here is some basic info on the command you are about to run.")
26 | print("CLI argument info...")
27 | print(f"Working with the dataset and best model corresponding to prefix {run_prefix} the timestamp from the merge "
28 | f"is the prefix in most cases.")
29 | print(f"Your maximum number of tuning iterations is {max_iter} and if you are concerned about runtime, make this "
30 | f"number smaller.")
31 | print(f"You are running {cv_count} rounds of cross-validation, and again... if you are concerned about runtime, "
32 | f"make this number smaller.")
33 | print("Give credit where credit is due, for this stage of analysis we use code from the great contributors to "
34 | "python packages: argparse, xgboost, sklearn, pandas, numpy, time, matplotlib and seaborn.")
35 | print("As a note, in all exported probabilities and other graphics, case status is treated as a 0 or 1, with 1 "
36 | "representing a positive case.")
37 | print("")
38 |
39 | infile_h5 = Path(run_prefix).joinpath("Munge").joinpath("dataForML.h5")
40 | df = pd.read_hdf(infile_h5, key="dataForML")
41 |
42 | # Addressing issue #12:
43 | if (matchingCols != None):
44 | print(f"We are using the harmonized columns you provided here: {matchingCols}")
45 | print(f"Note that you might have different/less features than before, given this was column list was harmonized between your reference and test dataset...")
46 |
47 | with open(matchingCols, 'r') as matchingCols_file:
48 | matching_column_names_list = matchingCols_file.read().splitlines()
49 |
50 | # Keep only the columns found in the file
51 | df = df[np.intersect1d(df.columns, matching_column_names_list)]
52 |
53 | y_tune = df.PHENO
54 | X_tune = df.drop(columns=['PHENO'])
55 | IDs_tune = X_tune.ID
56 | X_tune = X_tune.drop(columns=['ID'])
57 |
58 | best_algo_name_in = Path(run_prefix).joinpath("Train").joinpath('best_algorithm.txt')
59 | best_algo_df = pd.read_csv(best_algo_name_in, header=None, index_col=False)
60 | best_algo = str(best_algo_df.iloc[0,0])
61 |
62 |
63 | # Communicate to the user the best identified algorithm
64 | print(f"From previous analyses in the training phase, we've determined that the best algorithm for this "
65 | f"application is {best_algo}... so let's tune it up and see what gains we can make!")
66 |
67 | # Tuning
68 | ## This calls on the functions made in the tune class (tuning.py) at the genoml.continuous.supervised
69 | model_tune = supervised.tune(df, run_prefix, max_iter, cv_count)
70 | model_tune.select_tuning_parameters() # Returns algo, hyperparameters, and scoring_metric
71 | model_tune.apply_tuning_parameters() # Randomized search with CV to tune
72 | model_tune.report_tune() # Summary of the top 10 iterations of the hyperparameter tune
73 | model_tune.summarize_tune() # Summary of the cross-validation
74 | model_tune.compare_performance() # Compares tuned performance to baseline to
75 | model_tune.export_tuned_data() # Export the newly tuned predictions
76 | model_tune.export_tune_regression() # Export the tuned and fitted regression model
77 |
78 | print("")
79 | print("End of tuning stage with GenoML.")
80 | print("")
81 |
--------------------------------------------------------------------------------
/genoml/cli/discrete_supervised_test.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 The GenoML Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | from genoml.discrete import supervised
17 | import joblib
18 | import pandas as pd
19 | from pathlib import Path
20 | import sys
21 |
22 | def main(prefix, test_prefix, refModel_prefix):
23 | print("")
24 | print("Here is some basic info on the command you are about to run.")
25 | print("Python version info...")
26 | print(sys.version)
27 |
28 | # Print out the chosen CLI arguments
29 | print("CLI argument info...")
30 | print(f"You are importing this test dataset: {test_prefix}.")
31 | print(f"You are applying the model saved here: {refModel_prefix}.")
32 | print(f"The results of this test application of your model will be saved in files with the given prefix: {prefix}.")
33 | print("As a note, in all exported probabilities and other graphics, case status is treated as a 0 or 1, with 1 representing a positive case.")
34 |
35 | print("")
36 |
37 | # Specify prefix and dataframe variables to be passed into class
38 | infile_h5 = Path(prefix).joinpath("Munge").joinpath("dataForML.h5")
39 | infile_model = Path(prefix).joinpath("Tune").joinpath("tunedModel.joblib")
40 | loaded_model = joblib.load(infile_model)
41 |
42 | # Pass the arguments to the class
43 | df = pd.read_hdf(infile_h5, key="dataForML")
44 | test = supervised.test(df, loaded_model, prefix)
45 |
46 | # Prep and show the dataframe
47 | test.prep_df()
48 |
49 | # Export the ROC and precision-recall plots
50 | test.plot_results(save=True)
51 |
52 | # Export the probability histograms and data tables.
53 | test.export_prediction_data()
54 |
55 | # Export the additional summary stats
56 | #test.additional_sumstats()
57 |
58 | # Thank the user
59 | print("")
60 | print("Let's shut everything down, thanks for testing your model with GenoML!")
61 | print("")
62 |
63 |
64 |
--------------------------------------------------------------------------------
/genoml/cli/discrete_supervised_train.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 The GenoML Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | import sys
17 | import numpy as np
18 | import pandas as pd
19 | from pathlib import Path
20 | from genoml.discrete import supervised
21 |
22 |
23 | def main(prefix, metric_max, prob_hist, auc, matchingCols):
24 | print("")
25 | print("Here is some basic info on the command you are about to run.")
26 | print("Python Version info...")
27 | print(sys.version)
28 |
29 | # Print out chosen CLI arguments
30 | print("CLI argument info...")
31 | print(f"Working with dataset {prefix} from previous data munging efforts.")
32 | print(f"You have chosen to compete the algorithms based on {metric_max}.")
33 | print("Give credit where credit is due, for this stage of analysis we use code from the great contributors to Python packages: argparse, xgboost, sklearn, pandas, numpy, time, matplotlib and seaborn.")
34 | print("As a note, in all exported probabilities and other graphics, case status is treated as a 0 or 1, with 1 representing a positive case.")
35 | print("")
36 |
37 | # Specify prefix and dataframe variables to be passed into class
38 | run_prefix = prefix
39 | infile_h5 = Path(run_prefix).joinpath("Munge").joinpath("dataForML.h5")
40 | df = pd.read_hdf(infile_h5, key = "dataForML")
41 |
42 | if (matchingCols != None):
43 | print(f"Looks like you are retraining your reference file. We are using the harmonized columns you provided here: {matchingCols}")
44 | print(f"Note that you might have different/less features than before, given this was harmonized between training and test dataset, and might mean your model now performs worse...")
45 |
46 | with open(matchingCols, 'r') as matchingCols_file:
47 | matching_column_names_list = matchingCols_file.read().splitlines()
48 |
49 | # Keep only the columns found in the file
50 | df = df[np.intersect1d(df.columns, matching_column_names_list)]
51 |
52 | model = supervised.train(df, run_prefix)
53 | model.summary()
54 |
55 | # Give user context prior to competing algorithms
56 | # Explains to users how we are splitting their data 70:30
57 | print("")
58 | print("Now let's compete these algorithms!")
59 | print("We'll update you as each algorithm runs, then summarize at the end.")
60 | print("Here we test each algorithm under default settings using the same training and test datasets derived from a 70% training and 30% testing split of your data.")
61 | print("For each algorithm, we will output the following metrics...")
62 | print("Algorithm name, hoping that's pretty self-explanatory. Plenty of resources on these common ML algorithms at https://scikit-learn.org and https://xgboost.readthedocs.io/.")
63 | print("AUC_percent, this is the area under the curve from receiver operating characteristic analyses. This is the most common metric of classifier performance in biomedical literature, we express this as a percent. We calculate AUC based on the predicted probability of being a case.")
64 | print("Accuracy_percent, this is the simple accuracy of the classifier, how many predictions were correct from best classification cutoff (python default).")
65 | print("Balanced_Accuracy_Percent, consider this as the accuracy resampled to a 1:1 mix of cases and controls. Imbalanced datasets can give funny results for simple accuracy.")
66 | print("Log_Loss, this is essentially the inverse of the likelihood function for a correct prediction, you want to minimize this.")
67 | print("Sensitivity, proportion of cases correctly identified.")
68 | print("Specificity, proportion of controls correctly identified.")
69 | print("PPV, this is the positive predictive value, the probability that subjects with a positive result actually have the disease.")
70 | print("NPV, this is the negative predictive value, the probability that subjects with a negative result don't have the disease.")
71 | print("We also log the runtimes per algorithm.")
72 | print("")
73 | print("Algorithm summaries incoming...")
74 | print("")
75 |
76 | # Compete the algorithms
77 | model.compete()
78 |
79 | # Output the results of the log
80 | model.results(metric_max)
81 |
82 | # Export the results
83 | model.export_model()
84 |
85 | # Export the ROC and precision-recall plots
86 | model.plot_results(save=True)
87 |
88 | # Export the probability histograms and data tables.
89 | model.export_prediction_data()
90 |
91 | # Save out the proper algorithm
92 | model.save_results(algorithm_results=True, best_algorithm=True)
93 |
94 | print("Thank you for training with GenoML!")
--------------------------------------------------------------------------------
/genoml/cli/discrete_supervised_tune.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 The GenoML Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | import pandas as pd
17 | import numpy as np
18 | from pathlib import Path
19 | from genoml.discrete import supervised
20 |
21 |
22 | def main(run_prefix, metric_tune, max_iter, cv_count, matchingCols):
23 | # TUNING
24 | # Create a dialogue with the user
25 | print("Here is some basic info on the command you are about to run.")
26 | print("CLI argument info...")
27 | print(f"Working with the dataset and best model corresponding to prefix {run_prefix} the timestamp from the merge is the prefix in most cases.")
28 | print(f"You have chosen to tune the algorithms based on {metric_tune}.")
29 | print(f"Your maximum number of tuning iterations is {max_iter} and if you are concerned about runtime, make this number smaller.")
30 | print(f"You are running {cv_count} rounds of cross-validation, and again... if you are concerned about runtime, make this number smaller.")
31 | print("Give credit where credit is due, for this stage of analysis we use code from the great contributors to python packages: argparse, xgboost, sklearn, pandas, numpy, time, matplotlib and seaborn.")
32 | print("As a note, in all exported probabilities and other graphics, case status is treated as a 0 or 1, with 1 representing a positive case.")
33 |
34 | print("")
35 |
36 | infile_h5 = Path(run_prefix).joinpath("Munge").joinpath("dataForML.h5")
37 | df = pd.read_hdf(infile_h5, key = "dataForML")
38 |
39 | # Addressing issue #12:
40 | if (matchingCols != None):
41 | print(f"We are using the harmonized columns you provided here: {matchingCols}")
42 | print(f"Note that you might have different/less features than before, given this was column list was harmonized between your reference and test dataset...")
43 |
44 | with open(matchingCols, 'r') as matchingCols_file:
45 | matching_column_names_list = matchingCols_file.read().splitlines()
46 |
47 | # Keep only the columns found in the file
48 | df = df[np.intersect1d(df.columns, matching_column_names_list)]
49 |
50 | best_algo_name_in = Path(run_prefix).joinpath("Train").joinpath('best_algorithm.txt')#run_prefix + 'best_algorithm.txt'
51 | best_algo_df = pd.read_csv(best_algo_name_in, header=None, index_col=False)
52 | best_algo = str(best_algo_df.iloc[0,0])
53 |
54 | # Communicate to the user the best identified algorithm
55 | print(f"From previous analyses in the training phase, we've determined that the best algorithm for this application is {best_algo}... so let's tune it up and see what gains we can make!")
56 |
57 | # Tuning
58 | model_tune = supervised.tune(df, run_prefix, max_iter, cv_count)
59 | model_tune.select_tuning_parameters(metric_tune)
60 | model_tune.apply_tuning_parameters()
61 | model_tune.report_tune()
62 | model_tune.summarize_tune()
63 | model_tune.compare_performance()
64 | model_tune.plot_results(save=True)
65 | model_tune.export_prediction_data()
66 |
67 | print("")
68 | print("End of tuning stage with GenoML.")
69 | print("")
70 |
--------------------------------------------------------------------------------
/genoml/cli/harmonizing.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 The GenoML Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | import sys
17 | from genoml import preprocessing
18 |
19 |
20 | def main(test_geno_prefix, test_prefix, ref_model_prefix,
21 | training_snps_alleles):
22 | # Print configurations
23 | print("Here is some basic info on the command you are about to run.")
24 | print("Python version info...")
25 | print(sys.version)
26 | print("CLI argument info...")
27 | print(f"You are importing test dataset {test_geno_prefix}.")
28 | print(
29 | f"Applying the model saved from your reference dataset in {ref_model_prefix}.")
30 | print(
31 | f"Reading in the SNP and allele information we will use to compare from {training_snps_alleles}.")
32 | print(
33 | f"The results of this test application of your model will be saved in files tagged {test_prefix}.")
34 | print(
35 | "As a note, in all exported probabilities and other graphics, case status is treated as a 0 or 1, with 1 representing a positive case.")
36 |
37 | # Run the harmonize script in genoml.preprocessing
38 | harmonizer = preprocessing.harmonizing(test_geno_prefix=test_geno_prefix,
39 | test_out_prefix=test_prefix,
40 | ref_model_prefix=ref_model_prefix,
41 | training_SNPs=training_snps_alleles)
42 |
43 | # Generate new binaries from the test dataset using the reference dataset SNPs
44 | harmonizer.generate_new_PLINK()
45 |
46 | # Read in PLINK binaries
47 | # harmonizer.read_PLINK()
48 |
49 | # Generate reference columns to keep for munging
50 | harmonizer.prep_refCols_file()
51 |
52 | # Thank the user
53 | print("Thank you for harmonizing with GenoML!")
54 |
--------------------------------------------------------------------------------
/genoml/cli/munging.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 The GenoML Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | import argparse
17 | import sys
18 |
19 | import genoml.dependencies
20 | from genoml import preprocessing
21 |
22 |
23 | def main(prefix, impute, geno, skip_prune, r2_cutoff, pheno, addit, feature_selection, gwas, p, vif, iter, ref_cols_harmonize, umap_reduce, adjust_data, adjust_normalize, target_features, confounders, data_type):
24 | genoml.dependencies.check_dependencies()
25 |
26 | run_prefix = prefix
27 | impute_type = impute
28 | geno_path = geno
29 | prune_choice = skip_prune
30 | pheno_path = pheno
31 | addit_path = addit
32 | n_est = feature_selection
33 | gwas_path = gwas
34 | p_gwas = p
35 | r2_cutoff = r2_cutoff
36 | vif_thresh = vif
37 | vif_iter = iter
38 | refColsHarmonize = ref_cols_harmonize
39 | umap_reduce = umap_reduce
40 | adjust_data = adjust_data
41 | adjust_normalize = adjust_normalize
42 | target_features = target_features
43 | confounders = confounders
44 |
45 | # Print configurations
46 | print("")
47 | print("Here is some basic info on the command you are about to run.")
48 | print("Python version info...")
49 | print(sys.version)
50 | print("CLI argument info...")
51 | print(
52 | f"The output prefix for this run is {run_prefix} and will be appended to later runs of GenoML.")
53 | print(f"Working with genotype data? {geno_path}")
54 | print(f"Do you want GenoML to prune your SNPs for you? {prune_choice}")
55 | print(f"The pruning threshold you've chosen is {r2_cutoff}")
56 | print(f"Working with additional predictors? {addit_path}")
57 | print(f"Where is your phenotype file? {pheno_path}")
58 | print(f"Any use for an external set of GWAS summary stats? {gwas_path}")
59 | print(
60 | f"If you plan on using external GWAS summary stats for SNP filtering, we'll only keep SNPs at what P value? {p_gwas}")
61 | print(f"How strong is your VIF filter? {vif_thresh}")
62 | print(f"How many iterations of VIF filtering are you doing? {vif_iter}")
63 | print(
64 | f"The imputation method you picked is using the column {impute_type} to fill in any remaining NAs.")
65 | print(f"Will you be adjusting additional features using UMAP dimensionality reduction? {umap_reduce}")
66 | print(
67 | "Give credit where credit is due, for this stage of analysis we use code from the great contributors to python packages: os, sys, argparse, numpy, pandas, joblib, math and time. We also use PLINK v1.9 from https://www.cog-genomics.org/plink/1.9/.")
68 | print("")
69 |
70 | # Run the munging script in genoml.preprocessing
71 | munger = preprocessing.munging(pheno_path=pheno_path, run_prefix=run_prefix, impute_type=impute_type, skip_prune=prune_choice,
72 | p_gwas=p_gwas, addit_path=addit_path, gwas_path=gwas_path, geno_path=geno_path, refColsHarmonize=refColsHarmonize, r2_cutoff=r2_cutoff)
73 |
74 | # Process the PLINK inputs (for pruning)
75 | df = munger.plink_inputs()
76 |
77 | # Run the UMAP dimension reduction/ adjuster
78 | if (adjust_data == "yes" or umap_reduce == "yes"):
79 | adjuster = preprocessing.adjuster(run_prefix, df, target_features, confounders, adjust_data, adjust_normalize, umap_reduce)
80 | reduced_df = adjuster.umap_reducer()
81 | if (adjust_data == "yes"):
82 | print(f"\n You have chosen to adjust your data! \n")
83 | if (adjust_normalize == "yes"):
84 | print(f"\n You have also chosen to normalize your adjusted data \n")
85 | else:
86 | print(f"\n You have also chosen NOT to normalize your adjusted data \n")
87 | df = adjuster.normalize(reduced_df)
88 |
89 | # Run the feature selection using extraTrees
90 | if n_est > 0:
91 | featureSelection_df = preprocessing.featureselection(run_prefix, df, data_type, n_est)
92 | df = featureSelection_df.rank()
93 | featureSelection_df.export_data()
94 |
95 | # Run the VIF calculation
96 | if vif_iter > 0:
97 | vif_calc = preprocessing.vif(vif_iter, vif_thresh, df, 100, run_prefix)
98 | vif_calc.vif_calculations()
99 |
100 | # Thank the user
101 | print("Thank you for munging with GenoML!")
102 |
--------------------------------------------------------------------------------
/genoml/continuous/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 The GenoML Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | from genoml.continuous import supervised
17 |
--------------------------------------------------------------------------------
/genoml/continuous/supervised/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 The GenoML Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | from genoml.continuous.supervised.training import train
17 | from genoml.continuous.supervised.tuning import tune
18 | from genoml.continuous.supervised.testing import test
19 |
--------------------------------------------------------------------------------
/genoml/continuous/supervised/testing.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 The GenoML Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | # Import the necessary packages
17 | import pandas as pd
18 | from pathlib import Path
19 | import seaborn as sns
20 | import statsmodels.formula.api as sm
21 | from sklearn.metrics import explained_variance_score, mean_squared_error, median_absolute_error, r2_score
22 |
23 | class test:
24 | def __init__(self, df, loaded_model, run_prefix):
25 | self.df = df
26 | path = Path(run_prefix).joinpath("Test")
27 | if not path.is_dir():
28 | path.mkdir()
29 | self.run_prefix = path
30 | self.loaded_model = loaded_model
31 |
32 | def prep_df(self):
33 |
34 | print("")
35 | print("Your data looks like this (showing the first few lines of the left-most and right-most columns)...")
36 | print("#"*70)
37 | print(self.df.describe())
38 | print("#"*70)
39 | print("")
40 |
41 | # Save out and drop the PHENO and sample ID columns
42 | y_test = self.df.PHENO
43 | X_test = self.df.drop(columns=['PHENO'])
44 | IDs_test = X_test.ID
45 | X_test = X_test.drop(columns=['ID'])
46 |
47 | # Save variables to use globally within the class
48 | self.y_test = y_test
49 | self.X_test = X_test
50 | self.IDs_test = IDs_test
51 |
52 | return X_test
53 |
54 | def performance_metrics(self):
55 |
56 | log_cols=["Explained_variance_score", "Mean_squared_error", "Median_absolute_error", "R2_score"]
57 | log_table = pd.DataFrame(columns=log_cols)
58 |
59 | self.loaded_model.fit(self.X_test, self.y_test)
60 |
61 | print("")
62 | print("#"*70)
63 |
64 | test_predictions = self.loaded_model.predict(self.X_test)
65 | test_predictions = test_predictions
66 | evs = explained_variance_score(self.y_test, test_predictions)
67 | print("Explained variance score: {:.4}".format(evs))
68 |
69 | test_predictions = self.loaded_model.predict(self.X_test)
70 | test_predictions = test_predictions
71 | mse = mean_squared_error(self.y_test, test_predictions)
72 | print("Mean squared error: {:.4}".format(mse))
73 |
74 | test_predictions = self.loaded_model.predict(self.X_test)
75 | test_predictions = test_predictions
76 | mae = median_absolute_error(self.y_test, test_predictions)
77 | print("Median absolute error: {:.4}".format(mae))
78 |
79 | test_predictions = self.loaded_model.predict(self.X_test)
80 | test_predictions = test_predictions
81 | r2s = r2_score(self.y_test, test_predictions)
82 | print("R^2 score: {:.4}".format(r2s))
83 |
84 | log_entry = pd.DataFrame([[evs, mse, mae, r2s]], columns=log_cols)
85 | log_table = log_table._append(log_entry)
86 |
87 | print("#"*70)
88 |
89 | print("")
90 |
91 | log_outfile = self.run_prefix.joinpath('testedModel_allSamples_performanceMetrics.csv')
92 |
93 | print("")
94 | print(f"This table below is also logged as {log_outfile} and is in your current working directory...")
95 | print("#"*70)
96 | print(log_table)
97 | print("#"*70)
98 | print("")
99 |
100 | log_table.to_csv(log_outfile, index=False)
101 |
102 | self.log_table = log_table
103 | return log_table
104 |
105 | def export_pheno_predictions(self):
106 |
107 | test_predicted_values = self.loaded_model.predict(self.X_test)
108 | test_predicted_values_df = pd.DataFrame(test_predicted_values)
109 | y_test_df = pd.DataFrame(self.y_test)
110 | IDs_test_df = pd.DataFrame(self.IDs_test)
111 |
112 | test_out = pd.concat([IDs_test_df.reset_index(), y_test_df.reset_index(drop=True), test_predicted_values_df.reset_index(drop=True)], axis = 1, ignore_index=True)
113 | test_out.columns=["INDEX","ID","PHENO_REPORTED","PHENO_PREDICTED"]
114 | test_out = test_out.drop(columns=["INDEX"])
115 |
116 | test_outfile = self.run_prefix.joinpath('testedModel_allSample_predictions.csv')
117 | test_out.to_csv(test_outfile, index=False)
118 |
119 | print("")
120 | print(f"Preview of the exported predictions exported as {test_outfile}, these are pretty straight forward.")
121 | print("They generally include the sample ID, the previously reported phenotype, and the predicted phenotype from that algorithm.")
122 | print("#"*70)
123 | print(test_out.head())
124 | print("#"*70)
125 |
126 | self.test_out = test_out
127 | return test_out
128 |
129 | def regression_summary(self):
130 |
131 | genoML_colors = ["cyan","purple"]
132 |
133 | sns_plot = sns.regplot(data=self.test_out, y="PHENO_REPORTED", x="PHENO_PREDICTED", scatter_kws={"color": "cyan"}, line_kws={"color": "purple"})
134 |
135 | plot_out = self.run_prefix.joinpath('testedModel_allSamples_regressionPlot.png')
136 | sns_plot.figure.savefig(plot_out, dpi=600)
137 |
138 | print("")
139 | print(f"We are also exporting a regression plot for you here {plot_out}, this is a graphical representation of the difference between the reported and predicted phenotypes in the withheld test data for the best performing algorithm.")
140 |
141 | print("")
142 | print("Here is a quick summary of the regression comparing PHENO_REPORTED ~ PHENO_PREDICTED in the withheld test data...")
143 | print("")
144 |
145 | reg_model = sm.ols(formula='PHENO_REPORTED ~ PHENO_PREDICTED', data=self.test_out)
146 | fitted = reg_model.fit()
147 | print(fitted.summary())
148 |
149 | fitted_out = self.run_prefix.joinpath('testedModel_allSamples_regressionSummary.csv')
150 |
151 | with open(fitted_out, 'w') as fh:
152 | fh.write(fitted.summary().as_csv())
153 |
154 | print(f"We are exporting this summary here: {fitted_out}")
155 |
156 | print("")
157 | print("...always good to see the P value for the predictor.")
158 |
--------------------------------------------------------------------------------
/genoml/dependencies.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 The GenoML Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | import io
17 | import logging
18 | import os
19 | import pathlib
20 | import platform
21 | import requests
22 | import stat
23 | import subprocess
24 | import zipfile
25 |
26 | from genoml import utils
27 |
28 |
29 | def __get_executable_folder():
30 | key = "GENOML_DEP_DIR"
31 | if key in os.environ:
32 | return os.path.abspath(os.environ.get(key))
33 | else:
34 | return os.path.join(str(pathlib.Path.home()), ".genoml", "misc",
35 | "executables")
36 |
37 |
38 | __executable_folder = __get_executable_folder()
39 |
40 |
41 | def __check_exec(exec_path, *args, absolute_path=False):
42 | if not absolute_path:
43 | binary_path = os.path.join(__executable_folder, exec_path)
44 | else:
45 | binary_path = exec_path
46 | if not os.path.exists(binary_path):
47 | return False
48 |
49 | _ = subprocess.run([binary_path, *args], stdout=subprocess.DEVNULL,
50 | stderr=subprocess.DEVNULL)
51 | return True
52 |
53 |
54 | def __install_exec(url, exec_path):
55 | r = requests.get(url, verify=False, stream=True)
56 | r.raw.decode_content = True
57 | buffer = io.BytesIO()
58 | buffer.write(r.content)
59 | with zipfile.ZipFile(buffer, "r") as fp:
60 | fp.extractall(__executable_folder)
61 |
62 | binary_path = os.path.join(__executable_folder, exec_path)
63 | os.chmod(binary_path, stat.S_IEXEC)
64 |
65 |
66 | def __check_package(name):
67 | platform_system = platform.system()
68 |
69 | if name not in __DEPENDENCIES:
70 | raise EnvironmentError("Unknown package: {}".format(name))
71 |
72 | if platform_system not in __DEPENDENCIES[name]:
73 | raise EnvironmentError(
74 | "Unknown supported OK: {}".format(platform_system))
75 |
76 | entry = __DEPENDENCIES[name][platform_system]
77 |
78 | binary_name = entry["binary"]
79 | args = entry["version_args"]
80 | url = entry["url"]
81 |
82 | if __check_exec(binary_name, *args):
83 | logging.debug("{} is found".format(name))
84 | return os.path.join(__executable_folder, binary_name)
85 |
86 | logging.warning("Installing {}".format(name))
87 | __install_exec(url, binary_name)
88 | if not __check_exec(binary_name, *args):
89 | logging.warning("Failed to run {} after installation".format(name))
90 | raise EnvironmentError("Can not install {}".format(name))
91 | else:
92 | return os.path.join(__executable_folder, binary_name)
93 |
94 |
95 | @utils.DescriptionLoader.function_description("check_dependencies")
96 | def check_dependencies():
97 | global __DEPENDENCIES
98 | ret = {}
99 | for package, data in __DEPENDENCIES.items():
100 | if "checker" in data:
101 | with utils.DescriptionLoader.context(
102 | "check_dependencies_{}".format(package)):
103 | ret[package] = data["checker"]()
104 |
105 | return ret
106 |
107 |
108 | def check_plink():
109 | return __check_package("Plink")
110 |
111 |
112 | __DEPENDENCIES = {
113 | "Plink": {
114 | "checker": check_plink,
115 | "Darwin": {
116 | "binary": "plink",
117 | "version_args": ["--version"],
118 | "url": "http://s3.amazonaws.com/plink1-assets/plink_mac_20200219.zip"
119 | },
120 | "Linux": {
121 | "binary": "plink",
122 | "version_args": ["--version"],
123 | "url": "http://s3.amazonaws.com/plink1-assets/plink_linux_x86_64_20200219.zip"
124 | }
125 | },
126 | }
127 |
--------------------------------------------------------------------------------
/genoml/discrete/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 The GenoML Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | from genoml.discrete import supervised
--------------------------------------------------------------------------------
/genoml/discrete/supervised/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 The GenoML Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | from genoml.discrete.supervised.training import train
17 | from genoml.discrete.supervised.tuning import tune
18 | from genoml.discrete.supervised.testing import test
--------------------------------------------------------------------------------
/genoml/discrete/supervised/testing.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 The GenoML Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | # Import the necessary packages
17 | from pathlib import Path
18 | import genoml.discrete.utils as discrete_utils
19 |
20 | class test:
21 | def __init__(self, df, loaded_model, run_prefix):
22 | self.df = df
23 | path = Path(run_prefix).joinpath("Test")
24 | if not path.is_dir():
25 | path.mkdir()
26 | self.run_prefix = path
27 | self.algo = loaded_model
28 |
29 | def prep_df(self):
30 | print("")
31 | print("Your data looks like this (showing the first few lines of the left-most and right-most columns)...")
32 | print("#"*70)
33 | print(self.df.describe())
34 | print("#"*70)
35 | print("")
36 |
37 | # Save out and drop the PHENO and sample ID columns
38 | y_test = self.df.PHENO
39 | X_test = self.df.drop(columns=['PHENO'])
40 | IDs_test = X_test.ID
41 | X_test = X_test.drop(columns=['ID'])
42 |
43 | # Save variables to use globally within the class
44 | self.y_test = y_test
45 | self.X_test = X_test
46 | self.IDs_test = IDs_test
47 |
48 | def plot_results(self, save=False):
49 | # Issue #24: RandomForestClassifier is finicky - can't recalculate moving forward like the other
50 | self.algo.fit(self.X_test, self.y_test)
51 | plot_path = self.run_prefix.joinpath('testModel_withheldSample_ROC.png')
52 | ground_truth = self.y_test.values
53 | predictions = self.algo.predict(self.X_test)
54 | discrete_utils.ROC(save, plot_path, ground_truth, predictions)
55 | discrete_utils.precision_recall_plot(save, plot_path, ground_truth, predictions)
56 |
57 | def export_prediction_data(self):
58 | test_out = discrete_utils.export_prediction_tables(
59 | self.algo,
60 | self.y_test,
61 | self.X_test,
62 | self.IDs_test,
63 | self.run_prefix.joinpath('tunedModel_withheldSample_testingPredictions.csv'),
64 | )
65 |
66 | discrete_utils.export_prob_hist(
67 | test_out,
68 | self.run_prefix.joinpath('tunedModel_withheldSample_testingProbabilities'),
69 | )
70 |
71 | def additional_sumstats(self):
72 | print("")
73 | print("#"*70)
74 | print("Some additional summary stats logging from your application of your model to the test dataset.")
75 | print("")
76 |
77 | log_outfile = self.run_prefix.joinpath('tunedModel_validationCohort_allCasesControls_performanceMetrics.csv')
78 | log_table = discrete_utils.summary_stats(
79 | self.algo,
80 | self.y_test,
81 | self.X_test,
82 | )
83 | log_table.to_csv(log_outfile, index=False)
84 |
85 | print("")
86 | print("#"*70)
87 | print("")
88 | print(f"This table below is also logged as {log_outfile} and is in your current working directory...")
89 | print("#"*70)
90 | print(log_table)
91 | print("#"*70)
92 | print("")
93 |
--------------------------------------------------------------------------------
/genoml/misc/descriptions.json:
--------------------------------------------------------------------------------
1 | {
2 | "check_dependencies_Plink": {
3 | "title": "Checking plink",
4 | "description": "",
5 | "error": ""
6 | },
7 | "check_dependencies": {
8 | "title": "Dependency Check",
9 | "description": "",
10 | "end": true,
11 | "error": ""
12 | },
13 | "cli/continuous_supervised_train": {
14 | "title": "GenoML",
15 | "description": "Continuous Supervised Train",
16 | "end": true,
17 | "error": ""
18 | },
19 | "cli/continuous_supervised_train/info": {
20 | "title": "Basic Info",
21 | "description": "Here is some basic info on the command you are about to run.\nPython version info:\n{python_version}\n\nWorking with dataset from previous data munging efforts at:\n\t{prefix}",
22 | "error": ""
23 | },
24 | "cli/continuous_supervised_train/input": {
25 | "title": "Reading Input File: {path}",
26 | "description": "",
27 | "error": ""
28 | },
29 | "cli/continuous_supervised_train/matching_columns_path": {
30 | "title": "",
31 | "description": "Looks like you are retraining your reference file. We are using the harmonized columns you provided here: {matching_columns_path}\nNote that you might have different/less features than before, given this was harmonized between training and test dataset, and might mean your model now performs worse...",
32 | "error": ""
33 | },
34 | "continuous/supervised/training/Train/summary": {
35 | "title": "Input Data Summary",
36 | "description": "Your data looks like this (showing the first few lines of the left-most and right-most columns)...\n\n{data}",
37 | "error": ""
38 | },
39 | "continuous/supervised/training/Train/compete": {
40 | "title": "Compete the algorithms",
41 | "description": "Now let's compete these algorithms!\nWe'll update you as each algorithm runs, then summarize at the end.\nHere we test each algorithm under default settings using the same training and test datasets derived from a 70% training and 30% testing split of your data.\nFor each algorithm, we will output the following metrics...\nAlgorithm name, hoping that's pretty self-explanatory. Plenty of resources on these common ML algorithms at https://scikit-learn.org and https://xgboost.readthedocs.io/.\nexplained_variance_score, this is the variance explained by the model per algorithm (scale from 0 to 1 with 1 being completely explained).\nmean_squared_error, this is the mean squared error from regression loss.\nmedian_absolute_error, median absolute error from regression loss.\nr2_score, standard r2 metric from linear regression (coefficient of determination), remember, this can be negative if your model is really bad.\nWe also log the runtimes per algorithm.\n\nAlgorithm summaries incoming...",
42 | "end": true,
43 | "error": ""
44 | },
45 | "continuous/supervised/training/Train/compete/algorithm": {
46 | "title": "{name}",
47 | "description": "",
48 | "error": ""
49 | },
50 | "continuous/supervised/training/Train/compete/algorithm/results": {
51 | "title": "{name} Results",
52 | "description": "{results}",
53 | "error": ""
54 | },
55 | "continuous/supervised/training/Train/compete/algorithm/best": {
56 | "title": "Best Algorithm: {algorithm}",
57 | "description": "There are occasionally slight fluctuations in model performance on the same withheld samples.\n{metrics}",
58 | "error": ""
59 | },
60 | "continuous/supervised/training/Train/export_model": {
61 | "title": "Exporting Model: {output_path}",
62 | "description": "this model has been saved as {output_path} for later use and can be found in your working directory.",
63 | "end": true,
64 | "error": ""
65 | },
66 | "continuous/supervised/training/Train/save_algorithm_results": {
67 | "title": "Saving Algorithm Results: {output_path}",
68 | "description": "This table below is also logged as {output_path} and is in your current working directory...\n\n{data}",
69 | "end": true,
70 | "error": ""
71 | },
72 | "continuous/supervised/training/Train/save_best_algorithm": {
73 | "title": "Saving Best Algorithm: {output_path}",
74 | "description": "Based on your withheld samples, the algorithm with the highest explained variance score is the {best_algorithm}... let's save that model name for you on {output_path}.",
75 | "end": true,
76 | "error": ""
77 | },
78 | "continuous/supervised/training/Train/export_predictions/test_data": {
79 | "title": "Saving Prediction on Test Data: {output_path}",
80 | "description": "Preview of the exported predictions for the withheld test data that has been exported as {output_path} these are pretty straight forward.\nThey generally include the sample ID, the previously reported phenotype and the predicted phenotype from that algorithm,\n\n{data}",
81 | "end": true,
82 | "error": ""
83 | },
84 | "continuous/supervised/training/Train/export_predictions/train_data": {
85 | "title": "Saving Prediction on Train Data: {output_path}",
86 | "description": "Preview of the exported predictions for the training samples which is naturally overfit and exported as {output_path} in the similar format as in the withheld test dataset that was just exported.\n\n{data}",
87 | "end": true,
88 | "error": ""
89 | },
90 | "continuous/supervised/training/Train/export_predictions/plot": {
91 | "title": "Saving Regression Plot: {output_path}",
92 | "description": "Here is a quick summary of the regression comparing PHENO_REPORTED ~ PHENO_PREDICTED in the withheld test data...\n{data}\n...always good to see the P for the predictor.\n\nWe are also exporting a regression plot for you here {output_path} this is a graphical representation of the difference between the reported and predicted phenotypes in the withheld test data for the best performing algorithm.",
93 | "end": true,
94 | "error": ""
95 | }
96 | }
97 |
--------------------------------------------------------------------------------
/genoml/preprocessing/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 The GenoML Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | from genoml.preprocessing.munging import munging
17 | from genoml.preprocessing.vif import vif
18 | from genoml.preprocessing.featureselection import featureselection
19 | from genoml.preprocessing.harmonizing import harmonizing
20 | from genoml.preprocessing.adjuster import adjuster
21 |
22 | __all__ = []
23 |
--------------------------------------------------------------------------------
/genoml/preprocessing/adjuster.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 The GenoML Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | import pandas as pd
17 | import numpy as np
18 | import statsmodels.api as sm
19 | import statsmodels.formula.api as smf
20 | import statistics
21 | import umap.umap_ as umap
22 | from joblib import dump, load
23 | import matplotlib.pyplot as plt
24 | from matplotlib import style
25 | import seaborn as sns
26 |
27 | class adjuster:
28 | def __init__(self, run_prefix, df, target_features, confounders, adjust_data, adjust_normalize, umap_reduce):
29 | self.run_prefix = run_prefix
30 | self.umap_reduce = umap_reduce
31 | self.target_columns = target_features
32 | self.confounders = confounders
33 | self.adjust_data = adjust_data
34 | self.normalize_switch = adjust_normalize
35 |
36 | df = self.run_prefix + ".dataForML.h5"
37 | self.munged_data = df
38 |
39 | self.target_data_df = pd.read_hdf(self.munged_data, 'dataForML')
40 | self.target_column_df = pd.read_csv(self.target_columns, names=['TARGETS'])
41 |
42 | self.confounders_df = pd.read_csv(self.confounders)
43 |
44 | # Keep only intersecting feature names left in munged set (removed either because --gwas or std dev of 0 etc.)
45 | target_data_list = self.target_data_df.columns
46 | target_column_list = self.target_column_df['TARGETS'].tolist()
47 | intersecting_list = list(set(target_data_list).intersection(set(target_column_list)))
48 | self.target_column_df = pd.DataFrame(intersecting_list,columns=['TARGETS'])
49 |
50 | def umap_reducer(self):
51 |
52 | if (self.umap_reduce == "yes"):
53 | IDs = self.confounders_df['ID']
54 | IDs_df = pd.DataFrame(IDs)
55 | to_umap = self.confounders_df.drop(columns=['ID'])
56 |
57 | reducer = umap.UMAP(random_state=153)
58 | embedding = reducer.fit_transform(to_umap)
59 |
60 | embedding1 = pd.DataFrame(embedding[:,0])
61 | embedding2 = pd.DataFrame(embedding[:,1])
62 |
63 | out_data = pd.concat([IDs_df.reset_index(), embedding1.reset_index(drop=True), embedding2.reset_index(drop=True)], axis=1, ignore_index=True)
64 | out_data.columns = ['INDEX', 'ID', 'UMAP_embedding1', "UMAP_embedding2"]
65 | out_data = out_data.drop(columns=['INDEX'])
66 |
67 | # Plot
68 | print(f"Exporting UMAP plot...")
69 | fig, ax = plt.subplots(figsize=(12,10))
70 | plt.scatter(embedding[:,0], embedding[:,1], cmap="cool")
71 | plt.title("Data Reduction to 2 Dimensions by UMAP", fontsize=18)
72 | plot_out = self.run_prefix + '.umap_plot.png'
73 | plt.savefig(plot_out, dpi=600)
74 |
75 | print(f"The UMAP plot has been exported and can be found here: {plot_out}")
76 |
77 | out_file = self.runplot_out = self.run_prefix + '.umap_data_reduction.csv'
78 | out_data.to_csv(out_file, index=False)
79 |
80 | print(f"The reduced UMAP 2 dimensions per sample .csv file can be found here: {out_file}")
81 |
82 | exported_reducer = reducer.fit(to_umap)
83 | algo_out = self.runplot_out = self.run_prefix + '.umap_clustering.joblib'
84 | dump(exported_reducer, algo_out)
85 |
86 | self.confounders_df = out_data
87 |
88 | print(f"The UMAP .joblib file can be found here: {algo_out}")
89 |
90 | return self.confounders_df
91 |
92 | def normalize(self, confounders_df):
93 | target_list = list(self.target_column_df['TARGETS'])
94 | confounder_list = list(confounders_df.columns[1:])
95 | columns_to_keep_list = list(self.target_data_df.columns)
96 |
97 | adjustments_df = self.target_data_df.merge(confounders_df, how='inner', on='ID', suffixes=['', '_y'])
98 |
99 | formula_for_confounders = ' + '.join(confounder_list)
100 |
101 | for target in target_list:
102 | current_target = str(target)
103 | print(f"Looking at the following feature: {current_target}")
104 |
105 | current_formula = current_target + " ~ " + formula_for_confounders
106 | print(current_formula)
107 |
108 | target_model = smf.ols(formula=current_formula, data=adjustments_df).fit()
109 |
110 | if (self.normalize_switch == 'yes'):
111 | adjustments_df['temp'] = pd.to_numeric(target_model.resid)
112 | #print(type(adjustments_df['temp']))
113 | mean_scalar = adjustments_df['temp'].mean()
114 | sd_scalar = adjustments_df['temp'].std()
115 | adjustments_df[current_target] = (adjustments_df['temp'] - mean_scalar)/sd_scalar
116 | adjustments_df.drop(columns=['temp'], inplace=True)
117 | else:
118 | adjustments_df[current_target] = pd.to_numeric(target_model.resid)
119 |
120 | adjusted_df = adjustments_df[columns_to_keep_list]
121 |
122 | outfile_h5 = self.run_prefix + ".dataForML.h5"
123 | adjusted_df.to_hdf(outfile_h5, key='dataForML', mode='w')
124 |
125 | if (self.normalize_switch == 'yes'):
126 | print(f"\n The adjusted dataframe following normalization can be found here: {outfile_h5}, your updated .dataForML file \n")
127 | else:
128 | print(f"\n The adjusted dataframe without normalization can be found here: {outfile_h5}, your updated .dataForML file \n")
129 |
130 |
131 | return adjusted_df
132 |
--------------------------------------------------------------------------------
/genoml/preprocessing/featureselection.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 The GenoML Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | import pandas as pd
17 | from sklearn import ensemble
18 | from sklearn import feature_selection
19 |
20 | class featureselection:
21 | def __init__(self, run_prefix, df, data_type, n_est):
22 | self.run_prefix = run_prefix
23 | self.featureRanks = None
24 | self.n_est = n_est
25 | self.data_type = data_type
26 |
27 | # Double check there are no NAs in the dataset before proceeding
28 | remove_cols = df.columns[df.isna().any()].tolist()
29 | df.drop(remove_cols, axis=1, inplace=True)
30 |
31 | self.y = df['PHENO']
32 | self.X = df.drop(columns=['PHENO'])
33 | X = self.X
34 | self.IDs = X.ID
35 | self.X = X.drop(columns=['ID'])
36 |
37 | def rank(self):
38 | print(f"""
39 | Beginning featureSelection using {self.n_est} estimators...""")
40 |
41 | if (self.data_type == "d"):
42 | print(f"""
43 | using extraTrees Classifier for your discrete dataset
44 | """)
45 | clf = ensemble.ExtraTreesClassifier(n_estimators=self.n_est)
46 |
47 | if (self.data_type == "c"):
48 | print(f"""
49 | using extraTrees Regressor for your continuous dataset
50 | """)
51 | clf = ensemble.ExtraTreesRegressor(n_estimators=self.n_est)
52 |
53 | clf.fit(self.X, self.y)
54 | self.featureRanks = clf.feature_importances_
55 |
56 | # Code to drop the features below threshold and return the data set like it was (aka add PHENO and IDs back)
57 | model = feature_selection.SelectFromModel(clf, prefit=True) # find this import at top
58 | df_editing = model.transform(self.X)
59 | print("""
60 | Printing feature name that corresponds to the dataframe column name, then printing the relative importance as we go...
61 | """)
62 |
63 | list_featureScores = []
64 |
65 | for col,score in zip(self.X.columns,clf.feature_importances_):
66 | print(col,score)
67 | list_featureScores.append([col, score])
68 |
69 | df_featureScores = pd.DataFrame(list_featureScores, columns=["Feature_Name", "Score"])
70 | #df_featureScores = df_featureScores[df_featureScores['Score'] !=0]
71 | df_featureScores = df_featureScores.sort_values(by=['Score'], ascending=False)
72 | featureScores_outfile = self.run_prefix + ".approx_feature_importance.txt"
73 | df_featureScores.to_csv(featureScores_outfile, index=False, sep="\t")
74 |
75 | print(f"""
76 | You have reduced your dataset to {df_editing.shape[0]} samples at {df_editing.shape[1]} features, not including ID and PHENO.
77 | """)
78 |
79 | y_df = self.y
80 | ID_df = pd.DataFrame(self.IDs)
81 | features_selected = model.get_support()
82 | X_reduced = self.X.iloc[:,features_selected]
83 | df_selecta = pd.concat([ID_df.reset_index(drop=True), y_df.reset_index(drop=True), X_reduced.reset_index(drop=True)], axis = 1, ignore_index=False)
84 |
85 | self.df_selecta = df_selecta
86 | self.featureScores_outfile = featureScores_outfile
87 |
88 | return df_selecta
89 |
90 | def export_data(self):
91 | ## Export reduced data
92 | outfile_h5 = self.run_prefix + ".dataForML.h5"
93 | self.df_selecta.to_hdf(outfile_h5, key='dataForML')
94 |
95 | features_list = self.df_selecta.columns.values.tolist()
96 |
97 | features_listpath = self.run_prefix + ".list_features.txt"
98 | with open(features_listpath, 'w') as f:
99 | for feature in features_list:
100 | f.write("%s\n" % feature)
101 |
102 | print(f"""Exporting a new {outfile_h5} file that has a reduced feature set based on your importance approximations.
103 | This is a good dataset for general ML applications for the chosen PHENO as it includes only features that are likely to impact the model.
104 |
105 | An updated list of {len(features_list)} features, including ID and PHENO, that is in your munged dataForML.h5 file can be found here {features_listpath}
106 |
107 | A file with all your features, ranked from largest contributors at the top to smallest contributors at the bottom, can be found at {self.featureScores_outfile}.
108 | """)
109 |
--------------------------------------------------------------------------------
/genoml/utils.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 The GenoML Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | import json
17 | import os
18 | import time
19 | import traceback
20 |
21 | __author__ = 'Sayed Hadi Hashemi'
22 |
23 | import textwrap
24 |
25 |
26 | class ColoredBox:
27 | BLACK = 30
28 | RED = 31
29 | GREEN = 32
30 | YELLOW = 33
31 | BLUE = 34
32 | MAGENTA = 35
33 | CYAN = 36
34 | WHITE = 37
35 | RESET = 39
36 |
37 | def __init__(self, color=None):
38 | if color is None:
39 | color = self.GREEN
40 | self.__color = color
41 |
42 | def __enter__(self):
43 | print('\033[{}m'.format(self.__color), end="")
44 |
45 | def __exit__(self, exc_type, exc_val, exc_tb):
46 | print("\x1b[0m", end="")
47 |
48 | @classmethod
49 | def wrap(cls, text, color):
50 | return '\033[{}m'.format(color) + text + "\x1b[0m"
51 |
52 |
53 | class ContextScope:
54 | indent = 0
55 | _verbose = False
56 |
57 | def __init__(self, title, description, error, start=True, end=False,
58 | **kwargs):
59 | self._title = title.format(**kwargs)
60 | self._description = description.format(**kwargs)
61 | self._error = error.format(**kwargs)
62 | self._start = start
63 | self._end = end
64 |
65 | def __exit__(self, exc_type, exc_val, exc_tb):
66 | if exc_type is None and exc_val is None and exc_tb is None:
67 | if self._end:
68 | print(
69 | "{}{}: {}".format(
70 | self.get_prefix(ColoredBox.GREEN),
71 | ColoredBox.wrap(self._title, ColoredBox.GREEN),
72 | ColoredBox.wrap('[Done]', ColoredBox.GREEN)))
73 | self.remove_indent()
74 | else:
75 | print("{}{}: {}".format(
76 | self.get_prefix(ColoredBox.RED), self._title,
77 | ColoredBox.wrap('[Failed]', ColoredBox.RED)))
78 | print("{}".format(self.indent_text(self._error)))
79 | self.remove_indent()
80 | traceback.print_exception(exc_type, exc_val, exc_tb)
81 | exit(1)
82 |
83 | def __enter__(self):
84 | self.add_indent()
85 | if self._start:
86 | print()
87 | print("{}{}".format(self.get_prefix(ColoredBox.BLUE),
88 | ColoredBox.wrap(self._title, ColoredBox.BLUE)))
89 | if self._verbose and self._description:
90 | print("{}".format(self._description))
91 |
92 | @classmethod
93 | def add_indent(cls):
94 | cls.indent += 1
95 |
96 | @classmethod
97 | def remove_indent(cls):
98 | cls.indent -= 1
99 |
100 | @classmethod
101 | def get_prefix(cls, color=None):
102 | indent_size = 4
103 | # text = "=" * (cls.indent * 4) + "> "
104 | text = "---> " * cls.indent
105 | if color:
106 | text = ColoredBox.wrap(text, color)
107 | return text
108 |
109 | @classmethod
110 | def indent_text(cls, text):
111 | WIDTH = 70
112 | indent = max(0, len(cls.get_prefix()) - 2)
113 | width = WIDTH - indent
114 | ret = textwrap.fill(text, width)
115 | ret = textwrap.indent(ret, " " * indent)
116 | return ret
117 |
118 | @classmethod
119 | def set_verbose(cls, verbose):
120 | cls._verbose = verbose
121 |
122 |
123 | def function_description(**dkwargs):
124 | def wrap(func):
125 | def func_wrapper(*args, **kwargs):
126 | with ContextScope(**dkwargs):
127 | return func(*args, **kwargs)
128 |
129 | return func_wrapper
130 |
131 | return wrap
132 |
133 |
134 | class DescriptionLoader:
135 | _descriptions = None
136 |
137 | @classmethod
138 | def _load(cls):
139 | description_file = os.path.join(os.path.dirname(__file__),
140 | "misc", "descriptions.json")
141 | with open(description_file) as fp:
142 | cls._descriptions = json.load(fp)
143 |
144 | @classmethod
145 | def function_description(cls, key, **kwargs):
146 | dkwargs = cls.get(key)
147 | return function_description(**dkwargs, **kwargs)
148 |
149 | @classmethod
150 | def get(cls, key):
151 | if cls._descriptions is None:
152 | cls._load()
153 | return cls._descriptions[key]
154 |
155 | @classmethod
156 | def context(cls, key, **kwargs):
157 | dkwargs = cls.get(key)
158 | return ContextScope(**dkwargs, **kwargs)
159 |
160 | @classmethod
161 | def print(cls, key, **kwargs):
162 | dkwargs = cls.get(key)
163 | with ContextScope(**dkwargs, **kwargs):
164 | pass
165 |
166 |
167 | class Timer:
168 | def __init__(self):
169 | self.start = None
170 | self.end = None
171 |
172 | def start_timer(self):
173 | self.start = time.time()
174 |
175 | def __enter__(self):
176 | self.start_timer()
177 | return self
178 |
179 | def __exit__(self, *args):
180 | self.stop_timer()
181 |
182 | def stop_timer(self):
183 | self.end = time.time()
184 |
185 | def elapsed(self):
186 | return self.end - self.start
187 |
--------------------------------------------------------------------------------
/genoml2.egg-info/SOURCES.txt:
--------------------------------------------------------------------------------
1 | LICENSE
2 | MANIFEST.in
3 | README.md
4 | requirements.txt
5 | setup.cfg
6 | setup.py
7 | genoml/__init__.py
8 | genoml/__main__.py
9 | genoml/dependencies.py
10 | genoml/utils.py
11 | genoml/cli/__init__.py
12 | genoml/cli/continuous_supervised_test.py
13 | genoml/cli/continuous_supervised_train.py
14 | genoml/cli/continuous_supervised_tune.py
15 | genoml/cli/discrete_supervised_test.py
16 | genoml/cli/discrete_supervised_train.py
17 | genoml/cli/discrete_supervised_tune.py
18 | genoml/cli/harmonizing.py
19 | genoml/cli/munging.py
20 | genoml/continuous/__init__.py
21 | genoml/continuous/supervised/__init__.py
22 | genoml/continuous/supervised/testing.py
23 | genoml/continuous/supervised/training.py
24 | genoml/continuous/supervised/tuning.py
25 | genoml/discrete/__init__.py
26 | genoml/discrete/utils.py
27 | genoml/discrete/supervised/__init__.py
28 | genoml/discrete/supervised/testing.py
29 | genoml/discrete/supervised/training.py
30 | genoml/discrete/supervised/tuning.py
31 | genoml/misc/descriptions.json
32 | genoml/preprocessing/__init__.py
33 | genoml/preprocessing/adjuster.py
34 | genoml/preprocessing/featureselection.py
35 | genoml/preprocessing/harmonizing.py
36 | genoml/preprocessing/munging.py
37 | genoml/preprocessing/vif.py
38 | genoml2.egg-info/PKG-INFO
39 | genoml2.egg-info/SOURCES.txt
40 | genoml2.egg-info/dependency_links.txt
41 | genoml2.egg-info/entry_points.txt
42 | genoml2.egg-info/requires.txt
43 | genoml2.egg-info/top_level.txt
--------------------------------------------------------------------------------
/genoml2.egg-info/dependency_links.txt:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/genoml2.egg-info/entry_points.txt:
--------------------------------------------------------------------------------
1 | [console_scripts]
2 | genoml = genoml.__main__:handle_main
3 |
--------------------------------------------------------------------------------
/genoml2.egg-info/requires.txt:
--------------------------------------------------------------------------------
1 | joblib
2 | matplotlib
3 | numpy
4 | tables
5 | pandas
6 | pandas_plink
7 | requests
8 | scikit-learn
9 | scipy
10 | seaborn
11 | statsmodels
12 | xgboost==2.0.3
13 | umap-learn
14 |
--------------------------------------------------------------------------------
/genoml2.egg-info/top_level.txt:
--------------------------------------------------------------------------------
1 | genoml
2 |
--------------------------------------------------------------------------------
/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GenoML/genoml2/469cf66f3d5594488dcd95497e0051d80602df30/logo.png
--------------------------------------------------------------------------------
/outputs/test_discrete_geno.best_algorithm.txt:
--------------------------------------------------------------------------------
1 | SGDClassifier
--------------------------------------------------------------------------------
/outputs/test_discrete_geno.dataForML.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GenoML/genoml2/469cf66f3d5594488dcd95497e0051d80602df30/outputs/test_discrete_geno.dataForML.h5
--------------------------------------------------------------------------------
/outputs/test_discrete_geno.list_features.txt:
--------------------------------------------------------------------------------
1 | ID
2 | PHENO
3 | snp410
4 | snp403
5 | snp164
6 | snp363
7 | snp439
8 | snp370
9 | snp389
10 | snp475
11 | snp399
12 | snp87
13 | snp308
14 | snp223
15 | snp131
16 | snp94
17 | snp79
18 | snp420
19 | snp344
20 | snp281
21 | snp139
22 | snp379
23 | snp77
24 | snp431
25 | snp445
26 | snp360
27 | snp137
28 | snp27
29 | snp432
30 | snp28
31 | snp29
32 | snp208
33 | snp319
34 | snp30
35 | snp85
36 | snp433
37 | snp380
38 | snp31
39 | snp499
40 | snp443
41 | snp488
42 | snp32
43 | snp33
44 | snp113
45 | snp242
46 | snp369
47 | snp34
48 | snp278
49 | snp35
50 | snp160
51 | snp429
52 | snp159
53 | snp396
54 | snp374
55 | snp437
56 | snp486
57 | snp376
58 | snp268
59 | snp255
60 | snp423
61 | snp250
62 | snp411
63 | snp194
64 | snp267
65 | snp424
66 | snp408
67 | snp260
68 | snp130
69 | snp110
70 | snp372
71 | snp409
72 | snp138
73 | snp37
74 | snp343
75 | snp248
76 | snp352
77 | snp38
78 | snp262
79 | snp441
80 | snp39
81 | snp307
82 | snp393
83 | snp133
84 | snp84
85 | snp158
86 | snp292
87 | snp413
88 | snp336
89 | snp204
90 | snp465
91 | snp213
92 | snp350
93 | snp43
94 | snp44
95 | snp150
96 | snp434
97 | snp483
98 | snp143
99 | snp270
100 | snp98
101 | snp341
102 | snp243
103 | snp495
104 | snp145
105 | snp45
106 | snp155
107 | snp490
108 | snp192
109 | snp383
110 | snp163
111 | snp303
112 | snp148
113 | snp115
114 | snp325
115 | snp455
116 | snp418
117 | snp195
118 | snp210
119 | snp199
120 | snp109
121 | snp263
122 | snp442
123 | snp498
124 | snp168
125 | snp232
126 | snp238
127 | snp482
128 | snp41
129 | snp419
130 | snp42
131 | snp428
132 | snp272
133 | snp114
134 | snp293
135 | snp167
136 | snp55
137 | snp56
138 | snp406
139 | snp47
140 | snp201
141 | snp313
142 | snp449
143 | snp49
144 | snp290
145 | snp458
146 | snp473
147 | snp385
148 | snp276
149 | snp333
150 | snp462
151 | snp479
152 | snp50
153 | snp51
154 | snp52
155 | snp183
156 | snp318
157 | snp166
158 | snp241
159 | snp315
160 | snp53
161 | snp91
162 | snp284
163 | snp280
164 | snp354
165 | snp46
166 | snp464
167 | snp83
168 | snp351
169 | snp353
170 | snp497
171 | snp48
172 | snp100
173 | snp88
174 | snp282
175 | snp178
176 | snp461
177 | snp329
178 | snp259
179 | snp211
180 | snp422
181 | snp346
182 | snp59
183 | snp170
184 | snp73
185 | snp417
186 | snp436
187 | snp57
188 | snp86
189 | snp312
190 | snp99
191 | snp112
192 | snp217
193 | snp427
194 | snp338
195 | snp58
196 | snp275
197 | snp182
198 | snp357
199 | snp89
200 | snp181
201 | snp203
202 | snp302
203 | snp324
204 | snp310
205 | snp298
206 | snp185
207 | snp61
208 | snp392
209 | snp452
210 | snp189
211 | snp62
212 | snp334
213 | snp162
214 | snp348
215 | snp96
216 | snp247
217 | snp253
218 | snp228
219 | snp416
220 | snp172
221 | snp468
222 | snp121
223 | snp400
224 | snp258
225 | snp492
226 | snp477
227 | snp337
228 | snp144
229 | snp63
230 | snp80
231 | snp161
232 | snp330
233 | snp316
234 | snp226
235 | snp246
236 | snp60
237 | snp124
238 | snp218
239 | snp92
240 | snp200
241 | snp322
242 | snp126
243 | snp496
244 | snp361
245 | snp234
246 | snp300
247 | snp64
248 | snp256
249 | snp493
250 | snp141
251 | snp151
252 | snp489
253 | snp212
254 | snp471
255 | snp180
256 | snp412
257 | snp135
258 | snp463
259 | snp65
260 | snp440
261 | snp66
262 | snp111
263 | snp67
264 | snp187
265 | snp221
266 | snp149
267 | snp386
268 | snp240
269 | snp474
270 | snp332
271 | snp407
272 | snp101
273 | snp500
274 | snp75
275 | snp68
276 | snp69
277 | snp106
278 | snp273
279 | snp481
280 | snp70
281 | snp230
282 | snp387
283 | snp390
284 | snp207
285 | snp485
286 | snp430
287 | snp342
288 | snp116
289 | snp494
290 | snp3
291 | snp82
292 | snp469
293 | snp398
294 | snp265
295 | snp266
296 | snp456
297 | snp478
298 | snp251
299 | snp402
300 | snp328
301 | snp184
302 | snp323
303 | snp205
304 | snp447
305 | snp125
306 | snp157
307 | snp146
308 | snp305
309 | snp1
310 | snp373
311 | snp156
312 | snp295
313 | snp2
314 | snp103
315 | snp397
316 | snp71
317 | snp404
318 | snp384
319 | snp4
320 | snp206
321 | snp169
322 | snp134
323 | snp236
324 | snp136
325 | snp237
326 | snp467
327 | snp448
328 | snp271
329 | snp286
330 | snp320
331 | snp6
332 | snp426
333 | snp277
334 | snp105
335 | snp127
336 | snp231
337 | snp142
338 | snp484
339 | snp74
340 | snp365
341 | snp152
342 | snp5
343 | snp239
344 | snp288
345 | snp299
346 | snp401
347 | snp291
348 | snp176
349 | snp487
350 | snp321
351 | snp171
352 | snp301
353 | snp9
354 | snp10
355 | snp173
356 | snp11
357 | snp220
358 | snp274
359 | snp261
360 | snp296
361 | snp287
362 | snp314
363 | snp193
364 | snp108
365 | snp7
366 | snp8
367 | snp326
368 | snp375
369 | snp12
370 | snp371
371 | snp491
372 | snp13
373 | snp229
374 | snp175
375 | snp215
376 | snp191
377 | snp249
378 | snp425
379 | snp283
380 | snp222
381 | snp209
382 | snp14
383 | snp198
384 | snp233
385 | snp444
386 | snp335
387 | snp476
388 | snp219
389 | snp81
390 | snp15
391 | snp76
392 | snp147
393 | snp381
394 | snp190
395 | snp16
396 | snp225
397 | snp245
398 | snp264
399 | snp129
400 | snp285
401 | snp17
402 | snp118
403 | snp18
404 | snp19
405 | snp20
406 | snp93
407 | snp358
408 | snp254
409 | snp188
410 | snp438
411 | snp317
412 | snp154
413 | snp480
414 | snp309
415 | snp347
416 | snp421
417 | snp72
418 | snp23
419 | snp227
420 | snp235
421 | snp78
422 | snp470
423 | snp128
424 | snp331
425 | snp327
426 | snp104
427 | snp457
428 | snp359
429 | snp140
430 | snp340
431 | snp21
432 | snp22
433 | snp177
434 | snp388
435 | snp378
436 | snp202
437 | snp24
438 | snp117
439 | snp102
440 | snp214
441 | snp252
442 | snp25
443 | snp216
444 | snp26
445 | snp289
446 | snp132
447 | snp446
448 | snp311
449 | snp355
450 | snp414
451 | snp364
452 | snp356
453 | snp394
454 | snp454
455 | snp119
456 | snp257
457 | snp122
458 | snp349
459 | snp186
460 | snp345
461 | snp415
462 | snp466
463 | snp36
464 | snp120
465 | snp459
466 | snp297
467 | snp269
468 | snp197
469 | snp460
470 | snp123
471 | snp294
472 | snp366
473 | snp472
474 | snp395
475 | snp244
476 | snp339
477 | snp451
478 | snp196
479 | snp174
480 | snp90
481 | snp95
482 | snp377
483 | snp97
484 | snp382
485 | snp450
486 | snp107
487 | snp224
488 | snp368
489 | snp304
490 | snp306
491 |
--------------------------------------------------------------------------------
/outputs/test_discrete_geno.trainedModel.joblib:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GenoML/genoml2/469cf66f3d5594488dcd95497e0051d80602df30/outputs/test_discrete_geno.trainedModel.joblib
--------------------------------------------------------------------------------
/outputs/test_discrete_geno.trainedModel_withheldSample_Predictions.csv:
--------------------------------------------------------------------------------
1 | ID,CASE_REPORTED,CASE_PROBABILITY,CASE_PREDICTED
2 | sample362,1,1.0,1
3 | sample74,0,1.0,1
4 | sample375,0,0.0,0
5 | sample156,1,1.0,1
6 | sample105,1,1.0,1
7 | sample395,1,0.0,0
8 | sample378,1,1.0,1
9 | sample125,0,1.0,1
10 | sample69,1,0.0,0
11 | sample451,1,1.0,1
12 | sample10,1,0.0,0
13 | sample195,1,1.0,1
14 | sample407,0,0.0,0
15 | sample85,1,0.0,0
16 | sample372,1,0.0,0
17 | sample389,0,0.0,0
18 | sample496,1,1.0,1
19 | sample31,1,1.0,1
20 | sample317,1,1.0,1
21 | sample409,0,1.0,1
22 | sample491,1,1.0,1
23 | sample492,1,1.0,1
24 | sample281,0,0.0,0
25 | sample357,1,1.0,1
26 | sample77,1,0.0,0
27 | sample462,1,0.0,0
28 | sample498,1,1.0,1
29 | sample212,0,1.0,1
30 | sample102,1,1.0,1
31 | sample335,0,1.0,1
32 | sample476,1,0.0,0
33 | sample337,1,0.0,0
34 | sample441,1,0.0,0
35 | sample174,1,0.0,0
36 | sample3,1,1.0,1
37 | sample334,0,1.0,1
38 | sample410,1,1.0,1
39 | sample71,1,0.0,0
40 | sample210,1,1.0,1
41 | sample64,1,0.0,0
42 | sample385,1,1.0,1
43 | sample94,0,0.0,0
44 | sample486,1,1.0,1
45 | sample186,1,1.0,1
46 | sample34,0,0.0,0
47 | sample78,1,1.0,1
48 | sample1,1,1.0,1
49 | sample12,1,1.0,1
50 | sample416,0,0.0,0
51 | sample23,1,1.0,1
52 | sample73,0,1.0,1
53 | sample183,1,1.0,1
54 | sample132,1,1.0,1
55 | sample411,1,1.0,1
56 | sample194,1,1.0,1
57 | sample56,1,1.0,1
58 | sample149,1,0.0,0
59 | sample19,0,1.0,1
60 | sample205,1,0.0,0
61 | sample79,1,0.0,0
62 | sample495,1,0.0,0
63 | sample263,1,1.0,1
64 | sample324,0,1.0,1
65 | sample484,1,1.0,1
66 | sample80,0,1.0,1
67 | sample40,1,1.0,1
68 | sample452,0,1.0,1
69 | sample47,0,0.0,0
70 | sample239,1,1.0,1
71 | sample392,0,1.0,1
72 | sample353,1,1.0,1
73 | sample342,1,1.0,1
74 | sample278,1,1.0,1
75 | sample291,1,0.0,0
76 | sample318,1,0.0,0
77 | sample305,1,1.0,1
78 | sample269,1,0.0,0
79 | sample70,1,0.0,0
80 | sample456,1,1.0,1
81 | sample466,1,1.0,1
82 | sample155,1,1.0,1
83 | sample83,1,0.0,0
84 | sample478,1,0.0,0
85 | sample173,1,1.0,1
86 | sample322,1,1.0,1
87 | sample91,1,1.0,1
88 | sample181,1,1.0,1
89 | sample415,1,0.0,0
90 | sample313,1,1.0,1
91 | sample279,1,1.0,1
92 | sample382,1,1.0,1
93 | sample473,1,0.0,0
94 | sample363,1,0.0,0
95 | sample325,1,1.0,1
96 | sample432,1,0.0,0
97 | sample348,0,1.0,1
98 | sample87,1,1.0,1
99 | sample76,1,1.0,1
100 | sample439,1,1.0,1
101 | sample16,1,1.0,1
102 | sample250,0,1.0,1
103 | sample434,1,1.0,1
104 | sample20,0,0.0,0
105 | sample323,1,1.0,1
106 | sample333,1,0.0,0
107 | sample57,0,1.0,1
108 | sample302,1,0.0,0
109 | sample230,1,1.0,1
110 | sample332,1,1.0,1
111 | sample133,1,0.0,0
112 | sample138,0,0.0,0
113 | sample424,1,1.0,1
114 | sample336,1,1.0,1
115 | sample26,0,1.0,1
116 | sample465,0,1.0,1
117 | sample282,1,0.0,0
118 | sample248,0,0.0,0
119 | sample238,0,1.0,1
120 | sample118,1,1.0,1
121 | sample43,1,0.0,0
122 | sample221,1,0.0,0
123 | sample177,0,1.0,1
124 | sample321,0,1.0,1
125 | sample154,0,1.0,1
126 | sample232,1,1.0,1
127 | sample228,1,1.0,1
128 | sample418,1,0.0,0
129 | sample204,1,0.0,0
130 | sample127,0,1.0,1
131 | sample330,1,0.0,0
132 | sample32,1,1.0,1
133 | sample114,0,1.0,1
134 | sample471,1,1.0,1
135 | sample272,1,1.0,1
136 | sample141,0,1.0,1
137 | sample58,1,1.0,1
138 | sample193,1,1.0,1
139 | sample25,1,1.0,1
140 | sample18,1,1.0,1
141 | sample266,1,0.0,0
142 | sample67,1,1.0,1
143 | sample209,1,1.0,1
144 | sample480,1,1.0,1
145 | sample95,1,0.0,0
146 | sample254,1,0.0,0
147 | sample267,0,1.0,1
148 | sample24,1,1.0,1
149 | sample223,0,1.0,1
150 | sample262,1,1.0,1
151 | sample427,1,1.0,1
152 |
--------------------------------------------------------------------------------
/outputs/test_discrete_geno.trainedModel_withheldSample_ROC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GenoML/genoml2/469cf66f3d5594488dcd95497e0051d80602df30/outputs/test_discrete_geno.trainedModel_withheldSample_ROC.png
--------------------------------------------------------------------------------
/outputs/test_discrete_geno.trainedModel_withheldSample_probabilities.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GenoML/genoml2/469cf66f3d5594488dcd95497e0051d80602df30/outputs/test_discrete_geno.trainedModel_withheldSample_probabilities.png
--------------------------------------------------------------------------------
/outputs/test_discrete_geno.training_withheldSamples_performanceMetrics.csv:
--------------------------------------------------------------------------------
1 | Algorithm,AUC_Percent,Accuracy_Percent,Balanced_Accuracy_Percent,Log_Loss,Sensitivity,Specificity,PPV,NPV,Runtime_Seconds
2 | LogisticRegression,49.701028462090406,59.333333333333336,50.2870126763932,1.3083665335134453,0.6814159292035398,0.32432432432432434,0.7549019607843137,0.25,0.17054200172424316
3 | RandomForestClassifier,50.334848122458745,74.0,49.11504424778761,0.5929014943106782,0.9823008849557522,0.0,0.75,0.0,0.189835786819458
4 | AdaBoostClassifier,35.20688830423344,54.666666666666664,42.64530016742406,0.6912231659615806,0.6637168141592921,0.1891891891891892,0.7142857142857143,0.15555555555555556,0.17150497436523438
5 | GradientBoostingClassifier,50.92083233676155,62.66666666666667,49.772781631188714,0.6887642407734422,0.7522123893805309,0.24324324324324326,0.7522123893805309,0.24324324324324326,0.4369809627532959
6 | SGDClassifier,54.3171490074145,62.66666666666667,54.317149007414486,12.894476520766654,0.7079646017699115,0.3783783783783784,0.7766990291262136,0.2978723404255319,0.026642799377441406
7 | SVC,47.40492705094475,75.33333333333333,50.0,0.5824498890799489,1.0,0.0,0.7533333333333333,,0.40736913681030273
8 | MLPClassifier,49.17483855536953,66.0,54.71179143745515,1.2453128289862851,0.7699115044247787,0.32432432432432434,0.7767857142857143,0.3157894736842105,0.4303250312805176
9 | KNeighborsClassifier,53.34848122458742,62.66666666666667,52.49940205692418,1.9597754698972316,0.7256637168141593,0.32432432432432434,0.7663551401869159,0.27906976744186046,0.12562084197998047
10 | LinearDiscriminantAnalysis,44.59459459459459,53.333333333333336,49.94020569241808,6.016274091996749,0.5663716814159292,0.43243243243243246,0.7529411764705882,0.24615384615384617,0.10006976127624512
11 | QuadraticDiscriminantAnalysis,51.80578808897393,50.66666666666667,51.805788088973934,17.039129688155935,0.49557522123893805,0.5405405405405406,0.7671232876712328,0.2597402597402597,0.06195402145385742
12 | BaggingClassifier,47.27337957426453,62.0,50.23917723032767,0.8853380657190238,0.7345132743362832,0.2702702702702703,0.7545454545454545,0.25,0.13388800621032715
13 | XGBClassifier,47.38100932791198,69.33333333333334,56.015307342740975,0.7044975252573689,0.8230088495575221,0.2972972972972973,0.7815126050420168,0.3548387096774194,0.8208780288696289
14 |
--------------------------------------------------------------------------------
/outputs/test_discrete_geno.tunedModel.joblib:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GenoML/genoml2/469cf66f3d5594488dcd95497e0051d80602df30/outputs/test_discrete_geno.tunedModel.joblib
--------------------------------------------------------------------------------
/outputs/test_discrete_geno.tunedModel_CV_Summary.csv:
--------------------------------------------------------------------------------
1 | Mean_CV_Score_Baseline,Standard_Dev_CV_Score_Baseline,Min_CV_Score_Baseline,Max_CV_Score_Baseline,Mean_CV_Score_BestTuned,Standard_Dev_CV_Score_BestTuned,Min_CV_Score_BestTuned,Max_CV_Score_BestTuned
2 | 0.8752857471880757,0.012154420363495264,0.8515624999999999,0.886029411764706,0.8770257112311965,0.005740413855556065,0.8662683823529411,0.8828125
3 |
--------------------------------------------------------------------------------
/outputs/test_discrete_geno.tunedModel_allSample_probabilities.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GenoML/genoml2/469cf66f3d5594488dcd95497e0051d80602df30/outputs/test_discrete_geno.tunedModel_allSample_probabilities.png
--------------------------------------------------------------------------------
/outputs/test_discrete_geno.tunedModel_top10Iterations_Summary.csv:
--------------------------------------------------------------------------------
1 | Model_Rank,Mean_Validation_Score,Mean_Standard_Deviation,Parameters
2 | 1,0.8777594154772709,0.00614681385285427,{'n_estimators': 113}
3 | 2,0.8774203861067569,0.01466490966393507,{'n_estimators': 456}
4 | 3,0.8767881328548249,0.01328590033923356,{'n_estimators': 428}
5 | 4,0.8766358059703545,0.01336681019692871,{'n_estimators': 96}
6 | 5,0.8764338493111129,0.016218470814769632,{'n_estimators': 23}
7 | 6,0.8763807390066827,0.005794217414978005,{'n_estimators': 114}
8 | 7,0.876344472747023,0.01900285729384202,{'n_estimators': 790}
9 | 8,0.8762983664714131,0.007999675002041077,{'n_estimators': 201}
10 | 9,0.876111363198856,0.015236886652742407,{'n_estimators': 218}
11 | 10,0.8759275396694441,0.00940246779427247,{'n_estimators': 209}
12 |
--------------------------------------------------------------------------------
/outputs/test_discrete_geno.umap_clustering.joblib:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GenoML/genoml2/469cf66f3d5594488dcd95497e0051d80602df30/outputs/test_discrete_geno.umap_clustering.joblib
--------------------------------------------------------------------------------
/outputs/test_discrete_geno.umap_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GenoML/genoml2/469cf66f3d5594488dcd95497e0051d80602df30/outputs/test_discrete_geno.umap_plot.png
--------------------------------------------------------------------------------
/outputs/validation_test_discrete_geno.dataForML.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GenoML/genoml2/469cf66f3d5594488dcd95497e0051d80602df30/outputs/validation_test_discrete_geno.dataForML.h5
--------------------------------------------------------------------------------
/outputs/validation_test_discrete_geno.finalHarmonizedCols_toKeep.txt:
--------------------------------------------------------------------------------
1 | AGE
2 | FAMILY_HISTORY
3 | ID
4 | PHENO
5 | SEX_COV
6 | UPSIT
7 | snp1
8 | snp10
9 | snp100
10 | snp101
11 | snp102
12 | snp103
13 | snp104
14 | snp105
15 | snp106
16 | snp107
17 | snp108
18 | snp109
19 | snp11
20 | snp110
21 | snp111
22 | snp112
23 | snp113
24 | snp114
25 | snp115
26 | snp116
27 | snp117
28 | snp118
29 | snp119
30 | snp12
31 | snp120
32 | snp121
33 | snp122
34 | snp123
35 | snp124
36 | snp125
37 | snp126
38 | snp127
39 | snp128
40 | snp13
41 | snp130
42 | snp131
43 | snp132
44 | snp133
45 | snp134
46 | snp135
47 | snp136
48 | snp137
49 | snp138
50 | snp139
51 | snp14
52 | snp140
53 | snp141
54 | snp142
55 | snp143
56 | snp144
57 | snp145
58 | snp146
59 | snp147
60 | snp148
61 | snp149
62 | snp15
63 | snp150
64 | snp151
65 | snp152
66 | snp154
67 | snp155
68 | snp156
69 | snp157
70 | snp158
71 | snp159
72 | snp16
73 | snp160
74 | snp161
75 | snp162
76 | snp163
77 | snp164
78 | snp166
79 | snp167
80 | snp168
81 | snp169
82 | snp17
83 | snp170
84 | snp171
85 | snp172
86 | snp173
87 | snp174
88 | snp175
89 | snp176
90 | snp177
91 | snp178
92 | snp18
93 | snp180
94 | snp181
95 | snp182
96 | snp183
97 | snp184
98 | snp185
99 | snp186
100 | snp187
101 | snp188
102 | snp189
103 | snp19
104 | snp190
105 | snp191
106 | snp192
107 | snp193
108 | snp194
109 | snp195
110 | snp196
111 | snp197
112 | snp198
113 | snp199
114 | snp2
115 | snp20
116 | snp200
117 | snp201
118 | snp202
119 | snp203
120 | snp204
121 | snp205
122 | snp206
123 | snp207
124 | snp208
125 | snp209
126 | snp21
127 | snp210
128 | snp211
129 | snp212
130 | snp213
131 | snp214
132 | snp215
133 | snp216
134 | snp217
135 | snp218
136 | snp219
137 | snp22
138 | snp220
139 | snp221
140 | snp222
141 | snp223
142 | snp224
143 | snp225
144 | snp226
145 | snp227
146 | snp228
147 | snp229
148 | snp23
149 | snp230
150 | snp231
151 | snp232
152 | snp233
153 | snp234
154 | snp235
155 | snp236
156 | snp237
157 | snp238
158 | snp239
159 | snp24
160 | snp240
161 | snp241
162 | snp242
163 | snp243
164 | snp244
165 | snp245
166 | snp246
167 | snp247
168 | snp248
169 | snp249
170 | snp25
171 | snp250
172 | snp251
173 | snp252
174 | snp253
175 | snp254
176 | snp255
177 | snp256
178 | snp257
179 | snp258
180 | snp259
181 | snp26
182 | snp260
183 | snp261
184 | snp262
185 | snp263
186 | snp264
187 | snp265
188 | snp266
189 | snp267
190 | snp268
191 | snp269
192 | snp27
193 | snp270
194 | snp271
195 | snp272
196 | snp273
197 | snp274
198 | snp275
199 | snp276
200 | snp277
201 | snp278
202 | snp28
203 | snp280
204 | snp281
205 | snp282
206 | snp283
207 | snp284
208 | snp285
209 | snp286
210 | snp287
211 | snp288
212 | snp289
213 | snp29
214 | snp290
215 | snp291
216 | snp292
217 | snp293
218 | snp294
219 | snp295
220 | snp296
221 | snp297
222 | snp298
223 | snp299
224 | snp3
225 | snp30
226 | snp300
227 | snp301
228 | snp302
229 | snp303
230 | snp304
231 | snp305
232 | snp306
233 | snp307
234 | snp308
235 | snp309
236 | snp31
237 | snp310
238 | snp311
239 | snp312
240 | snp313
241 | snp314
242 | snp315
243 | snp316
244 | snp317
245 | snp318
246 | snp319
247 | snp32
248 | snp320
249 | snp321
250 | snp322
251 | snp323
252 | snp324
253 | snp325
254 | snp326
255 | snp327
256 | snp328
257 | snp329
258 | snp33
259 | snp330
260 | snp331
261 | snp332
262 | snp333
263 | snp334
264 | snp335
265 | snp336
266 | snp337
267 | snp338
268 | snp339
269 | snp34
270 | snp340
271 | snp341
272 | snp342
273 | snp343
274 | snp344
275 | snp345
276 | snp346
277 | snp347
278 | snp348
279 | snp349
280 | snp35
281 | snp350
282 | snp351
283 | snp352
284 | snp353
285 | snp354
286 | snp355
287 | snp356
288 | snp357
289 | snp358
290 | snp359
291 | snp36
292 | snp360
293 | snp361
294 | snp363
295 | snp364
296 | snp365
297 | snp366
298 | snp368
299 | snp369
300 | snp37
301 | snp370
302 | snp371
303 | snp372
304 | snp373
305 | snp375
306 | snp376
307 | snp377
308 | snp378
309 | snp379
310 | snp38
311 | snp380
312 | snp381
313 | snp382
314 | snp383
315 | snp384
316 | snp385
317 | snp386
318 | snp387
319 | snp388
320 | snp389
321 | snp39
322 | snp390
323 | snp392
324 | snp393
325 | snp394
326 | snp395
327 | snp396
328 | snp397
329 | snp398
330 | snp399
331 | snp4
332 | snp400
333 | snp401
334 | snp402
335 | snp403
336 | snp404
337 | snp406
338 | snp407
339 | snp408
340 | snp409
341 | snp41
342 | snp410
343 | snp411
344 | snp412
345 | snp413
346 | snp414
347 | snp415
348 | snp416
349 | snp417
350 | snp418
351 | snp419
352 | snp42
353 | snp420
354 | snp421
355 | snp422
356 | snp423
357 | snp424
358 | snp425
359 | snp426
360 | snp427
361 | snp428
362 | snp429
363 | snp43
364 | snp430
365 | snp431
366 | snp432
367 | snp433
368 | snp434
369 | snp436
370 | snp437
371 | snp438
372 | snp439
373 | snp44
374 | snp440
375 | snp441
376 | snp442
377 | snp443
378 | snp444
379 | snp445
380 | snp446
381 | snp447
382 | snp448
383 | snp449
384 | snp45
385 | snp450
386 | snp451
387 | snp452
388 | snp454
389 | snp455
390 | snp456
391 | snp457
392 | snp458
393 | snp459
394 | snp46
395 | snp460
396 | snp461
397 | snp462
398 | snp463
399 | snp464
400 | snp465
401 | snp466
402 | snp467
403 | snp468
404 | snp469
405 | snp47
406 | snp470
407 | snp471
408 | snp472
409 | snp473
410 | snp474
411 | snp475
412 | snp476
413 | snp477
414 | snp478
415 | snp479
416 | snp48
417 | snp480
418 | snp481
419 | snp482
420 | snp483
421 | snp484
422 | snp485
423 | snp486
424 | snp487
425 | snp488
426 | snp489
427 | snp49
428 | snp490
429 | snp491
430 | snp492
431 | snp493
432 | snp494
433 | snp495
434 | snp496
435 | snp497
436 | snp498
437 | snp499
438 | snp5
439 | snp50
440 | snp500
441 | snp51
442 | snp52
443 | snp53
444 | snp55
445 | snp56
446 | snp57
447 | snp58
448 | snp59
449 | snp6
450 | snp60
451 | snp61
452 | snp62
453 | snp63
454 | snp64
455 | snp65
456 | snp66
457 | snp67
458 | snp68
459 | snp69
460 | snp7
461 | snp70
462 | snp71
463 | snp72
464 | snp73
465 | snp74
466 | snp75
467 | snp76
468 | snp77
469 | snp78
470 | snp79
471 | snp8
472 | snp80
473 | snp81
474 | snp82
475 | snp83
476 | snp84
477 | snp85
478 | snp86
479 | snp87
480 | snp88
481 | snp89
482 | snp9
483 | snp90
484 | snp91
485 | snp92
486 | snp93
487 | snp94
488 | snp95
489 | snp96
490 | snp97
491 | snp98
492 | snp99
493 |
--------------------------------------------------------------------------------
/outputs/validation_test_discrete_geno.list_features.txt:
--------------------------------------------------------------------------------
1 | AGE
2 | FAMILY_HISTORY
3 | ID
4 | PHENO
5 | SEX_COV
6 | UPSIT
7 | snp1
8 | snp10
9 | snp100
10 | snp101
11 | snp102
12 | snp103
13 | snp104
14 | snp105
15 | snp106
16 | snp107
17 | snp108
18 | snp109
19 | snp11
20 | snp110
21 | snp111
22 | snp112
23 | snp113
24 | snp114
25 | snp115
26 | snp116
27 | snp117
28 | snp118
29 | snp119
30 | snp12
31 | snp120
32 | snp121
33 | snp122
34 | snp123
35 | snp124
36 | snp125
37 | snp126
38 | snp127
39 | snp128
40 | snp13
41 | snp130
42 | snp131
43 | snp132
44 | snp133
45 | snp134
46 | snp135
47 | snp136
48 | snp137
49 | snp138
50 | snp139
51 | snp14
52 | snp140
53 | snp141
54 | snp142
55 | snp143
56 | snp144
57 | snp145
58 | snp146
59 | snp147
60 | snp148
61 | snp149
62 | snp15
63 | snp150
64 | snp151
65 | snp152
66 | snp154
67 | snp155
68 | snp156
69 | snp157
70 | snp158
71 | snp159
72 | snp16
73 | snp160
74 | snp161
75 | snp162
76 | snp163
77 | snp164
78 | snp166
79 | snp167
80 | snp168
81 | snp169
82 | snp17
83 | snp170
84 | snp171
85 | snp172
86 | snp173
87 | snp174
88 | snp175
89 | snp176
90 | snp177
91 | snp178
92 | snp18
93 | snp180
94 | snp181
95 | snp182
96 | snp183
97 | snp184
98 | snp185
99 | snp186
100 | snp187
101 | snp188
102 | snp189
103 | snp19
104 | snp190
105 | snp191
106 | snp192
107 | snp193
108 | snp194
109 | snp195
110 | snp196
111 | snp197
112 | snp198
113 | snp199
114 | snp2
115 | snp20
116 | snp200
117 | snp201
118 | snp202
119 | snp203
120 | snp204
121 | snp205
122 | snp206
123 | snp207
124 | snp208
125 | snp209
126 | snp21
127 | snp210
128 | snp211
129 | snp212
130 | snp213
131 | snp214
132 | snp215
133 | snp216
134 | snp217
135 | snp218
136 | snp219
137 | snp22
138 | snp220
139 | snp221
140 | snp222
141 | snp223
142 | snp224
143 | snp225
144 | snp226
145 | snp227
146 | snp228
147 | snp229
148 | snp23
149 | snp230
150 | snp231
151 | snp232
152 | snp233
153 | snp234
154 | snp235
155 | snp236
156 | snp237
157 | snp238
158 | snp239
159 | snp24
160 | snp240
161 | snp241
162 | snp242
163 | snp243
164 | snp244
165 | snp245
166 | snp246
167 | snp247
168 | snp248
169 | snp249
170 | snp25
171 | snp250
172 | snp251
173 | snp252
174 | snp253
175 | snp254
176 | snp255
177 | snp256
178 | snp257
179 | snp258
180 | snp259
181 | snp26
182 | snp260
183 | snp261
184 | snp262
185 | snp263
186 | snp264
187 | snp265
188 | snp266
189 | snp267
190 | snp268
191 | snp269
192 | snp27
193 | snp270
194 | snp271
195 | snp272
196 | snp273
197 | snp274
198 | snp275
199 | snp276
200 | snp277
201 | snp278
202 | snp28
203 | snp280
204 | snp281
205 | snp282
206 | snp283
207 | snp284
208 | snp285
209 | snp286
210 | snp287
211 | snp288
212 | snp289
213 | snp29
214 | snp290
215 | snp291
216 | snp292
217 | snp293
218 | snp294
219 | snp295
220 | snp296
221 | snp297
222 | snp298
223 | snp299
224 | snp3
225 | snp30
226 | snp300
227 | snp301
228 | snp302
229 | snp303
230 | snp304
231 | snp305
232 | snp306
233 | snp307
234 | snp308
235 | snp309
236 | snp31
237 | snp310
238 | snp311
239 | snp312
240 | snp313
241 | snp314
242 | snp315
243 | snp316
244 | snp317
245 | snp318
246 | snp319
247 | snp32
248 | snp320
249 | snp321
250 | snp322
251 | snp323
252 | snp324
253 | snp325
254 | snp326
255 | snp327
256 | snp328
257 | snp329
258 | snp33
259 | snp330
260 | snp331
261 | snp332
262 | snp333
263 | snp334
264 | snp335
265 | snp336
266 | snp337
267 | snp338
268 | snp339
269 | snp34
270 | snp340
271 | snp341
272 | snp342
273 | snp343
274 | snp344
275 | snp345
276 | snp346
277 | snp347
278 | snp348
279 | snp349
280 | snp35
281 | snp350
282 | snp351
283 | snp352
284 | snp353
285 | snp354
286 | snp355
287 | snp356
288 | snp357
289 | snp358
290 | snp359
291 | snp36
292 | snp360
293 | snp361
294 | snp363
295 | snp364
296 | snp365
297 | snp366
298 | snp368
299 | snp369
300 | snp37
301 | snp370
302 | snp371
303 | snp372
304 | snp373
305 | snp375
306 | snp376
307 | snp377
308 | snp378
309 | snp379
310 | snp38
311 | snp380
312 | snp381
313 | snp382
314 | snp383
315 | snp384
316 | snp385
317 | snp386
318 | snp387
319 | snp388
320 | snp389
321 | snp39
322 | snp390
323 | snp392
324 | snp393
325 | snp394
326 | snp395
327 | snp396
328 | snp397
329 | snp398
330 | snp399
331 | snp4
332 | snp400
333 | snp401
334 | snp402
335 | snp403
336 | snp404
337 | snp406
338 | snp407
339 | snp408
340 | snp409
341 | snp41
342 | snp410
343 | snp411
344 | snp412
345 | snp413
346 | snp414
347 | snp415
348 | snp416
349 | snp417
350 | snp418
351 | snp419
352 | snp42
353 | snp420
354 | snp421
355 | snp422
356 | snp423
357 | snp424
358 | snp425
359 | snp426
360 | snp427
361 | snp428
362 | snp429
363 | snp43
364 | snp430
365 | snp431
366 | snp432
367 | snp433
368 | snp434
369 | snp436
370 | snp437
371 | snp438
372 | snp439
373 | snp44
374 | snp440
375 | snp441
376 | snp442
377 | snp443
378 | snp444
379 | snp445
380 | snp446
381 | snp447
382 | snp448
383 | snp449
384 | snp45
385 | snp450
386 | snp451
387 | snp452
388 | snp454
389 | snp455
390 | snp456
391 | snp457
392 | snp458
393 | snp459
394 | snp46
395 | snp460
396 | snp461
397 | snp462
398 | snp463
399 | snp464
400 | snp465
401 | snp466
402 | snp467
403 | snp468
404 | snp469
405 | snp47
406 | snp470
407 | snp471
408 | snp472
409 | snp473
410 | snp474
411 | snp475
412 | snp476
413 | snp477
414 | snp478
415 | snp479
416 | snp48
417 | snp480
418 | snp481
419 | snp482
420 | snp483
421 | snp484
422 | snp485
423 | snp486
424 | snp487
425 | snp488
426 | snp489
427 | snp49
428 | snp490
429 | snp491
430 | snp492
431 | snp493
432 | snp494
433 | snp495
434 | snp496
435 | snp497
436 | snp498
437 | snp499
438 | snp5
439 | snp50
440 | snp500
441 | snp51
442 | snp52
443 | snp53
444 | snp55
445 | snp56
446 | snp57
447 | snp58
448 | snp59
449 | snp6
450 | snp60
451 | snp61
452 | snp62
453 | snp63
454 | snp64
455 | snp65
456 | snp66
457 | snp67
458 | snp68
459 | snp69
460 | snp7
461 | snp70
462 | snp71
463 | snp72
464 | snp73
465 | snp74
466 | snp75
467 | snp76
468 | snp77
469 | snp78
470 | snp79
471 | snp8
472 | snp80
473 | snp81
474 | snp82
475 | snp83
476 | snp84
477 | snp85
478 | snp86
479 | snp87
480 | snp88
481 | snp89
482 | snp9
483 | snp90
484 | snp91
485 | snp92
486 | snp93
487 | snp94
488 | snp95
489 | snp96
490 | snp97
491 | snp98
492 | snp99
493 |
--------------------------------------------------------------------------------
/outputs/validation_test_discrete_geno.refColsHarmonize_toKeep.txt:
--------------------------------------------------------------------------------
1 | ID
2 | PHENO
3 | SEX_COV
4 | AGE
5 | UPSIT
6 | FAMILY_HISTORY
7 | snp410
8 | snp403
9 | snp164
10 | snp363
11 | snp439
12 | snp370
13 | snp389
14 | snp475
15 | snp399
16 | snp87
17 | snp308
18 | snp223
19 | snp131
20 | snp94
21 | snp79
22 | snp420
23 | snp344
24 | snp281
25 | snp139
26 | snp379
27 | snp77
28 | snp431
29 | snp445
30 | snp360
31 | snp137
32 | snp27
33 | snp432
34 | snp28
35 | snp29
36 | snp208
37 | snp319
38 | snp30
39 | snp85
40 | snp433
41 | snp380
42 | snp31
43 | snp499
44 | snp443
45 | snp488
46 | snp32
47 | snp33
48 | snp113
49 | snp242
50 | snp369
51 | snp34
52 | snp278
53 | snp35
54 | snp160
55 | snp429
56 | snp159
57 | snp396
58 | snp374
59 | snp437
60 | snp486
61 | snp376
62 | snp268
63 | snp255
64 | snp423
65 | snp250
66 | snp411
67 | snp194
68 | snp267
69 | snp424
70 | snp408
71 | snp260
72 | snp130
73 | snp110
74 | snp372
75 | snp409
76 | snp138
77 | snp37
78 | snp343
79 | snp248
80 | snp352
81 | snp38
82 | snp262
83 | snp441
84 | snp39
85 | snp307
86 | snp393
87 | snp133
88 | snp84
89 | snp158
90 | snp292
91 | snp413
92 | snp336
93 | snp204
94 | snp465
95 | snp213
96 | snp350
97 | snp43
98 | snp44
99 | snp150
100 | snp434
101 | snp483
102 | snp143
103 | snp270
104 | snp98
105 | snp341
106 | snp243
107 | snp495
108 | snp145
109 | snp45
110 | snp155
111 | snp490
112 | snp192
113 | snp383
114 | snp163
115 | snp303
116 | snp148
117 | snp115
118 | snp325
119 | snp455
120 | snp418
121 | snp195
122 | snp210
123 | snp199
124 | snp109
125 | snp263
126 | snp442
127 | snp498
128 | snp168
129 | snp232
130 | snp238
131 | snp482
132 | snp41
133 | snp419
134 | snp42
135 | snp428
136 | snp272
137 | snp114
138 | snp293
139 | snp167
140 | snp55
141 | snp56
142 | snp406
143 | snp47
144 | snp201
145 | snp313
146 | snp449
147 | snp49
148 | snp290
149 | snp458
150 | snp473
151 | snp385
152 | snp276
153 | snp333
154 | snp462
155 | snp479
156 | snp50
157 | snp51
158 | snp52
159 | snp183
160 | snp318
161 | snp166
162 | snp241
163 | snp315
164 | snp53
165 | snp91
166 | snp284
167 | snp280
168 | snp354
169 | snp46
170 | snp464
171 | snp83
172 | snp351
173 | snp353
174 | snp497
175 | snp48
176 | snp100
177 | snp88
178 | snp282
179 | snp178
180 | snp461
181 | snp329
182 | snp259
183 | snp211
184 | snp422
185 | snp346
186 | snp59
187 | snp170
188 | snp73
189 | snp417
190 | snp436
191 | snp57
192 | snp86
193 | snp312
194 | snp99
195 | snp112
196 | snp217
197 | snp427
198 | snp338
199 | snp58
200 | snp275
201 | snp182
202 | snp357
203 | snp89
204 | snp181
205 | snp203
206 | snp302
207 | snp324
208 | snp310
209 | snp298
210 | snp185
211 | snp61
212 | snp392
213 | snp452
214 | snp189
215 | snp62
216 | snp334
217 | snp162
218 | snp348
219 | snp96
220 | snp247
221 | snp253
222 | snp228
223 | snp416
224 | snp172
225 | snp468
226 | snp121
227 | snp400
228 | snp258
229 | snp492
230 | snp477
231 | snp337
232 | snp144
233 | snp63
234 | snp80
235 | snp161
236 | snp330
237 | snp316
238 | snp226
239 | snp246
240 | snp60
241 | snp124
242 | snp218
243 | snp92
244 | snp200
245 | snp322
246 | snp126
247 | snp496
248 | snp361
249 | snp234
250 | snp300
251 | snp64
252 | snp256
253 | snp493
254 | snp141
255 | snp151
256 | snp489
257 | snp212
258 | snp471
259 | snp180
260 | snp412
261 | snp135
262 | snp463
263 | snp65
264 | snp440
265 | snp66
266 | snp111
267 | snp67
268 | snp187
269 | snp221
270 | snp149
271 | snp386
272 | snp240
273 | snp474
274 | snp332
275 | snp407
276 | snp101
277 | snp500
278 | snp75
279 | snp68
280 | snp69
281 | snp106
282 | snp273
283 | snp481
284 | snp70
285 | snp230
286 | snp387
287 | snp390
288 | snp207
289 | snp485
290 | snp430
291 | snp342
292 | snp116
293 | snp494
294 | snp3
295 | snp82
296 | snp469
297 | snp398
298 | snp265
299 | snp266
300 | snp456
301 | snp478
302 | snp251
303 | snp402
304 | snp328
305 | snp184
306 | snp323
307 | snp205
308 | snp447
309 | snp125
310 | snp157
311 | snp146
312 | snp305
313 | snp1
314 | snp373
315 | snp156
316 | snp295
317 | snp2
318 | snp103
319 | snp397
320 | snp71
321 | snp404
322 | snp384
323 | snp4
324 | snp206
325 | snp169
326 | snp134
327 | snp236
328 | snp136
329 | snp237
330 | snp467
331 | snp448
332 | snp271
333 | snp286
334 | snp320
335 | snp6
336 | snp426
337 | snp277
338 | snp105
339 | snp127
340 | snp231
341 | snp142
342 | snp484
343 | snp74
344 | snp365
345 | snp152
346 | snp5
347 | snp239
348 | snp288
349 | snp299
350 | snp401
351 | snp291
352 | snp176
353 | snp487
354 | snp321
355 | snp171
356 | snp301
357 | snp9
358 | snp10
359 | snp173
360 | snp11
361 | snp220
362 | snp274
363 | snp261
364 | snp296
365 | snp287
366 | snp314
367 | snp193
368 | snp108
369 | snp7
370 | snp8
371 | snp326
372 | snp375
373 | snp12
374 | snp371
375 | snp491
376 | snp13
377 | snp229
378 | snp175
379 | snp215
380 | snp191
381 | snp249
382 | snp425
383 | snp283
384 | snp222
385 | snp209
386 | snp14
387 | snp198
388 | snp233
389 | snp444
390 | snp335
391 | snp476
392 | snp219
393 | snp81
394 | snp15
395 | snp76
396 | snp147
397 | snp381
398 | snp190
399 | snp16
400 | snp225
401 | snp245
402 | snp264
403 | snp129
404 | snp285
405 | snp17
406 | snp118
407 | snp18
408 | snp19
409 | snp20
410 | snp93
411 | snp358
412 | snp254
413 | snp188
414 | snp438
415 | snp317
416 | snp154
417 | snp480
418 | snp309
419 | snp347
420 | snp421
421 | snp72
422 | snp23
423 | snp227
424 | snp235
425 | snp78
426 | snp470
427 | snp128
428 | snp331
429 | snp327
430 | snp104
431 | snp457
432 | snp359
433 | snp140
434 | snp340
435 | snp21
436 | snp22
437 | snp177
438 | snp388
439 | snp378
440 | snp202
441 | snp24
442 | snp117
443 | snp102
444 | snp214
445 | snp252
446 | snp25
447 | snp216
448 | snp26
449 | snp289
450 | snp132
451 | snp446
452 | snp311
453 | snp355
454 | snp414
455 | snp364
456 | snp356
457 | snp394
458 | snp454
459 | snp119
460 | snp257
461 | snp122
462 | snp349
463 | snp186
464 | snp345
465 | snp415
466 | snp466
467 | snp36
468 | snp120
469 | snp459
470 | snp297
471 | snp269
472 | snp197
473 | snp460
474 | snp123
475 | snp294
476 | snp366
477 | snp472
478 | snp395
479 | snp244
480 | snp339
481 | snp451
482 | snp196
483 | snp174
484 | snp90
485 | snp95
486 | snp377
487 | snp97
488 | snp382
489 | snp450
490 | snp107
491 | snp224
492 | snp368
493 | snp304
494 | snp306
495 |
--------------------------------------------------------------------------------
/outputs/validation_test_discrete_geno.refSNPs_andAlleles.bed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GenoML/genoml2/469cf66f3d5594488dcd95497e0051d80602df30/outputs/validation_test_discrete_geno.refSNPs_andAlleles.bed
--------------------------------------------------------------------------------
/outputs/validation_test_discrete_geno.refSNPs_andAlleles.fam:
--------------------------------------------------------------------------------
1 | valdiation34 valdiation34 0 0 2 2
2 | valdiation75 valdiation75 0 0 1 2
3 | valdiation65 valdiation65 0 0 2 2
4 | valdiation15 valdiation15 0 0 2 1
5 | valdiation5 valdiation5 0 0 1 2
6 | valdiation24 valdiation24 0 0 1 1
7 | valdiation14 valdiation14 0 0 2 2
8 | valdiation58 valdiation58 0 0 1 2
9 | valdiation10 valdiation10 0 0 1 2
10 | valdiation89 valdiation89 0 0 1 2
11 | valdiation20 valdiation20 0 0 2 1
12 | valdiation59 valdiation59 0 0 1 1
13 | valdiation19 valdiation19 0 0 2 2
14 | valdiation13 valdiation13 0 0 2 2
15 | valdiation21 valdiation21 0 0 2 2
16 | valdiation35 valdiation35 0 0 1 2
17 | valdiation1 valdiation1 0 0 2 1
18 | valdiation92 valdiation92 0 0 1 1
19 | valdiation74 valdiation74 0 0 2 2
20 | valdiation94 valdiation94 0 0 1 2
21 | valdiation2 valdiation2 0 0 2 2
22 | valdiation37 valdiation37 0 0 1 2
23 | valdiation44 valdiation44 0 0 2 1
24 | valdiation48 valdiation48 0 0 1 1
25 | valdiation49 valdiation49 0 0 1 2
26 | valdiation17 valdiation17 0 0 1 2
27 | valdiation18 valdiation18 0 0 1 2
28 | valdiation83 valdiation83 0 0 2 2
29 | valdiation68 valdiation68 0 0 1 2
30 | valdiation50 valdiation50 0 0 1 2
31 | valdiation22 valdiation22 0 0 1 2
32 | valdiation33 valdiation33 0 0 2 1
33 | valdiation43 valdiation43 0 0 2 1
34 | valdiation60 valdiation60 0 0 2 2
35 | valdiation70 valdiation70 0 0 1 2
36 | valdiation99 valdiation99 0 0 1 2
37 | valdiation36 valdiation36 0 0 2 1
38 | valdiation51 valdiation51 0 0 1 2
39 | valdiation76 valdiation76 0 0 2 2
40 | valdiation64 valdiation64 0 0 1 2
41 | valdiation69 valdiation69 0 0 2 1
42 | valdiation32 valdiation32 0 0 1 2
43 | valdiation88 valdiation88 0 0 1 2
44 | valdiation11 valdiation11 0 0 1 1
45 | valdiation3 valdiation3 0 0 1 2
46 | valdiation46 valdiation46 0 0 1 2
47 | valdiation27 valdiation27 0 0 1 2
48 | valdiation63 valdiation63 0 0 1 2
49 | valdiation4 valdiation4 0 0 1 2
50 | valdiation85 valdiation85 0 0 1 2
51 | valdiation23 valdiation23 0 0 1 1
52 | valdiation84 valdiation84 0 0 1 2
53 | valdiation71 valdiation71 0 0 1 2
54 | valdiation54 valdiation54 0 0 1 1
55 | valdiation55 valdiation55 0 0 2 1
56 | valdiation26 valdiation26 0 0 1 2
57 | valdiation56 valdiation56 0 0 1 2
58 | valdiation72 valdiation72 0 0 1 2
59 | valdiation93 valdiation93 0 0 1 1
60 | valdiation8 valdiation8 0 0 1 2
61 | valdiation30 valdiation30 0 0 1 2
62 | valdiation39 valdiation39 0 0 1 1
63 | valdiation81 valdiation81 0 0 1 1
64 | valdiation80 valdiation80 0 0 1 2
65 | valdiation100 valdiation100 0 0 2 2
66 | valdiation9 valdiation9 0 0 1 2
67 | valdiation96 valdiation96 0 0 1 2
68 | valdiation12 valdiation12 0 0 2 2
69 | valdiation6 valdiation6 0 0 1 2
70 | valdiation31 valdiation31 0 0 2 2
71 | valdiation45 valdiation45 0 0 2 2
72 | valdiation87 valdiation87 0 0 1 2
73 | valdiation53 valdiation53 0 0 1 2
74 | valdiation86 valdiation86 0 0 1 1
75 | valdiation91 valdiation91 0 0 1 2
76 | valdiation25 valdiation25 0 0 1 1
77 | valdiation95 valdiation95 0 0 1 2
78 | valdiation62 valdiation62 0 0 2 2
79 | valdiation42 valdiation42 0 0 2 2
80 | valdiation98 valdiation98 0 0 1 2
81 | valdiation16 valdiation16 0 0 2 2
82 | valdiation38 valdiation38 0 0 2 1
83 | valdiation52 valdiation52 0 0 1 2
84 | valdiation57 valdiation57 0 0 1 1
85 | valdiation47 valdiation47 0 0 1 2
86 | valdiation28 valdiation28 0 0 1 2
87 | valdiation78 valdiation78 0 0 2 1
88 | valdiation29 valdiation29 0 0 1 1
89 | valdiation97 valdiation97 0 0 2 2
90 | valdiation40 valdiation40 0 0 1 1
91 | valdiation66 valdiation66 0 0 1 2
92 | valdiation61 valdiation61 0 0 2 2
93 | valdiation77 valdiation77 0 0 2 2
94 | valdiation90 valdiation90 0 0 1 2
95 | valdiation79 valdiation79 0 0 1 2
96 | valdiation41 valdiation41 0 0 1 2
97 | valdiation82 valdiation82 0 0 1 1
98 | valdiation7 valdiation7 0 0 2 2
99 | valdiation67 valdiation67 0 0 1 2
100 | valdiation73 valdiation73 0 0 1 2
101 |
--------------------------------------------------------------------------------
/outputs/validation_test_discrete_geno.testedModel_allSample_ROC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GenoML/genoml2/469cf66f3d5594488dcd95497e0051d80602df30/outputs/validation_test_discrete_geno.testedModel_allSample_ROC.png
--------------------------------------------------------------------------------
/outputs/validation_test_discrete_geno.testedModel_allSample_predictions.csv:
--------------------------------------------------------------------------------
1 | ID,CASE_REPORTED,CASE_PROBABILITY,CASE_PREDICTED
2 | valdiation1,0,0.03148861333566866,0
3 | valdiation2,1,0.9849483263923975,1
4 | valdiation3,1,0.9394837298774558,1
5 | valdiation4,1,0.961536162754122,1
6 | valdiation5,1,0.7495487657093041,1
7 | valdiation6,1,0.9770457209562207,1
8 | valdiation7,1,0.8482988366528482,1
9 | valdiation8,1,0.7736793567653053,1
10 | valdiation9,1,0.6662222480606722,1
11 | valdiation10,1,0.8777924765950229,1
12 | valdiation11,0,0.06308968944624066,0
13 | valdiation12,1,0.9841752543221782,1
14 | valdiation13,1,0.9715629013535524,1
15 | valdiation14,1,0.8865205600711621,1
16 | valdiation15,0,0.09400063147306198,0
17 | valdiation16,1,0.9827536027621778,1
18 | valdiation17,1,0.9852498838419805,1
19 | valdiation18,1,0.9779392459770798,1
20 | valdiation19,1,0.8215149005406969,1
21 | valdiation20,0,0.025766961939865437,0
22 | valdiation21,1,0.369723015891961,0
23 | valdiation22,1,0.992850603956108,1
24 | valdiation23,0,0.22210115032526745,0
25 | valdiation24,0,0.10090341403815763,0
26 | valdiation25,0,0.20967163765538469,0
27 | valdiation26,1,0.9954998695495226,1
28 | valdiation27,1,0.9622843947181122,1
29 | valdiation28,1,0.9926081702605717,1
30 | valdiation29,0,0.05603870368131757,0
31 | valdiation30,1,0.688307955072017,1
32 | valdiation31,1,0.9946113192557112,1
33 | valdiation32,1,0.9766874201362098,1
34 | valdiation33,0,0.06878102918507902,0
35 | valdiation34,1,0.9894603825649909,1
36 | valdiation35,1,0.9437301028345729,1
37 | valdiation36,0,0.21216235062749125,0
38 | valdiation37,1,0.9900018821933396,1
39 | valdiation38,0,0.16901206233073723,0
40 | valdiation39,0,0.13817595449303555,0
41 | valdiation40,0,0.04568187764569424,0
42 | valdiation41,1,0.9846117972454992,1
43 | valdiation42,1,0.9662683691473066,1
44 | valdiation43,0,0.19761096748372325,0
45 | valdiation44,0,0.09155973982361744,0
46 | valdiation45,1,0.8777371456685015,1
47 | valdiation46,1,0.9911946720102857,1
48 | valdiation47,1,0.950193585755543,1
49 | valdiation48,0,0.021267349785308882,0
50 | valdiation49,1,0.9947173924809013,1
51 | valdiation50,1,0.9798199701239221,1
52 | valdiation51,1,0.7785928326337268,1
53 | valdiation52,1,0.9877527076932267,1
54 | valdiation53,1,0.6619314886523919,1
55 | valdiation54,0,0.24334927888057628,0
56 | valdiation55,0,0.023175285267309574,0
57 | valdiation56,1,0.9936299851165581,1
58 | valdiation57,0,0.292972066817925,0
59 | valdiation58,1,0.9453775699409687,1
60 | valdiation59,0,0.08359695641646878,0
61 | valdiation60,1,0.9826793556869285,1
62 | valdiation61,1,0.9804351180970784,1
63 | valdiation62,1,0.9689776045129581,1
64 | valdiation63,1,0.43015270870091094,0
65 | valdiation64,1,0.9842623094116923,1
66 | valdiation65,1,0.9822136540202853,1
67 | valdiation66,1,0.8657350197834053,1
68 | valdiation67,1,0.9896966555486313,1
69 | valdiation68,1,0.9840282172936693,1
70 | valdiation69,0,0.27260734300721334,0
71 | valdiation70,1,0.9840772920766343,1
72 | valdiation71,1,0.9750412686846891,1
73 | valdiation72,1,0.9916573991337486,1
74 | valdiation73,1,0.962258027120737,1
75 | valdiation74,1,0.827550733382712,1
76 | valdiation75,1,0.9716689143266516,1
77 | valdiation76,1,0.9601409855961331,1
78 | valdiation77,1,0.9726736058066694,1
79 | valdiation78,0,0.09561175299797499,0
80 | valdiation79,1,0.9711145567192419,1
81 | valdiation80,1,0.9805958147654943,1
82 | valdiation81,0,0.30162483312327265,0
83 | valdiation82,0,0.02553170681822126,0
84 | valdiation83,1,0.7234706286257632,1
85 | valdiation84,1,0.9923825256192076,1
86 | valdiation85,1,0.9866697978848589,1
87 | valdiation86,0,0.06355608737123379,0
88 | valdiation87,1,0.5943631474810299,1
89 | valdiation88,1,0.9888374151591128,1
90 | valdiation89,1,0.9885651828332006,1
91 | valdiation90,1,0.7357631184903739,1
92 | valdiation91,1,0.9722710446370482,1
93 | valdiation92,0,0.17406641415140675,0
94 | valdiation93,0,0.06171541281506878,0
95 | valdiation94,1,0.9892974833143502,1
96 | valdiation95,1,0.9847112377880253,1
97 | valdiation96,1,0.9920365721699342,1
98 | valdiation97,1,0.8508495360265258,1
99 | valdiation98,1,0.9892594847751349,1
100 | valdiation99,1,0.9175627336719306,1
101 | valdiation100,1,0.9971595269910182,1
102 |
--------------------------------------------------------------------------------
/outputs/validation_test_discrete_geno.testedModel_allSample_probabilities.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GenoML/genoml2/469cf66f3d5594488dcd95497e0051d80602df30/outputs/validation_test_discrete_geno.testedModel_allSample_probabilities.png
--------------------------------------------------------------------------------
/outputs/validation_test_discrete_geno.testedModel_allSamples_performanceMetrics.csv:
--------------------------------------------------------------------------------
1 | AUC_Percent,Accuracy_Percent,Balanced_Accuracy_Percent,Log_Loss,Sensitivity,Specificity,PPV,NPV
2 | 100.0,100.0,100.0,0.002694198189173816,1.0,1.0,1.0,1.0
3 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | joblib
2 | matplotlib
3 | numpy
4 | tables
5 | pandas
6 | pandas_plink
7 | requests
8 | scikit-learn
9 | scipy
10 | seaborn
11 | statsmodels
12 | xgboost==2.0.3
13 | umap-learn
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 The GenoML Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | import setuptools
17 |
18 | with open('requirements.txt') as file:
19 | requires = [line.strip() for line in file if not line.startswith('#')]
20 |
21 | with open("README.md", "r") as fh:
22 | long_description = fh.read()
23 |
24 | setuptools.setup(
25 | name="genoml2",
26 | version="1.0.1",
27 | maintainer="The GenoML Development Team",
28 | maintainer_email="genoml@googlegroups.com",
29 | description="GenoML is an automated machine learning tool that optimizes"
30 | " basic machine learning pipelines for genomic data.",
31 | long_description=long_description,
32 | long_description_content_type="text/markdown",
33 | url="https://genoml.github.io/",
34 | download_url="https://github.com/GenoML/genoml2/archive/v1.0.1.tar.gz",
35 | entry_points={
36 | 'console_scripts':
37 | ['genoml=genoml.__main__:handle_main'],
38 | },
39 | packages=setuptools.find_packages(),
40 | install_requires=requires,
41 | classifiers=[
42 | "Development Status :: 4 - Beta",
43 | "Programming Language :: Python :: 3.6",
44 | "License :: OSI Approved :: Apache Software License",
45 | "Operating System :: OS Independent",
46 | ],
47 | python_requires='>=3.6',
48 | package_data={'genoml': ['misc/*']},
49 | )
50 |
--------------------------------------------------------------------------------