├── .gitignore ├── .idea ├── .gitignore ├── inspectionProfiles │ ├── Project_Default.xml │ └── profiles_settings.xml ├── misc.xml ├── modules.xml ├── permGWAS.iml └── vcs.xml ├── Docker ├── Dockerfile └── requirements.txt ├── LICENSE ├── README.md ├── create_h5_file.py ├── create_plot.py ├── data ├── config.yaml ├── cov_matrix.csv ├── k_matrix.csv ├── k_matrix.h5 ├── x_matrix.csv ├── x_matrix.h5 ├── x_matrix.map ├── x_matrix.ped ├── y_matrix.csv └── y_matrix.pheno ├── docs ├── DATAGUIDE.md ├── INSTALLATION.md ├── OPTIONS.md ├── PERMUTATIONS.md ├── PLOTS.md ├── QUICKSTART.md ├── manhattan.png └── qq_plot.png ├── models ├── __init__.py ├── _base_model.py └── lmm.py ├── optimize ├── __init__.py └── brent.py ├── perform_gwas.py ├── permGWAS.py ├── permGWAS_logo.png ├── postprocess ├── __init__.py └── plot_functions.py ├── preprocess ├── __init__.py └── data_loader.py ├── supplementary_data ├── simulated_phenotypes_her30.h5 └── suppl_data_John_et_al_2022 │ └── AraGWAS_thresholds.csv └── utils ├── __init__.py ├── check_functions.py └── helper_functions.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /shelf/ 3 | /workspace.xml 4 | # Datasource local storage ignored files 5 | /dataSources/ 6 | /dataSources.local.xml 7 | # Editor-based HTTP Client requests 8 | /httpRequests/ 9 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/Project_Default.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 13 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/permGWAS.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /Docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:11.5.2-base-ubuntu20.04 2 | RUN apt-get update && apt-get install -y python3 && apt-get install -y python3-pip 3 | RUN apt-get install -y vim 4 | RUN apt-get install -y git 5 | RUN mkdir /configfiles 6 | COPY requirements.txt /configfiles 7 | RUN pip3 install -r /configfiles/requirements.txt 8 | RUN pip3 install torch==1.11.0+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html 9 | -------------------------------------------------------------------------------- /Docker/requirements.txt: -------------------------------------------------------------------------------- 1 | h5py 2 | matplotlib 3 | numpy 4 | pandas 5 | pandas-plink 6 | scipy 7 | seaborn 8 | pyyaml -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Grimm Lab - Bioinformatics and Machine Learning 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Python 3.8](https://img.shields.io/badge/python-3.8%20%7C%203.9%20%7C%203.10-blue)](https://www.python.org/downloads/release/python-3100/) 2 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) 3 | 4 | 5 | 6 | ## permGWAS2 7 | 8 | This is an improved version of permGWAS. The original version can be found at [permGWAS Version1](https://github.com/grimmlab/permGWAS/releases/tag/permGWAS) 9 | 10 | permGWAS2 is an open source software tool written in python to efficiently perform genome-wide association studies (GWAS) 11 | with permutation-based thresholds. It uses a batch-wise Linear Mixed Model to compute several univariate tests simultaneously. 12 | permGWAS2 provides support for multiple CPUs as well as for GPUs. 13 | 14 | In contrast to the original version, permGWAS2 allows for two different permutation strategies: 15 | 16 | x (default): permute the fixed effects matrix including covariates and the SNP of interest (equivalent to permuting y and the covariance matrix) 17 | 18 | y: permute only the phenotype vector (same method as in the original permGWAS) 19 | 20 | Details on the architecture of permGWAS and permGWAS2, benchmarking results of the framework and on permutation-based thresholds can be found in our publications. 21 | 22 | ## How to run permGWAS2 23 | 1. [Requirements & Installation](./docs/INSTALLATION.md) 24 | 2. [Quickstart Guide](./docs/QUICKSTART.md) 25 | 3. [Data Guide](./docs/DATAGUIDE.md) 26 | 4. [permGWAS2 with permutations](./docs/PERMUTATIONS.md) 27 | 5. [Create plots](./docs/PLOTS.md) 28 | 6. [Optional settings](./docs/OPTIONS.md) 29 | 30 | 31 | ## Publications & Citation 32 | 33 | John, M., Korte, A., Todesco M., & Grimm, D. G. (2024). 34 | **Population-aware permutation-based significance thresholds for genome-wide association studies**. 35 | Bioinformatics Advances, 2024 36 | 37 | DOI: [https://doi.org/10.1093/bioadv/vbae168](https://doi.org/10.1093/bioadv/vbae168) 38 | 39 | John, M., Ankenbrand, M. J., Artmann, C., Freudenthal, J. A., Korte, A., & Grimm, D. G. (2022). 40 | **Efficient Permutation-based Genome-wide Association Studies for Normal and Skewed Phenotypic Distributions**. 41 | Bioinformatics, 2022. 42 | 43 | DOI: [https://doi.org/10.1093/bioinformatics/btac455](https://doi.org/10.1093/bioinformatics/btac455) 44 | -------------------------------------------------------------------------------- /create_h5_file.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import pathlib 3 | from preprocess import data_loader 4 | from utils import check_functions 5 | 6 | if __name__ == "__main__": 7 | # Input parameters 8 | parser = argparse.ArgumentParser() 9 | parser.add_argument('-x', '--genotype_file', type=str, 10 | help='specify the name of the genotype file, absolute and relative paths are accepted, ' 11 | 'only accept CSV, PLINK and binary PLINK files, ' 12 | 'PLINK and binary PLINK: all required files must be in the same folder with same prefix,' 13 | 'for format CSV files check documentation') 14 | parser.add_argument('-sd', '--save_dir', type=str, default=None, 15 | help='specify a directory to save newly generated H5 file. Optional, if None is specified, ' 16 | 'H5 file will be saved in same directory as original genotype file.') 17 | 18 | args = vars(parser.parse_args()) 19 | args["genotype_file"] = check_functions.check_file(args["genotype_file"]) 20 | if pathlib.Path(args["genotype_file"]).suffix in ('.h5', '.hdf5', '.h5py'): 21 | raise Exception('Genotype file is already in HDF5, H5, H5PY') 22 | if args["save_dir"] is None: 23 | args["save_dir"] = pathlib.Path(args["genotype_file"]).parent 24 | out_file = pathlib.Path(args["genotype_file"]).with_suffix('.h5').stem 25 | args["save_dir"], out_file = check_functions.check_dir_paths(out_dir=args["save_dir"], out_file=out_file, prefix='') 26 | 27 | # load data from file 28 | print('Load data from file ' + str(args["genotype_file"])) 29 | dataset = data_loader.Genotype(genotype_file=args["genotype_file"]) 30 | dataset.load_genotype_data() 31 | 32 | # save data as H5 33 | dataset.save_genotype_hdf5(filename=args["save_dir"].joinpath(out_file)) 34 | -------------------------------------------------------------------------------- /create_plot.py: -------------------------------------------------------------------------------- 1 | # create Manhattan and QQ-plots 2 | import pandas as pd 3 | import pathlib 4 | import argparse 5 | 6 | from utils import check_functions 7 | from postprocess import plot_functions 8 | 9 | if __name__ == "__main__": 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('-p_val', '--p_value_file', type=str, default=None, 12 | help='Specify the full path to the p_value file, absolute and relative paths are accepted, ' 13 | 'only accept .csv files. p_value files must at least contain chromosome ids (CHR), ' 14 | 'position ids (POS) and corresponding p_values (p_value).') 15 | parser.add_argument('-min_p_val', '--min_p_value_file', type=str, default=None, 16 | help='Optional, specify the full path to the file containing minimal p-values in order to ' 17 | 'compute permutation-based thresholds, absolute and relative paths are accepted, ' 18 | 'only accept .csv files.') 19 | parser.add_argument('-mplot', '--manhattan', action='store_true', 20 | help='optional, creates manhattan plot') 21 | parser.add_argument('-qqplot', action='store_true', 22 | help='optional, creates QQ-plot') 23 | parser.add_argument('-out_dir', type=str, default=None, 24 | help='Specify the name of the directory plots should be stored in,' 25 | 'absolute and relative paths are accepted. Optional, if not provided, files will be ' 26 | 'stored in same folder as p_value file.') 27 | parser.add_argument('-out_file', type=str, default=None, 28 | help='Specify NAME of plots, will be stored as manhattan_NAME.png or qq_plot_NAME.png,' 29 | 'optional, if not provided name of p_value file will be used.') 30 | parser.add_argument('-sig_level', type=int, default=5, 31 | help='Significance level (percentage values) to compute threshold for Manhattan plot. ' 32 | 'Optional, default is 5.') 33 | args = vars(parser.parse_args()) 34 | 35 | args["p_value_file"] = check_functions.check_file(args["p_value_file"]) 36 | if args["min_p_value_file"] is not None: 37 | args["min_p_value_file"] = check_functions.check_file(args["min_p_value_file"]) 38 | if args["out_dir"] is None: 39 | args["out_dir"] = pathlib.Path(args["p_value_file"]).parent 40 | if args["out_file"] is None: 41 | args["out_file"] = pathlib.Path(args["p_value_file"]).stem 42 | 43 | df = pd.read_csv(args["p_value_file"]) 44 | if not {'CHR', 'POS', 'p_value'}.issubset(df.columns): 45 | raise Exception('Cannot create Manhattan plot; need CHR, POS and p_value in DataFrame.') 46 | 47 | if args["manhattan"]: 48 | out_dir, out_file = check_functions.check_dir_paths(out_dir=args["out_dir"], out_file=args["out_file"], 49 | prefix='manhattan_') 50 | print('Save Manhattan plot with significance level of %d.' % args["sig_level"]) 51 | if args["min_p_value_file"] is not None: 52 | df_min = pd.read_csv(args["min_p_value_file"]) 53 | if not 'min_p_val' in df_min.columns: 54 | raise Exception('Cannot compute permutation-based threshold, need min_p_val in DataFrame.') 55 | min_p_val = df_min['min_p_val'].values 56 | else: 57 | min_p_val = None 58 | plot_functions.manhattan_plot(df=df, data_dir=out_dir, filename=out_file, 59 | min_p_values=min_p_val, sig_level=args["sig_level"]) 60 | 61 | if args["qqplot"]: 62 | out_dir, out_file = check_functions.check_dir_paths(out_dir=args["out_dir"], out_file=args["out_file"], 63 | prefix='qq_plot_') 64 | print('Save QQ-plot.') 65 | plot_functions.qq_plot(p_values=df['p_value'].values, data_dir=out_dir, filename=out_file) 66 | -------------------------------------------------------------------------------- /data/config.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | genotype_file: "./data/x_matrix.h5" 3 | phenotype_file: "./data/y_matrix.csv" 4 | trait: "phenotype_value" 5 | kinship_file: 6 | covariate_file: 7 | covariate_list: 8 | perm_method: "x" 9 | maf_threshold: 0 10 | perm: 100 -------------------------------------------------------------------------------- /data/cov_matrix.csv: -------------------------------------------------------------------------------- 1 | accession_id,covariate 2 | 9381,1 3 | 9380,1 4 | 9378,1 5 | 9371,0 6 | 9367,1 7 | 9363,0 8 | 9356,0 9 | 9355,0 10 | 9354,0 11 | 9353,0 12 | 9352,1 13 | 9351,0 14 | 9344,1 15 | 9343,0 16 | 9339,1 17 | 9336,1 18 | 9332,0 19 | 9323,0 20 | 9321,1 21 | 9482,1 22 | 9481,0 23 | 9472,0 24 | 9471,1 25 | 9470,1 26 | 9469,0 27 | 9455,0 28 | 9454,1 29 | 9453,0 30 | 9451,1 31 | 9419,1 32 | 9418,0 33 | 9409,1 34 | 9402,0 35 | 9369,1 36 | 9349,1 37 | 9476,1 38 | 9433,1 39 | 9446,1 40 | 9443,1 41 | 9442,1 42 | 997,1 43 | 996,0 44 | 1068,1 45 | 1026,0 46 | 1585,1 47 | 1435,1 48 | 1169,1 49 | 1075,1 50 | 1132,1 51 | 1064,0 52 | 1063,0 53 | 1062,1 54 | 1247,1 55 | 991,1 56 | 1391,0 57 | 1374,0 58 | 1318,0 59 | 1254,1 60 | 1163,1 61 | 1153,1 62 | 1073,1 63 | 1072,1 64 | 394,0 65 | 7,0 66 | 203,1 67 | 236,0 68 | 367,0 69 | 123,1 70 | 395,0 71 | 196,1 72 | 264,0 73 | 185,1 74 | 297,0 75 | 318,0 76 | 323,0 77 | 79,0 78 | 198,0 79 | 371,0 80 | 280,0 81 | 12,1 82 | 347,1 83 | 268,1 84 | 288,1 85 | 377,1 86 | 252,0 87 | 296,1 88 | 341,1 89 | 156,0 90 | 397,1 91 | 263,0 92 | 48,1 93 | 45,1 94 | 210,0 95 | 83,0 96 | 372,1 97 | 393,0 98 | 205,1 99 | 87,0 100 | 62,1 101 | 309,1 102 | 222,1 103 | 160,1 104 | 229,1 105 | 369,0 106 | 227,1 107 | 230,0 108 | 217,0 109 | 194,1 110 | 391,1 111 | 340,1 112 | 167,0 113 | 266,1 114 | 208,1 115 | 335,1 116 | 213,1 117 | 388,1 118 | 331,0 119 | 216,1 120 | 277,1 121 | 85,0 122 | 310,1 123 | 389,1 124 | 387,0 125 | 191,0 126 | 224,0 127 | 82,1 128 | 225,1 129 | 295,1 130 | 169,1 131 | 375,0 132 | 292,1 133 | 215,1 134 | 337,1 135 | 320,1 136 | 171,0 137 | 346,1 138 | 151,1 139 | 137,1 140 | 291,0 141 | 385,0 142 | 84,1 143 | 349,0 144 | 219,0 145 | 322,1 146 | 204,0 147 | 273,1 148 | 212,1 149 | 146,0 150 | 348,0 151 | 157,1 152 | 214,0 153 | 316,0 154 | 186,0 155 | 314,1 156 | 293,1 157 | 183,1 158 | 287,0 159 | 290,0 160 | 168,0 161 | 343,1 162 | 153,1 163 | 339,1 164 | 60,0 165 | 174,0 166 | 88,1 167 | 359,1 168 | 298,1 169 | 162,1 170 | 311,1 171 | 329,0 172 | 175,0 173 | 163,0 174 | 77,1 175 | 302,1 176 | 231,0 177 | 148,0 178 | 106,1 179 | 283,1 180 | 184,1 181 | 122,1 182 | 170,1 183 | 396,1 184 | 275,0 185 | 244,1 186 | 116,1 187 | 364,0 188 | 121,0 189 | 165,0 190 | 32,1 191 | 201,1 192 | 326,1 193 | 368,1 194 | 332,1 195 | 361,0 196 | 202,0 197 | 200,1 198 | 257,0 199 | 80,0 200 | 9,0 201 | 187,1 202 | 89,0 203 | 207,0 204 | 69,1 205 | 188,1 206 | 306,0 207 | 360,0 208 | 237,1 209 | 327,1 210 | 261,1 211 | 86,1 212 | 228,0 213 | 190,0 214 | 74,1 215 | 8,0 216 | 4,1 217 | 159,1 218 | 262,0 219 | 51,0 220 | 5,1 221 | 363,1 222 | 338,1 223 | 355,1 224 | 269,1 225 | 278,0 226 | 179,0 227 | 6,0 228 | 206,1 229 | 461,1 230 | 466,1 231 | 9490,1 232 | 9496,0 233 | 9504,1 234 | 9499,1 235 | 9308,0 236 | 9305,1 237 | 9302,1 238 | 9309,1 239 | 4980,0 240 | 5444,1 241 | 5394,1 242 | 5461,1 243 | 5494,1 244 | 5398,0 245 | 5466,1 246 | 5450,0 247 | 4675,0 248 | 4632,0 249 | 5769,1 250 | 4757,1 251 | 4827,0 252 | 4820,0 253 | 5159,1 254 | 5759,0 255 | 5739,1 256 | 5738,1 257 | 5770,1 258 | 5826,1 259 | 5745,1 260 | 5744,1 261 | 5774,1 262 | 5760,0 263 | 5746,1 264 | 5762,0 265 | 5711,1 266 | 5802,1 267 | 5740,0 268 | 5716,1 269 | 5772,1 270 | 5722,1 271 | 5751,1 272 | 5721,0 273 | 5812,0 274 | 5792,0 275 | 5735,1 276 | 5767,0 277 | 5817,1 278 | 5807,1 279 | 5777,1 280 | 5736,0 281 | 5763,1 282 | 5813,0 283 | 5741,1 284 | 5731,1 285 | 5819,1 286 | 5724,1 287 | 5789,0 288 | 5141,1 289 | 5175,1 290 | 5145,0 291 | 5469,0 292 | 5106,1 293 | 5299,0 294 | 5335,1 295 | 7121,1 296 | 7106,0 297 | 7104,0 298 | 7113,1 299 | 7116,0 300 | 7149,1 301 | 7228,1 302 | 7301,1 303 | 7109,1 304 | 6987,1 305 | 7028,1 306 | 7029,1 307 | 7030,1 308 | 7013,0 309 | 7017,0 310 | 7032,1 311 | 7073,0 312 | 242,0 313 | 104,1 314 | 282,1 315 | 96,1 316 | 23,1 317 | 6102,0 318 | 6938,0 319 | 8304,1 320 | 8238,1 321 | 8386,0 322 | 8348,1 323 | 1416,1 324 | 6237,0 325 | 6226,1 326 | 6184,0 327 | 6174,0 328 | 6172,1 329 | 6171,1 330 | 6170,1 331 | 6151,0 332 | 6150,0 333 | 6149,1 334 | 6148,1 335 | 6147,0 336 | 6146,0 337 | 6145,0 338 | 6144,0 339 | 6142,1 340 | 6141,1 341 | 6137,0 342 | 6136,0 343 | 6131,1 344 | 6134,0 345 | 6133,1 346 | 6132,1 347 | 6129,0 348 | 6128,1 349 | 6127,1 350 | 6126,0 351 | 6125,0 352 | 6123,1 353 | 6122,1 354 | 6121,1 355 | 6119,1 356 | 6116,1 357 | 6115,0 358 | 6114,0 359 | 6111,1 360 | 6110,1 361 | 6108,0 362 | 6107,1 363 | 6106,1 364 | 6104,1 365 | 6103,0 366 | 6101,1 367 | 6100,1 368 | 6099,1 369 | 6098,0 370 | 6097,1 371 | 6095,1 372 | 6093,1 373 | 6092,1 374 | 6091,1 375 | 6090,1 376 | 6177,1 377 | 6221,1 378 | 6244,1 379 | 6241,1 380 | 6240,1 381 | 6238,1 382 | 6236,1 383 | 6235,1 384 | 6220,1 385 | 6218,1 386 | 6217,1 387 | 6216,0 388 | 6215,1 389 | 6214,1 390 | 6210,1 391 | 6209,0 392 | 6207,1 393 | 6201,0 394 | 6200,0 395 | 6199,1 396 | 6198,1 397 | 6197,1 398 | 6195,1 399 | 6194,1 400 | 6193,1 401 | 6192,1 402 | 6191,1 403 | 6189,1 404 | 6163,1 405 | 6154,1 406 | 8274,1 407 | 7192,1 408 | 7194,1 409 | 7210,0 410 | 7238,1 411 | 7245,0 412 | 7246,0 413 | 7256,0 414 | 7265,0 415 | 7268,1 416 | 366,1 417 | 5245,0 418 | 5264,0 419 | 7195,0 420 | 7262,0 421 | 7250,1 422 | 1925,1 423 | 5719,1 424 | 5798,1 425 | 5816,1 426 | 5821,0 427 | 5710,1 428 | 5715,1 429 | 5720,0 430 | 5733,1 431 | 5820,1 432 | 5737,1 433 | 5755,0 434 | 5781,1 435 | 5782,1 436 | 5784,1 437 | 5146,0 438 | 5133,0 439 | 5709,1 440 | 5712,0 441 | 5795,0 442 | 5750,0 443 | 5708,0 444 | 5749,1 445 | 5727,0 446 | 5780,0 447 | 5756,1 448 | 5723,1 449 | 5730,0 450 | 5717,0 451 | 5732,1 452 | 5804,1 453 | 5752,1 454 | 5799,1 455 | 5713,1 456 | 5728,1 457 | 5787,1 458 | 5788,1 459 | 5793,0 460 | 5803,0 461 | 5758,1 462 | 7317,0 463 | 7034,1 464 | 615,0 465 | 627,0 466 | 607,0 467 | 631,0 468 | 623,0 469 | 719,1 470 | 640,0 471 | 827,0 472 | 895,1 473 | 946,1 474 | 936,0 475 | 7717,0 476 | 7787,1 477 | 7837,1 478 | 7847,1 479 | 7867,1 480 | 8077,1 481 | 8122,1 482 | 1743,0 483 | 1799,1 484 | 2175,1 485 | 2160,0 486 | 2171,0 487 | 2148,1 488 | 2180,0 489 | 2157,1 490 | 1948,0 491 | 1941,0 492 | 1949,0 493 | 1965,1 494 | 1981,1 495 | 1992,1 496 | 2016,1 497 | 2011,1 498 | 2019,1 499 | 2020,1 500 | 2151,1 501 | 1862,1 502 | 1872,0 503 | 1864,1 504 | 1871,1 505 | 1857,1 506 | 1865,0 507 | 1873,0 508 | 1850,0 509 | 1858,1 510 | 1874,1 511 | 1868,1 512 | 1829,1 513 | 1853,0 514 | 1926,0 515 | 1966,1 516 | 1918,1 517 | 1959,0 518 | 1936,1 519 | 1952,0 520 | 1960,1 521 | 1968,1 522 | 1938,0 523 | 1963,1 524 | 1720,0 525 | 1736,1 526 | 1744,1 527 | 1752,0 528 | 1729,1 529 | 1745,1 530 | 1753,0 531 | 1722,1 532 | 1730,0 533 | 1738,1 534 | 1724,1 535 | 1740,1 536 | 1733,1 537 | 1718,1 538 | 1726,1 539 | 1750,0 540 | 1782,1 541 | 1719,1 542 | 7566,1 543 | 1751,1 544 | 2214,0 545 | 2201,1 546 | 2204,1 547 | 2294,0 548 | 2280,1 549 | 2338,1 550 | 2292,1 551 | 2300,1 552 | 2316,0 553 | 2283,0 554 | 7584,1 555 | 7580,1 556 | 7578,0 557 | 7570,0 558 | 8608,0 559 | 8727,1 560 | 8760,1 561 | 8768,1 562 | 8616,1 563 | 8617,1 564 | 8770,1 565 | 8730,1 566 | 8619,1 567 | 8629,1 568 | 8612,1 569 | 8724,1 570 | 8631,0 571 | 8725,0 572 | 8640,1 573 | 8759,0 574 | 8774,1 575 | 8824,1 576 | 8557,0 577 | 8791,1 578 | 8777,1 579 | 8811,1 580 | 8787,0 581 | 8805,1 582 | 8534,1 583 | 8687,1 584 | 9045,1 585 | 8673,1 586 | 9041,1 587 | 8701,1 588 | 9053,0 589 | 8985,1 590 | 8957,1 591 | 8966,0 592 | 8695,1 593 | 8967,1 594 | 9004,1 595 | 8690,1 596 | 9012,1 597 | 8969,1 598 | 8961,1 599 | 8970,1 600 | 8954,0 601 | 8962,0 602 | 8965,1 603 | 8973,0 604 | 8975,1 605 | 9006,1 606 | 8976,1 607 | 9007,1 608 | 8977,0 609 | 8992,1 610 | 9008,0 611 | 9001,1 612 | 8719,1 613 | 8996,1 614 | 9011,1 615 | 1742,0 616 | 1749,1 617 | 999,1 618 | 1061,0 619 | 1404,0 620 | 1552,1 621 | 1257,1 622 | 1158,1 623 | 1070,1 624 | 9452,1 625 | 417,1 626 | 421,0 627 | 407,0 628 | 424,1 629 | 402,1 630 | 403,1 631 | 404,1 632 | 428,0 633 | 429,1 634 | 409,1 635 | 413,0 636 | 5883,1 637 | 5848,0 638 | 6416,1 639 | 5838,1 640 | 6287,1 641 | 6417,0 642 | 5841,1 643 | 5894,1 644 | 5904,0 645 | 5913,0 646 | 5921,1 647 | 5939,1 648 | 5969,0 649 | 5895,0 650 | 5905,1 651 | 5914,0 652 | 5923,0 653 | 5932,1 654 | 5942,1 655 | 5961,1 656 | 5970,1 657 | 5884,0 658 | 5906,1 659 | 5924,0 660 | 5933,1 661 | 5943,1 662 | 5953,1 663 | 5963,0 664 | 5972,0 665 | 5888,0 666 | 5934,1 667 | 5898,1 668 | 5908,1 669 | 5919,0 670 | 5926,0 671 | 5935,1 672 | 5945,1 673 | 5955,1 674 | 5891,1 675 | 5900,0 676 | 5927,0 677 | 5946,1 678 | 5966,0 679 | 5975,0 680 | 5901,1 681 | 5911,0 682 | 5875,1 683 | 5948,1 684 | 5893,1 685 | 5902,1 686 | 5920,1 687 | 5938,1 688 | 5959,1 689 | 5968,1 690 | 5988,1 691 | 5999,1 692 | 6455,0 693 | 5979,1 694 | 6421,0 695 | 5991,1 696 | 5992,1 697 | 6004,0 698 | 6458,0 699 | 5982,1 700 | 5993,0 701 | 6425,0 702 | 6451,1 703 | 6309,1 704 | 5984,0 705 | 5994,0 706 | 6444,1 707 | 5997,0 708 | 6007,0 709 | 6445,0 710 | 6453,0 711 | 5998,1 712 | 6427,1 713 | 6435,0 714 | 6446,0 715 | 6403,1 716 | 5922,1 717 | 5915,0 718 | 5910,1 719 | 6401,0 720 | 6003,0 721 | 5899,0 722 | 6396,0 723 | 5873,0 724 | 6418,0 725 | 5874,1 726 | 5916,1 727 | 5878,1 728 | 5983,1 729 | 5990,1 730 | 5996,1 731 | 5940,1 732 | 5846,1 733 | 5871,0 734 | 6436,1 735 | 5872,1 736 | 5956,1 737 | 6402,1 738 | 4758,1 739 | 5285,0 740 | 9153,0 741 | 9137,0 742 | 9151,1 743 | 9143,0 744 | 9201,1 745 | 6173,0 746 | 6284,0 747 | 6276,1 748 | 6258,1 749 | 6252,0 750 | 6255,0 751 | 6166,0 752 | 6085,1 753 | 6025,1 754 | 6268,1 755 | 6180,1 756 | 6143,1 757 | 6041,1 758 | 5829,1 759 | 8427,1 760 | 8218,1 761 | 6023,0 762 | 5835,1 763 | 5831,1 764 | 5830,1 765 | 6039,0 766 | 6086,1 767 | 6413,1 768 | 6412,1 769 | 6411,1 770 | 6087,0 771 | 6077,0 772 | 6076,1 773 | 6071,0 774 | 6069,0 775 | 6038,1 776 | 6036,1 777 | 6035,0 778 | 6034,1 779 | 6030,0 780 | 6024,0 781 | 6021,0 782 | 6019,1 783 | 6017,1 784 | 6013,0 785 | 6012,0 786 | 6011,1 787 | 6010,1 788 | 5870,0 789 | 5867,0 790 | 5865,1 791 | 5860,1 792 | 5836,1 793 | 6231,0 794 | 6212,1 795 | 6140,1 796 | 6138,0 797 | 6120,1 798 | 6118,0 799 | 6073,0 800 | 6022,1 801 | 6020,0 802 | 8227,1 803 | 8225,1 804 | 8230,0 805 | 5856,0 806 | 8307,1 807 | 1409,0 808 | 6959,0 809 | 7525,1 810 | 6961,1 811 | 6967,1 812 | 6973,1 813 | 6974,1 814 | 6976,1 815 | 7516,1 816 | 6979,1 817 | 6980,0 818 | 6982,1 819 | 6983,0 820 | 6985,1 821 | 6931,1 822 | 6043,0 823 | 6945,1 824 | 7519,1 825 | 7526,1 826 | 7523,1 827 | 6956,1 828 | 6960,1 829 | 7524,1 830 | 6963,1 831 | 6964,0 832 | 6965,0 833 | 6966,1 834 | 6969,1 835 | 6971,1 836 | 6975,1 837 | 7517,0 838 | 6978,1 839 | 6981,0 840 | 6984,0 841 | 6899,0 842 | 6903,0 843 | 6904,1 844 | 6905,0 845 | 6906,1 846 | 6909,1 847 | 6911,1 848 | 6916,1 849 | 8215,0 850 | 6921,1 851 | 6932,0 852 | 6046,0 853 | 6944,1 854 | 7515,1 855 | 7514,1 856 | 6962,1 857 | 6968,0 858 | 6972,1 859 | 6970,1 860 | 6977,1 861 | 8329,1 862 | 7379,0 863 | 7080,1 864 | 6744,1 865 | 7098,1 866 | 7158,0 867 | 7163,0 868 | 7165,0 869 | 7340,1 870 | 7372,0 871 | 7394,0 872 | 7397,1 873 | 281,1 874 | 8258,0 875 | 8259,1 876 | 8290,1 877 | 7461,1 878 | 7323,1 879 | 8254,0 880 | 8270,1 881 | 8233,1 882 | 8285,0 883 | 6016,1 884 | 8423,0 885 | 8237,0 886 | 6040,1 887 | 6064,0 888 | 6957,0 889 | 8369,1 890 | 8247,1 891 | 8426,0 892 | 8428,0 893 | 9058,1 894 | 8249,0 895 | 9057,1 896 | 7139,1 897 | 7307,1 898 | 7331,1 899 | 7337,1 900 | 7378,0 901 | 7405,0 902 | 66,0 903 | 149,1 904 | 328,1 905 | 334,0 906 | 2274,1 907 | 5753,1 908 | 6709,1 909 | 7000,0 910 | 6989,0 911 | 7031,1 912 | 7062,0 913 | 7460,1 914 | 7123,0 915 | 7128,1 916 | 7145,1 917 | 7147,0 918 | 7166,1 919 | 7255,0 920 | 7275,1 921 | 7258,0 922 | 7291,1 923 | 7310,0 924 | 7330,1 925 | 7333,0 926 | 7411,0 927 | 178,0 928 | 378,1 929 | 8241,1 930 | 6988,0 931 | 8256,1 932 | 8796,0 933 | 8264,0 934 | 8265,0 935 | 8231,0 936 | 8271,0 937 | 6190,0 938 | 8275,0 939 | 8420,1 940 | 8283,1 941 | 8284,1 942 | 6008,1 943 | 8422,1 944 | 8296,1 945 | 8297,1 946 | 8300,1 947 | 8235,0 948 | 8306,0 949 | 8310,0 950 | 8236,1 951 | 8311,0 952 | 8314,0 953 | 8239,1 954 | 8240,0 955 | 8323,1 956 | 8242,0 957 | 8325,1 958 | 8326,1 959 | 8222,1 960 | 8430,1 961 | 6042,1 962 | 8335,1 963 | 8343,1 964 | 6074,0 965 | 8351,0 966 | 8353,0 967 | 8354,0 968 | 7296,1 969 | 8365,1 970 | 8374,1 971 | 8376,0 972 | 8378,0 973 | 8412,1 974 | 8387,0 975 | 8389,1 976 | 6243,1 977 | 7507,0 978 | 7343,1 979 | 6005,1 980 | 5729,1 981 | 5380,1 982 | 5381,0 983 | 5565,1 984 | 7011,1 985 | 7199,1 986 | 7224,1 987 | 7277,0 988 | 7490,1 989 | 7492,1 990 | 7300,0 991 | 7306,0 992 | 7408,1 993 | 7418,0 994 | 5887,0 995 | 5987,0 996 | 173,0 997 | 357,1 998 | 258,1 999 | 374,0 1000 | 94,1 1001 | 1859,1 1002 | 6188,1 1003 | 5207,0 1004 | 6448,1 1005 | 8312,1 1006 | 8313,0 1007 | 8334,1 1008 | 8337,1 1009 | 8357,1 1010 | 8366,0 1011 | 8411,0 1012 | 8388,1 1013 | 8395,1 1014 | 7014,1 1015 | 7035,0 1016 | 6810,1 1017 | 7498,0 1018 | 7506,0 1019 | 7390,0 1020 | 7284,1 1021 | 7081,1 1022 | 8243,0 1023 | 8245,1 1024 | 7033,0 1025 | 2150,1 1026 | 100000,1 1027 | 8266,0 1028 | 6897,1 1029 | 6898,1 1030 | 5837,0 1031 | 6907,1 1032 | 7438,1 1033 | 6910,1 1034 | 6913,1 1035 | 6914,0 1036 | 6918,1 1037 | 6919,1 1038 | 8214,1 1039 | 6924,1 1040 | 8424,1 1041 | 6926,0 1042 | 6928,1 1043 | 6933,1 1044 | 7520,1 1045 | 7521,0 1046 | 6936,0 1047 | 7522,1 1048 | 6937,0 1049 | 6939,0 1050 | 6900,1 1051 | 6901,1 1052 | 6908,1 1053 | 6009,0 1054 | 6915,0 1055 | 6917,1 1056 | 6920,0 1057 | 6922,0 1058 | 6923,0 1059 | 6927,0 1060 | 6929,1 1061 | 6930,0 1062 | 6940,0 1063 | 6942,1 1064 | 6943,1 1065 | 7518,1 1066 | 6946,1 1067 | 8213,0 1068 | 6951,1 1069 | 6958,1 1070 | 7305,0 1071 | 7376,1 1072 | 7386,1 1073 | 7404,1 1074 | 7403,0 1075 | 7457,1 1076 | 7463,1 1077 | 7015,1 1078 | 7024,0 1079 | 7079,0 1080 | 7152,1 1081 | 7297,1 1082 | 7381,0 1083 | 7413,0 1084 | 7176,1 1085 | 7352,0 1086 | 2327,0 1087 | 7117,1 1088 | 7172,1 1089 | 7168,1 1090 | 7423,0 1091 | 7425,1 1092 | 7223,1 1093 | 7239,1 1094 | 7276,1 1095 | 7281,1 1096 | 7287,1 1097 | 7292,0 1098 | 7299,0 1099 | 7303,0 1100 | 7309,0 1101 | 7328,0 1102 | 7406,1 1103 | 2320,1 1104 | 7242,1 1105 | 7462,1 1106 | 5385,1 1107 | 5292,0 1108 | 5337,1 1109 | 5350,1 1110 | 5377,1 1111 | 5386,0 1112 | 5310,1 1113 | 5282,1 1114 | 5339,0 1115 | 5322,0 1116 | 5331,1 1117 | 5364,0 1118 | 5373,1 1119 | 4879,1 1120 | 7069,1 1121 | 7496,0 1122 | 7502,1 1123 | 7344,0 1124 | 7346,1 1125 | 7353,0 1126 | 7373,0 1127 | 7384,0 1128 | 81,1 1129 | 373,0 1130 | 383,0 1131 | 1867,0 1132 | 957,0 1133 | 998,1 1134 | 1006,1 1135 | 992,1 1136 | 1002,1 1137 | 1166,1 1138 | 9077,1 1139 | 9104,1 1140 | 9152,0 1141 | 9165,0 1142 | 9179,0 1143 | 6996,0 1144 | 7008,1 1145 | 6729,1 1146 | 7092,1 1147 | 7164,1 1148 | 7169,0 1149 | 7181,0 1150 | 7252,1 1151 | 7446,1 1152 | 7270,1 1153 | 7483,0 1154 | 7316,1 1155 | 7351,1 1156 | 7391,1 1157 | 1,1 1158 | 392,0 1159 | 379,1 1160 | 380,1 1161 | 267,1 1162 | 2057,0 1163 | 5742,0 1164 | 5056,1 1165 | 5122,1 1166 | 5158,0 1167 | 5832,0 1168 | 6994,0 1169 | 7002,1 1170 | 7026,0 1171 | 6730,0 1172 | 7075,1 1173 | 7126,1 1174 | 7227,0 1175 | 7229,0 1176 | 7449,1 1177 | 6847,0 1178 | 6953,1 1179 | 7320,1 1180 | 7354,0 1181 | 7283,0 1182 | 2,0 1183 | 386,1 1184 | 1716,1 1185 | 1967,1 1186 | 5785,1 1187 | 4802,1 1188 | 5116,0 1189 | 5202,1 1190 | 7071,1 1191 | 7064,1 1192 | 7078,1 1193 | 7094,1 1194 | 7141,1 1195 | 7143,0 1196 | 7151,0 1197 | 7150,0 1198 | 7424,0 1199 | 7178,1 1200 | 7188,1 1201 | 7201,1 1202 | 7206,1 1203 | 7205,1 1204 | 7231,0 1205 | 7244,1 1206 | 7260,1 1207 | 7263,1 1208 | 7280,0 1209 | 7282,0 1210 | 7472,0 1211 | 7382,0 1212 | 7392,1 1213 | 7477,1 1214 | 8610,0 1215 | 8692,1 1216 | 6727,1 1217 | 7105,1 1218 | 7479,1 1219 | 7482,0 1220 | 7504,1 1221 | 7508,0 1222 | 7355,1 1223 | 5896,0 1224 | 166,1 1225 | 223,1 1226 | 126,1 1227 | 390,0 1228 | 321,1 1229 | 259,1 1230 | 362,1 1231 | 260,1 1232 | 91,0 1233 | 641,1 1234 | 5160,1 1235 | 5232,0 1236 | 5606,0 1237 | 5628,0 1238 | 7004,0 1239 | 7100,1 1240 | 7102,0 1241 | 7110,0 1242 | 7135,1 1243 | 7186,1 1244 | 7430,1 1245 | 2187,1 1246 | 6094,0 1247 | 6096,0 1248 | 6109,0 1249 | 6112,1 1250 | 6124,0 1251 | 6169,1 1252 | 6202,1 1253 | 6203,0 1254 | 6242,0 1255 | 6318,1 1256 | 6990,1 1257 | 6992,1 1258 | 6998,0 1259 | 4927,1 1260 | 4935,1 1261 | 4862,1 1262 | 5596,1 1263 | 5517,1 1264 | 5582,0 1265 | 5590,1 1266 | 5536,1 1267 | 5670,0 1268 | 5678,0 1269 | 5645,1 1270 | 2290,1 1271 | 5805,0 1272 | 4997,1 1273 | 5341,1 1274 | 6449,1 1275 | 1366,1 1276 | 1363,0 1277 | 1317,1 1278 | 1313,1 1279 | 1312,1 1280 | 1360,1 1281 | 1362,1 1282 | 1256,1 1283 | 9342,1 1284 | 9450,1 1285 | 9437,0 1286 | 9436,0 1287 | 9434,1 1288 | 9427,1 1289 | 9421,1 1290 | 9416,0 1291 | 9413,0 1292 | 9412,1 1293 | 9411,1 1294 | 9410,0 1295 | 9408,1 1296 | 9407,0 1297 | 9405,1 1298 | 9404,1 1299 | 9399,1 1300 | 9392,0 1301 | 9391,1 1302 | 9390,1 1303 | 9388,0 1304 | 9386,1 1305 | 9385,1 1306 | 9384,1 1307 | 9383,0 1308 | 9382,1 1309 | -------------------------------------------------------------------------------- /data/k_matrix.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/grimmlab/permGWAS/3f7a1c2e3e4c63281f5719425ff9ac405f8d9cfc/data/k_matrix.h5 -------------------------------------------------------------------------------- /data/x_matrix.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/grimmlab/permGWAS/3f7a1c2e3e4c63281f5719425ff9ac405f8d9cfc/data/x_matrix.h5 -------------------------------------------------------------------------------- /data/x_matrix.map: -------------------------------------------------------------------------------- 1 | 1 1_657 0 657 2 | 1 1_3102 0 3102 3 | 1 1_4648 0 4648 4 | 1 1_4880 0 4880 5 | 1 1_5975 0 5975 6 | 1 1_6063 0 6063 7 | 1 1_6449 0 6449 8 | 1 1_6514 0 6514 9 | 1 1_6603 0 6603 10 | 1 1_6768 0 6768 11 | 1 1_7601 0 7601 12 | 1 1_8193 0 8193 13 | 1 1_8617 0 8617 14 | 1 1_10219 0 10219 15 | 1 1_10449 0 10449 16 | 1 1_10969 0 10969 17 | 1 1_11493 0 11493 18 | 1 1_11696 0 11696 19 | 1 1_12584 0 12584 20 | 1 1_12659 0 12659 21 | 1 1_13045 0 13045 22 | 1 1_14385 0 14385 23 | 1 1_19819 0 19819 24 | 1 1_20892 0 20892 25 | 1 1_21043 0 21043 26 | 1 1_21128 0 21128 27 | 1 1_21829 0 21829 28 | 1 1_22522 0 22522 29 | 1 1_23838 0 23838 30 | 1 1_25315 0 25315 31 | 1 1_25365 0 25365 32 | 1 1_25773 0 25773 33 | 1 1_26288 0 26288 34 | 1 1_27265 0 27265 35 | 1 1_28948 0 28948 36 | 1 1_28978 0 28978 37 | 1 1_29291 0 29291 38 | 1 1_30529 0 30529 39 | 1 1_30683 0 30683 40 | 1 1_31515 0 31515 41 | 1 1_31926 0 31926 42 | 1 1_32210 0 32210 43 | 1 1_32807 0 32807 44 | 1 1_34125 0 34125 45 | 1 1_34599 0 34599 46 | 1 1_35856 0 35856 47 | 1 1_37072 0 37072 48 | 1 1_38946 0 38946 49 | 1 1_39751 0 39751 50 | 1 1_41178 0 41178 51 | 1 1_41427 0 41427 52 | 1 1_44567 0 44567 53 | 1 1_45075 0 45075 54 | 1 1_45580 0 45580 55 | 1 1_45683 0 45683 56 | 1 1_46373 0 46373 57 | 1 1_46499 0 46499 58 | 1 1_46912 0 46912 59 | 1 1_47577 0 47577 60 | 1 1_47692 0 47692 61 | 1 1_48118 0 48118 62 | 1 1_48181 0 48181 63 | 1 1_49080 0 49080 64 | 1 1_51392 0 51392 65 | 1 1_51706 0 51706 66 | 1 1_51878 0 51878 67 | 1 1_52202 0 52202 68 | 1 1_53183 0 53183 69 | 1 1_53729 0 53729 70 | 1 1_53901 0 53901 71 | 1 1_55684 0 55684 72 | 1 1_57136 0 57136 73 | 1 1_57686 0 57686 74 | 1 1_59637 0 59637 75 | 1 1_60083 0 60083 76 | 1 1_60772 0 60772 77 | 1 1_61122 0 61122 78 | 1 1_61266 0 61266 79 | 1 1_61405 0 61405 80 | 1 1_61661 0 61661 81 | 1 1_62259 0 62259 82 | 1 1_62935 0 62935 83 | 1 1_63084 0 63084 84 | 1 1_63645 0 63645 85 | 1 1_63759 0 63759 86 | 1 1_63915 0 63915 87 | 1 1_64149 0 64149 88 | 1 1_64651 0 64651 89 | 1 1_68340 0 68340 90 | 1 1_68880 0 68880 91 | 1 1_69311 0 69311 92 | 1 1_70933 0 70933 93 | 1 1_71326 0 71326 94 | 1 1_71348 0 71348 95 | 1 1_71868 0 71868 96 | 1 1_72138 0 72138 97 | 1 1_72756 0 72756 98 | 1 1_72894 0 72894 99 | 1 1_72924 0 72924 100 | 1 1_73047 0 73047 101 | 1 1_73467 0 73467 102 | 1 1_73691 0 73691 103 | 1 1_73851 0 73851 104 | 1 1_73989 0 73989 105 | 1 1_74169 0 74169 106 | 1 1_74707 0 74707 107 | 1 1_75481 0 75481 108 | 1 1_75721 0 75721 109 | 1 1_75899 0 75899 110 | 1 1_76188 0 76188 111 | 1 1_76217 0 76217 112 | 1 1_76847 0 76847 113 | 1 1_76879 0 76879 114 | 1 1_76906 0 76906 115 | 1 1_77127 0 77127 116 | 1 1_77140 0 77140 117 | 1 1_77243 0 77243 118 | 1 1_77458 0 77458 119 | 1 1_78803 0 78803 120 | 1 1_78975 0 78975 121 | 1 1_79418 0 79418 122 | 1 1_80216 0 80216 123 | 1 1_80374 0 80374 124 | 1 1_80400 0 80400 125 | 1 1_81068 0 81068 126 | 1 1_81496 0 81496 127 | 1 1_81854 0 81854 128 | 1 1_81869 0 81869 129 | 1 1_82197 0 82197 130 | 1 1_82290 0 82290 131 | 1 1_83117 0 83117 132 | 1 1_83177 0 83177 133 | 1 1_83219 0 83219 134 | 1 1_84144 0 84144 135 | 1 1_84379 0 84379 136 | 1 1_84558 0 84558 137 | 1 1_85561 0 85561 138 | 1 1_85860 0 85860 139 | 1 1_86656 0 86656 140 | 1 1_87060 0 87060 141 | 1 1_87791 0 87791 142 | 1 1_87985 0 87985 143 | 1 1_88300 0 88300 144 | 1 1_88658 0 88658 145 | 1 1_89312 0 89312 146 | 1 1_90606 0 90606 147 | 1 1_92353 0 92353 148 | 1 1_92866 0 92866 149 | 1 1_93562 0 93562 150 | 1 1_93740 0 93740 151 | -------------------------------------------------------------------------------- /data/y_matrix.csv: -------------------------------------------------------------------------------- 1 | accession_id,phenotype_value 2 | 9381, 3 | 9380, 4 | 9378, 5 | 9371, 6 | 9367, 7 | 9363, 8 | 9356, 9 | 9355, 10 | 9354, 11 | 9353, 12 | 9352, 13 | 9351, 14 | 9344, 15 | 9343, 16 | 9339, 17 | 9336, 18 | 9332, 19 | 9323, 20 | 9321, 21 | 9482, 22 | 9481, 23 | 9472, 24 | 9471, 25 | 9470, 26 | 9469, 27 | 9455, 28 | 9454, 29 | 9453, 30 | 9451, 31 | 9419, 32 | 9418, 33 | 9409, 34 | 9402, 35 | 9369, 36 | 9349, 37 | 9476, 38 | 9433, 39 | 9446, 40 | 9443, 41 | 9442, 42 | 997, 43 | 996, 44 | 1068, 45 | 1026, 46 | 1585, 47 | 1435, 48 | 1169, 49 | 1075, 50 | 1132, 51 | 1064, 52 | 1063, 53 | 1062, 54 | 1247, 55 | 991, 56 | 1391, 57 | 1374, 58 | 1318, 59 | 1254, 60 | 1163, 61 | 1153, 62 | 1073, 63 | 1072, 64 | 394, 65 | 7, 66 | 203, 67 | 236, 68 | 367, 69 | 123, 70 | 395, 71 | 196, 72 | 264, 73 | 185, 74 | 297, 75 | 318, 76 | 323, 77 | 79, 78 | 198, 79 | 371, 80 | 280, 81 | 12, 82 | 347, 83 | 268, 84 | 288, 85 | 377, 86 | 252, 87 | 296, 88 | 341, 89 | 156, 90 | 397, 91 | 263, 92 | 48, 93 | 45, 94 | 210, 95 | 83, 96 | 372, 97 | 393, 98 | 205, 99 | 87, 100 | 62, 101 | 309, 102 | 222, 103 | 160, 104 | 229, 105 | 369, 106 | 227, 107 | 230, 108 | 217, 109 | 194, 110 | 391, 111 | 340, 112 | 167, 113 | 266, 114 | 208, 115 | 335, 116 | 213, 117 | 388, 118 | 331, 119 | 216, 120 | 277, 121 | 85, 122 | 310, 123 | 389, 124 | 387, 125 | 191, 126 | 224, 127 | 82, 128 | 225, 129 | 295, 130 | 169, 131 | 375, 132 | 292, 133 | 215, 134 | 337, 135 | 320, 136 | 171, 137 | 346, 138 | 151, 139 | 137, 140 | 291, 141 | 385, 142 | 84, 143 | 349, 144 | 219, 145 | 322, 146 | 204, 147 | 273, 148 | 212, 149 | 146, 150 | 348, 151 | 157, 152 | 214, 153 | 316, 154 | 186, 155 | 314, 156 | 293, 157 | 183, 158 | 287, 159 | 290, 160 | 168, 161 | 343, 162 | 153, 163 | 339, 164 | 60, 165 | 174, 166 | 88, 167 | 359, 168 | 298, 169 | 162, 170 | 311, 171 | 329, 172 | 175, 173 | 163, 174 | 77, 175 | 302, 176 | 231, 177 | 148, 178 | 106, 179 | 283, 180 | 184, 181 | 122, 182 | 170, 183 | 396, 184 | 275, 185 | 244, 186 | 116, 187 | 364, 188 | 121, 189 | 165, 190 | 32, 191 | 201, 192 | 326, 193 | 368, 194 | 332, 195 | 361, 196 | 202, 197 | 200, 198 | 257, 199 | 80, 200 | 9, 201 | 187, 202 | 89, 203 | 207, 204 | 69, 205 | 188, 206 | 306, 207 | 360, 208 | 237, 209 | 327, 210 | 261, 211 | 86, 212 | 228, 213 | 190, 214 | 74, 215 | 8, 216 | 4, 217 | 159, 218 | 262, 219 | 51, 220 | 5, 221 | 363, 222 | 338, 223 | 355, 224 | 269, 225 | 278, 226 | 179, 227 | 6, 228 | 206, 229 | 461, 230 | 466, 231 | 9490, 232 | 9496, 233 | 9504, 234 | 9499, 235 | 9308, 236 | 9305, 237 | 9302, 238 | 9309, 239 | 4980, 240 | 5444, 241 | 5394, 242 | 5461, 243 | 5494, 244 | 5398, 245 | 5466, 246 | 5450, 247 | 4675, 248 | 4632, 249 | 5769, 250 | 4757, 251 | 4827, 252 | 4820, 253 | 5159, 254 | 5759, 255 | 5739, 256 | 5738, 257 | 5770, 258 | 5826, 259 | 5745, 260 | 5744, 261 | 5774, 262 | 5760, 263 | 5746, 264 | 5762, 265 | 5711, 266 | 5802, 267 | 5740, 268 | 5716, 269 | 5772, 270 | 5722, 271 | 5751, 272 | 5721, 273 | 5812, 274 | 5792, 275 | 5735, 276 | 5767, 277 | 5817, 278 | 5807, 279 | 5777, 280 | 5736, 281 | 5763, 282 | 5813, 283 | 5741, 284 | 5731, 285 | 5819, 286 | 5724, 287 | 5789, 288 | 5141, 289 | 5175, 290 | 5145, 291 | 5469, 292 | 5106, 293 | 5299, 294 | 5335, 295 | 7121, 296 | 7106, 297 | 7104, 298 | 7113, 299 | 7116, 300 | 7149, 301 | 7228, 302 | 7301, 303 | 7109, 304 | 6987, 305 | 7028, 306 | 7029, 307 | 7030, 308 | 7013, 309 | 7017, 310 | 7032, 311 | 7073, 312 | 242, 313 | 104, 314 | 282, 315 | 96, 316 | 23, 317 | 6102, 318 | 6938, 319 | 8304, 320 | 8238, 321 | 8386, 322 | 8348, 323 | 1416, 324 | 6237, 325 | 6226, 326 | 6184, 327 | 6174, 328 | 6172, 329 | 6171, 330 | 6170, 331 | 6151, 332 | 6150, 333 | 6149, 334 | 6148, 335 | 6147, 336 | 6146, 337 | 6145, 338 | 6144, 339 | 6142, 340 | 6141, 341 | 6137, 342 | 6136, 343 | 6131, 344 | 6134, 345 | 6133, 346 | 6132, 347 | 6129, 348 | 6128, 349 | 6127, 350 | 6126, 351 | 6125, 352 | 6123, 353 | 6122, 354 | 6121, 355 | 6119, 356 | 6116, 357 | 6115, 358 | 6114, 359 | 6111, 360 | 6110, 361 | 6108, 362 | 6107, 363 | 6106, 364 | 6104, 365 | 6103, 366 | 6101, 367 | 6100, 368 | 6099, 369 | 6098, 370 | 6097, 371 | 6095, 372 | 6093, 373 | 6092, 374 | 6091, 375 | 6090, 376 | 6177, 377 | 6221, 378 | 6244, 379 | 6241, 380 | 6240, 381 | 6238, 382 | 6236, 383 | 6235, 384 | 6220, 385 | 6218, 386 | 6217, 387 | 6216, 388 | 6215, 389 | 6214, 390 | 6210, 391 | 6209, 392 | 6207, 393 | 6201, 394 | 6200, 395 | 6199, 396 | 6198, 397 | 6197, 398 | 6195, 399 | 6194, 400 | 6193, 401 | 6192, 402 | 6191, 403 | 6189, 404 | 6163, 405 | 6154, 406 | 8274,74.0 407 | 7192, 408 | 7194, 409 | 7210, 410 | 7238, 411 | 7245, 412 | 7246, 413 | 7256, 414 | 7265, 415 | 7268, 416 | 366, 417 | 5245, 418 | 5264, 419 | 7195, 420 | 7262, 421 | 7250, 422 | 1925, 423 | 5719, 424 | 5798, 425 | 5816, 426 | 5821, 427 | 5710, 428 | 5715, 429 | 5720, 430 | 5733, 431 | 5820, 432 | 5737, 433 | 5755, 434 | 5781, 435 | 5782, 436 | 5784, 437 | 5146, 438 | 5133, 439 | 5709, 440 | 5712, 441 | 5795, 442 | 5750, 443 | 5708, 444 | 5749, 445 | 5727, 446 | 5780, 447 | 5756, 448 | 5723, 449 | 5730, 450 | 5717, 451 | 5732, 452 | 5804, 453 | 5752, 454 | 5799, 455 | 5713, 456 | 5728, 457 | 5787, 458 | 5788, 459 | 5793, 460 | 5803, 461 | 5758, 462 | 7317, 463 | 7034, 464 | 615, 465 | 627, 466 | 607, 467 | 631, 468 | 623, 469 | 719, 470 | 640, 471 | 827, 472 | 895, 473 | 946, 474 | 936, 475 | 7717, 476 | 7787, 477 | 7837, 478 | 7847, 479 | 7867, 480 | 8077, 481 | 8122, 482 | 1743, 483 | 1799, 484 | 2175, 485 | 2160, 486 | 2171, 487 | 2148, 488 | 2180, 489 | 2157, 490 | 1948, 491 | 1941, 492 | 1949, 493 | 1965, 494 | 1981, 495 | 1992, 496 | 2016, 497 | 2011, 498 | 2019, 499 | 2020, 500 | 2151, 501 | 1862, 502 | 1872, 503 | 1864, 504 | 1871, 505 | 1857, 506 | 1865, 507 | 1873, 508 | 1850, 509 | 1858, 510 | 1874, 511 | 1868, 512 | 1829, 513 | 1853, 514 | 1926, 515 | 1966, 516 | 1918, 517 | 1959, 518 | 1936, 519 | 1952, 520 | 1960, 521 | 1968, 522 | 1938, 523 | 1963, 524 | 1720, 525 | 1736, 526 | 1744, 527 | 1752, 528 | 1729, 529 | 1745, 530 | 1753, 531 | 1722, 532 | 1730, 533 | 1738, 534 | 1724, 535 | 1740, 536 | 1733, 537 | 1718, 538 | 1726, 539 | 1750, 540 | 1782, 541 | 1719, 542 | 7566, 543 | 1751, 544 | 2214, 545 | 2201, 546 | 2204, 547 | 2294, 548 | 2280, 549 | 2338, 550 | 2292, 551 | 2300, 552 | 2316, 553 | 2283, 554 | 7584, 555 | 7580, 556 | 7578, 557 | 7570, 558 | 8608, 559 | 8727, 560 | 8760, 561 | 8768, 562 | 8616, 563 | 8617, 564 | 8770, 565 | 8730, 566 | 8619, 567 | 8629, 568 | 8612, 569 | 8724, 570 | 8631, 571 | 8725, 572 | 8640, 573 | 8759, 574 | 8774, 575 | 8824, 576 | 8557, 577 | 8791, 578 | 8777, 579 | 8811, 580 | 8787, 581 | 8805, 582 | 8534, 583 | 8687, 584 | 9045, 585 | 8673, 586 | 9041, 587 | 8701, 588 | 9053, 589 | 8985, 590 | 8957, 591 | 8966, 592 | 8695, 593 | 8967, 594 | 9004, 595 | 8690, 596 | 9012, 597 | 8969, 598 | 8961, 599 | 8970, 600 | 8954, 601 | 8962, 602 | 8965, 603 | 8973, 604 | 8975, 605 | 9006, 606 | 8976, 607 | 9007, 608 | 8977, 609 | 8992, 610 | 9008, 611 | 9001, 612 | 8719, 613 | 8996, 614 | 9011, 615 | 1742, 616 | 1749, 617 | 999, 618 | 1061, 619 | 1404, 620 | 1552, 621 | 1257, 622 | 1158, 623 | 1070, 624 | 9452, 625 | 417, 626 | 421, 627 | 407, 628 | 424, 629 | 402, 630 | 403, 631 | 404, 632 | 428, 633 | 429, 634 | 409, 635 | 413, 636 | 5883, 637 | 5848, 638 | 6416, 639 | 5838, 640 | 6287, 641 | 6417, 642 | 5841, 643 | 5894, 644 | 5904, 645 | 5913, 646 | 5921, 647 | 5939, 648 | 5969, 649 | 5895, 650 | 5905, 651 | 5914, 652 | 5923, 653 | 5932, 654 | 5942, 655 | 5961, 656 | 5970, 657 | 5884, 658 | 5906, 659 | 5924, 660 | 5933, 661 | 5943, 662 | 5953, 663 | 5963, 664 | 5972, 665 | 5888, 666 | 5934, 667 | 5898, 668 | 5908, 669 | 5919, 670 | 5926, 671 | 5935, 672 | 5945, 673 | 5955, 674 | 5891, 675 | 5900, 676 | 5927, 677 | 5946, 678 | 5966, 679 | 5975, 680 | 5901, 681 | 5911, 682 | 5875, 683 | 5948, 684 | 5893, 685 | 5902, 686 | 5920, 687 | 5938, 688 | 5959, 689 | 5968, 690 | 5988, 691 | 5999, 692 | 6455, 693 | 5979, 694 | 6421, 695 | 5991, 696 | 5992, 697 | 6004, 698 | 6458, 699 | 5982, 700 | 5993, 701 | 6425, 702 | 6451, 703 | 6309, 704 | 5984, 705 | 5994, 706 | 6444, 707 | 5997, 708 | 6007, 709 | 6445, 710 | 6453, 711 | 5998, 712 | 6427, 713 | 6435, 714 | 6446, 715 | 6403, 716 | 5922, 717 | 5915, 718 | 5910, 719 | 6401, 720 | 6003, 721 | 5899, 722 | 6396, 723 | 5873, 724 | 6418, 725 | 5874, 726 | 5916, 727 | 5878, 728 | 5983, 729 | 5990, 730 | 5996, 731 | 5940, 732 | 5846, 733 | 5871, 734 | 6436, 735 | 5872, 736 | 5956, 737 | 6402, 738 | 4758, 739 | 5285, 740 | 9153, 741 | 9137, 742 | 9151, 743 | 9143, 744 | 9201, 745 | 6173, 746 | 6284, 747 | 6276, 748 | 6258, 749 | 6252, 750 | 6255, 751 | 6166, 752 | 6085, 753 | 6025, 754 | 6268, 755 | 6180, 756 | 6143, 757 | 6041, 758 | 5829, 759 | 8427, 760 | 8218, 761 | 6023, 762 | 5835, 763 | 5831, 764 | 5830, 765 | 6039, 766 | 6086, 767 | 6413, 768 | 6412, 769 | 6411, 770 | 6087, 771 | 6077, 772 | 6076, 773 | 6071, 774 | 6069, 775 | 6038, 776 | 6036, 777 | 6035, 778 | 6034, 779 | 6030, 780 | 6024, 781 | 6021, 782 | 6019, 783 | 6017, 784 | 6013, 785 | 6012, 786 | 6011, 787 | 6010, 788 | 5870, 789 | 5867, 790 | 5865, 791 | 5860, 792 | 5836, 793 | 6231, 794 | 6212, 795 | 6140, 796 | 6138, 797 | 6120, 798 | 6118, 799 | 6073, 800 | 6022, 801 | 6020, 802 | 8227, 803 | 8225, 804 | 8230,97.0 805 | 5856, 806 | 8307, 807 | 1409, 808 | 6959,51.0 809 | 7525,46.0 810 | 6961,46.0 811 | 6967,44.0 812 | 6973,53.0 813 | 6974,103.0 814 | 6976,56.0 815 | 7516,100.0 816 | 6979,44.0 817 | 6980,51.0 818 | 6982,49.0 819 | 6983,71.0 820 | 6985,56.0 821 | 6931,46.0 822 | 6043,90.0 823 | 6945,55.0 824 | 7519,76.0 825 | 7526,53.0 826 | 7523,57.0 827 | 6956,69.0 828 | 6960,47.0 829 | 7524,51.0 830 | 6963,60.0 831 | 6964,93.0 832 | 6965,102.0 833 | 6966,53.0 834 | 6969,70.0 835 | 6971,51.0 836 | 6975,51.0 837 | 7517,107.0 838 | 6978,49.0 839 | 6981,44.0 840 | 6984,53.0 841 | 6899,54.0 842 | 6903,57.0 843 | 6904,66.0 844 | 6905,65.0 845 | 6906,43.0 846 | 6909,51.0 847 | 6911,46.0 848 | 6916,63.0 849 | 8215,51.0 850 | 6921,64.0 851 | 6932,51.0 852 | 6046,93.0 853 | 6944,49.0 854 | 7515,49.0 855 | 7514,58.0 856 | 6962,52.0 857 | 6968,71.0 858 | 6972,63.0 859 | 6970,48.0 860 | 6977,49.0 861 | 8329,46.0 862 | 7379, 863 | 7080, 864 | 6744, 865 | 7098, 866 | 7158, 867 | 7163,57.0 868 | 7165, 869 | 7340, 870 | 7372, 871 | 7394, 872 | 7397, 873 | 281, 874 | 8258,73.0 875 | 8259,73.0 876 | 8290,50.0 877 | 7461,61.0 878 | 7323,56.0 879 | 8254,52.0 880 | 8270,49.0 881 | 8233,59.0 882 | 8285,70.0 883 | 6016,75.0 884 | 8423,70.0 885 | 8237,97.0 886 | 6040,71.0 887 | 6064,96.0 888 | 6957,84.0 889 | 8369,76.0 890 | 8247,87.0 891 | 8426,49.0 892 | 8428, 893 | 9058,101.0 894 | 8249,81.0 895 | 9057,76.0 896 | 7139, 897 | 7307, 898 | 7331, 899 | 7337, 900 | 7378, 901 | 7405, 902 | 66, 903 | 149, 904 | 328, 905 | 334, 906 | 2274, 907 | 5753, 908 | 6709,52.0 909 | 7000,65.0 910 | 6989, 911 | 7031, 912 | 7062,46.0 913 | 7460,49.0 914 | 7123,59.5 915 | 7128, 916 | 7145, 917 | 7147,71.0 918 | 7166, 919 | 7255,46.0 920 | 7275,46.0 921 | 7258, 922 | 7291, 923 | 7310, 924 | 7330, 925 | 7333, 926 | 7411, 927 | 178, 928 | 378, 929 | 8241,73.0 930 | 6988,48.0 931 | 8256,61.0 932 | 8796, 933 | 8264,46.0 934 | 8265,44.0 935 | 8231,91.0 936 | 8271,49.0 937 | 6190, 938 | 8275,68.0 939 | 8420,56.0 940 | 8283,71.0 941 | 8284,61.0 942 | 6008,60.0 943 | 8422,106.0 944 | 8296,45.0 945 | 8297,73.0 946 | 8300,61.0 947 | 8235,60.0 948 | 8306,96.0 949 | 8310,49.0 950 | 8236,91.0 951 | 8311,49.0 952 | 8314,64.0 953 | 8239,52.0 954 | 8240,93.0 955 | 8323,51.0 956 | 8242,120.0 957 | 8325,49.0 958 | 8326,67.0 959 | 8222,90.0 960 | 8430, 961 | 6042,56.0 962 | 8335,104.0 963 | 8343,62.0 964 | 6074,91.0 965 | 8351,78.0 966 | 8353,41.0 967 | 8354,70.0 968 | 7296,70.0 969 | 8365,51.0 970 | 8374,59.0 971 | 8376,84.0 972 | 8378,56.0 973 | 8412, 974 | 8387,52.0 975 | 8389,63.0 976 | 6243,56.0 977 | 7507, 978 | 7343, 979 | 6005, 980 | 5729, 981 | 5380, 982 | 5381, 983 | 5565, 984 | 7011, 985 | 7199, 986 | 7224, 987 | 7277, 988 | 7490, 989 | 7492, 990 | 7300, 991 | 7306,60.0 992 | 7408, 993 | 7418,63.0 994 | 5887, 995 | 5987, 996 | 173, 997 | 357, 998 | 258, 999 | 374, 1000 | 94, 1001 | 1859, 1002 | 6188, 1003 | 5207, 1004 | 6448, 1005 | 8312,66.0 1006 | 8313,49.0 1007 | 8334,64.0 1008 | 8337,70.0 1009 | 8357, 1010 | 8366, 1011 | 8411, 1012 | 8388,60.0 1013 | 8395,69.0 1014 | 7014,92.0 1015 | 7035, 1016 | 6810, 1017 | 7498, 1018 | 7506, 1019 | 7390, 1020 | 7284, 1021 | 7081,46.0 1022 | 8243,66.0 1023 | 8245,46.0 1024 | 7033,76.0 1025 | 2150, 1026 | 100000,58.0 1027 | 8266,99.0 1028 | 6897,62.0 1029 | 6898,41.0 1030 | 5837,57.0 1031 | 6907,58.0 1032 | 7438,75.0 1033 | 6910,49.0 1034 | 6913,99.0 1035 | 6914,73.0 1036 | 6918,108.0 1037 | 6919,71.0 1038 | 8214,51.0 1039 | 6924,49.0 1040 | 8424,46.0 1041 | 6926,49.0 1042 | 6928,55.0 1043 | 6933,56.0 1044 | 7520,60.0 1045 | 7521,60.0 1046 | 6936,67.0 1047 | 7522,83.0 1048 | 6937,65.0 1049 | 6939,49.0 1050 | 6900,90.0 1051 | 6901,86.0 1052 | 6908,49.0 1053 | 6009,98.0 1054 | 6915,53.0 1055 | 6917,121.0 1056 | 6920,71.0 1057 | 6922,48.0 1058 | 6923,44.0 1059 | 6927,51.0 1060 | 6929,71.0 1061 | 6930,49.0 1062 | 6940,49.0 1063 | 6942,46.0 1064 | 6943,49.0 1065 | 7518,103.0 1066 | 6946,62.0 1067 | 8213,44.0 1068 | 6951,68.0 1069 | 6958,49.0 1070 | 7305, 1071 | 7376, 1072 | 7386, 1073 | 7404, 1074 | 7403, 1075 | 7457, 1076 | 7463, 1077 | 7015, 1078 | 7024, 1079 | 7079, 1080 | 7152, 1081 | 7297, 1082 | 7381, 1083 | 7413, 1084 | 7176, 1085 | 7352, 1086 | 2327, 1087 | 7117, 1088 | 7172, 1089 | 7168, 1090 | 7423, 1091 | 7425, 1092 | 7223, 1093 | 7239, 1094 | 7276, 1095 | 7281, 1096 | 7287, 1097 | 7292, 1098 | 7299, 1099 | 7303, 1100 | 7309, 1101 | 7328, 1102 | 7406, 1103 | 2320, 1104 | 7242, 1105 | 7462, 1106 | 5385, 1107 | 5292, 1108 | 5337, 1109 | 5350, 1110 | 5377, 1111 | 5386, 1112 | 5310, 1113 | 5282, 1114 | 5339, 1115 | 5322, 1116 | 5331, 1117 | 5364, 1118 | 5373, 1119 | 4879, 1120 | 7069, 1121 | 7496, 1122 | 7502, 1123 | 7344, 1124 | 7346,64.0 1125 | 7353, 1126 | 7373, 1127 | 7384, 1128 | 81, 1129 | 373, 1130 | 383, 1131 | 1867, 1132 | 957, 1133 | 998, 1134 | 1006, 1135 | 992, 1136 | 1002, 1137 | 1166, 1138 | 9077, 1139 | 9104, 1140 | 9152, 1141 | 9165, 1142 | 9179, 1143 | 6996, 1144 | 7008, 1145 | 6729, 1146 | 7092, 1147 | 7164, 1148 | 7169, 1149 | 7181, 1150 | 7252, 1151 | 7446, 1152 | 7270, 1153 | 7483, 1154 | 7316, 1155 | 7351, 1156 | 7391, 1157 | 1, 1158 | 392, 1159 | 379, 1160 | 380, 1161 | 267, 1162 | 2057, 1163 | 5742, 1164 | 5056, 1165 | 5122, 1166 | 5158, 1167 | 5832, 1168 | 6994, 1169 | 7002, 1170 | 7026, 1171 | 6730, 1172 | 7075, 1173 | 7126, 1174 | 7227, 1175 | 7229, 1176 | 7449, 1177 | 6847, 1178 | 6953, 1179 | 7320, 1180 | 7354, 1181 | 7283, 1182 | 2, 1183 | 386, 1184 | 1716, 1185 | 1967, 1186 | 5785, 1187 | 4802, 1188 | 5116, 1189 | 5202, 1190 | 7071, 1191 | 7064,79.0 1192 | 7078, 1193 | 7094,58.5 1194 | 7141, 1195 | 7143, 1196 | 7151, 1197 | 7150, 1198 | 7424,43.0 1199 | 7178, 1200 | 7188, 1201 | 7201, 1202 | 7206, 1203 | 7205, 1204 | 7231,46.0 1205 | 7244, 1206 | 7260, 1207 | 7263, 1208 | 7280, 1209 | 7282,51.0 1210 | 7472, 1211 | 7382, 1212 | 7392, 1213 | 7477,59.0 1214 | 8610, 1215 | 8692, 1216 | 6727, 1217 | 7105, 1218 | 7479, 1219 | 7482, 1220 | 7504, 1221 | 7508, 1222 | 7355, 1223 | 5896, 1224 | 166, 1225 | 223, 1226 | 126, 1227 | 390, 1228 | 321, 1229 | 259, 1230 | 362, 1231 | 260, 1232 | 91, 1233 | 641, 1234 | 5160, 1235 | 5232, 1236 | 5606, 1237 | 5628, 1238 | 7004, 1239 | 7100, 1240 | 7102, 1241 | 7110, 1242 | 7135, 1243 | 7186, 1244 | 7430, 1245 | 2187, 1246 | 6094, 1247 | 6096, 1248 | 6109, 1249 | 6112, 1250 | 6124, 1251 | 6169, 1252 | 6202, 1253 | 6203, 1254 | 6242, 1255 | 6318, 1256 | 6990, 1257 | 6992, 1258 | 6998, 1259 | 4927, 1260 | 4935, 1261 | 4862, 1262 | 5596, 1263 | 5517, 1264 | 5582, 1265 | 5590, 1266 | 5536, 1267 | 5670, 1268 | 5678, 1269 | 5645, 1270 | 2290, 1271 | 5805, 1272 | 4997, 1273 | 5341, 1274 | 6449, 1275 | 1366, 1276 | 1363, 1277 | 1317, 1278 | 1313, 1279 | 1312, 1280 | 1360, 1281 | 1362, 1282 | 1256, 1283 | 9342, 1284 | 9450, 1285 | 9437, 1286 | 9436, 1287 | 9434, 1288 | 9427, 1289 | 9421, 1290 | 9416, 1291 | 9413, 1292 | 9412, 1293 | 9411, 1294 | 9410, 1295 | 9408, 1296 | 9407, 1297 | 9405, 1298 | 9404, 1299 | 9399, 1300 | 9392, 1301 | 9391, 1302 | 9390, 1303 | 9388, 1304 | 9386, 1305 | 9385, 1306 | 9384, 1307 | 9383, 1308 | 9382, 1309 | -------------------------------------------------------------------------------- /data/y_matrix.pheno: -------------------------------------------------------------------------------- 1 | FID IID phenotype_value 2 | 5837 5837 57.0 3 | 6008 6008 60.0 4 | 6009 6009 98.0 5 | 6016 6016 75.0 6 | 6040 6040 71.0 7 | 6042 6042 56.0 8 | 6043 6043 90.0 9 | 6046 6046 93.0 10 | 6064 6064 96.0 11 | 6074 6074 91.0 12 | 6243 6243 56.0 13 | 6709 6709 52.0 14 | 6897 6897 62.0 15 | 6898 6898 41.0 16 | 6899 6899 54.0 17 | 6900 6900 90.0 18 | 6901 6901 86.0 19 | 6903 6903 57.0 20 | 6904 6904 66.0 21 | 6905 6905 65.0 22 | 6906 6906 43.0 23 | 6907 6907 58.0 24 | 6908 6908 49.0 25 | 6909 6909 51.0 26 | 6910 6910 49.0 27 | 6911 6911 46.0 28 | 6913 6913 99.0 29 | 6914 6914 73.0 30 | 6915 6915 53.0 31 | 6916 6916 63.0 32 | 6917 6917 121.0 33 | 6918 6918 108.0 34 | 6919 6919 71.0 35 | 6920 6920 71.0 36 | 6921 6921 64.0 37 | 6922 6922 48.0 38 | 6923 6923 44.0 39 | 6924 6924 49.0 40 | 6926 6926 49.0 41 | 6927 6927 51.0 42 | 6928 6928 55.0 43 | 6929 6929 71.0 44 | 6930 6930 49.0 45 | 6931 6931 46.0 46 | 6932 6932 51.0 47 | 6933 6933 56.0 48 | 6936 6936 67.0 49 | 6937 6937 65.0 50 | 6939 6939 49.0 51 | 6940 6940 49.0 52 | 6942 6942 46.0 53 | 6943 6943 49.0 54 | 6944 6944 49.0 55 | 6945 6945 55.0 56 | 6946 6946 62.0 57 | 6951 6951 68.0 58 | 6956 6956 69.0 59 | 6957 6957 84.0 60 | 6958 6958 49.0 61 | 6959 6959 51.0 62 | 6960 6960 47.0 63 | 6961 6961 46.0 64 | 6962 6962 52.0 65 | 6963 6963 60.0 66 | 6964 6964 93.0 67 | 6965 6965 102.0 68 | 6966 6966 53.0 69 | 6967 6967 44.0 70 | 6968 6968 71.0 71 | 6969 6969 70.0 72 | 6970 6970 48.0 73 | 6971 6971 51.0 74 | 6972 6972 63.0 75 | 6973 6973 53.0 76 | 6974 6974 103.0 77 | 6975 6975 51.0 78 | 6976 6976 56.0 79 | 6977 6977 49.0 80 | 6978 6978 49.0 81 | 6979 6979 44.0 82 | 6980 6980 51.0 83 | 6981 6981 44.0 84 | 6982 6982 49.0 85 | 6983 6983 71.0 86 | 6984 6984 53.0 87 | 6985 6985 56.0 88 | 6988 6988 48.0 89 | 7000 7000 65.0 90 | 7014 7014 92.0 91 | 7033 7033 76.0 92 | 7062 7062 46.0 93 | 7064 7064 79.0 94 | 7081 7081 46.0 95 | 7094 7094 58.5 96 | 7123 7123 59.5 97 | 7147 7147 71.0 98 | 7163 7163 57.0 99 | 7231 7231 46.0 100 | 7255 7255 46.0 101 | 7275 7275 46.0 102 | 7282 7282 51.0 103 | 7296 7296 70.0 104 | 7306 7306 60.0 105 | 7323 7323 56.0 106 | 7346 7346 64.0 107 | 7418 7418 63.0 108 | 7424 7424 43.0 109 | 7438 7438 75.0 110 | 7460 7460 49.0 111 | 7461 7461 61.0 112 | 7477 7477 59.0 113 | 7514 7514 58.0 114 | 7515 7515 49.0 115 | 7516 7516 100.0 116 | 7517 7517 107.0 117 | 7518 7518 103.0 118 | 7519 7519 76.0 119 | 7520 7520 60.0 120 | 7521 7521 60.0 121 | 7522 7522 83.0 122 | 7523 7523 57.0 123 | 7524 7524 51.0 124 | 7525 7525 46.0 125 | 7526 7526 53.0 126 | 8213 8213 44.0 127 | 8214 8214 51.0 128 | 8215 8215 51.0 129 | 8222 8222 90.0 130 | 8230 8230 97.0 131 | 8231 8231 91.0 132 | 8233 8233 59.0 133 | 8235 8235 60.0 134 | 8236 8236 91.0 135 | 8237 8237 97.0 136 | 8239 8239 52.0 137 | 8240 8240 93.0 138 | 8241 8241 73.0 139 | 8242 8242 120.0 140 | 8243 8243 66.0 141 | 8245 8245 46.0 142 | 8247 8247 87.0 143 | 8249 8249 81.0 144 | 8254 8254 52.0 145 | 8256 8256 61.0 146 | 8258 8258 73.0 147 | 8259 8259 73.0 148 | 8264 8264 46.0 149 | 8265 8265 44.0 150 | 8266 8266 99.0 151 | 8270 8270 49.0 152 | 8271 8271 49.0 153 | 8274 8274 74.0 154 | 8275 8275 68.0 155 | 8283 8283 71.0 156 | 8284 8284 61.0 157 | 8285 8285 70.0 158 | 8290 8290 50.0 159 | 8296 8296 45.0 160 | 8297 8297 73.0 161 | 8300 8300 61.0 162 | 8306 8306 96.0 163 | 8310 8310 49.0 164 | 8311 8311 49.0 165 | 8312 8312 66.0 166 | 8313 8313 49.0 167 | 8314 8314 64.0 168 | 8323 8323 51.0 169 | 8325 8325 49.0 170 | 8326 8326 67.0 171 | 8329 8329 46.0 172 | 8334 8334 64.0 173 | 8335 8335 104.0 174 | 8337 8337 70.0 175 | 8343 8343 62.0 176 | 8351 8351 78.0 177 | 8353 8353 41.0 178 | 8354 8354 70.0 179 | 8365 8365 51.0 180 | 8369 8369 76.0 181 | 8374 8374 59.0 182 | 8376 8376 84.0 183 | 8378 8378 56.0 184 | 8387 8387 52.0 185 | 8388 8388 60.0 186 | 8389 8389 63.0 187 | 8395 8395 69.0 188 | 8420 8420 56.0 189 | 8422 8422 106.0 190 | 8423 8423 70.0 191 | 8424 8424 46.0 192 | 8426 8426 49.0 193 | 9057 9057 76.0 194 | 9058 9058 101.0 195 | 100000 100000 58.0 196 | -------------------------------------------------------------------------------- /docs/DATAGUIDE.md: -------------------------------------------------------------------------------- 1 | # Data Guide 2 | 3 | The minimal requirement is to provide a genotype and a phenotype file. We provide test data in the folder `data`. 4 | permGWAS2 is designed to work with several genotype file formats: 5 | 6 | ## Genotype file 7 | permGWAS needs **fully imputed** genotypes. We support our custom HDF5/H5/H5PY file, CSV PLINK and binary PLINK files. 8 | We recommend to use permGWAS2 with HDF5/H5/H5PY files. For this we provide a function to create an H5 file which satisfies 9 | our requirements and takes CSV, PLINK and binary PLINK genotype files as an input. For more info on how to use this function, 10 | see the section **Create H5 file** below. 11 | 12 | ### HDF5/H5/H5PY 13 | The file has to contain the following keys: 14 | 15 | - snps: genotype matrix, additively encoded (012) 16 | - sample_ids: vector containing corresponding sample ids 17 | - position_index: vector containing the positions of all SNPs 18 | - chr_index: vector containing the corresponding chromosome number 19 | 20 | ```shell 21 | python3 permGWAS.py -x ./data/x_matrix.h5 -y ./data/y_matrix.csv 22 | ``` 23 | 24 | ### CSV 25 | The **first column** should be the **sample ids**. The **column names** should be the **SNP identifiers** in the form 26 | "CHR_POSITION" (e.g. Chr1_657). The values should be the genotype matrix in **additive encoding**. 27 | 28 | ```shell 29 | python3 permGWAS.py -x ./data/x_matrix.csv -y ./data/y_matrix.csv 30 | ``` 31 | 32 | ### PLINK 33 | To use PLINK data, a .map and .ped file with the same prefix need to be in the same folder. 34 | To run permGWAS2 with PLINK files, you can use PREFIX.map or PREFIX.ped as option for the genotype file. 35 | 36 | ```shell 37 | python3 permGWAS.py -x ./data/x_matrix.map -y ./data/y_matrix.pheno 38 | ``` 39 | 40 | ### binary PLINK 41 | To use binary PLINK data, a .bed, .bim and .fam file with the same prefix need to be in the same folder. 42 | To run permGWAS2 with binary PLINK files, you can use PREFIX.bed, PREFIX.bim or PREFIX.fam as option for the genotype file. 43 | 44 | 45 | ## Phenotype file 46 | permGWAS2 currently only accepts CSV, PHENO and TXT files for the phenotype. Here the **first column** should contain 47 | the **sample ids**. The remaining columns should contain the phenotype values with the phenotype name as column name. 48 | For TXT and PHENO files it is assumed that the values are separated by a **single space**. The samples need not be in 49 | the same order as in the genotype file. permGWAS2 automatically matched genotype and phenotype and discards all samples 50 | where only one of both is available. 51 | It is possible to run permGWAS with several traits one after another as long as they are stored in the same 52 | phenotype file. 53 | 54 | ```shell 55 | python3 permGWAS.py -x ./data/x_matrix.h5 -y ./data/y_matrix.csv -trait phenotype_value phenotype_2 56 | ``` 57 | You can also run permGWAS2 for all available phenotypes in your phenotype file: 58 | 59 | ```shell 60 | python3 permGWAS.py -x ./data/x_matrix.h5 -y ./data/y_matrix.csv -trait all 61 | ``` 62 | 63 | ## Kinship file 64 | Per default permGWAS2 computes the realized relationship kernel as kinship matrix. 65 | It is also possible to provide a kinship matrix. Currently, permGWAS only accepts CSV, H5, HDF5, H5PY files as 66 | kinship file. For CSV files the first column should contain the sample ids. For H5, HDF5, H5PY files the kinship 67 | matrix should have the key 'kinship' and the corresponding sample ids the key 'sample_ids' 68 | The sample ids need to match those of the genotype matrix. 69 | 70 | ```shell 71 | python3 permGWAS.py -x ./data/x_matrix.h5 -y ./data/y_matrix.csv -k ./data/k_matrix.csv 72 | ``` 73 | 74 | ## Covariates file 75 | It is possible to run permGWAS2 with covariates. If no covariates file is provided, only the intercept will be used as 76 | fixed effect. Currently, permGWAS2 only accepts CSV files for covariates. Here the first column should contain the 77 | sample ids. The sample ids must match those of the phenotype file. 78 | 79 | ```shell 80 | python3 permGWAS.py -x ./data/x_matrix.h5 -y ./data/y_matrix.csv -cov ./data/cov_matrix.csv 81 | ``` 82 | 83 | ## create H5 file 84 | We provide a function to create an H5 file which satisfies our requirements. It is possible to create the H5 based on a 85 | CSV, PLINK or binary PLINK files which have to fulfil the same requirements as above. The function takes the genotype 86 | file path via the option `-x` and additionally one can specify a new directory to save the H5 file via `-sd` if the save 87 | directory is not specified, the new file will be stored in the same directory as the input file. 88 | 89 | ```shell 90 | python3 create_h5_file.py -x ./data/x_matrix.map -sd ./data/test 91 | ``` -------------------------------------------------------------------------------- /docs/INSTALLATION.md: -------------------------------------------------------------------------------- 1 | # Requirements 2 | 3 | To ensure a stable working environment, we recommend using [Docker](https://www.docker.com). To follow this recommendation, 4 | docker needs to be installed and running on your machine. We provide a Dockerfile based on CUDA 11.5 and Ubuntu 20.4. 5 | 6 | If you want to use permGWAS2 without Docker, you need to install all packages mentioned in the 7 | [requirements file](../Docker/requirements.txt). 8 | 9 | # Installation Guide 10 | 11 | 1. Clone the repository into the directory where you want to set up the project 12 | 13 | ```shell 14 | git clone https://github.com/grimmlab/permGWAS.git 15 | ``` 16 | 17 | 2. To use permGWAS2 within a Docker environment, navigate to `Docker` and build a Docker image using the provided Dockerfile. 18 | 19 | ```shell 20 | cd permGWAS/Docker 21 | docker build -t IMAGENAME . 22 | ``` 23 | 24 | 3. Run an interactive Docker container based on the created image.\ 25 | You have to mount the directory where the repository is located on your machine in the Docker container. 26 | If you want to work on GPU, specify the GPUs to mount. 27 | 28 | ```shell 29 | docker run -it -v PATH_TO_REPO_FOLDER:/NAME_OF_DIRECTORY_IN_CONTAINER --gpus device=DEVICE_NUMBER --name CONTAINERNAME IMAGENAME 30 | ``` 31 | 32 | ### Example 33 | 34 | 1. Assume our repository is located in a folder called `/myhome` and we want to name our image `permGWAS_image` 35 | 36 | ```shell 37 | cd /myhome/permGWAS/Docker 38 | docker build -t permGWAS_image . 39 | ``` 40 | 41 | 2. Further, assume that we want to call our container `permGWAS_container`, our data is located in (subfolders of) 42 | `/myhome` (i.e. we only need to mount one directory) and we want to use GPU 1. Then we have to run the following command: 43 | 44 | ```shell 45 | docker run -it -v /myhome/:/myhome_in_container/ --gpus device=1 --name permGWAS_container permGWAS_image 46 | ``` 47 | 48 | 3. If we need to mount a second directory (e.g. we want to save our results in a different folder called `/results`), 49 | we can run the following: 50 | 51 | ```shell 52 | docker run -it -v /myhome/:/myhome_in_container/ -v /results/:/results/ --gpus device=1 --name permGWAS_container permGWAS_image 53 | ``` 54 | 55 | With this the setup is finished. For details on how to run permGWAS, see our [Quickstart Guide](./QUICKSTART.md). -------------------------------------------------------------------------------- /docs/OPTIONS.md: -------------------------------------------------------------------------------- 1 | # Optional settings 2 | ## Minor allele frequency (MAF) 3 | It is possible to filter the markers for minor allele frequency. For this use the flag `-maf` and specify an integer 4 | value between 0 and 30. For example to remove all SNPs with MAF<10%: 5 | ```shell 6 | python3 permGWAS.py -x ./data/x_matrix.h5 -y ./data/y_matrix.csv -maf 10 7 | ``` 8 | Per default permGWAS2 does not filter for MAF. 9 | 10 | ## GPU usage 11 | For faster computations, permGWAS2 supports GPU usage. If one or several GPUs are available permGWAS2 will per default use 12 | the GPU device 0 for its computations. If no GPUs are available, permGWAS will perform all computations on CPUs only. 13 | To change the GPU you can use the flag `-device` and specify the number of the GPU to use. If you do NOT want to use 14 | GPUs, although they are available, you can use the flag `disable_gpu`: 15 | ```shell 16 | python3 permGWAS.py -x ./data/x_matrix.h5 -y ./data/y_matrix.csv -device 1 17 | 18 | python3 permGWAS.py -x ./data/x_matrix.h5 -y ./data/y_matrix.csv -disable_gpu 19 | ``` 20 | 21 | ## Batch size 22 | It is possible to adjust the batch size for the simultaneous computation of univariate tests via `-batch`. Here the 23 | default is set to 50000. If you run into memory errors while using permGWAS2 we suggest reducing the batch size. 24 | ```shell 25 | python3 permGWAS.py -x ./data/x_matrix.h5 -y ./data/y_matrix.csv -batch 10000 26 | ``` 27 | When using permGWAS2 with permutations, several univariate tests will be computed for all permutations at once. 28 | To prevent running into memory errors, one can adjust the batch size for permutations separately via `-batch_perm`. 29 | Here the default value is set to 1000. We suggest adjusting this parameter depending on the number of samples and number 30 | of permutations. For more information about permutations see [permGWAS2 with permutations](./PERMUTATIONS.md) 31 | ```shell 32 | python3 permGWAS.py -x ./data/x_matrix.h5 -y ./data/y_matrix.csv -perm 100 -batch_perm 500 33 | ``` 34 | 35 | ## Batch-wise loading of genotype 36 | As memory is a limiting factor, permGWAS2 is also capable to load the genotype matrix batch-wise from file under certain 37 | conditions. For this you have to provide a precomputed kinship matrix (see [DataGuide](./DATAGUIDE.md)) and the genotype matrix 38 | must be provided via an HDF5 file (see [DataGuide](./DATAGUIDE.md) for a function to create an HDF5 file). 39 | 40 | However, if memory is not an issue, we recommend loading the genotype file completely to improve the speed of permGWAS2. 41 | When no precomputed kinship is provided, the genotype matrix will be loaded completely per default. It is also possible 42 | to force permGWAS2 to load the genotype matrix completely even if a kinship is provided via the flag `-load_genotype`. 43 | ```shell 44 | python3 permGWAS.py -x ./data/x_matrix.h5 -y ./data/y_matrix.csv -load_genotype 45 | ``` 46 | 47 | ## Model (coming soon) 48 | permGWAS computes test statistics and p-values based on a Linear Mixed Model (LMM). In the future there will be other 49 | models available. The model can be chosen via `-model`. Currently, only `lmm` is available. 50 | 51 | ## Non-additive encoding 52 | permGWAS assumes that the genotypes are in additive encoding (i.e. number of minor alleles) and produces an error if the genotypes 53 | are encoded differently. If your data is **not additively encoded**, you can use the flag `-not_add`. For example if you 54 | are working with other data than SNP data. However, our framework was developed for SNP data, and we give no guarantee that it 55 | works for other purposes. 56 | 57 | 58 | See [Quickstart](./QUICKSTART.md), [permGWAS2 with permutations](./PERMUTATIONS.md) and [Create plots](./PLOTS.md) for 59 | detailed explanations of other flags and options. 60 | 61 | ## Overview of all flags and options 62 | |**flag**|**description**| 63 | |---|---| 64 | |-x (--genotype_file) | absolute or relative path to genotype file | 65 | |-y (--phenotype_file) | absolute or relative path to phenotype file | 66 | |-trait (--y_name)| name of phenotype (column) to be used in phenotype file, optional, default is "phenotype_value"| 67 | |-k (-kinship_file) | absolute or relative path to kinship file, optional| 68 | |-cov (--covariate_file) | absolute or relative path to covariates file, optional| 69 | |-cov_list (--covariate_list) | names of covariates to use from covariate_file, optional | 70 | |-maf (--maf_threshold) | minor allele frequency threshold as percentage value, optional, default is 0| 71 | |-load_genotype | choose whether to load full genotype from file or batch-wise during computations, optional, default is False| 72 | |-config (--config_file) | full path to yaml config file| 73 | |-model | specify model name, only relevant if you define your own models, currently only lmm is available| 74 | |-out_dir | name of the directory result-files should be stored in, optional, if not provided, files will be stored in folder "results" in current directory| 75 | |-out_file | NAME of result files, will be stored as NAME_p_values and NAME_min_p_values, optional, if not provided name of phenotype will be used| 76 | |-disable_gpu | use if you want to perform computations on CPU only though GPU would be available| 77 | |-device | GPU device to be used, optional, default is 0| 78 | |-perm | number of permutations to be performed, optional, default is 0| 79 | |-perm_method | method to use for permutations: y - permute only y, x - permute y and kinship matrix, default is x| 80 | |-adj_p_value | additionally compute permutation-based adjusted p-values and store them in the p-value file, optional default is False| 81 | |-batch (--batch_size) | number of SNPs to work on simultaneously, optional, default is 50000| 82 | |-batch_perm (--perm_batch_size) | number of SNPs to work on simultaneously while using permutations, optional, default is 1000| 83 | |-mplot (--plot, --manhattan)| creates Manhattan plot, optional| 84 | |-qqplot | creates QQ-plot, optional| 85 | |-not_add | use when genotype is not in additive encoding| 86 | -------------------------------------------------------------------------------- /docs/PERMUTATIONS.md: -------------------------------------------------------------------------------- 1 | # permGWAS2 with permutations 2 | 3 | The main purpose of permGWAS2 is to perform GWAS with permutation-based thresholds. To use permGWAS2 with permutations, 4 | you have to specify the number of permutations *q* via the flag `-perm`: 5 | ```shell 6 | python3 permGWAS.py -x ./data/x_matrix.h5 -y ./data/y_matrix.csv -perm 100 7 | ``` 8 | This creates an additional result file `min_p_values_NAME.csv` containing for each permutation the seed and the minimal 9 | p-value. Additionally, the `summary_statistics_NAME.txt` output file now contains permutation-based significance 10 | thresholds for common significance levels $\alpha$. 11 | 12 | ### General workflow of permGWAS2 with permutations 13 | 1. Compute p-values for all available SNPs during normal GWAS run 14 | 2. Create *q* permutations 15 | 3. Compute the test statistic for each permutation and SNP in batches 16 | 4. For each permutation find the maximal test statistic over all SNPs and compute the corresponding minimal p-value 17 | 5. The permutation-based threshold is given as the ($1-\alpha$)th percentile for a significance level $\alpha$ 18 | (*maxT/minP method*) 19 | 20 | ### Additional settings 21 | - permGWAS2 supports two different permutation strategies which can be selected via the flag `-perm_method`: 22 | 1. `x`(default): permutes the fixed effects matrix including SNP of interest and covariates (equivalent to permuting 23 | the phenotype and covariance matrix). This method considers the population structure while permuting. 24 | 2. `y`: only permute the phenotype vector. This method is faster but breaks the population structure between the 25 | samples 26 | ```shell 27 | python3 permGWAS.py -x ./data/x_matrix.h5 -y ./data/y_matrix.csv -perm 100 -perm_method x 28 | 29 | python3 permGWAS.py -x ./data/x_matrix.h5 -y ./data/y_matrix.csv -perm 100 -perm_method y 30 | ``` 31 | - permGWAS2 supports computations on GPUs. If GPUs are available, it will automatically use the 0th GPU. If no GPUs are 32 | available, permGWAS will perform all computations on CPUs only. To change the GPU you can use the flag `-device` and 33 | specify the number of the GPU to use. If you do NOT want to use GPUs, although they are available, you can use the flag 34 | `disable_gpu`: 35 | ```shell 36 | python3 permGWAS.py -x ./data/x_matrix.h5 -y ./data/y_matrix.csv -perm 100 -device 1 37 | 38 | python3 permGWAS.py -x ./data/x_matrix.h5 -y ./data/y_matrix.csv -perm 100 -dasable_gpu 39 | ``` 40 | - Since permGWAS2 computes the test statistics for different SNPs and permutations simultaneously in batches, the 41 | available VRAM poses a limitation. To avoid running into memory errors (when using GPUs), you can manually adjust the 42 | batch-size, i.e. the number of SNPs to be processed simultaneously for all permutations, via the flag `-batch_perm` 43 | (The default are 1000 SNPs): 44 | ```shell 45 | python3 permGWAS.py -x ./data/x_matrix.h5 -y ./data/y_matrix.csv -perm 100 -batch_perm 500 46 | ``` 47 | - permGWAS is also able to compute permutation-based adjusted p-values and save them in the p_value output file via the 48 | flag `adj_p_value`. However, it should be noted that in order to get meaningful adjusted p-values, millions of 49 | permutations are needed. 50 | ```shell 51 | python3 permGWAS.py -x ./data/x_matrix.h5 -y ./data/y_matrix.csv -perm 100 -adj_p_value 52 | ``` 53 | -------------------------------------------------------------------------------- /docs/PLOTS.md: -------------------------------------------------------------------------------- 1 | # Create plots 2 | 3 | permGWAS is able to create Manhattan and QQ-plots during a GWAS run and form existing p-value files via the 4 | `create_plot.py` script. 5 | 6 | ## Manhattan plot 7 | 8 | 9 | - While running permGWAS, you can use the flag `-mplot` to generate and save a Manhattan plot with Bonferroni 10 | significance threshold for significance level $\alpha=0.05$. If you use permGWAS2 with permutations, additionally the 11 | permutation-based threshold will be plotted. 12 | - If you already have result files generated by permGWAS, you can also create a Manhattan plot afterward. You only need 13 | to specify the p-value file (relative or absolute path) and use the flag `-mplot`: 14 | ```shell 15 | python3 create_plot.py -p_val PATH_TO_RESULT_P_VALUE_FILE -mplot 16 | ``` 17 | - By default, it uses a significance level of 5%. You can change it via the flag `-sig_level`, which expects an integer 18 | value, e.g. 19 | ```shell 20 | python3 create_plot.py -p_val PATH_TO_RESULT_P_VALUE_FILE -mplot -sig_level 1 21 | ``` 22 | - If you have a corresponding minimal p-value file available, you can additionally plot the permutation-based significance 23 | threshold by giving the path to the file via the flag `-min_p_val`: 24 | ```shell 25 | python3 create_plot.py -p_val PATH_TO_RESULT_P_VALUE_FILE -min_p_val PATH_TO_MIN_P_VALUE_FILE -mplot 26 | ``` 27 | - The resulting Manhattan plot will be saved in the same folder where the p-value file is stored, unless you specify a 28 | different directory via `-out_dir`. If no other name is specified via `-out_file`, the plot will be named 29 | `manhattan_PHENOTYPE_NAME.png`. 30 | 31 | 32 | ### QQ-plot 33 | 34 | 35 | - While running permGWAS, you can use the flag `-qqplot` to generate and save a simple QQ-plot including the inflation 36 | factor lambda. 37 | - To generate a QQ-plot afterward based on existing p-value result files, you only need to specify the p-value file 38 | (relative or absolute path) and use the flag `-qqplot`: 39 | ```shell 40 | python3 create_plot.py -p_val PATH_TO_RESULT_P_VALUE_FILE -qqplot 41 | ``` 42 | - The resulting QQ-plot will be saved in the same folder where the p-value file is stored, unless you specify a 43 | different directory via `-out_dir`. If no other name is specified via `-out_file`, the plot will be named 44 | `qq_plot_PHENOTYPE_NAME.png`. -------------------------------------------------------------------------------- /docs/QUICKSTART.md: -------------------------------------------------------------------------------- 1 | # Quickstart Guide 2 | 3 | ## Simple workflow using Docker 4 | 5 | 1. Create a new Docker container using our [Installation Guide](./INSTALLATION.md) or start an existing container with: 6 | 7 | ```shell 8 | docker start -i CONTAINERNAME 9 | ``` 10 | 11 | 2. Navigate to the directory where the permGWAS2 repository is located: 12 | 13 | ```shell 14 | cd /REPO_DIRECTORY/permGWAS 15 | ``` 16 | 17 | 3. Run the script with the test data provided in the `./data` folder: 18 | 19 | ```shell 20 | python3 permGWAS.py -x ./data/x_matrix.h5 -y ./data/y_matrix.csv 21 | ``` 22 | 23 | To use permGWAS2 without Docker, simply omit the first step. 24 | 25 | 26 | ## Basic settings 27 | ### 1. Input Data 28 | Details on the supported data types can be found in the [Data Guide](./DATAGUIDE.md). 29 | ###### Genotype & Phenotype 30 | - The minimal requirement is to provide a genotype and a phenotype file (as relative or absolute paths) via the 31 | flags `-x` and `-y`, respectively. 32 | - By default, permGWAS assumes that the phenotype in the phenotype file is called `phenotype_value`. You can specify a 33 | different name via the flag `-trait`: 34 | ```shell 35 | python3 permGWAS.py -x PATH_TO_GENOTYPE -y PATH_TO_PHENOTYPE -trait PHENO_NAME 36 | ``` 37 | - It is possible to run permGWAS2 for several phenotypes located in the same phenotype file one after another. You can 38 | either specify a list of phenotypes or run permGWAS2 for all available phenotypes in the file by using the key word `all`: 39 | ```shell 40 | python3 permGWAS.py -x PATH_TO_GENOTYPE -y PATH_TO_PHENOTYPE -trait PHENO_1 PHENO_2 PHENO_3 41 | 42 | python3 permGWAS.py -x PATH_TO_GENOTYPE -y PATH_TO_PHENOTYPE -trait all 43 | ``` 44 | 45 | ###### Kinship 46 | By default, permGWAS2 computes the realized relationship kernel as kinship matrix. You can use a pre-computed genomic 47 | relationship matrix via the flag `-k`: 48 | ```shell 49 | python3 permGWAS.py -x PATH_TO_GENOTYPE -y PATH_TO_PHENOTYPE -k PATH_TO_KINSHIP 50 | ``` 51 | 52 | ###### Covariates 53 | It is possible to run permGWAS2 with additional covariates. To specify the covariate file, use the flag `cov`. 54 | By default, this uses all available covariates in the file. If you only want to use certain columns/covariates, you 55 | have to use the flag `-cov_list` and specify the covariate names as a list: 56 | ```shell 57 | python3 permGWAS.py -x PATH_TO_GENOTYPE -y PATH_TO_PHENOTYPE -cov PATH_TO_COVARIATE_FILE 58 | 59 | python3 permGWAS.py -x PATH_TO_GENOTYPE -y PATH_TO_PHENOTYPE -cov PATH_TO_COVARIATE_FILE -cov_list COV_1 COV_2 COV_3 60 | ``` 61 | 62 | ### 2. Config file 63 | permGWAS2 accepts yaml config files where you can specify all flags and options instead of passing them all separately: 64 | ```shell 65 | python3 permGWAS.py -config ./data/config.yaml 66 | ``` 67 | The config file should have the following structure: 68 | ```YAML 69 | --- 70 | genotype_file: "PATH_TO_GENOTYPE" 71 | phenotype_file: "PATH_TO_PHENOTYPE" 72 | trait: "PHENO_NAME" 73 | kinship_file: "PATH_TO_KINSHIP" 74 | covariate_file: "PATH_TO_COVARIATE_FILE" 75 | covariate_list: 76 | - "COV_1" 77 | - "COV_2" 78 | - "COV_3" 79 | ``` 80 | 81 | ### 3. Output files 82 | Per default permGWAS2 creates a CSV output file and saves it in a directory called `results`. You can also specify a 83 | different directory for the output files via the flag `-out_dir`. The output file will be saved under the name 84 | `p_values_NAME.csv`, where NAME will be the phenotype name by default, but can also be changed via `-out_file`. 85 | ```shell 86 | python3 permGWAS.py -x PATH_TO_GENOTYPE -y PATH_TO_PHENOTYPE -out_dir RESULT_FILE_DIR -out_file RESULT_FILE_NAME 87 | ``` 88 | The result file contains for each analyzed SNP: 89 | - CHR: chromosome number 90 | - POS: position within chromosome 91 | - p_value: computed p-value 92 | - test_stat: computed test statistic 93 | - maf: minor allele frequency of SNP 94 | - SE: standard error 95 | - effect_size: coefficient beta 96 | 97 | Additionally, a TXT file with summary statistics will be saved. 98 | This file contains the estimates of the variance components of the null model, 99 | the narrow-sense heritability, the Bonferroni threshold and, 100 | if activated, the permutation-based threshold. 101 | 102 | 103 | ## Further options 104 | The table below shows all available flags. For detailed explanations of further flags and options go to 105 | [permGWAS2 with permutations](./PERMUTATIONS.md), [Create plots](./PLOTS.md) and [Optional settings](./OPTIONS.md). 106 | 107 | |**flag**|**description**| 108 | |---|---| 109 | |-x (--genotype_file) | absolute or relative path to genotype file | 110 | |-y (--phenotype_file) | absolute or relative path to phenotype file | 111 | |-trait (--y_name)| name of phenotype (column) to be used in phenotype file, optional, default is "phenotype_value"| 112 | |-k (-kinship_file) | absolute or relative path to kinship file, optional| 113 | |-cov (--covariate_file) | absolute or relative path to covariates file, optional| 114 | |-cov_list (--covariate_list) | names of covariates to use from covariate_file, optional | 115 | |-maf (--maf_threshold) | minor allele frequency threshold as percentage value, optional, default is 0| 116 | |-load_genotype | choose whether to load full genotype from file or batch-wise during computations, optional, default is False| 117 | |-config (--config_file) | full path to yaml config file| 118 | |-model | specify model name, only relevant if you define your own models, currently only lmm is available| 119 | |-out_dir | name of the directory result-files should be stored in, optional, if not provided, files will be stored in folder "results" in current directory| 120 | |-out_file | NAME of result files, will be stored as NAME_p_values and NAME_min_p_values, optional, if not provided name of phenotype will be used| 121 | |-disable_gpu | use if you want to perform computations on CPU only though GPU would be available| 122 | |-device | GPU device to be used, optional, default is 0| 123 | |-perm | number of permutations to be performed, optional, default is 0| 124 | |-perm_method | method to use for permutations: y - permute only y, x - permute y and kinship matrix, default is x| 125 | |-adj_p_value | additionally compute permutation-based adjusted p-values and store them in the p-value file, optional default is False| 126 | |-batch (--batch_size) | number of SNPs to work on simultaneously, optional, default is 50000| 127 | |-batch_perm (--perm_batch_size) | number of SNPs to work on simultaneously while using permutations, optional, default is 1000| 128 | |-mplot (--plot, --manhattan)| creates Manhattan plot, optional| 129 | |-qqplot | creates QQ-plot, optional| 130 | |-not_add | use when genotype is not in additive encoding| -------------------------------------------------------------------------------- /docs/manhattan.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/grimmlab/permGWAS/3f7a1c2e3e4c63281f5719425ff9ac405f8d9cfc/docs/manhattan.png -------------------------------------------------------------------------------- /docs/qq_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/grimmlab/permGWAS/3f7a1c2e3e4c63281f5719425ff9ac405f8d9cfc/docs/qq_plot.png -------------------------------------------------------------------------------- /models/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = ["_base_model", "lmm"] -------------------------------------------------------------------------------- /models/_base_model.py: -------------------------------------------------------------------------------- 1 | import abc 2 | import torch 3 | import pathlib 4 | import pandas as pd 5 | import numpy as np 6 | 7 | from preprocess import data_loader 8 | from postprocess import plot_functions 9 | 10 | 11 | class BaseModel(abc.ABC): 12 | 13 | def __init__(self, dataset: data_loader.Dataset, batch_size: int, device: torch.device, perm: int = None, 14 | perm_batch_size: int = None): 15 | self.dataset = dataset 16 | self.batch_size = batch_size 17 | self.device = device 18 | self.perm_batch_size = perm_batch_size 19 | self.perm = perm 20 | self.v_g = None # genetic variance component for LMM 21 | self.v_e = None # residual variance component for LMM 22 | self.delta = None # v_e/v_g 23 | self.effect_size = None # effect sizes for all SNPs 24 | self.SE = None # standard errors for all SNPs 25 | self.test_stat = None # tests statistics for all SNPs 26 | self.p_value = None # p_values for all SNPs 27 | self.seeds = None # seeds for permutation with numpy generator 28 | self.perm_p_val = None # permutation-based p-values 29 | self.min_p_value = None # minimal p-values for all permutations 30 | 31 | @abc.abstractmethod 32 | def gwas(self): 33 | """ 34 | Function to perform batch-wise computation of univariate test 35 | 36 | """ 37 | 38 | @abc.abstractmethod 39 | def perm_gwas(self, **kwargs): 40 | """ 41 | Function to perform batch-wise computation of permutation-based test 42 | 43 | """ 44 | 45 | # general methods 46 | def perm_seeds(self) -> np.array: 47 | """ 48 | get seeds for permutations 49 | 50 | :return: array with seeds 51 | """ 52 | rng = np.random.default_rng() 53 | return rng.choice(1000000, self.perm, replace=False) 54 | 55 | def permute(self, data: torch.tensor) -> torch.tensor: 56 | """ 57 | Create tensor with permutations of input data 58 | 59 | :param data: input data to permute of shape (n,c) or (n) 60 | 61 | :return: tensor with permuted data of shape (p,n,c) or (n,p) 62 | """ 63 | data = data.to(torch.device("cpu")) 64 | x_perm = [] 65 | for seed in self.seeds: 66 | tmp = np.random.default_rng(seed=seed) 67 | x_perm.append(tmp.permutation(data, axis=0)) 68 | if data.ndim == 1: 69 | return torch.t(torch.tensor(np.array(x_perm), dtype=torch.float64, device=self.device)) 70 | else: 71 | return torch.tensor(np.array(x_perm), dtype=torch.float64, device=self.device) 72 | 73 | def save_results(self, data_dir: pathlib.Path, filename: str): 74 | """ 75 | Save p-values results to csv file as p_values_filename. If permutations were computed, also save 76 | minimal p-values as min_p_values_filename. 77 | 78 | :param data_dir: full path to results directory 79 | :param filename: name of results file 80 | """ 81 | df = pd.DataFrame({'CHR': self.dataset.chromosomes, 82 | 'POS': self.dataset.positions, 83 | 'p_value': self.p_value, 84 | 'test_stat': self.test_stat, 85 | 'maf': self.dataset.maf, 86 | 'SE': self.SE, 87 | 'effect_size': self.effect_size}) 88 | if self.perm_p_val is not None: 89 | df['adjusted_p_val'] = self.perm_p_val 90 | df.to_csv(data_dir.joinpath('p_values_' + filename), index=False) 91 | if self.min_p_value is not None: 92 | df_min = pd.DataFrame({'seed': self.seeds, 93 | 'min_p_val': self.min_p_value}) 94 | df_min.to_csv(data_dir.joinpath('min_p_values_' + filename), index=False) 95 | 96 | def manhattan_plot(self, data_dir: pathlib.Path, filename: str, sig_level: int = 5): 97 | """ 98 | Save Manhattan plot as manhattan_FILENAME.png to data_dir 99 | 100 | :param data_dir: full path to save directory 101 | :param filename: name of file 102 | :param sig_level: significance level for Bonferroni and perm thresholds, default is 5 103 | """ 104 | df = pd.DataFrame({'CHR': self.dataset.chromosomes, 105 | 'POS': self.dataset.positions, 106 | 'p_value': self.p_value}) 107 | 108 | plot_functions.manhattan_plot(df=df, data_dir=data_dir, filename=filename, min_p_values=self.min_p_value, 109 | sig_level=sig_level) 110 | 111 | def qq_plot(self, data_dir: pathlib.Path, filename: str): 112 | """ 113 | Save QQ-plot as qq_plot_FILENAME.png to data_dir 114 | 115 | :param data_dir: full path to save directory 116 | :param filename: name of file 117 | """ 118 | plot_functions.qq_plot(p_values=self.p_value, data_dir=data_dir, filename=filename) 119 | -------------------------------------------------------------------------------- /models/lmm.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import time 4 | import scipy.stats as stats 5 | 6 | from . import _base_model 7 | from preprocess import data_loader 8 | from optimize import brent 9 | 10 | 11 | class LMM(_base_model.BaseModel): 12 | 13 | def __init__(self, dataset: data_loader.Dataset, batch_size: int, device: torch.device, perm: int = None, 14 | perm_batch_size: int = None): 15 | super().__init__(dataset=dataset, batch_size=batch_size, device=device, perm=perm, 16 | perm_batch_size=perm_batch_size) 17 | self.D = None # eigenvalues of K 18 | self.U = None # unitary matrix of eigenvectors of K 19 | self.freedom_deg = None # adjusted degrees of freedom = n_samples - degrees of freedom = int 20 | self.Uy = None # y after linear transformation with eigenvectors 21 | self.UZ = None # fixed effects after linear transformation with eigenvectors 22 | 23 | def gwas(self): 24 | """ 25 | Perform batch-wise computation of univariate test with linear mixed model (EMMAX): 26 | (1) compute spectral decomposition of K=UDU' 27 | (2) transform data: U'y, U'Z 28 | (3) estimate delta and compute variance components 29 | (4) calculate residual sum of squares of null model 30 | (5) batch-wise: 31 | (a) linearly transform marker 32 | (b) calculate effect size, residual sum of squares and standard error 33 | (c) calculate test statistic 34 | (6) calculate p-values 35 | Dataset: 36 | X: genotype matrix of shape (n,m) or (n,b) if batch-wise 37 | y: phenotype vector of shape (n) 38 | K: kinship matrix of shape (n,n) 39 | fixed: vector/matrix of fixed effects of shape (n) or (n,c) 40 | """ 41 | start = time.time() 42 | self.freedom_deg = self.dataset.n_samples - self.dataset.fixed.shape[1] 43 | # get spectral decomposition 44 | self.D, self.U = torch.linalg.eigh(self.dataset.K) 45 | # linearly transform data, i.e. compute U'y and U'Z for fixed effects Z 46 | self.Uy = self.transform_input(X=self.dataset.y, U=self.U) # shape (n) 47 | self.UZ = self.transform_input(X=self.dataset.fixed, U=self.U) # shape (n) or (n,c) 48 | # estimate delta and compute variance components 49 | self.delta = self.estimate_delta(gridlen=100, logdelta_min=-10, logdelta_max=10, reml=True) 50 | D = self.D + self.delta 51 | ZD = self._zd(UZ=self.UZ, D=D) 52 | ZDZ = self._zdz(UZ=self.UZ, ZD=ZD) 53 | self.v_g, self.v_e = self.compute_var_components(D=D, UZ=self.UZ, ZD=ZD, ZDZ=ZDZ, reml=True) 54 | # calculate residual sum of squares of null model 55 | RSS_0 = self.get_rss_h0() # shape: (1) 56 | self.freedom_deg -= 1 57 | # in batches: 58 | SE = [] 59 | effect_size = [] 60 | test_stat = [] 61 | for batch in range(int(np.ceil(self.dataset.n_snps / self.batch_size))): 62 | # set bounds for SNP batch 63 | lower_bound, upper_bound = self._bounds(batch_size=self.batch_size, batch=batch) 64 | # load and transform batch of SNPs 65 | US = self._s_matrix(lower_bound=lower_bound, upper_bound=upper_bound) # shape: (n,b) 66 | # transform data 67 | US = self.transform_input(X=US, U=self.U) 68 | # calculate effect size, residual sum of squares and standard error 69 | RSS_1, stds, betas = self.get_rss_and_se(D=D, S=US, ZD=ZD, ZDZ=ZDZ) 70 | SE.append(stds.to(torch.device("cpu"))) 71 | effect_size.append(betas.to(torch.device("cpu"))) 72 | # calculate test statistic 73 | test_stat.append(self.get_f_score(rss0=RSS_0, rss1=RSS_1).to(torch.device("cpu"))) 74 | # free GPU space 75 | if self.device.type != "cpu": 76 | with torch.cuda.device(self.device): 77 | del RSS_1 78 | del US 79 | del stds 80 | del betas 81 | torch.cuda.empty_cache() 82 | self.SE = torch.cat(SE, dim=0) # shape: (m) 83 | self.effect_size = torch.cat(effect_size, dim=0) # shape: (m) 84 | self.test_stat = torch.cat(test_stat, dim=0) # shape: (m) 85 | time_test_stats = time.time() 86 | print("Have test statistics of %d SNPs. Elapsed time: %f" % (self.test_stat.shape[0], time_test_stats - start)) 87 | print("Calculate P-values now") 88 | # compute p-values 89 | self.p_value = torch.tensor(list(map(self.get_p_value, self.test_stat))) 90 | print("Have P-values. Elapsed time: ", time.time() - time_test_stats) 91 | if self.device.type != "cpu": 92 | with torch.cuda.device(self.device): 93 | del D 94 | del ZD 95 | del ZDZ 96 | del self.dataset.K 97 | torch.cuda.empty_cache() 98 | 99 | def perm_gwas(self, perm_method: str = 'x', adj_p_value: bool = False): 100 | """ 101 | Perform batch-wise computation of permutation-based test with linear mixed model (EMMAX): 102 | reuse spectral decomposition of K=UDU' 103 | perm method y: 104 | (1) permute phenotype p times 105 | (2) transform data: U'y 106 | (3) estimate delta and compute variance components for each permutation 107 | (4) calculate residual sum of squares of null model 108 | (5) batch-wise: 109 | (a) linearly transform marker 110 | (b) calculate residual sum of squares 111 | (c) calculate test statistic 112 | perm method x: 113 | (1) permute fixed effects p times 114 | (2) transform data: U'Z 115 | (3) estimate delta and compute variance components for each permutation 116 | (4) calculate residual sum of squares of null model 117 | (5) batch-wise: 118 | (a) permute marker p times 119 | (b) linearly transform marker 120 | (c) calculate residual sum of squares 121 | (d) calculate test statistic 122 | (6) calculate minimal p-values for Westfall-Young permutation-based threshold 123 | optional: (7) calculate permutation-based p-values 124 | Dataset: 125 | X: genotype matrix of shape (n,m) or (n,b) if batch-wise 126 | y: phenotype vector of shape (n) 127 | K: kinship matrix of shape (n,n) 128 | fixed: vector/matrix of fixed effects of shape (n) or (n,c) 129 | 130 | :param perm_method: y to permute phenotype or x to permute fixed effects + marker 131 | :param adj_p_value: if True compute adjusted p-values, default is false 132 | """ 133 | start = time.time() 134 | if self.test_stat is None: 135 | raise Exception('Need to first calculate true test statistics using LMM.gwas().') 136 | self.freedom_deg = self.dataset.n_samples - self.dataset.fixed.shape[1] 137 | self.seeds = self.perm_seeds() 138 | if perm_method == 'y': 139 | # compute permutations of y 140 | self.Uy = self.permute(data=self.dataset.y) # shape: (n,p) 141 | self.Uy = torch.unsqueeze(torch.t(self.transform_input(X=self.Uy, U=self.U)), 2) # shape: (p,n,1) 142 | # estimate variance components for each permutation 143 | self.delta = self.estimate_delta_perm(gridlen=100, logdelta_min=-10, logdelta_max=10, reml=True) 144 | self.D = self._d_delta(delta=self.delta, batch_size=self.perm) # shape: (p,1,n) 145 | self.UZ = self.get_3d_copy(v=self.UZ, batch_size=self.perm) # shape: (p,n,c) 146 | ZD = self._zd(UZ=self.UZ, D=self.D) # shape: (p,c,n) 147 | ZDZ = self._zdz(UZ=self.UZ, ZD=ZD) # shape: (p,c,c) 148 | v_g, _ = self.compute_var_components(D=self.D, UZ=self.UZ, ZD=ZD, ZDZ=ZDZ, reml=True) # shape: (p) 149 | elif perm_method == 'x': 150 | self.Uy = self.get_3d_copy(v=self.Uy, batch_size=self.perm) # shape: (p,n,1) 151 | if self.dataset.fixed.shape[1] > 1: 152 | # permute and transform fixed effects 153 | self.UZ = self.permute(data=self.dataset.fixed) # shape: (p,n,c) 154 | self.UZ = self.transform_input(X=self.UZ, U=self.U) # shape: (p,n,c) 155 | # estimate variance components 156 | self.delta = self.estimate_delta_perm(gridlen=100, logdelta_min=-10, logdelta_max=10, reml=True) 157 | self.D = self._d_delta(delta=self.delta, batch_size=self.perm) # shape: (p,1,n) 158 | ZD = self._zd(UZ=self.UZ, D=self.D) # shape: (p,c,n) 159 | ZDZ = self._zdz(UZ=self.UZ, ZD=ZD) # shape: (p,c,c) 160 | v_g, _ = self.compute_var_components(D=self.D, UZ=self.UZ, ZD=ZD, ZDZ=ZDZ, reml=True) # shape: (p) 161 | else: 162 | # reuse UZ, delta, sigma and get 3D copies 163 | self.D = self._d_delta(delta=self.delta, batch_size=self.perm) # shape: (p,1,n) 164 | self.UZ = self.get_3d_copy(v=self.UZ, batch_size=self.perm) # shape: (p,n,c) 165 | ZD = self._zd(UZ=self.UZ, D=self.D) # shape: (p,c,n) 166 | ZDZ = self._zdz(UZ=self.UZ, ZD=ZD) # shape: (p,c,c) 167 | v_g = self.v_g.repeat(self.perm) # shape: (p) 168 | else: 169 | raise Exception('Choose either permutation method x or y.') 170 | # calculate rss for null model 171 | RSS_0 = self.get_rss_h0().repeat(self.perm) # shape: (p) 172 | self.freedom_deg -= 1 173 | if self.device.type != "cpu": 174 | with torch.cuda.device(self.device): 175 | del self.delta 176 | del self.dataset.y 177 | del self.dataset.fixed 178 | torch.cuda.empty_cache() 179 | var_comp_time = time.time() 180 | print("Have variance components. Elapsed time: ", var_comp_time - start) 181 | test_stat = [] 182 | for batch in range(int(np.ceil(self.dataset.n_snps / self.perm_batch_size))): 183 | # set bounds for SNP batch 184 | lower_bound, upper_bound = self._bounds(batch_size=self.perm_batch_size, batch=batch) 185 | # load and transform batch of SNPs 186 | print("\rCalculate perm test statistics for SNPs %d to %d" % (lower_bound, upper_bound), end='') 187 | if perm_method == 'y': 188 | US = self._s_matrix(lower_bound=lower_bound, upper_bound=upper_bound, save_meta=False) # shape: (n,b) 189 | # transform data 190 | US = self.transform_input(X=US, U=self.U) 191 | # get 3D copy of S for permutations 192 | US = self.get_3d_copy(v=US, batch_size=self.perm) # shape: (p,n,b) 193 | else: 194 | US = self._s_matrix(lower_bound=lower_bound, upper_bound=upper_bound, device=torch.device("cpu"), 195 | save_meta=False) # shape: (n,b) 196 | US = self.permute(data=US) # shape: (p,n,b) 197 | # transform data 198 | US = self.transform_input(X=US, U=self.U) # shape: (p,n,b) 199 | # calculate residual sum of squares 200 | RSS = self.get_rss_perm(S=US, ZD=ZD, ZDZ=ZDZ, v_g=v_g) # shape: (p,b) 201 | # calculate test statistics 202 | test_stat.append(self.get_f_score(rss0=torch.t(RSS_0.repeat(RSS.shape[1], 1)), 203 | rss1=RSS).to(torch.device("cpu"))) # shape: (p,b)) 204 | if self.device.type != "cpu": 205 | with torch.cuda.device(self.device): 206 | del RSS 207 | del US 208 | torch.cuda.empty_cache() 209 | test_stat = torch.cat(test_stat, dim=1).to(torch.device("cpu")) # shape: (p,m) 210 | time_test_stats = time.time() 211 | print("\nHave perm test statistics. Elapsed time: ", time_test_stats - var_comp_time) 212 | if adj_p_value: 213 | # calculate permutation-based p-values 214 | self.perm_p_val = self.get_perm_p_value(perm_test_stats=test_stat) # shape: (m) 215 | print("Have adjusted p-values") 216 | # calculate Westfall-Young permutation-based threshold 217 | self.min_p_value = self.get_min_p_value(test_stat=test_stat) # shape: (p) 218 | print("Have minimal p-values. Elapsed time: ", time.time() - time_test_stats) 219 | 220 | def estimate_delta(self, gridlen: int = 100, logdelta_min: int = -10, logdelta_max: int = 10, 221 | reml: bool = True) -> torch.tensor: 222 | """ 223 | Estimate ratio of variance components delta of LMM 224 | Get grid of evenly divided delta values on logarithmic scale and compute neg loglikelihood for each 225 | 226 | :param gridlen: length of grid, default=100 227 | :param logdelta_min: lower bound for delta (log value), default=-10 228 | :param logdelta_max: upper bound for delta (log value), default=10 229 | :param reml: if True use REML estimate, if False use ML, default=True 230 | 231 | :return: optimal delta 232 | """ 233 | deltas = torch.exp(torch.linspace(start=logdelta_min, end=logdelta_max, steps=gridlen + 1, device=self.device)) 234 | neglogs = self.negloglikelihood(delta=deltas, Uy=self.Uy, UZ=self.UZ, reml=reml) 235 | neglogs.to(self.device) 236 | delta_opt = self._minimize(Uy=self.Uy, UZ=self.UZ, deltas=deltas, neglogs=neglogs, gridlen=gridlen, reml=reml) 237 | return delta_opt 238 | 239 | def _minimize(self, Uy: torch.tensor, UZ: torch.tensor, deltas: torch.tensor, neglogs: torch.tensor, 240 | gridlen: int = 100, reml: bool = True) -> torch.tensor: 241 | """ 242 | minimize negative loglikelihood function with brent search 243 | 244 | :param Uy: transformed phenotype vector U'y 245 | :param UZ: transformed vector of fixed effects U'Z 246 | :param deltas: tensor with possible delta values in ascending order 247 | :param neglogs: tensor with negative loglikelihood value for each delta 248 | :param gridlen: length of delta grid, default=100 249 | :param reml: if True use REML estimate, if False use ML, default=True 250 | 251 | :return: optimal delta 252 | """ 253 | tmp = torch.argmin(neglogs) 254 | delta_opt = deltas[tmp] 255 | neglog_opt = neglogs[tmp] 256 | # use brent search for each triple in grid 257 | for i in range(gridlen - 1): 258 | if (neglogs[i + 1] < neglogs[i]) and (neglogs[i + 1] < neglogs[i + 2]): 259 | delta_tmp, neglog_tmp, niters = brent.brent_search(f=self.negloglikelihood, a=deltas[i], 260 | b=deltas[i + 2], x=deltas[i + 1], fx=neglogs[i + 1], 261 | Uy=Uy, UZ=UZ, reml=reml) 262 | if neglog_tmp < neglog_opt: 263 | delta_opt = delta_tmp 264 | neglog_opt = neglog_tmp 265 | return delta_opt 266 | 267 | def negloglikelihood(self, delta: torch.tensor, UZ: torch.tensor, Uy: torch.tensor, reml: bool = True) \ 268 | -> torch.tensor: 269 | """ 270 | compute negative loglikelihood for one delta value or several values in parallel 271 | 272 | :param delta: ratio of variance components 273 | :param UZ: transformed fixed effects U'Z 274 | :param Uy: transformed phenotype U'y 275 | :param reml: if True use REML estimate, if False use ML, default=True 276 | 277 | :return: negative loglikelihood 278 | """ 279 | if delta.ndim == 0: 280 | D = self.D + delta 281 | else: 282 | D = self._d_delta(delta=delta, batch_size=len(delta)) # shape: (b,1,n) 283 | ZD = self._zd(UZ=UZ, D=D) 284 | ZDZ = self._zdz(UZ=UZ, ZD=ZD) 285 | beta = self._beta(ZDZ=ZDZ, ZDy=torch.matmul(ZD, Uy)) 286 | sigma = self._sigma(D=D, Uy=Uy, UZ=UZ, beta=beta, reml=reml) 287 | if D.ndim == 1: 288 | logdetD = torch.sum(torch.log(D)) 289 | else: 290 | logdetD = torch.sum(torch.squeeze(torch.log(D)), 1) 291 | if not reml: 292 | return (self.dataset.n_samples*torch.log(2*torch.pi*sigma) + logdetD + self.dataset.n_samples) / 2 293 | else: 294 | if UZ.ndim == 2: 295 | logdetZ = torch.logdet(torch.matmul(torch.t(UZ), UZ)) 296 | elif UZ.ndim == 3: 297 | logdetZ = torch.logdet(torch.matmul(torch.transpose(UZ, dim0=1, dim1=2), UZ)) 298 | else: 299 | logdetZ = torch.logdet(torch.matmul(torch.transpose(UZ, dim0=2, dim1=3), UZ)) 300 | logdetZDZ = torch.logdet(ZDZ) 301 | return (self.freedom_deg*torch.log(2*torch.pi*sigma) + logdetD + self.freedom_deg - logdetZ + logdetZDZ) / 2 302 | 303 | def compute_var_components(self, D: torch.tensor, UZ: torch.tensor, ZD: torch.tensor, ZDZ: torch.tensor, 304 | reml: bool = True) -> tuple: 305 | """ 306 | Compute variance components v_g^2 and v_e^2 with Var(y) = v_g^2K + v_e^2I 307 | 308 | :param D: vector with eigenvalues of K 309 | :param UZ: transformed fixed effects U'Z 310 | :param ZD: precomputed matrix product of (U'Z)'D^-1 311 | :param ZDZ: precomputed matrix product of (U'Z)'D^-1(U'Z) 312 | :param reml: if True use REML estimate, if False use ML, default=True 313 | 314 | :return: v_g^2 and v_e^2 315 | """ 316 | beta = self._beta(ZDZ=ZDZ, ZDy=torch.matmul(ZD, self.Uy)) 317 | v_g = self._sigma(D=D, Uy=self.Uy, UZ=UZ, beta=beta, reml=reml) 318 | v_e = self.delta * v_g 319 | return v_g, v_e 320 | 321 | def get_rss_h0(self, sigma_opt: bool = True, reml: bool = True) -> torch.tensor: 322 | """ 323 | Compute residual sum of squares of H0 (marker has no effect on phenotype), 324 | i.e. for fixed effects Z, covariance matrix V and phenotype y compute: 325 | b = (Z'V^{-1}Z)^{-1}Z'V^{-1}y 326 | rss = (y-Zb)'V^{-1}(y-Zb) 327 | note that for optimal sigma_g rss=n-c (REML) or rss=n (ML) 328 | 329 | :param sigma_opt: if True return degrees of freedom, default is True 330 | :param reml: if True use REML estimate, if False use ML, default=True 331 | 332 | :return: residual sum of squares 333 | """ 334 | if sigma_opt: 335 | if reml: 336 | return torch.tensor(self.dataset.n_samples - self.dataset.fixed.shape[1], device=self.device) 337 | else: 338 | return torch.tensor(self.dataset.n_samples, device=self.device) 339 | else: 340 | raise NotImplementedError 341 | 342 | def get_rss_and_se(self, D: torch.tensor, S: torch.tensor, ZD: torch.tensor, ZDZ: torch.tensor) -> tuple: 343 | """ 344 | Compute residual sum of squares of alternative hypothesis (marker has effect on phenotype), 345 | i.e. for a 3D tensor with batches of fixed effects X and 3D tensor with copies of phenotype y: 346 | beta = (X'D^{-1}X)^{-1}X'D^{-1}y 347 | rss = (y-Xbeta)'D^{-1}(y-Xbeta) 348 | Use block-wise computation for beta, i.e., for computation of beta use the fact that X=[Z,s] for fixed 349 | effects Z and SNP s. 350 | 351 | :param D: vector with eigenvalues of K + ratio of variance components delta; shape: (n) 352 | :param S: matrix containing several markers in batches; shape: (n,b) 353 | :param ZD: precomputed matrix product of (U'Z)'D^-1; shape: (c,n) 354 | :param ZDZ: precomputed matrix product of (U'Z)'D^-1(U'Z); shape: (c,c) 355 | 356 | :return: residual sum of squares, standard error and effect size in batches 357 | """ 358 | batch_size = S.shape[1] 359 | # get (X'D^{-1}X)^{-1} 360 | SD, XDX = self._xdx(D=D, S=S, ZD=ZD, ZDZ=ZDZ) 361 | XDX = torch.linalg.pinv(XDX, hermitian=True) 362 | # compute Z'Dy 363 | ZDy = self.get_3d_copy(v=torch.matmul(ZD, self.Uy), batch_size=batch_size) # shape: (b,c,1) 364 | # compute X'Dy 365 | SD = torch.matmul(SD, self.Uy).reshape(batch_size, 1, 1) # shape: (b,1,1) 366 | # put together 3D tensor 367 | SD = torch.cat((ZDy, SD), dim=1) # shape: (b,c+1,1) 368 | # compute beta 369 | beta = torch.matmul(XDX, SD) # shape: (b,c+1,1) 370 | # compute rss 371 | S = self._x_batch(X=S, fixed=self.UZ) # shape (b,n,c+1) 372 | S = torch.matmul(S, beta) # shape (b,n,1) 373 | S = self.get_3d_copy(v=self.Uy, batch_size=batch_size) - S # shape (b,n,1) 374 | resD = torch.div(S, torch.unsqueeze(D, 1)) 375 | S = torch.squeeze(torch.matmul(torch.transpose(resD, dim0=1, dim1=2), S)) / self.v_g 376 | # get standard error 377 | diag = torch.diagonal(XDX, dim1=1, dim2=2)[:, -1] 378 | se = torch.sqrt(self.v_g * diag) 379 | return S, se, torch.squeeze(beta[:, -1]) 380 | 381 | def get_f_score(self, rss0: torch.tensor, rss1: torch.tensor) -> torch.tensor: 382 | """ 383 | Compute tensor of test statistics 384 | 385 | :param rss0: residual sum of squares of H0: marker has no effect on phenotype 386 | :param rss1: residual sum of squares of H1: marker has effect on phenotype 387 | 388 | :return: F1 score 389 | """ 390 | return self.freedom_deg * (rss0 - rss1) / rss1 391 | 392 | def get_p_value(self, f_score: float) -> float: 393 | """ 394 | Compute p-value using survival function of f distribution 395 | 396 | :param f_score: F1 score 397 | 398 | :return: p-value 399 | """ 400 | return stats.f.sf(f_score, 1, self.freedom_deg) 401 | 402 | # functions for permutations 403 | def estimate_delta_perm(self, gridlen: int = 100, logdelta_min: int = -10, logdelta_max: int = 10, 404 | reml: bool = True) -> torch.tensor: 405 | """ 406 | Estimate ratio of variance components delta of LMM for permutations 407 | Get grid of evenly divided delta values on logarithmic scale and compute neg loglikelihood for each 408 | 409 | :param gridlen: length of grid, default=100 410 | :param logdelta_min: lower bound for delta (log value), default=-10 411 | :param logdelta_max: upper bound for delta (log value), default=10 412 | :param reml: if True use REML estimate, if False use ML, default=True 413 | 414 | :return: tensor with optimal delta for each permutation 415 | """ 416 | deltas = torch.exp(torch.linspace(start=logdelta_min, end=logdelta_max, steps=gridlen + 1, device=self.device)) 417 | if self.UZ.ndim == 2: 418 | # for perm method y: same U'Z for each permutation 419 | neglogs = self.negloglikelihood(delta=deltas, Uy=self.get_4d_copy(v=self.Uy, batch_size=len(deltas)), 420 | UZ=self.UZ, reml=reml) 421 | else: 422 | # for perm method x: have different U'Z for each permutation 423 | neglogs = self.negloglikelihood(delta=deltas, Uy=self.get_4d_copy(v=self.Uy, batch_size=len(deltas)), 424 | UZ=self.get_4d_copy(v=self.UZ, batch_size=len(deltas)), reml=reml) 425 | neglogs.to(self.device) 426 | delta_opt = [] 427 | if self.UZ.ndim == 2: 428 | # for perm method y: same U'Z for each permutation 429 | for i in range(self.perm): 430 | delta_opt.append(self._minimize(Uy=self.Uy[i, :, 0], UZ=self.UZ, deltas=deltas, neglogs=neglogs[i, :], 431 | gridlen=100, reml=True)) 432 | else: 433 | # for perm method x: have different U'Z for each permutation 434 | for i in range(self.perm): 435 | delta_opt.append(self._minimize(Uy=self.Uy[i, :, 0], UZ=self.UZ[i, :, :], deltas=deltas, 436 | neglogs=neglogs[i, :], gridlen=100, reml=True)) 437 | return torch.tensor(delta_opt, device=self.device) 438 | 439 | def get_rss_perm(self, S: torch.tensor, ZD: torch.tensor, ZDZ: torch.tensor, v_g: torch.tensor) -> torch.tensor: 440 | """ 441 | Compute residual sum of squares of alternative hypothesis (marker has effect on phenotype) with permutations, 442 | i.e. for a 4D tensor with copies of batches of fixed effects Z and markers S and 4D tensor with copies of 443 | permutations of phenotype y: 444 | b = (X'X)^{-1}X'y 445 | rss = (y-Xb)'(y-Xb) 446 | Use block-wise computation for beta, i.e., for computation of beta use the fact that X=[Z,s] for fixed 447 | effects Z and SNP s. 448 | 449 | :param S: matrix containing batch of markers, shape: (p,n,b) 450 | :param ZD: 3D tensor containing matrix product (U'Z)'D^{-1} for each permutation, shape: (p,c,n) 451 | :param ZDZ: 3D tensor containing matrix product (U'Z)'D^{-1}(U'Z) for each permutation, shape: (p,c,c) 452 | :param v_g: tensor containing genetic variance component for each permutation, shape: (p) 453 | 454 | :return: residual sum of squares in batches 455 | """ 456 | batch_size = S.shape[2] 457 | y_batch = self.get_4d_copy(v=self.Uy, batch_size=batch_size) # shape: (p,b,n,1) 458 | beta = self._beta_perm(S=S, ZD=ZD, ZDZ=ZDZ, y_batch=y_batch, batch_size=batch_size) # shape: (p,b,c+1,1) 459 | # compute residuals 460 | S = self._x_batch(X=S, fixed=self.UZ) # shape: (p,b,n,c+1) 461 | S = y_batch - torch.matmul(S, beta) # shape: (p,b,n,1) 462 | # compute residual sum of squares 463 | rss = torch.div(torch.transpose(S, dim0=2, dim1=3), self.get_4d_copy(v=self.D, batch_size=batch_size)) 464 | rss = torch.squeeze(torch.matmul(rss, S)) # shape: (p,b) 465 | return torch.t(torch.div(torch.t(rss), torch.unsqueeze(v_g, dim=0))) 466 | 467 | def get_perm_p_value(self, perm_test_stats: torch.tensor) -> torch.tensor: 468 | """ 469 | Compute permutation-based p-values via 470 | p = R/(qm) with R being the number of permuted test statistics bigger than the observed test statistic 471 | 472 | :param perm_test_stats: matrix containing test-statistics for all permutations and SNPs, dim (p,m) 473 | 474 | :return: adjusted p-values 475 | """ 476 | sorted_test_stats, ind = torch.sort(perm_test_stats.flatten()) 477 | n = sorted_test_stats.shape[0] 478 | test_stats_ind = torch.searchsorted(sorted_test_stats.contiguous(), self.test_stat.contiguous(), right=True) 479 | adj_p_value = ((n - test_stats_ind) / n).type(torch.float64) 480 | return torch.where(adj_p_value == 0., 1 / n, adj_p_value) 481 | 482 | def get_min_p_value(self, test_stat: torch.tensor) -> torch.tensor: 483 | """ 484 | Compute minimal p-values for each permutation: 485 | First search the maximal test statistic for each permutation, since the survival function is decreasing, this 486 | gives the minimal p-value 487 | 488 | :param test_stat: matrix containing test-statistics for all permutations and SNPs, dim (p,m) 489 | 490 | :return: vector containing the minimal p-value for each permutation 491 | """ 492 | max_test_stats, _ = torch.max(test_stat, dim=1) 493 | min_p_val = [] 494 | for test in max_test_stats: 495 | min_p_val.append(self.get_p_value(f_score=test)) 496 | return torch.tensor(min_p_val) 497 | 498 | # functions to compute intermediate results 499 | @staticmethod 500 | def _zd(UZ: torch.tensor, D: torch.tensor) -> torch.tensor: 501 | """ 502 | Compute (U'Z)'D^{-1} for fixed effects Z of shape (n,c) or (p,n,c) 503 | 504 | :param UZ: transformed fixed effects U'Z 505 | :param D: vector with eigenvalues of K + ratio of variance components delta 506 | 507 | :return: Z'D^{-1} 508 | """ 509 | if UZ.ndim == 2: 510 | return torch.div(torch.t(UZ), D) 511 | elif UZ.ndim == 3: 512 | return torch.div(torch.transpose(UZ, dim0=1, dim1=2), D) 513 | elif UZ.ndim == 4: 514 | return torch.div(torch.transpose(UZ, dim0=2, dim1=3), D) 515 | 516 | @staticmethod 517 | def _zdz(UZ: torch.tensor, ZD: torch.tensor) -> torch.tensor: 518 | """ 519 | Compute (U'Z)'D^{-1}(U'Z) for fixed effects Z of shape (c,c) or (p,c,c) 520 | 521 | :param UZ: transformed fixed effects U'Z 522 | :param ZD: precomputed (U'Z)'D^{-1} 523 | 524 | :return: (U'Z)'D^{-1}(U'Z) 525 | """ 526 | return torch.matmul(ZD, UZ) 527 | 528 | @staticmethod 529 | def _beta(ZDZ: torch.tensor, ZDy: torch.tensor) -> torch.tensor: 530 | """ 531 | compute effect size beta = ((U'Z)'D^-1(U'Z))^-1(U'Z)'D^-1(U'y) 532 | 533 | :param ZDZ: precomputed matrix product of (U'Z)'D^-1(U'Z) 534 | :param ZDy: precomputed matrix product of (U'Z)'D^-1(U'y) 535 | 536 | :return: beta 537 | """ 538 | return torch.linalg.solve(ZDZ, ZDy) 539 | 540 | def _sigma(self, D: torch.tensor, Uy: torch.tensor, UZ: torch.tensor, beta: torch.tensor, reml: bool = True) \ 541 | -> torch.tensor: 542 | """ 543 | compute variance component v_g^2 = ((U'y)-(U'Z)beta)'D^-1((U'y)-(U'Z)beta)/(n-c) 544 | 545 | :param D: vector with eigenvalues of K + ratio of variance components delta 546 | :param Uy: transformed phenotype U'y, shape (n) 547 | :param UZ: transformed fixed effects U'Z, shape (n,c) 548 | :param beta: effect size, shape (c) 549 | :param reml: if True use REML estimate, if False use ML, default=True 550 | 551 | :return: v_g^2 552 | """ 553 | if D.ndim == 3: 554 | if Uy.ndim == 1: 555 | Uy = self.get_3d_copy(v=Uy, batch_size=D.shape[0]) 556 | if beta.ndim == 2: 557 | beta = torch.unsqueeze(beta, 2) 558 | res = Uy - torch.matmul(UZ, beta) 559 | res = torch.multiply(res, res) 560 | if D.ndim == 1: 561 | res = torch.sum(torch.div(res, D)) 562 | elif res.ndim == 3: 563 | res = torch.div(torch.transpose(res, dim0=1, dim1=2), D) 564 | res = torch.sum(torch.squeeze(res), 1) 565 | elif res.ndim == 4: 566 | res = torch.div(torch.transpose(res, dim0=2, dim1=3), D) 567 | res = torch.sum(torch.squeeze(res), 2) 568 | if not reml: 569 | return res / self.dataset.n_samples 570 | else: 571 | return res / self.freedom_deg 572 | 573 | def _xdx(self, D: torch.tensor, S: torch.tensor, ZD: torch.tensor, ZDZ: torch.tensor) -> tuple: 574 | """ 575 | Compute (X'D^{-1}X)^{-1} for X=([Z,s_i],...,[Z,s_{i+b-1}]) of shape (b,n,c+1) for fixed effects Z of shape (n,c) 576 | and SNPs s_j 577 | For permutations compute 4D version 578 | 579 | :param D: vector with eigenvalues of K + ratio of variance components delta; shape: (n) or (p,1,n) 580 | :param S: matrix with batch of b SNPs (n,b) or (p,n,b) 581 | :param ZD: Z'D^{-1} for fixed effects Z and matrix of eigenvalues+delta D (c,n) or (p,c,n) for perm 582 | :param ZDZ: Z'D^{-1}Z for fixed effects Z and matrix of eigenvalues+delta D (c,c) or (p,c,c) for perm 583 | 584 | :return: S'D^{-1} and (X'D^{-1}X)^{-1} 585 | """ 586 | if ZD.ndim == 2: 587 | batch_size = S.shape[1] 588 | # compute Z'Ds_i for each SNP s_i in batches 589 | ZDS = torch.unsqueeze(torch.t(torch.matmul(ZD, S)), dim=2) # shape: (b,c,1) 590 | # compute s_iDs_i for all SNPs in batch 591 | SD = torch.unsqueeze(torch.div(torch.t(S), D), dim=1) # shape: (b,1,n) 592 | XDX = torch.bmm(SD, torch.unsqueeze(torch.t(S), dim=2)) # shape: (b,1,1) 593 | # put together 3D tensor for XDX 594 | XDX = torch.cat((torch.cat((self.get_3d_copy(v=ZDZ, batch_size=batch_size), ZDS), dim=2), 595 | torch.cat((torch.transpose(ZDS, dim0=1, dim1=2), XDX), dim=2)), dim=1) # shape: (b,c+1,c+1) 596 | elif ZD.ndim == 3: 597 | batch_size = S.shape[2] 598 | # get 4D copy of ZDZ for batch 599 | ZDZ_4d = self.get_4d_copy(v=ZDZ, batch_size=batch_size) # shape: (p,b,c,c) 600 | # compute Z'D^{-1}S 601 | ZDS = torch.unsqueeze(torch.transpose(torch.matmul(ZD, S), dim0=1, dim1=2), 3) # shape: (p,b,c,1) 602 | # compute S'D^{-1}S 603 | St = torch.transpose(S, dim0=1, dim1=2) # shape: (p,b,n) 604 | SD = torch.unsqueeze(torch.divide(St, self.D), dim=2) # shape: (p,b,1,n) 605 | # compute S'D^{-1}S 606 | XDX = torch.matmul(SD, torch.unsqueeze(St, dim=3)) 607 | # put together X'D^{-1}X 608 | XDX = torch.concat((torch.transpose(ZDS, dim0=2, dim1=3), XDX), dim=3) 609 | XDX = torch.concat((torch.concat((ZDZ_4d, ZDS), dim=3), XDX), dim=2) # shape: (p,b,c+1,c+1) 610 | else: 611 | raise Exception('Can only compute XDX for 2D or 3D version of ZD.') 612 | return SD, XDX 613 | 614 | def _beta_perm(self, S: torch.tensor, ZD: torch.tensor, ZDZ: torch.tensor, y_batch: torch.tensor, batch_size: int) \ 615 | -> torch.tensor: 616 | """ 617 | Compute betas for permutations in 4D tensor using block-wise computations 618 | 619 | :param S: matrix containing batch of markers, shape: (p,n,b) 620 | :param ZD: 3D tensor containing matrix product (U'Z)'D^{-1} for each permutation, shape: (p,c,n) 621 | :param ZDZ: 3D tensor containing matrix product (U'Z)'D^{-1}(U'Z) for each permutation, shape: (p,c,c) 622 | :param y_batch: 4D copy of permutations of phenotype vector, shape: (p,b,n,1) 623 | :param batch_size: number of markers 624 | 625 | :return: 4D tensor with beta values for all markers nad permutations, shape: (p,b,c+1,1) 626 | """ 627 | # get S'D^{-1}S and X'D^{-1}X 628 | SD, XDX = self._xdx(D=self.D, S=S, ZD=ZD, ZDZ=ZDZ) # shape: (p,b,1,n), (p,b,c+1,c+1) 629 | # get X'D^{-1}y 630 | XDy = self.get_4d_copy(v=torch.matmul(ZD, self.Uy), batch_size=batch_size) # shape: (p,b,c,1) 631 | SD = torch.matmul(SD, y_batch) # shape: (p,b,1,1) 632 | XDy = torch.concat((XDy, SD), dim=2) # shape: (p,b,c+1,1) 633 | # get beta of shape: (p,b,c+1,1) 634 | return self._beta(ZDZ=XDX, ZDy=XDy) 635 | 636 | # functions for data transformation 637 | @staticmethod 638 | def transform_input(X: torch.tensor, U: torch.tensor) -> torch.tensor: 639 | """ 640 | compute U'X 641 | 642 | :param X: input vector/matrix 643 | :param U: input matrix 644 | 645 | :return: product with transpose 646 | """ 647 | return torch.matmul(torch.t(U), X) 648 | 649 | def _d_delta(self, delta: torch.tensor, batch_size: int): 650 | """ 651 | get 3D tensor with D + delta*I as batches for diagonal matrix with eigenvalues D and different variance 652 | component ratios delta. If delta is one value, return tensor with b copies of D+delta. 653 | 654 | :param delta: variance component ratio shape: (b) or (1) 655 | :param batch_size: number of needed copies of D 656 | 657 | :return: D + delta of shape (b,1,n) 658 | """ 659 | if delta.ndim == 1: 660 | return torch.unsqueeze(self.D.repeat(batch_size, 1) + torch.unsqueeze(delta, 1), 1) 661 | else: 662 | return torch.unsqueeze((self.D + delta).repeat(batch_size, 1), 1) 663 | 664 | def _s_matrix(self, lower_bound: int, upper_bound: int, device=None, save_meta: bool = True) -> torch.tensor: 665 | """ 666 | load batch of markers to specified device 667 | 668 | :param lower_bound: lower bound of marker batch 669 | :param upper_bound: upper bound of marker batch 670 | :param device: either cpu or cuda device 671 | :param save_meta: if genotype is loaded batch-wise, set to False for permutations to prevent saving of meta info 672 | 673 | :return: matrix with markers of shape (n,upper_bound-lower_bound) 674 | """ 675 | if device is None: 676 | device = self.device 677 | if self.dataset.X is None: 678 | # load X batch-wise 679 | self.dataset.load_genotype_batch_wise(device=device, save_meta=save_meta, snp_lower_index=lower_bound, 680 | snp_upper_index=upper_bound) # shape: (n,b) 681 | S = self.dataset.X # shape: (n,b) 682 | self.dataset.reset_genotype() 683 | else: 684 | # get X_batch if X was completely loaded before 685 | S = self.dataset.X[:, lower_bound:upper_bound].to(device) # shape: (n,b) 686 | return S 687 | 688 | def _x_batch(self, X: torch.tensor, fixed: torch.tensor) -> torch.tensor: 689 | """ 690 | Create 3D or 4D tensor where each matrix in the 3D tensor contains the same fixed effects and a different SNP, 691 | and the 4D tensor contains copies of the 3D tensors 692 | 693 | :param X: genotype matrix/tensor of shape (n,b) or (p,n,b) 694 | :param fixed: matrix/tensor of fixed effects of shape (n,c) or (p,n,c) 695 | 696 | :return: tensor of shape (b,n,c+1) or (p,b,n,c+1) 697 | """ 698 | if X.ndim == 2: 699 | b = self.get_3d_copy(v=fixed, batch_size=X.shape[1]) 700 | return torch.cat((b, torch.transpose(torch.unsqueeze(X, 0), 0, 2)), dim=2) 701 | elif X.ndim == 3: 702 | b = self.get_4d_copy(v=fixed, batch_size=X.shape[2]) 703 | return torch.cat((b, torch.unsqueeze(torch.transpose(X, dim0=1, dim1=2), 3)), dim=3) 704 | 705 | @staticmethod 706 | def get_3d_copy(v: torch.tensor, batch_size: int) -> torch.tensor: 707 | """ 708 | Create 3D tensor with copies of input tensor 709 | 710 | :param v: vector/matrix of shape (n) or (n,c) 711 | :param batch_size: batch size of new 3D tensor 712 | 713 | :return: tensor of copies of v with shape (batch_size,n,1) or (batch_size,n,c) 714 | """ 715 | if v.ndim == 1: 716 | return torch.unsqueeze(v.expand(batch_size, v.shape[0]), 2) 717 | if v.ndim == 2: 718 | return v.expand(batch_size, v.shape[0], v.shape[1]) 719 | 720 | @staticmethod 721 | def get_4d_copy(v: torch.tensor, batch_size: int) -> torch.tensor: 722 | """ 723 | Create 4D tensor with copies of input tensor 724 | 725 | :param v: tensor of shape (p,n,c) 726 | :param batch_size: batch size of new 4D tensor 727 | 728 | :return: tensor of copies of v with shape (p,b,n,c) 729 | """ 730 | return torch.transpose(v.expand(batch_size, v.shape[0], v.shape[1], v.shape[2]), dim0=0, dim1=1) 731 | 732 | # helper functions 733 | def _bounds(self, batch_size: int, batch: int) -> tuple: 734 | """ 735 | compute upper and lower bound for natch-wise computations 736 | 737 | :param batch_size: number of markers within batch 738 | :param batch: number of batch 739 | 740 | :return: lower and upper bound 741 | """ 742 | lower_bound = batch * batch_size 743 | upper_bound = (batch + 1) * batch_size 744 | if upper_bound > self.dataset.n_snps: 745 | upper_bound = self.dataset.n_snps 746 | return lower_bound, upper_bound 747 | -------------------------------------------------------------------------------- /optimize/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/grimmlab/permGWAS/3f7a1c2e3e4c63281f5719425ff9ac405f8d9cfc/optimize/__init__.py -------------------------------------------------------------------------------- /optimize/brent.py: -------------------------------------------------------------------------------- 1 | # Brent's method 2 | 3 | def brent_search(f, a: float, b: float, x: float = None, fx: float = None, rel_tol: float = 1.48e-08, 4 | abs_tol: float = 1.48e-08, max_iter: int = 500, **kwargs) -> tuple: 5 | """ 6 | Find minimum of a function using Brent's method (see Numerical Recipes 3rd Edition: The Art of Scientific Computing) 7 | Given a function f with minimum in interval [a,b], find local minimum. 8 | 9 | :param f: function to be minimized 10 | :param a: lower bound of interval 11 | :param b: upper bound of interval 12 | :param x: starting point (initial guess of minimum) 13 | :param fx: function value of f 14 | :param rel_tol: relative tolerance, default=1.48e-08 15 | :param abs_tol: absolute tolerance, default=1.48e-08 16 | :param max_iter: maximal number of iterations, default=500 17 | :param kwargs: additional arguments of f 18 | 19 | :return: minimum x, function value of minimum f(x) and number of iterations 20 | """ 21 | 22 | golden = 0.381966011250105097 23 | if a > b: 24 | raise ValueError('Interval boundaries do not fit. a must be smaller or equal to b.') 25 | if x is None: 26 | x = a + golden * (b-a) 27 | if fx is None: 28 | fx = f(x, **kwargs) 29 | if not (a <= x <= b): 30 | raise ValueError('Starting value x needs to be within interval boundaries.') 31 | 32 | # initialize values 33 | x_sec, fx_sec = x, fx # second best value and function value 34 | x_trd, fx_trd = x, fx # third best value and function value 35 | d, e = 0.0, 0.0 # step size and direction of last two iterations 36 | i = -1 37 | 38 | for i in range(max_iter): 39 | mid = 0.5 * (a + b) 40 | tol1 = rel_tol * abs(x) + abs_tol 41 | tol2 = 2.0 * tol1 42 | 43 | # check stopping crit 44 | if abs(x - mid) <= tol2 - 0.5 * (b - a): 45 | break 46 | 47 | # compute Lagrange polynomial through (x, f(x)), (x_sec, f(x_sec)) and (x_trd, f(x_trd)) 48 | if abs(e) > tol1: 49 | tmp1 = (x - x_sec) * (fx - fx_trd) 50 | denominator = (x - x_trd) * (fx - fx_sec) 51 | numerator = (x - x_trd) * denominator - (x - x_sec) * tmp1 52 | denominator = 2.0 * (denominator - tmp1) 53 | if denominator > 0.0: 54 | numerator = -numerator 55 | denominator = abs(denominator) 56 | tmp1 = e 57 | e = d 58 | 59 | if (abs(numerator) >= abs(0.5 * denominator * tmp1)) or (numerator <= denominator * (a-x)) or \ 60 | (numerator >= denominator * (b-x)): 61 | # golden section step 62 | e = b-x if x < mid else a-x 63 | d = golden * e 64 | else: 65 | # polynomial interpolation step 66 | d = numerator / denominator 67 | x_new = x + d 68 | if (x_new - a < tol2) or (b - x_new < tol2): 69 | d = tol1 if x < mid else -tol1 70 | else: 71 | # golden section step 72 | e = b - x if x < mid else a - x 73 | d = golden * e 74 | 75 | # function must not be evaluated too close to x 76 | if tol1 <= abs(d): 77 | x_new = x + d 78 | elif 0.0 < d: 79 | x_new = x + tol1 80 | else: 81 | x_new = x - tol1 82 | fx_new = f(x_new, **kwargs) 83 | 84 | # check if x_new is better than previous x 85 | if fx_new <= fx: 86 | # decrease interval size 87 | if x_new >= x: 88 | a = x 89 | else: 90 | b = x 91 | # replace previous best 3 with current best 3 92 | x_trd, fx_trd = x_sec, fx_sec 93 | x_sec, fx_sec = x, fx 94 | x, fx = x_new, fx_new 95 | else: 96 | # decrease interval size 97 | if x_new < x: 98 | a = x_new 99 | else: 100 | b = x_new 101 | # check if x_new better than second or third and replace accordingly 102 | if fx_new <= fx_sec or x_sec == x: 103 | x_trd, fx_trd = x_sec, fx_sec 104 | x_sec, fx_sec = x_new, fx_new 105 | elif fx_new <= fx_trd or x_trd == x or x_trd == x_sec: 106 | x_trd, fx_trd = x_new, fx_new 107 | 108 | return x, fx, i+1 109 | -------------------------------------------------------------------------------- /perform_gwas.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | import time 3 | import torch 4 | 5 | from preprocess import data_loader 6 | from utils import helper_functions 7 | 8 | 9 | def run(genotype_file: pathlib.Path, phenotype_file: pathlib.Path, model: str, trait: str = 'phenotype_value', 10 | kinship_file: pathlib.Path = None, covariate_file: pathlib.Path = None, covariate_list: list = None, 11 | maf_threshold: int = 0, load_genotype: bool = False, 12 | out_dir: pathlib.Path = pathlib.Path.cwd().joinpath('results'), out_file: str = None, 13 | device: torch.device = torch.device('cpu'), perm: int = 0, perm_method: str = 'x', 14 | adj_p_value: bool = False, batch_size: int = 50000, perm_batch_size: int = 1000, manhattan: bool = False, 15 | qqplot: bool = False, not_add: bool = False): 16 | # check user specified arguments 17 | start = time.time() 18 | print('Start loading data now') 19 | 20 | # load data 21 | dataset = data_loader.Dataset(genotype_file=genotype_file, phenotype_file=phenotype_file, trait=trait, 22 | maf_threshold=maf_threshold, load_genotype=load_genotype, kinship_file=kinship_file, 23 | covariate_file=covariate_file, covariate_list=covariate_list, not_add=not_add) 24 | dataset.to_device(device=device) 25 | have_data = time.time() 26 | print('Loaded data, elapsed time: %f s.' % (have_data - start)) 27 | print('Start performing GWAS on phenotype %s for %d samples and %d SNPs.' 28 | % (trait, dataset.n_samples, dataset.n_snps)) 29 | 30 | # perform GWAS 31 | gwas_model = helper_functions.get_model_class_name(model_name=model)(dataset=dataset, batch_size=batch_size, 32 | device=device, perm=perm, 33 | perm_batch_size=perm_batch_size) 34 | gwas_model.gwas() 35 | done_gwas = time.time() 36 | print('Done performing GWAS on phenotype %s for %d samples and %d SNPs.\n' 37 | 'Elapsed time: %f s' % (trait, dataset.n_samples, len(dataset.positions), done_gwas - have_data)) 38 | 39 | # perform GWAS with permutations 40 | if perm > 0: 41 | print('Start performing GWAS with %d permutations.' % perm) 42 | gwas_model.perm_gwas(perm_method=perm_method, adj_p_value=adj_p_value) 43 | done_perm = time.time() 44 | print('Done performing GWAS with %d permutations.\n' 45 | 'Elapsed time: %f s' % (perm, done_perm - done_gwas)) 46 | 47 | # save results 48 | print('Save results.') 49 | gwas_model.save_results(data_dir=out_dir, filename=out_file) 50 | total_time = time.time() - start 51 | print('Total time: ', total_time) 52 | 53 | # plots 54 | if manhattan: 55 | print('Save Manhattan plot with significance level of 5%.') 56 | gwas_model.manhattan_plot(data_dir=out_dir, filename=out_file, sig_level=5) 57 | total_time = time.time() - start 58 | if qqplot: 59 | print('Save QQ-plot.') 60 | gwas_model.qq_plot(data_dir=out_dir, filename=out_file) 61 | total_time = time.time() - start 62 | 63 | # summary statistics 64 | if not load_genotype: 65 | # reset number of SNPs in case of batch-wise loading 66 | dataset.n_snps = len(dataset.positions) 67 | helper_functions.get_summary_stats(out_dir=out_dir, out_file=out_file, genotype_file=genotype_file, 68 | phenotype_file=phenotype_file, trait=trait, samples=dataset.n_samples, 69 | snps=dataset.n_snps, model=model, maf_threshold=maf_threshold, perm=perm, 70 | v_g=gwas_model.v_g.item(), v_e=gwas_model.v_e.item(), 71 | min_p_val=gwas_model.min_p_value, time=total_time, kinship_file=kinship_file, 72 | covariate_file=covariate_file, covariate_list=covariate_list, 73 | perm_method=perm_method) 74 | -------------------------------------------------------------------------------- /permGWAS.py: -------------------------------------------------------------------------------- 1 | # run the script here 2 | import argparse 3 | import pathlib 4 | 5 | from utils import check_functions 6 | import perform_gwas 7 | 8 | 9 | if __name__ == "__main__": 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('-x', '--genotype_file', type=str, default=None, 12 | help='Specify the full path to the genotype file, absolute and relative paths are accepted, ' 13 | 'only accept .h5, .hdf5, .h5py, .csv, PLINK and binary PLINK files, ' 14 | 'PLINK and binary PLINK: all required files must be in the same folder with same prefix. ' 15 | 'See documentation for correct format.') 16 | parser.add_argument('-y', '--phenotype_file', type=str, default=None, 17 | help='Specify the full path to the phenotype file, absolute and relative paths are ' 18 | 'accepted, only accept .csv, .txt and .pheno files. See documentation for correct format.') 19 | parser.add_argument('-trait', '--trait', '--y_name', nargs='+', type=str, default=['phenotype_value'], 20 | help='Specify the name of phenotype (column) to be used in phenotype file,' 21 | 'default is "phenotype_value". You can run permGWAS on several phenotypes one after ' 22 | 'another if they are in the same phenotype_file. Juste name the phenotypes, ' 23 | 'e.g. --trait pheno1 pheno2 if you want to use all available traits use --trait all') 24 | parser.add_argument('-k', '--kinship_file', '--k', '--kinship', type=str, default=None, 25 | help='Specify the the full path to the kinship file, absolute and relative paths are accepted,' 26 | 'only accept .csv and .h5/.h5py/.hdf5 files. See documentation for correct format. ' 27 | 'Optional, if not provided realized relationship kernel will be calculated') 28 | parser.add_argument('-cov', '--covariate_file', '--cov', '--cov_file', type=str, default=None, 29 | help='Specify the full path to the covariates file, absolute and relative paths are accepted,' 30 | 'currently only accept .csv files. Optional, if not provided only intercept will be used ' 31 | 'as fixed effect.') 32 | parser.add_argument('-cov_list', '--covariate_list', nargs='+', type=str, default=None, 33 | help='Specify the covariates (column headers) to use from the covariates file. Optional, if ' 34 | 'not provided, will use all available columns as covariates.') 35 | parser.add_argument('-maf', '--maf_threshold', '--maf', type=int, choices=range(0, 31), default=0, 36 | help='Specify minor allele frequency threshold as percentage value. ' 37 | 'Optional, if not provided no maf filtering will be performed.') 38 | parser.add_argument('-load_genotype', action='store_true', 39 | help='If used, genotype matrix will be completely loaded from file during preprocessing. ' 40 | 'Otherwise load genotype batch-wise during computations of test statistics. ' 41 | 'Batch-wise loading is only possible, if kinship file is provided. Default is False') 42 | parser.add_argument('-config', '--config_file', type=str, default=None, 43 | help='Specify the full path to the yaml config file. Specify all required arguments to use in ' 44 | 'this config file and just give the config file instead of all required parameters. ' 45 | 'For more info regarding the required format see the documentation.') 46 | parser.add_argument('-model', type=str, default='lmm', 47 | help='Specify the model to use for GWAS. Currently only lmm (linear mixed model) is ' 48 | 'implemented.') 49 | parser.add_argument('-out_dir', '--out_dir', type=str, default=pathlib.Path.cwd().joinpath('results'), 50 | help='Specify the name of the directory result-files should be stored in,' 51 | 'absolute and relative paths are accepted. Optional, if not provided, files will be ' 52 | 'stored in folder "results" in current directory,') 53 | parser.add_argument('-out_file', '--out_file', type=str, default=None, 54 | help='Specify NAME of result files, will be stored as p_values_NAME and min_p_values_NAME,' 55 | 'optional, if not provided name of phenotype will be used. If you run permGWAS with ' 56 | 'several phenotypes, will always use name of phenotype.') 57 | parser.add_argument('-disable_gpu', action='store_true', 58 | help='If used, GPUs will be disabled and only CPUs will be used for computations.') 59 | parser.add_argument('-device', '--device', type=int, default=0, 60 | help='Specify GPU device to be used, default is 0.') 61 | parser.add_argument('-perm', '--perm', type=int, default=0, 62 | help='Specify the number of permutations (integer value) to be performed, optional, if not ' 63 | 'provided no permutations will be performed') 64 | parser.add_argument('-perm_method', type=str, default='x', 65 | help='Specify the method to use for permutations: x or y,' 66 | 'for x permute fixed effects matrix including SNP of interest, which is equivalent to ' 67 | 'permuting the phenotype and the covariance matrix; for y permute only the phenotype ' 68 | 'vector as in permGWAS Version1. Default is x.') 69 | parser.add_argument('-adj_p_value', action='store_true', 70 | help='If used, will additionally compute adjusted permutation-based p-values for each SNP.') 71 | parser.add_argument('-batch', '--batch_size', '--batch', type=int, default=50000, 72 | help='Specify number of SNPs to work on simultaneously, default is 50000') 73 | parser.add_argument('-batch_perm', '--perm_batch_size', '--batch_perm', type=int, default=1000, 74 | help='Specify number of SNPs to work on simultaneously, default is 1000') 75 | parser.add_argument('-mplot', '--manhattan', '--plot', action='store_true', 76 | help='optional, creates manhattan plot') 77 | parser.add_argument('-qqplot', '--qqplot', action='store_true', 78 | help='optional, creates QQ-plot') 79 | parser.add_argument('-not_add', '--not_add', action='store_true', 80 | help='optional, use if genotype has different encoding.') 81 | args = vars(parser.parse_args()) 82 | # check config file 83 | args = check_functions.check_all_arguments(args=args) 84 | phenotypes = args["trait"] 85 | 86 | # run pipeline 87 | for trait in phenotypes: 88 | print('Working on phenotype ', trait) 89 | args["trait"] = trait 90 | args = check_functions.check_output_files(args=args) 91 | print('Checked if all specified files exist.') 92 | try: 93 | perform_gwas.run(**args) 94 | args["out_file"] = None 95 | except Exception as exc: 96 | print("Failure when running permGWAS2.0") 97 | print(exc) 98 | continue 99 | -------------------------------------------------------------------------------- /permGWAS_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/grimmlab/permGWAS/3f7a1c2e3e4c63281f5719425ff9ac405f8d9cfc/permGWAS_logo.png -------------------------------------------------------------------------------- /postprocess/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/grimmlab/permGWAS/3f7a1c2e3e4c63281f5719425ff9ac405f8d9cfc/postprocess/__init__.py -------------------------------------------------------------------------------- /postprocess/plot_functions.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | import numpy as np 3 | import pandas as pd 4 | import matplotlib.pyplot as plt 5 | import seaborn as sns 6 | import scipy.stats as stats 7 | plt.rc('axes', axisbelow=True) 8 | plt.rcParams['axes.labelsize'] = 16 9 | plt.rcParams['xtick.labelsize'] = 14 10 | plt.rcParams['ytick.labelsize'] = 14 11 | plt.rcParams['legend.fontsize'] = 16 12 | plt.rcParams['axes.titlesize'] = 20 13 | 14 | from utils import helper_functions 15 | 16 | 17 | def manhattan_plot(df: pd.DataFrame, data_dir: pathlib.Path, filename: str, min_p_values: np.array = None, 18 | sig_level: int = 5): 19 | """ 20 | Save Manhattan plot as manhattan_FILENAME.png to data_dir 21 | 22 | :param df: DataFrame containing chromosome (CHR) and position (POS) identifiers, and corresponding p_values 23 | :param data_dir: full path to save directory 24 | :param filename: name of file 25 | :param min_p_values: array containing minimal p_values to compute permutation-based threshold 26 | :param sig_level: significance level for Bonferroni and perm thresholds, default is 5 27 | """ 28 | if not {'CHR', 'POS', 'p_value'}.issubset(df.columns): 29 | raise Exception('Cannot create Manhattan plot; need CHR, POS and p_value in DataFrame.') 30 | n_snps = len(df) 31 | df = df[df['p_value'] <= 0.01].copy() 32 | if isinstance(df['CHR'].values[0], str): 33 | try: 34 | df['CHR'] = [int(x.replace('Chr', '')) for x in df['CHR']] 35 | except Exception as exc: 36 | print("Chromosome identifier might be wrong. Use the chromosome number.") 37 | print(exc) 38 | running_pos = 0 39 | cumul_pos = [] 40 | for chrom, group_df in df.groupby('CHR'): 41 | cumul_pos.append(group_df['POS'] + running_pos) 42 | running_pos += group_df['POS'].max() 43 | df['cumul_pos'] = pd.concat(cumul_pos) 44 | 45 | fig, ax = plt.subplots(1, 1, figsize=(20, 5), constrained_layout=True) 46 | sns.scatterplot(ax=ax, data=df, x='cumul_pos', y='p_value', hue='CHR', palette='colorblind', linewidth=0, s=20, 47 | legend=None) 48 | ax.spines['top'].set_visible(False) 49 | ax.spines['right'].set_visible(False) 50 | ax.set_yscale("log") 51 | ax.invert_yaxis() 52 | ax.minorticks_off() 53 | ax.set_xlabel('Chromosome') 54 | ax.set_ylabel(r'$-log_{10}$(p-value)') 55 | ax.set_xticks(df.groupby('CHR')['cumul_pos'].median()) 56 | ax.set_xticklabels(np.unique(df['CHR'])) 57 | 58 | if min_p_values is not None: 59 | ax.axhline(helper_functions.compute_perm_threshold(min_p_values, sig_level), linewidth=1.5, color='blue', 60 | label='permGWAS2') 61 | ax.axhline(helper_functions.compute_bonf_threshold(n_snps, sig_level), linewidth=1.5, color='red', 62 | label='Bonferroni') 63 | ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.13), fancybox=True, ncol=2, frameon=True) 64 | fig.savefig(data_dir.joinpath('manhattan_' + pathlib.Path(filename).with_suffix('.png').as_posix())) 65 | fig.clf() 66 | 67 | 68 | def qq_plot(p_values: np.array, data_dir: pathlib.Path, filename: str): 69 | """ 70 | Save QQ-plot as qq_plot_FILENAME.png to data_dir 71 | 72 | :param p_values: array containing p_values 73 | :param data_dir: full path to save directory 74 | :param filename: name of file 75 | """ 76 | n_snps = len(p_values) 77 | observed_p = -np.log10(np.sort(p_values)) 78 | expected_p = -np.log10(np.arange(1.0 / float(n_snps), 1, 1.0 / float(n_snps + 1))) 79 | inflation_factor = np.median(stats.chi2.isf(p_values, 1)) / 0.456 80 | 81 | plt.figure(figsize=(6, 6)) 82 | plt.plot(expected_p, observed_p, '.', markersize=4, markeredgewidth=0, alpha=0.8) 83 | plt.plot(expected_p, expected_p, 'k--', linewidth=0.75) 84 | plt.text(3.5, 0.5, "$\lambda=%.2f$" % inflation_factor) 85 | plt.xlabel('Expected $-log10(p-value)$') 86 | plt.ylabel('Observed $-log10(p-value)$') 87 | plt.savefig(data_dir.joinpath('qq_plot_' + pathlib.Path(filename).with_suffix('.png').as_posix())) 88 | plt.clf() 89 | -------------------------------------------------------------------------------- /preprocess/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/grimmlab/permGWAS/3f7a1c2e3e4c63281f5719425ff9ac405f8d9cfc/preprocess/__init__.py -------------------------------------------------------------------------------- /preprocess/data_loader.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import pandas as pd 4 | import h5py 5 | import pathlib 6 | from pandas_plink import read_plink1_bin 7 | 8 | 9 | class Genotype: 10 | """ 11 | Class for loading of genotype data. 12 | 13 | **Attributes** 14 | 15 | - genotype_file (*pathlib.Path*): full path to genotype file for data loading 16 | - X (*torch.tensor*): matrix containing genotype values 17 | - sample_ids (*numpy.array*): ids of genotype samples 18 | - chromosomes (*numpy.array*): chromosome identifier of SNPs 19 | - positions (*numpy.array*): position identifier of SNPs 20 | - maf (*torch.tensor*): vector containing minor allele frequencies 21 | - sample_index (*numpy.array*): indices of the samples to load from the genotype matrix 22 | - n_samples (*int*): number of samples 23 | - n_snps (*int*): number of SNPs 24 | - maf_threshold (*int*): threshold for minor allele frequency filtering 25 | 26 | **Functions** 27 | 28 | - load_genotype_ids(load_genotype): load sample_ids from .h5/.hdf5/.h5py file 29 | - load_genotype_data(): load and encode genotype data from file, calls the following functions: 30 | - load_genotype_hdf5_file(sample_index, snp_lower_index, snp_upper_index): load genotype data from 31 | .h5/.hdf5/.h5py files 32 | - load_genotype_csv_file(): load genotype data from .csv files 33 | - load_genotype_binary_plink_file(): load genotype data from binary PLINK files 34 | - load_genotype_plink_file(): load genotype data from PLINK files 35 | - encode_genotype(): check encoding of genotype, change to additive if necessary, create torch.tensor, 36 | calls the following functions: 37 | - check_encoding() 38 | - get_additive_encoding() 39 | - load_genotype_batch_wise(maf_threshold, snp_lower_index, snp_upper_index): batch-wise loading and filtering 40 | of genotype data 41 | - filter_monomorphic_snps(): remove monomorphic SNPs 42 | - get_minor_allele_freq(): compute minor allele frequencies 43 | - use_maf_filter(maf_threshold): filter for minor allele frequency 44 | - save_genotype_hdf5(filename): save genotype data as .h5 file 45 | - reset_genotype(): delete X for batch-wise loading 46 | - get_matched_data(data, row_index): filter samples of data 47 | 48 | :param genotype_file: full path to genotype file 49 | :param maf_threshold: threshold for minor allele frequency filtering 50 | :param not_add: use if genotype has different / not additive encoding 51 | """ 52 | 53 | def __init__(self, genotype_file: pathlib.Path, maf_threshold: int = 0, not_add: bool = False): 54 | self.genotype_file = genotype_file 55 | self.maf_threshold = maf_threshold 56 | self.not_add = not_add 57 | self.sample_ids = None 58 | self.chromosomes = None 59 | self.positions = None 60 | self.X = None 61 | self.maf = None 62 | self.sample_index = None 63 | self.n_samples = None 64 | self.n_snps = None 65 | 66 | def load_genotype_ids(self, load_genotype: bool = False) -> np.array: 67 | """ 68 | Load sample_ids from .h5/.hdf5/.h5py genotype file. 69 | """ 70 | if self.genotype_file.suffix not in ('.h5', '.hdf5', '.h5py'): 71 | raise Exception('Can only load genotype IDs from .h5/.hdf5/.h5py files.') 72 | with h5py.File(self.genotype_file, "r") as gt: 73 | self.sample_ids = gt['sample_ids'][:].astype(str) 74 | if not load_genotype: 75 | self.n_snps = len(gt['position_index'][:]) 76 | 77 | def load_genotype_data(self): 78 | """ 79 | Load and encode genotype data. Accepts PLINK files, binary PLINK files, .csv and .h5, .hdf5, .h5py files. 80 | For .h5/.hdf5/.h5py files only load needed samples defined in self.sample_index. 81 | After loading check encoding of genotype and change to additive if necessary. 82 | Return genotype matrix as torch.tensor, chromosomes, positions and sample_ids as np.arrays. 83 | """ 84 | suffix = self.genotype_file.suffix 85 | if suffix in ('.h5', '.hdf5', '.h5py'): 86 | self.X, self.chromosomes, self.positions = self.load_genotype_hdf5_file() 87 | elif suffix == '.csv': 88 | self.X, self.sample_ids, self.chromosomes, self.positions = self.load_genotype_csv_file() 89 | elif suffix in ('.bed', '.bim', '.fam'): 90 | self.X, self.sample_ids, self.chromosomes, self.positions = self.load_genotype_binary_plink_file() 91 | elif suffix in ('.map', '.ped'): 92 | self.X, self.sample_ids, self.chromosomes, self.positions = self.load_genotype_plink_file() 93 | # check if genotype is in additive encoding, change encoding if not 94 | # change X from np.array to torch.tensor 95 | self.encode_genotype() 96 | self.n_samples = len(self.sample_ids) 97 | self.n_snps = len(self.positions) 98 | 99 | def load_genotype_batch_wise(self, device: torch.device = torch.device("cpu"), save_meta: bool = True, 100 | snp_lower_index: int = None, snp_upper_index: int = None): 101 | """ 102 | Load and encode genotype data batch-wise. After loading filter for monomorphic snps and minor allele frequency. 103 | Only accept .h5(.hdf5/.h5py files. 104 | 105 | :param device: device (cpu/gpu) for computations 106 | :param save_meta: save chromosome and position identifiers if True 107 | :param snp_lower_index: lower bound of batch 108 | :param snp_upper_index: upper bound of batch 109 | """ 110 | self.X, chromosomes, positions = self.load_genotype_hdf5_file(snp_lower_index=snp_lower_index, 111 | snp_upper_index=snp_upper_index) 112 | self.encode_genotype() 113 | chromosomes, positions = self.filter_monomorphic_snps(chromosomes=chromosomes, positions=positions) 114 | maf = self.get_minor_allele_freq() 115 | if self.maf_threshold != 0: 116 | maf, chromosomes, positions = self.use_maf_filter(maf=maf, chromosomes=chromosomes, positions=positions) 117 | self.X = self.X.to(device) 118 | 119 | if save_meta: 120 | if self.chromosomes is None: 121 | self.chromosomes = chromosomes 122 | self.positions = positions 123 | self.maf = maf 124 | else: 125 | self.chromosomes = np.concatenate((self.chromosomes, chromosomes)) 126 | self.positions = np.concatenate((self.positions, positions)) 127 | self.maf = torch.cat((self.maf, maf)) 128 | 129 | def load_genotype_hdf5_file(self, snp_lower_index: int = None, snp_upper_index: int = None) -> tuple: 130 | """ 131 | Load genotype matrix from .h5/.hdf5/.h5py file. 132 | Only load needed samples and SNPs batch wise: 133 | will only load specified samples given in sample_index 134 | if snp_upper_bound/snp_lower_bound is given, will load SNPs batch-wise, else will load all SNPs 135 | H5, HDF5, H5PY files need to have the following structure: 136 | snps: genotype matrix either in additive encoding or in raw nucleotide encoding (biallelic 137 | notation (i.e. 'AA', 'AT', ...) or iupac notation (i.e. 'A', 'W', ...)) with samples as 138 | rows and markers as columns 139 | sample_ids: sample identifier in the same order as the rows of the genotype matrix 140 | chr_index: chromosome identifier in the same order as the columns of the genotype matrix 141 | position_index: position number (integer) in the same order as the columns of the genotype matrix 142 | 143 | :param snp_lower_index: lower bound of batch 144 | :param snp_upper_index: upper bound of batch 145 | 146 | :return: Genotype values, chromosomes and positions and sample_ids if no sample_index is specified 147 | """ 148 | with h5py.File(self.genotype_file, "r") as gt: 149 | chromosomes = gt['chr_index'][snp_lower_index:snp_upper_index].astype(str) 150 | positions = gt['position_index'][snp_lower_index:snp_upper_index].astype(int) 151 | if isinstance(self.sample_index, (np.ndarray, list)): 152 | # using sample indices directly does not work for h5py --> use workaround 153 | indices, inverse = np.unique(self.sample_index, return_inverse=True) 154 | X = gt['snps'][indices, snp_lower_index:snp_upper_index] 155 | X = X[inverse, :] 156 | return X, chromosomes, positions 157 | else: 158 | raise Exception('sample_index needs to be a list in order to load certain genotype samples only.') 159 | 160 | def load_genotype_csv_file(self) -> (np.array, np.array, np.array, np.array): 161 | """ 162 | Load .csv genotype file. File must have the following structure: 163 | First column must contain the sample ids, the column names should be the SNP ids as CHROMOSOME_POSITION. 164 | The values should be the genotype matrix either in additive encoding or in raw nucleotide encoding (biallelic 165 | notation (i.e. 'AA', 'AT', ...) or iupac notation (i.e. 'A', 'W', ...)). 166 | 167 | :return: Genotype values, sample_ids, chromosomes and positions 168 | """ 169 | gt = pd.read_csv(self.genotype_file, index_col=0) 170 | snp_ids = np.array(list(map(lambda a: a.split("_"), gt.columns.values))) 171 | chromosomes = snp_ids[:, 0] 172 | positions = snp_ids[:, 1].astype(int) 173 | sample_ids = np.asarray(gt.index, dtype=str) 174 | X = np.asarray(gt.values) 175 | return X, sample_ids, chromosomes, positions 176 | 177 | def load_genotype_binary_plink_file(self) -> (np.array, np.array, np.array, np.array): 178 | """ 179 | Load binary PLINK file, .bim, .fam, .bed files with same prefix need to be in same folder. 180 | 181 | :return: Genotype values, sample_ids, chromosomes and positions 182 | """ 183 | prefix = self.genotype_file.with_suffix('').as_posix() 184 | gt = read_plink1_bin(prefix + '.bed', prefix + '.bim', prefix + '.fam', ref="a0", verbose=False) 185 | sample_ids = np.array(gt['fid'], dtype=str).flatten() 186 | positions = np.array(gt['pos']).flatten() 187 | chromosomes = np.array(gt['chrom']).flatten() 188 | X = np.asarray(gt.values) 189 | return X, sample_ids, chromosomes, positions 190 | 191 | def load_genotype_plink_file(self) -> (np.array, np.array, np.array, np.array): 192 | """ 193 | Load PLINK files, .map and .ped file with same prefix need to be in same folder. 194 | Accepts GENOTYPENAME.ped and GENOTYPENAME.map as input 195 | 196 | :return: Genotype values, sample_ids, chromosomes and positions 197 | """ 198 | prefix = self.genotype_file.with_suffix('').as_posix() 199 | with open(prefix + '.map', 'r') as f: 200 | chromosomes = [] 201 | positions = [] 202 | for line in f: 203 | tmp = line.strip().split(" ") 204 | chromosomes.append(tmp[0].strip()) 205 | positions.append(int(float(tmp[-1].strip()))) 206 | chromosomes = np.array(chromosomes) 207 | positions = np.array(positions) 208 | iupac_map = {"AA": "A", "GG": "G", "TT": "T", "CC": "C", "AG": "R", "GA": "R", "RR": "R", "CT": "Y", "TC": "Y", 209 | "YY": "Y", "GC": "S", "CG": "S", "SS": "S", "AT": "W", "TA": "W", "WW": "W", "GT": "K", "TG": "K", 210 | "KK": "K", "AC": "M", "CA": "M", "MM": "M"} 211 | with open(prefix + '.ped', 'r') as f: 212 | sample_ids = [] 213 | X = [] 214 | for line in f: 215 | tmp = line.strip().split(" ") 216 | sample_ids.append(tmp[1].strip()) 217 | snps = [] 218 | j = 6 219 | while j < len(tmp) - 1: 220 | snps.append(iupac_map[tmp[j] + tmp[j + 1]]) 221 | j += 2 222 | X.append(snps) 223 | sample_ids = np.array(sample_ids, dtype=str) 224 | X = np.array(X) 225 | return X, sample_ids, chromosomes, positions 226 | 227 | def encode_genotype(self): 228 | """ 229 | first check encoding of genotype, then change to additive if necessary, finally change X from np.array 230 | to torch.tensor 231 | """ 232 | if self.not_add: 233 | print('Genotype might not be in additive encoding. Will not check encoding of genotype.') 234 | self.X = torch.tensor(self.X, dtype=torch.float64) 235 | else: 236 | enc_of_X = self.check_encoding() 237 | # if genotype in biallelic notation, will change to iupac notation and then encode additively 238 | if enc_of_X == 'biallelic': 239 | iupac_map = {"AA": "A", "GG": "G", "TT": "T", "CC": "C", "AG": "R", "GA": "R", "CT": "Y", "TC": "Y", 240 | "GC": "S", "CG": "S", "AT": "W", "TA": "W", "GT": "K", "TG": "K", "AC": "M", "CA": "M"} 241 | self.X = np.vectorize(iupac_map.__getitem__)(self.X.astype(str)) 242 | enc_of_X = 'iupac' 243 | if enc_of_X == 'iupac': 244 | self.X = torch.tensor(self.get_additive_encoding(), dtype=torch.float64) 245 | elif enc_of_X == 'additive': 246 | self.X = torch.tensor(self.X, dtype=torch.float64) 247 | else: 248 | raise Exception('Genotype in wrong encoding. Can only deal with additive, iupac and biallelic ' 249 | 'encoding. If you want to use different encoding use flag -not_add.') 250 | 251 | def check_encoding(self): 252 | """ 253 | Check the encoding of the genotype matrix 254 | 255 | :return: encoding of the genotype matrix 256 | """ 257 | if self.X[0, 0].astype(str) in ['A', 'C', 'G', 'T', 'M', 'R', 'W', 'S', 'Y', 'K']: 258 | return 'iupac' 259 | elif self.X[0, 0] in [0, 1, 2]: 260 | return 'additive' 261 | elif self.X[0, 0] in ["AA", "GG", "TT", "CC", "AG", "GA", "CT", "TC", "GC", "CG", "AT", "TA", "GT", "TG", 262 | "AC", "CA"]: 263 | return 'biallelic' 264 | else: 265 | raise Exception('Genotype in wrong encoding. Can only deal with additive, iupac and biallelic encoding. ' 266 | 'Please check again.') 267 | 268 | def get_additive_encoding(self): 269 | """ 270 | Function to compute additive encoding of genotype matrix with 271 | 0: homozygous major allele 272 | 1: heterozygous 273 | 2: homozygous minor allele 274 | 275 | :return: gnotype in additive encoding 276 | """ 277 | alleles = [] 278 | index_arr = [] 279 | pairs = [['A', 'C'], ['A', 'G'], ['A', 'T'], ['C', 'G'], ['C', 'T'], ['G', 'T']] 280 | heterozygous_nuc = ['M', 'R', 'W', 'S', 'Y', 'K'] 281 | for i, col in enumerate(np.transpose(self.X)): 282 | unique, inv, counts = np.unique(col, return_counts=True, return_inverse=True) 283 | unique = unique.astype(str) 284 | boolean = (unique == 'A') | (unique == 'T') | (unique == 'C') | (unique == 'G') 285 | tmp = np.zeros(3) 286 | if len(unique) > 3: 287 | raise Exception('More than two alleles encountered at snp ' + str(i)) 288 | elif len(unique) == 3: 289 | hetero = unique[~boolean][0] 290 | homozygous = unique[boolean] 291 | for j, pair in enumerate(pairs): 292 | if all(h in pair for h in homozygous) and hetero != heterozygous_nuc[j]: 293 | raise Exception('More than two alleles encountered at snp ' + str(i)) 294 | tmp[~boolean] = 1.0 295 | tmp[np.argmin(counts[boolean])] = 2.0 296 | elif len(unique) == 2: 297 | if list(unique) in pairs: 298 | tmp[np.argmin(counts)] = 2.0 299 | else: 300 | tmp[(~boolean).nonzero()] = 1.0 301 | else: 302 | if unique[0] in heterozygous_nuc: 303 | tmp[0] = 1.0 304 | alleles.append(tmp) 305 | index_arr.append(inv) 306 | alleles = np.transpose(np.array(alleles)) 307 | index_arr = np.transpose(np.array(index_arr)) 308 | cols = np.arange(alleles.shape[1]) 309 | return alleles[index_arr, cols] 310 | 311 | def filter_monomorphic_snps(self, chromosomes: np.array = None, positions: np.array = None) -> (np.array, np.array): 312 | """ 313 | Remove monomorphic SNPs, i.e., SNPs that are constant 314 | 315 | :param chromosomes: vector with chromosome identifiers 316 | :param positions: vector with position identifiers 317 | 318 | :return filtered chromosomes and positions 319 | """ 320 | tmp = self.X == self.X[0, :] 321 | self.X = self.X[:, ~tmp.all(0)] 322 | if chromosomes is None: 323 | self.chromosomes = self.chromosomes[~tmp.all(0)] 324 | self.positions = self.positions[~tmp.all(0)] 325 | else: 326 | return chromosomes[~tmp.all(0)], positions[~tmp.all(0)] 327 | 328 | def get_minor_allele_freq(self): 329 | """ 330 | Function to calculate minor allele frequencies of each SNP 331 | 332 | :return: vector containing frequencies 333 | """ 334 | 335 | return (torch.sum(self.X, 0)) / (2 * self.X.shape[0]) 336 | 337 | def use_maf_filter(self, maf: torch.tensor = None, chromosomes: np.array = None, positions: np.array = None) \ 338 | -> (torch.tensor, np.array, np.array): 339 | """ 340 | filter genotype by minor allele frequency 341 | 342 | :param maf: vector containing minor allele frequencies 343 | :param chromosomes: vector with chromosome identifiers 344 | :param positions: vector with position identifiers 345 | 346 | :return: tensor with filtered maf frequencies, chromosomes and positions 347 | """ 348 | if maf is None: 349 | tmp = self.maf > (self.maf_threshold / 100) 350 | self.X = self.X[:, tmp] 351 | self.chromosomes = self.chromosomes[tmp] 352 | self.positions = self.positions[tmp] 353 | self.maf = self.maf[tmp] 354 | else: 355 | # for batch-wise loading 356 | tmp = maf > (self.maf_threshold / 100) 357 | self.X = self.X[:, tmp] 358 | return maf[tmp], chromosomes[tmp], positions[tmp] 359 | 360 | def save_genotype_hdf5(self, filename: pathlib.Path): 361 | """ 362 | Save genotype data to .h5 file 363 | 364 | :param filename: Full path to new genotype file 365 | """ 366 | if any(elem is None for elem in [self.X, self.sample_ids, self.chromosomes, self.positions]): 367 | raise Exception('Cannot save genotype file. Some values are None, please check again.') 368 | print('Save genotype data as ' + filename.as_posix() + '.\nThis might take some time.') 369 | with h5py.File(filename.with_suffix('.h5'), 'w') as f: 370 | f.create_dataset('sample_ids', data=self.sample_ids.astype(bytes), chunks=True, compression="gzip") 371 | f.create_dataset('chr_index', data=self.chromosomes.astype(bytes), chunks=True, compression="gzip") 372 | f.create_dataset('position_index', data=self.positions.astype(int), chunks=True, compression="gzip") 373 | f.create_dataset('snps', data=self.X, chunks=True, compression="gzip", compression_opts=7) 374 | print('Done saving H5 file.') 375 | 376 | def reset_genotype(self): 377 | """ 378 | Delete X for batchwise loading 379 | """ 380 | self.X = None 381 | 382 | @staticmethod 383 | def get_matched_data(data, row_index: np.array): 384 | """ 385 | Get rows of data specified in index array 386 | 387 | :param data: data to match, either np.array or torch.tensor 388 | :param row_index: row-index array for filtering / matching 389 | """ 390 | if data.ndim == 2: 391 | return data[row_index, :] 392 | if data.ndim == 1: 393 | return data[row_index] 394 | else: 395 | raise Exception('Cannot match data, dimensions are wrong. Expected dimension 1 or 2 but got ' 396 | + str(data.ndim) + ' instead. Please check again.') 397 | 398 | 399 | class Dataset(Genotype): 400 | """ 401 | Class for loading and preparation of genotype, phenotype, kinship and covariates. 402 | 403 | **Attributes** 404 | 405 | - genotype_file (*pathlib.Path*): full path to genotype file for data loading 406 | - X (*torch.tensor*): matrix containing genotype values 407 | - sample_ids (*numpy.array*): ids of genotype samples 408 | - chromosomes (*numpy.array*): chromosome identifier of SNPs 409 | - positions (*numpy.array*): position identifier of SNPs 410 | - y (*torch.tensor*): tensor containing phenotypic values 411 | - K (*torch.tensor*): kinship matrix 412 | - fixed (*torch.tensor*): matrix containing fixed effects, i.e. vector of ones and covariates if available 413 | - maf (*torch.tensor*): vector containing minor allele frequencies 414 | - sample_index (*np.array*): vector containing sample indices for batch-wise loading of X 415 | - n_samples (*int*): number of samples 416 | - n_snps (*int*): number of SNPs 417 | - maf_threshold (*int*): threshold for minor allele frequency filtering 418 | 419 | **Functions** 420 | 421 | - load_and_prepare_data(): load load and match data, calls the following functions: 422 | - see class Genotype for all genotype specific functions 423 | - load_phenotype(phenotype_file, trait): load phenotype fom file 424 | - load_kinship(kinship_file): load kinship matrix from file 425 | - compute_rrk_kinship(): compute realized relationship kernel 426 | - normalize_kinship(): normalize kinship matrix using a Gower's centered matrix 427 | - load_covariates(covariates_file, column_list): load covariates from file 428 | - get_fixed_effects(): create fixed effects vector/matrix 429 | - match_data(data_ids1, data_ids2): match ids of two datasets 430 | - to_device(device): move tensors to device 431 | 432 | :param genotype_file: full path to genotype file 433 | :param phenotype_file: full path to phenotype file 434 | :param trait: name of phenotypic trait to use 435 | :param maf_threshold: minor allele frequency threshold to use for SNP filtering, default is 0 (no filtering) 436 | :param load_genotype: bool, if False load genotype batch-wise during computations, default is False 437 | :param kinship_file: full path to kinship file, optional, if missing, compute rrk kinship 438 | :param covariate_file: full path to covariate file, optional 439 | :param covariate_list: list of covariates to use, optional 440 | :param not_add: use if genotype has different / not additive encoding 441 | """ 442 | 443 | def __init__(self, genotype_file: pathlib.Path, phenotype_file: pathlib.Path, trait: str, maf_threshold: int = 0, 444 | load_genotype: bool = False, kinship_file: pathlib.Path = None, covariate_file: pathlib.Path = None, 445 | covariate_list: list = None, not_add: bool = False): 446 | super().__init__(genotype_file=genotype_file, maf_threshold=maf_threshold, not_add=not_add) 447 | 448 | self.y = None 449 | self.K = None 450 | self.fixed = None 451 | self.load_and_prepare_data(phenotype_file=phenotype_file, trait=trait, load_genotype=load_genotype, 452 | kinship_file=kinship_file, covariate_file=covariate_file, 453 | covariate_list=covariate_list) 454 | 455 | def load_and_prepare_data(self, phenotype_file: pathlib.Path, trait: str, load_genotype: bool = False, 456 | kinship_file: pathlib.Path = None, covariate_file: pathlib.Path = None, 457 | covariate_list: list = None): 458 | """ 459 | Load and match genotype, phenotype, kinship and covariates. 460 | 1. Load phenotype from file. 461 | 2. Load genotype and match with pheno: 462 | If load_genotype is False, only load geno sample_ids from file and match data 463 | Load genotype sample_ids, match with pheno and load geno data only for needed samples 464 | 3. Filter genotype for monomorphic SNPs and minor allele frequency 465 | 4. Load kinship from file and match with geno, or compute kinship from geno data 466 | 5. if available load covariates from file 467 | 468 | :param phenotype_file: full path to phenotype file 469 | :param trait: name of phenotypic trait to use 470 | :param load_genotype: bool, if False load genotype batch-wise during computations, default is False 471 | :param kinship_file: full path to kinship file, optional, if missing, compute rrk kinship 472 | :param covariate_file: full path to covariate file, optional 473 | :param covariate_list: list of covariates to use, optional 474 | """ 475 | # load phenotype 476 | y, y_ids = self.load_phenotype(phenotype_file=phenotype_file, trait=trait) 477 | # load genotype 478 | if not load_genotype: 479 | # only load and match sample ids of genotype, values will be loaded batch-wise during computations 480 | self.load_genotype_ids(load_genotype=False) 481 | pheno_index, self.sample_index = self.match_data(data_ids1=y_ids, data_ids2=self.sample_ids) 482 | if len(pheno_index) == 0: 483 | raise Exception("Samples of genotype and phenotype do not match.") 484 | else: 485 | if self.genotype_file.suffix in ('.h5', '.hdf5', '.h5py'): 486 | # load genotype sample ids, match data and only load genotype values for needed samples 487 | self.load_genotype_ids() 488 | pheno_index, self.sample_index = self.match_data(data_ids1=y_ids, data_ids2=self.sample_ids) 489 | if len(pheno_index) == 0: 490 | raise Exception("Samples of genotype and phenotype do not match.") 491 | self.load_genotype_data() 492 | else: 493 | self.load_genotype_data() 494 | pheno_index, self.sample_index = self.match_data(data_ids1=y_ids, data_ids2=self.sample_ids) 495 | if len(pheno_index) == 0: 496 | raise Exception("Samples of genotype and phenotype do not match.") 497 | self.X = self.get_matched_data(data=self.X, row_index=self.sample_index) 498 | self.filter_monomorphic_snps() 499 | self.maf = self.get_minor_allele_freq() 500 | if self.maf_threshold != 0: 501 | self.use_maf_filter() 502 | self.n_snps = len(self.positions) 503 | self.y = self.get_matched_data(data=y, row_index=pheno_index) 504 | self.sample_ids = self.get_matched_data(data=self.sample_ids, row_index=self.sample_index) 505 | self.n_samples = len(self.y) 506 | # kinship 507 | if kinship_file is None: 508 | # compute kinship matrix 509 | self.K = self.compute_rrk_kinship() 510 | else: 511 | # load kinship from file 512 | self.K, K_ids = self.load_kinship(kinship_file=kinship_file) 513 | _, K_index = self.match_data(data_ids1=self.sample_ids, data_ids2=K_ids) 514 | if len(K_index) == len(self.sample_ids): 515 | self.K = self.K[K_index, :][:, K_index] 516 | else: 517 | raise Exception("Sample ids of genotype and kinship matrix do not match. Please check again") 518 | self.normalize_kinship() 519 | # fixed effects 520 | if covariate_file is not None: 521 | # load covariates from file 522 | cov = self.load_covariates(covariate_file=covariate_file, covariate_list=covariate_list) 523 | cov_ids = np.asarray(cov.index, dtype=y_ids.dtype).flatten() 524 | _, cov_index = self.match_data(data_ids1=self.sample_ids, data_ids2=cov_ids) 525 | if len(cov_index) == len(self.sample_ids): 526 | self.fixed = torch.tensor(cov.values, dtype=torch.float64).flatten()[cov_index] 527 | else: 528 | raise Exception('Sample ids of covariates and phenotype do not match.') 529 | self.get_fixed_effects() 530 | 531 | def load_phenotype(self, phenotype_file: pathlib.Path, trait: str) -> (torch.Tensor, np.array): 532 | """ 533 | Load phenotype from file. Accept .csv and single white space separated .txt and .pheno files. 534 | Phenotype data needs to contain sample identifiers as first column and phenotypic traits as remaining columns. 535 | The trait name should be the respective column name. Can contain more than one phenotype columns. 536 | Will drop NAN values during preparation and compute mean over replicates. 537 | 538 | :param phenotype_file: full path to phenotype file 539 | :param trait: name of phenotypic trait / column to use 540 | 541 | :return: tensor containing phenotypic traits and array containing respective sample_ids 542 | """ 543 | 544 | suffix = phenotype_file.suffix 545 | # load CSV 546 | if suffix == ".csv": 547 | y = pd.read_csv(phenotype_file) 548 | # load PHENO or TXT 549 | elif suffix == ".txt": 550 | y = pd.read_csv(phenotype_file, sep=" ") 551 | elif suffix == ".pheno": 552 | y = pd.read_csv(phenotype_file, sep=" ") 553 | if {'FID', 'IID'}.issubset(set(y.columns)): 554 | y.drop(columns='FID', inplace=True) 555 | else: 556 | raise NotImplementedError('Only accept CSV, PHENO and TXT phenotype files') 557 | # account for replicates 558 | y = y.sort_values(y.columns[0]).groupby(y.columns[0]).mean() 559 | if trait not in y.columns: 560 | raise Exception('Phenotype ' + trait + ' is not in phenotype file ' + phenotype_file.as_posix()) 561 | else: 562 | y = y[[trait]].dropna() 563 | return torch.tensor(y.values, dtype=torch.float64).flatten(), np.asarray(y.index, dtype=str).flatten() 564 | 565 | def load_kinship(self, kinship_file: pathlib.Path) -> (torch.tensor, np.array): 566 | """ 567 | load kinship matrix from file. Only take .csv or .h5/.hdf5/.h5py files. 568 | For .csv files sample ids have to be in first column, .h5/.hdf5/.h5py files need to contain the kinship matrix 569 | with key 'kinship' and the corresponding sample ids with key 'sample_ids'. 570 | 571 | :param kinship_file: full path to kinship file 572 | 573 | :return: torch.tensor containing kinship matrix and array with sample ids 574 | """ 575 | # load .csv 576 | suffix = kinship_file.suffix 577 | if suffix == ".csv": 578 | kin = pd.read_csv(kinship_file, index_col=0) 579 | K = torch.tensor(kin.values) 580 | sample_ids = np.array(kin.index, dtype=str) 581 | # load .h5/.hdf5/.h5py 582 | elif suffix in (".h5", ".hdf5", ".h5py"): 583 | with h5py.File(kinship_file, "r") as f: 584 | K = torch.tensor(f['kinship'][:], dtype=torch.float64) 585 | sample_ids = f['sample_ids'][:].astype(str) 586 | else: 587 | raise NotImplementedError('Only accept .csv, .h5, .hdf5, .h5py kinship files') 588 | return K, sample_ids 589 | 590 | def compute_rrk_kinship(self) -> torch.tensor: 591 | """ 592 | compute realized relationship kernel as kinship matrix 593 | 594 | :return: kinship matrix 595 | """ 596 | if self.X is None: 597 | raise Exception('Cannot compute kinship matrix, no genotype matrix available.') 598 | X_stand = (self.X - self.X.mean(axis=0)) / self.X.std(axis=0) 599 | K = torch.matmul(X_stand, torch.t(X_stand)) / self.X.shape[1] 600 | # set negative values in K to zero 601 | return torch.where(K > 0, K, 0.) 602 | 603 | def normalize_kinship(self): 604 | """ 605 | normalize kinship matrix using a Gower's centered matrix 606 | """ 607 | n = self.K.shape[0] 608 | P = (torch.eye(n, dtype=self.K.dtype, device=self.K.device) - 609 | torch.ones(n, n, dtype=self.K.dtype, device=self.K.device) / n) 610 | self.K = (n - 1) / torch.sum(torch.mul(P, self.K)) * self.K 611 | 612 | def load_covariates(self, covariate_file: pathlib.Path, covariate_list: list = None) -> torch.tensor: 613 | """ 614 | Only take .csv files: sample ids have to be in first column, if column_list is available, will load all columns 615 | specified, else will load all available columns 616 | 617 | :param covariate_file: full path to covariates file 618 | :param covariate_list: list containing column names/headers of covariates to load 619 | 620 | :return: pandas DataFrame containing covariates with sample ids as index 621 | """ 622 | if covariate_file.suffix == ".csv": 623 | covs = pd.read_csv(covariate_file) 624 | covs = covs.sort_values(covs.columns[0]).groupby(covs.columns[0]).mean().dropna() 625 | if covariate_list is not None: 626 | if set(covariate_list).issubset(set(covs.columns)): 627 | covs = covs[covariate_list] 628 | else: 629 | raise Exception('Specified covariates are not available in covariate file. Please check again.') 630 | else: 631 | raise NotImplementedError('Only accept .csv covariates files') 632 | return covs 633 | 634 | def get_fixed_effects(self): 635 | """ 636 | Check for covariates and create fixed effects matrix with ones as first column and covariates as remaining 637 | columns if available --> dim: (n, c+1) 638 | """ 639 | if self.fixed is None: 640 | self.fixed = torch.ones((len(self.y), 1), dtype=torch.float64) 641 | elif self.fixed.ndim == 1: 642 | self.fixed = torch.stack((torch.ones(len(self.y), dtype=torch.float64), self.fixed), dim=1) 643 | else: 644 | self.fixed = torch.cat((torch.ones((len(self.y), 1), dtype=torch.float64), self.fixed), dim=1) 645 | 646 | def to_device(self, device: torch.device): 647 | """ 648 | move data to device 649 | 650 | :param device: cpu or cuda 651 | """ 652 | self.y = self.y.to(device) 653 | self.K = self.K.to(device) 654 | self.fixed = self.fixed.to(device) 655 | 656 | @staticmethod 657 | def match_data(data_ids1: np.array, data_ids2: np.array) -> (np.array, np.array): 658 | """ 659 | match two datasets 660 | 661 | :param data_ids1: ids of first dataset 662 | :param data_ids2: ids of second dataset 663 | 664 | :return: two arrays with indices of matched data 665 | """ 666 | return (np.reshape(data_ids1, (data_ids1.shape[0], 1)) == data_ids2.astype(data_ids1.dtype)).nonzero() 667 | -------------------------------------------------------------------------------- /supplementary_data/simulated_phenotypes_her30.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/grimmlab/permGWAS/3f7a1c2e3e4c63281f5719425ff9ac405f8d9cfc/supplementary_data/simulated_phenotypes_her30.h5 -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/grimmlab/permGWAS/3f7a1c2e3e4c63281f5719425ff9ac405f8d9cfc/utils/__init__.py -------------------------------------------------------------------------------- /utils/check_functions.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | import torch 3 | import pandas as pd 4 | from utils import helper_functions 5 | import models 6 | 7 | 8 | def check_all_arguments(args: dict) -> dict: 9 | """ 10 | Check user specified arguments for plausibility and turn all file paths to pathlib.Path objects 11 | :param args: 12 | :return: 13 | """ 14 | if args["config_file"] is not None: 15 | args = helper_functions.parse_config_file(args=args) 16 | del args["config_file"] 17 | # check if specified files exist 18 | args["genotype_file"] = check_file(filepath=args["genotype_file"]) 19 | args["phenotype_file"] = check_file(filepath=args["phenotype_file"]) 20 | args["kinship_file"] = check_file(filepath=args["kinship_file"]) 21 | args["covariate_file"] = check_file(filepath=args["covariate_file"]) 22 | if args["trait"] is None: 23 | args["trait"] = 'phenotype_value' 24 | elif (args["trait"] == 'all') or (args["trait"] == ['all']): 25 | print('Will perform computations on all available phenotypes.') 26 | args["out_file"] = None 27 | suffix = args["phenotype_file"].suffix 28 | if suffix == ".csv": 29 | df = pd.read_csv(args["phenotype_file"], index_col=0) 30 | # load PHENO or TXT 31 | elif suffix == ".txt": 32 | df = pd.read_csv(args["phenotype_file"], index_col=0, sep=" ") 33 | elif suffix == ".pheno": 34 | df = pd.read_csv(args["phenotype_file"], index_col=0, sep=" ") 35 | if 'FID' in df.columns: 36 | df.drop(columns='FID', inplace=True) 37 | if 'IID' in df.columns: 38 | df.drop(columns='IID', inplace=True) 39 | else: 40 | raise Exception('Only accept .txt, .pheno or .csv phenotype files.') 41 | args["trait"] = df.columns.tolist() 42 | elif isinstance(args["trait"], str): 43 | args["trait"] = [args["trait"]] 44 | elif isinstance(args["trait"], list): 45 | args["out_file"] = None 46 | else: 47 | raise Exception('Something is wrong with the trait name. Please check again.') 48 | # sanity checks for fast loading and batch-wise loading 49 | if args["kinship_file"] is None: 50 | args["load_genotype"] = True 51 | if args["genotype_file"].suffix not in ('.h5', '.hdf5', '.h5py'): 52 | args["load_genotype"] = True 53 | # check gpu 54 | if torch.cuda.is_available() and not args["disable_gpu"]: 55 | dev = "cuda:" + str(args["device"]) 56 | print('GPU is available. Perform computations on device ', dev) 57 | else: 58 | dev = "cpu" 59 | print('GPU is not available. Perform computations on device ', dev) 60 | del args["disable_gpu"] 61 | args["device"] = torch.device(dev) 62 | # check model 63 | if args["model"] is None: 64 | args["model"] = 'lmm' 65 | if args["model"] not in models.__all__: 66 | raise NotImplementedError('Specified model not implemented') 67 | 68 | # sanity checks 69 | if args["maf_threshold"] is None: 70 | args["maf_threshold"] = 0 71 | if isinstance(args["covariate_list"], str): 72 | args["covariate_list"] = [args["covariate_list"]] 73 | # check permutation method 74 | if args["perm"] is None: 75 | args["perm"] = 0 76 | if args["perm"] > 0: 77 | if args["perm_method"] not in ('x', 'y'): 78 | raise NotImplementedError(' Can only perform permutation methods x and y. Please check again.') 79 | if args["adj_p_value"] and args["perm"] == 0: 80 | raise Exception('Can not compute adjusted p-values with 0 permutations. Please check again.') 81 | return args 82 | 83 | 84 | def check_output_files(args: dict) -> dict: 85 | # check output directory and file 86 | if args["out_file"] is None: 87 | args["out_file"] = args["trait"] + '.csv' 88 | if args["out_dir"] is None: 89 | args["out_dir"] = pathlib.Path.cwd().joinpath('results') 90 | args["out_dir"], args["out_file"] = check_dir_paths(args["out_dir"], args["out_file"]) 91 | return args 92 | 93 | 94 | def check_file(filepath: str): 95 | """ 96 | Check if specified file exists 97 | 98 | :param filepath: full path to file 99 | 100 | :return: path to file as Path object 101 | """ 102 | if filepath is None: 103 | return None 104 | else: 105 | filepath = pathlib.Path(filepath) 106 | if filepath.is_file(): 107 | return filepath 108 | else: 109 | raise FileNotFoundError('There is no file ', filepath.as_posix()) 110 | 111 | 112 | def check_dir_paths(out_dir: str, out_file: str, prefix: str = 'p_values_') -> (pathlib.Path, pathlib.Path): 113 | """ 114 | Check if directory for result files exists, if not, create directory. 115 | Then check if result files already exist, if they already exist, rename result file by adding (i) to the 116 | end of the file 117 | 118 | :param out_dir: directory to save result files 119 | :param out_file: result file 120 | :param prefix: prefix to use when checking for existing files, default is p_values_ 121 | 122 | :return: path object 123 | """ 124 | my_path = pathlib.Path(out_dir) 125 | if prefix in ('manhattan_', 'qq_plot_'): 126 | suffix = '.png' 127 | elif prefix == '': 128 | suffix = '.h5' 129 | else: 130 | suffix = '.csv' 131 | out_file = pathlib.Path(out_file).with_suffix(suffix).as_posix() 132 | if my_path.is_dir(): 133 | if my_path.joinpath(prefix + out_file).exists(): 134 | if suffix == '.h5': 135 | raise Exception('File %s already exists in chosen directory %s.' % (out_file, out_dir)) 136 | i = 1 137 | new_file = pathlib.Path(out_file).with_suffix('').as_posix() + '(' + str(i) + ')' + suffix 138 | new_path = my_path.joinpath(prefix + new_file) 139 | while new_path.exists(): 140 | i += 1 141 | new_file = pathlib.Path(out_file).with_suffix('').as_posix() + '(' + str(i) + ')' + suffix 142 | new_path = my_path.joinpath(prefix + new_file) 143 | print('The file %s already exists in chosen directory %s. Changed filename to %s.' 144 | % (prefix + out_file, out_dir, prefix + new_file)) 145 | else: 146 | new_file = out_file 147 | else: 148 | new_file = out_file 149 | my_path.mkdir(parents=True, exist_ok=True) 150 | return my_path, new_file 151 | -------------------------------------------------------------------------------- /utils/helper_functions.py: -------------------------------------------------------------------------------- 1 | import yaml 2 | import pathlib 3 | import importlib 4 | import inspect 5 | import numpy as np 6 | 7 | import models 8 | 9 | 10 | def parse_config_file(args: dict) -> dict: 11 | """ 12 | Read yaml config file to update all user specified arguments 13 | 14 | :param args: dict with user specified arguments 15 | 16 | :return: updated dict with arguments 17 | """ 18 | config_path = pathlib.Path(args["config_file"]) 19 | if not config_path.is_file(): 20 | raise FileNotFoundError('Specified config file does not exist. Please check again.') 21 | if config_path.suffix not in ['.yaml', '.yml']: 22 | raise Exception('Only accept yaml config files. Please check again.') 23 | config = yaml.safe_load(open(config_path)) 24 | args.update(config) 25 | return args 26 | 27 | 28 | def get_model_class_name(model_name: str = 'lmm'): 29 | """ 30 | Get class name of model for user input 31 | 32 | :param model_name: user input of model name 33 | :return: model class name 34 | """ 35 | if model_name in models.__all__: 36 | model_name = 'models.' + model_name 37 | for name, cls in inspect.getmembers(importlib.import_module(model_name), inspect.isclass): 38 | if cls.__module__ == model_name: 39 | return cls 40 | else: 41 | raise NotImplementedError('No class named ', model_name) 42 | else: 43 | raise NotImplementedError('No class named ', model_name) 44 | 45 | 46 | def estimate_heritability(v_g: float, v_e: float) -> float: 47 | """ 48 | compute narrow sense heritability 49 | :param v_g: genetic variance component 50 | :param v_e: residual variance component 51 | :return: narrow sense heritability 52 | """ 53 | return v_g / (v_g + v_e) 54 | 55 | 56 | def compute_perm_threshold(min_p_val: np.array, sig_level: int) -> float: 57 | """ 58 | Compute permutation-based threshold 59 | :param min_p_val: array with minimal p-values 60 | :param sig_level: significance level as percentage value 61 | :return: threshold 62 | """ 63 | return np.percentile(min_p_val, sig_level) 64 | 65 | 66 | def compute_bonf_threshold(number_snps: int, sig_level: int) -> float: 67 | """ 68 | Compute Bonferroni threshold 69 | :param number_snps: number of SNPs 70 | :param sig_level: significance level as percentage value 71 | :return: threshold 72 | """ 73 | return (sig_level / 100) / number_snps 74 | 75 | 76 | def print_summary_stats(genotype_file: pathlib.Path, phenotype_file: pathlib.Path, trait: str, samples: int, snps: int, 77 | model: str, maf_threshold: int, perm: int, v_g: float, v_e: float, h2: float, bonf1: float, 78 | bonf5: float, perm1: float, perm5: float, time: float, kinship_file: pathlib.Path = None, 79 | covariate_file: pathlib.Path = None, covariate_list: list = None, perm_method: str = None): 80 | """ 81 | Print summary statistics 82 | 83 | :param genotype_file: 84 | :param phenotype_file: 85 | :param trait: name of phenotypic trait 86 | :param samples: number of samples used 87 | :param snps: number of SNPs used 88 | :param model: model used for GWAS 89 | :param maf_threshold: threshold used for maf filtering 90 | :param perm: number of permutations 91 | :param v_g: genetic variance component 92 | :param v_e: residual variiance component 93 | :param h2: narrow-sense heritability 94 | :param bonf1: Bonferroni threshold significance level 1% 95 | :param bonf5: Bonferroni threshold significance level 5% 96 | :param perm1: permutation-based threshold significance level 1% 97 | :param perm5: permutation-based threshold significance level 5% 98 | :param kinship_file: 99 | :param covariate_file: 100 | :param covariate_list: list containing covariates 101 | :param perm_method: method used for permutations 102 | """ 103 | print('\n') 104 | print('+++++++++ Summary Statistics +++++++++') 105 | print('## Genotype file: ' + genotype_file.as_posix()) 106 | print('## Phenotype file: ' + phenotype_file.as_posix()) 107 | print('## Phenotype: ' + trait) 108 | if covariate_file is not None: 109 | print('## Covariate file: ' + covariate_file.as_posix()) 110 | if covariate_list is not None: 111 | print('## Used covariates: ' + ",".join(covariate_list)) 112 | else: 113 | print('## Used all available covariates') 114 | if kinship_file is not None: 115 | print('## Kinship file: ' + kinship_file.as_posix()) 116 | print('## Number of individuals: ' + str(samples)) 117 | print('## Number of SNPs: ' + str(snps)) 118 | print('## Model: ' + model) 119 | print('## MAF threshold: ' + str(maf_threshold)) 120 | print('## Number of permutations: ' + str(perm)) 121 | if perm_method is not None: 122 | print('## permutation method: ' + perm_method) 123 | if model == 'lmm': 124 | print('## v_g estimate in null model: ' + str(v_g)) 125 | print('## v_e estimate in null model: ' + str(v_e)) 126 | print('## Narrow-sense heritability estimate: ' + str(h2)) 127 | print('## Bonferroni threshold (1% significance level): ' + str(bonf1)) 128 | print('## Bonferroni threshold (5% significance level): ' + str(bonf5)) 129 | if perm1 is not None: 130 | print('## Permutation-based threshold (1% significance level): ' + str(perm1)) 131 | print('## Permutation-based threshold (5% significance level): ' + str(perm5)) 132 | print('## Total time: %.2f s' %time) 133 | print('+++++++++++++++++++++++++++') 134 | print('\n') 135 | 136 | 137 | def write_summary_stats(out_dir: pathlib.Path, out_file: str, genotype_file: pathlib.Path, phenotype_file: pathlib.Path, 138 | trait: str, samples: int, snps: int, model: str, maf_threshold: int, perm: int, v_g: float, 139 | v_e: float, h2: float, bonf1: float, bonf5: float, perm1: float, perm5: float, time: float, 140 | kinship_file: pathlib.Path = None, covariate_file: pathlib.Path = None, 141 | covariate_list: list = None, perm_method: str = None): 142 | """ 143 | Save summary statistics to txt file 144 | 145 | :param out_dir: 146 | :param out_file: 147 | :param genotype_file: 148 | :param phenotype_file: 149 | :param trait: name of phenotypic trait 150 | :param samples: number of samples used 151 | :param snps: number of SNPs used 152 | :param model: model used for GWAS 153 | :param maf_threshold: threshold used for maf filtering 154 | :param perm: number of permutations 155 | :param v_g: genetic variance component 156 | :param v_e: residual variance component 157 | :param h2: narrow-sense heritability 158 | :param bonf1: Bonferroni threshold significance level 1% 159 | :param bonf5: Bonferroni threshold significance level 5% 160 | :param perm1: permutation-based threshold significance level 1% 161 | :param perm5: permutation-based threshold significance level 5% 162 | :param kinship_file: 163 | :param covariate_file: 164 | :param covariate_list: list containing covariates 165 | :param perm_method: method used for permutations 166 | """ 167 | filename = out_dir.joinpath('summary_statistics_' + pathlib.Path(out_file).with_suffix('.txt').as_posix()) 168 | with open(filename, 'w') as f: 169 | f.write('Summary Statistics:\n') 170 | f.write('## Genotype file:\t' + genotype_file.as_posix() + '\n') 171 | f.write('## Phenotype file:\t' + phenotype_file.as_posix() + '\n') 172 | f.write('## Phenotype:\t' + trait + '\n') 173 | if covariate_file is not None: 174 | f.write('## Covariate file:\t' + covariate_file.as_posix() + '\n') 175 | if covariate_list is not None: 176 | f.write('## Used covariates:\t' + ",".join(covariate_list) + '\n') 177 | else: 178 | f.write('## Used all available covariates' + '\n') 179 | if kinship_file is not None: 180 | f.write('## Kinship file:\t' + kinship_file.as_posix() + '\n') 181 | f.write('## Number of individuals:\t' + str(samples) + '\n') 182 | f.write('## Number of SNPs:\t' + str(snps) + '\n') 183 | f.write('## Model:\t' + model + '\n') 184 | f.write('## MAF threshold:\t' + str(maf_threshold) + '\n') 185 | f.write('## Number of permutations:\t' + str(perm) + '\n') 186 | if perm_method is not None: 187 | f.write('## permutation method:\t' + perm_method + '\n') 188 | if model == 'lmm': 189 | f.write('## v_g estimate in null model:\t' + str(v_g) + '\n') 190 | f.write('## v_e estimate in null model:\t' + str(v_e) + '\n') 191 | f.write('## Narrow-sense heritability estimate:\t' + str(h2) + '\n') 192 | f.write('## Bonferroni threshold (1% significance level):\t' + str(bonf1) + '\n') 193 | f.write('## Bonferroni threshold (5% significance level):\t' + str(bonf5) + '\n') 194 | if perm1 is not None: 195 | f.write('## Permutation-based threshold (1% significance level):\t' + str(perm1) + '\n') 196 | f.write('## Permutation-based threshold (5% significance level):\t' + str(perm5) + '\n') 197 | f.write('## Total time:\t' + str(time) + ' s\n') 198 | 199 | 200 | def get_summary_stats(out_dir: pathlib.Path, out_file: str, genotype_file: pathlib.Path, phenotype_file: pathlib.Path, 201 | trait: str, samples: int, snps: int, model: str, maf_threshold: int, perm: int, v_g: float, 202 | v_e: float, min_p_val: np.array, time: float, kinship_file: pathlib.Path = None, 203 | covariate_file: pathlib.Path = None, covariate_list: list = None, perm_method: str = None): 204 | """ 205 | Compute summary statistics, print and save them to file 206 | 207 | :param out_dir: 208 | :param out_file: 209 | :param genotype_file: 210 | :param phenotype_file: 211 | :param trait: name of phenotypic trait 212 | :param samples: number of samples used 213 | :param snps: number of SNPs used 214 | :param model: model used for GWAS 215 | :param maf_threshold: threshold used for maf filtering 216 | :param perm: number of permutations 217 | :param v_g: genetic variance component 218 | :param v_e: residual variance component 219 | :param min_p_val: minimal p-values 220 | :param kinship_file: 221 | :param covariate_file: 222 | :param covariate_list: list containing covariates 223 | :param perm_method: method used for permutations 224 | """ 225 | if model == 'lmm': 226 | h2 = estimate_heritability(v_g=v_g, v_e=v_e) 227 | else: 228 | h2 = None 229 | bonf1 = compute_bonf_threshold(number_snps=snps, sig_level=1) 230 | bonf5 = compute_bonf_threshold(number_snps=snps, sig_level=5) 231 | if min_p_val is not None: 232 | perm1 = compute_perm_threshold(min_p_val=min_p_val, sig_level=1) 233 | perm5 = compute_perm_threshold(min_p_val=min_p_val, sig_level=5) 234 | else: 235 | perm1 = None 236 | perm5 = None 237 | 238 | write_summary_stats(out_dir=out_dir, out_file=out_file, genotype_file=genotype_file, phenotype_file=phenotype_file, 239 | trait=trait, samples=samples, snps=snps, model=model, maf_threshold=maf_threshold, perm=perm, 240 | v_g=v_g, v_e=v_e, h2=h2, bonf1=bonf1, bonf5=bonf5, perm1=perm1, perm5=perm5, time=time, 241 | kinship_file=kinship_file, covariate_file=covariate_file, covariate_list=covariate_list, 242 | perm_method=perm_method) 243 | print_summary_stats(genotype_file=genotype_file, phenotype_file=phenotype_file, 244 | trait=trait, samples=samples, snps=snps, model=model, maf_threshold=maf_threshold, perm=perm, 245 | v_g=v_g, v_e=v_e, h2=h2, bonf1=bonf1, bonf5=bonf5, perm1=perm1, perm5=perm5, time=time, 246 | kinship_file=kinship_file, covariate_file=covariate_file, covariate_list=covariate_list, 247 | perm_method=perm_method) 248 | --------------------------------------------------------------------------------