├── .gitignore
├── .idea
├── .gitignore
├── inspectionProfiles
│ ├── Project_Default.xml
│ └── profiles_settings.xml
├── misc.xml
├── modules.xml
├── permGWAS.iml
└── vcs.xml
├── Docker
├── Dockerfile
└── requirements.txt
├── LICENSE
├── README.md
├── create_h5_file.py
├── create_plot.py
├── data
├── config.yaml
├── cov_matrix.csv
├── k_matrix.csv
├── k_matrix.h5
├── x_matrix.csv
├── x_matrix.h5
├── x_matrix.map
├── x_matrix.ped
├── y_matrix.csv
└── y_matrix.pheno
├── docs
├── DATAGUIDE.md
├── INSTALLATION.md
├── OPTIONS.md
├── PERMUTATIONS.md
├── PLOTS.md
├── QUICKSTART.md
├── manhattan.png
└── qq_plot.png
├── models
├── __init__.py
├── _base_model.py
└── lmm.py
├── optimize
├── __init__.py
└── brent.py
├── perform_gwas.py
├── permGWAS.py
├── permGWAS_logo.png
├── postprocess
├── __init__.py
└── plot_functions.py
├── preprocess
├── __init__.py
└── data_loader.py
├── supplementary_data
├── simulated_phenotypes_her30.h5
└── suppl_data_John_et_al_2022
│ └── AraGWAS_thresholds.csv
└── utils
├── __init__.py
├── check_functions.py
└── helper_functions.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # IPython
81 | profile_default/
82 | ipython_config.py
83 |
84 | # pyenv
85 | .python-version
86 |
87 | # pipenv
88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
91 | # install all needed dependencies.
92 | #Pipfile.lock
93 |
94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95 | __pypackages__/
96 |
97 | # Celery stuff
98 | celerybeat-schedule
99 | celerybeat.pid
100 |
101 | # SageMath parsed files
102 | *.sage.py
103 |
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 |
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 |
117 | # Rope project settings
118 | .ropeproject
119 |
120 | # mkdocs documentation
121 | /site
122 |
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 |
128 | # Pyre type checker
129 | .pyre/
130 |
--------------------------------------------------------------------------------
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 | # Datasource local storage ignored files
5 | /dataSources/
6 | /dataSources.local.xml
7 | # Editor-based HTTP Client requests
8 | /httpRequests/
9 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/Project_Default.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/permGWAS.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/Docker/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM nvidia/cuda:11.5.2-base-ubuntu20.04
2 | RUN apt-get update && apt-get install -y python3 && apt-get install -y python3-pip
3 | RUN apt-get install -y vim
4 | RUN apt-get install -y git
5 | RUN mkdir /configfiles
6 | COPY requirements.txt /configfiles
7 | RUN pip3 install -r /configfiles/requirements.txt
8 | RUN pip3 install torch==1.11.0+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html
9 |
--------------------------------------------------------------------------------
/Docker/requirements.txt:
--------------------------------------------------------------------------------
1 | h5py
2 | matplotlib
3 | numpy
4 | pandas
5 | pandas-plink
6 | scipy
7 | seaborn
8 | pyyaml
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2022 Grimm Lab - Bioinformatics and Machine Learning
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | [](https://www.python.org/downloads/release/python-3100/)
2 | [](https://opensource.org/licenses/MIT)
3 |
4 |
5 |
6 | ## permGWAS2
7 |
8 | This is an improved version of permGWAS. The original version can be found at [permGWAS Version1](https://github.com/grimmlab/permGWAS/releases/tag/permGWAS)
9 |
10 | permGWAS2 is an open source software tool written in python to efficiently perform genome-wide association studies (GWAS)
11 | with permutation-based thresholds. It uses a batch-wise Linear Mixed Model to compute several univariate tests simultaneously.
12 | permGWAS2 provides support for multiple CPUs as well as for GPUs.
13 |
14 | In contrast to the original version, permGWAS2 allows for two different permutation strategies:
15 |
16 | x (default): permute the fixed effects matrix including covariates and the SNP of interest (equivalent to permuting y and the covariance matrix)
17 |
18 | y: permute only the phenotype vector (same method as in the original permGWAS)
19 |
20 | Details on the architecture of permGWAS and permGWAS2, benchmarking results of the framework and on permutation-based thresholds can be found in our publications.
21 |
22 | ## How to run permGWAS2
23 | 1. [Requirements & Installation](./docs/INSTALLATION.md)
24 | 2. [Quickstart Guide](./docs/QUICKSTART.md)
25 | 3. [Data Guide](./docs/DATAGUIDE.md)
26 | 4. [permGWAS2 with permutations](./docs/PERMUTATIONS.md)
27 | 5. [Create plots](./docs/PLOTS.md)
28 | 6. [Optional settings](./docs/OPTIONS.md)
29 |
30 |
31 | ## Publications & Citation
32 |
33 | John, M., Korte, A., Todesco M., & Grimm, D. G. (2024).
34 | **Population-aware permutation-based significance thresholds for genome-wide association studies**.
35 | Bioinformatics Advances, 2024
36 |
37 | DOI: [https://doi.org/10.1093/bioadv/vbae168](https://doi.org/10.1093/bioadv/vbae168)
38 |
39 | John, M., Ankenbrand, M. J., Artmann, C., Freudenthal, J. A., Korte, A., & Grimm, D. G. (2022).
40 | **Efficient Permutation-based Genome-wide Association Studies for Normal and Skewed Phenotypic Distributions**.
41 | Bioinformatics, 2022.
42 |
43 | DOI: [https://doi.org/10.1093/bioinformatics/btac455](https://doi.org/10.1093/bioinformatics/btac455)
44 |
--------------------------------------------------------------------------------
/create_h5_file.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import pathlib
3 | from preprocess import data_loader
4 | from utils import check_functions
5 |
6 | if __name__ == "__main__":
7 | # Input parameters
8 | parser = argparse.ArgumentParser()
9 | parser.add_argument('-x', '--genotype_file', type=str,
10 | help='specify the name of the genotype file, absolute and relative paths are accepted, '
11 | 'only accept CSV, PLINK and binary PLINK files, '
12 | 'PLINK and binary PLINK: all required files must be in the same folder with same prefix,'
13 | 'for format CSV files check documentation')
14 | parser.add_argument('-sd', '--save_dir', type=str, default=None,
15 | help='specify a directory to save newly generated H5 file. Optional, if None is specified, '
16 | 'H5 file will be saved in same directory as original genotype file.')
17 |
18 | args = vars(parser.parse_args())
19 | args["genotype_file"] = check_functions.check_file(args["genotype_file"])
20 | if pathlib.Path(args["genotype_file"]).suffix in ('.h5', '.hdf5', '.h5py'):
21 | raise Exception('Genotype file is already in HDF5, H5, H5PY')
22 | if args["save_dir"] is None:
23 | args["save_dir"] = pathlib.Path(args["genotype_file"]).parent
24 | out_file = pathlib.Path(args["genotype_file"]).with_suffix('.h5').stem
25 | args["save_dir"], out_file = check_functions.check_dir_paths(out_dir=args["save_dir"], out_file=out_file, prefix='')
26 |
27 | # load data from file
28 | print('Load data from file ' + str(args["genotype_file"]))
29 | dataset = data_loader.Genotype(genotype_file=args["genotype_file"])
30 | dataset.load_genotype_data()
31 |
32 | # save data as H5
33 | dataset.save_genotype_hdf5(filename=args["save_dir"].joinpath(out_file))
34 |
--------------------------------------------------------------------------------
/create_plot.py:
--------------------------------------------------------------------------------
1 | # create Manhattan and QQ-plots
2 | import pandas as pd
3 | import pathlib
4 | import argparse
5 |
6 | from utils import check_functions
7 | from postprocess import plot_functions
8 |
9 | if __name__ == "__main__":
10 | parser = argparse.ArgumentParser()
11 | parser.add_argument('-p_val', '--p_value_file', type=str, default=None,
12 | help='Specify the full path to the p_value file, absolute and relative paths are accepted, '
13 | 'only accept .csv files. p_value files must at least contain chromosome ids (CHR), '
14 | 'position ids (POS) and corresponding p_values (p_value).')
15 | parser.add_argument('-min_p_val', '--min_p_value_file', type=str, default=None,
16 | help='Optional, specify the full path to the file containing minimal p-values in order to '
17 | 'compute permutation-based thresholds, absolute and relative paths are accepted, '
18 | 'only accept .csv files.')
19 | parser.add_argument('-mplot', '--manhattan', action='store_true',
20 | help='optional, creates manhattan plot')
21 | parser.add_argument('-qqplot', action='store_true',
22 | help='optional, creates QQ-plot')
23 | parser.add_argument('-out_dir', type=str, default=None,
24 | help='Specify the name of the directory plots should be stored in,'
25 | 'absolute and relative paths are accepted. Optional, if not provided, files will be '
26 | 'stored in same folder as p_value file.')
27 | parser.add_argument('-out_file', type=str, default=None,
28 | help='Specify NAME of plots, will be stored as manhattan_NAME.png or qq_plot_NAME.png,'
29 | 'optional, if not provided name of p_value file will be used.')
30 | parser.add_argument('-sig_level', type=int, default=5,
31 | help='Significance level (percentage values) to compute threshold for Manhattan plot. '
32 | 'Optional, default is 5.')
33 | args = vars(parser.parse_args())
34 |
35 | args["p_value_file"] = check_functions.check_file(args["p_value_file"])
36 | if args["min_p_value_file"] is not None:
37 | args["min_p_value_file"] = check_functions.check_file(args["min_p_value_file"])
38 | if args["out_dir"] is None:
39 | args["out_dir"] = pathlib.Path(args["p_value_file"]).parent
40 | if args["out_file"] is None:
41 | args["out_file"] = pathlib.Path(args["p_value_file"]).stem
42 |
43 | df = pd.read_csv(args["p_value_file"])
44 | if not {'CHR', 'POS', 'p_value'}.issubset(df.columns):
45 | raise Exception('Cannot create Manhattan plot; need CHR, POS and p_value in DataFrame.')
46 |
47 | if args["manhattan"]:
48 | out_dir, out_file = check_functions.check_dir_paths(out_dir=args["out_dir"], out_file=args["out_file"],
49 | prefix='manhattan_')
50 | print('Save Manhattan plot with significance level of %d.' % args["sig_level"])
51 | if args["min_p_value_file"] is not None:
52 | df_min = pd.read_csv(args["min_p_value_file"])
53 | if not 'min_p_val' in df_min.columns:
54 | raise Exception('Cannot compute permutation-based threshold, need min_p_val in DataFrame.')
55 | min_p_val = df_min['min_p_val'].values
56 | else:
57 | min_p_val = None
58 | plot_functions.manhattan_plot(df=df, data_dir=out_dir, filename=out_file,
59 | min_p_values=min_p_val, sig_level=args["sig_level"])
60 |
61 | if args["qqplot"]:
62 | out_dir, out_file = check_functions.check_dir_paths(out_dir=args["out_dir"], out_file=args["out_file"],
63 | prefix='qq_plot_')
64 | print('Save QQ-plot.')
65 | plot_functions.qq_plot(p_values=df['p_value'].values, data_dir=out_dir, filename=out_file)
66 |
--------------------------------------------------------------------------------
/data/config.yaml:
--------------------------------------------------------------------------------
1 | ---
2 | genotype_file: "./data/x_matrix.h5"
3 | phenotype_file: "./data/y_matrix.csv"
4 | trait: "phenotype_value"
5 | kinship_file:
6 | covariate_file:
7 | covariate_list:
8 | perm_method: "x"
9 | maf_threshold: 0
10 | perm: 100
--------------------------------------------------------------------------------
/data/cov_matrix.csv:
--------------------------------------------------------------------------------
1 | accession_id,covariate
2 | 9381,1
3 | 9380,1
4 | 9378,1
5 | 9371,0
6 | 9367,1
7 | 9363,0
8 | 9356,0
9 | 9355,0
10 | 9354,0
11 | 9353,0
12 | 9352,1
13 | 9351,0
14 | 9344,1
15 | 9343,0
16 | 9339,1
17 | 9336,1
18 | 9332,0
19 | 9323,0
20 | 9321,1
21 | 9482,1
22 | 9481,0
23 | 9472,0
24 | 9471,1
25 | 9470,1
26 | 9469,0
27 | 9455,0
28 | 9454,1
29 | 9453,0
30 | 9451,1
31 | 9419,1
32 | 9418,0
33 | 9409,1
34 | 9402,0
35 | 9369,1
36 | 9349,1
37 | 9476,1
38 | 9433,1
39 | 9446,1
40 | 9443,1
41 | 9442,1
42 | 997,1
43 | 996,0
44 | 1068,1
45 | 1026,0
46 | 1585,1
47 | 1435,1
48 | 1169,1
49 | 1075,1
50 | 1132,1
51 | 1064,0
52 | 1063,0
53 | 1062,1
54 | 1247,1
55 | 991,1
56 | 1391,0
57 | 1374,0
58 | 1318,0
59 | 1254,1
60 | 1163,1
61 | 1153,1
62 | 1073,1
63 | 1072,1
64 | 394,0
65 | 7,0
66 | 203,1
67 | 236,0
68 | 367,0
69 | 123,1
70 | 395,0
71 | 196,1
72 | 264,0
73 | 185,1
74 | 297,0
75 | 318,0
76 | 323,0
77 | 79,0
78 | 198,0
79 | 371,0
80 | 280,0
81 | 12,1
82 | 347,1
83 | 268,1
84 | 288,1
85 | 377,1
86 | 252,0
87 | 296,1
88 | 341,1
89 | 156,0
90 | 397,1
91 | 263,0
92 | 48,1
93 | 45,1
94 | 210,0
95 | 83,0
96 | 372,1
97 | 393,0
98 | 205,1
99 | 87,0
100 | 62,1
101 | 309,1
102 | 222,1
103 | 160,1
104 | 229,1
105 | 369,0
106 | 227,1
107 | 230,0
108 | 217,0
109 | 194,1
110 | 391,1
111 | 340,1
112 | 167,0
113 | 266,1
114 | 208,1
115 | 335,1
116 | 213,1
117 | 388,1
118 | 331,0
119 | 216,1
120 | 277,1
121 | 85,0
122 | 310,1
123 | 389,1
124 | 387,0
125 | 191,0
126 | 224,0
127 | 82,1
128 | 225,1
129 | 295,1
130 | 169,1
131 | 375,0
132 | 292,1
133 | 215,1
134 | 337,1
135 | 320,1
136 | 171,0
137 | 346,1
138 | 151,1
139 | 137,1
140 | 291,0
141 | 385,0
142 | 84,1
143 | 349,0
144 | 219,0
145 | 322,1
146 | 204,0
147 | 273,1
148 | 212,1
149 | 146,0
150 | 348,0
151 | 157,1
152 | 214,0
153 | 316,0
154 | 186,0
155 | 314,1
156 | 293,1
157 | 183,1
158 | 287,0
159 | 290,0
160 | 168,0
161 | 343,1
162 | 153,1
163 | 339,1
164 | 60,0
165 | 174,0
166 | 88,1
167 | 359,1
168 | 298,1
169 | 162,1
170 | 311,1
171 | 329,0
172 | 175,0
173 | 163,0
174 | 77,1
175 | 302,1
176 | 231,0
177 | 148,0
178 | 106,1
179 | 283,1
180 | 184,1
181 | 122,1
182 | 170,1
183 | 396,1
184 | 275,0
185 | 244,1
186 | 116,1
187 | 364,0
188 | 121,0
189 | 165,0
190 | 32,1
191 | 201,1
192 | 326,1
193 | 368,1
194 | 332,1
195 | 361,0
196 | 202,0
197 | 200,1
198 | 257,0
199 | 80,0
200 | 9,0
201 | 187,1
202 | 89,0
203 | 207,0
204 | 69,1
205 | 188,1
206 | 306,0
207 | 360,0
208 | 237,1
209 | 327,1
210 | 261,1
211 | 86,1
212 | 228,0
213 | 190,0
214 | 74,1
215 | 8,0
216 | 4,1
217 | 159,1
218 | 262,0
219 | 51,0
220 | 5,1
221 | 363,1
222 | 338,1
223 | 355,1
224 | 269,1
225 | 278,0
226 | 179,0
227 | 6,0
228 | 206,1
229 | 461,1
230 | 466,1
231 | 9490,1
232 | 9496,0
233 | 9504,1
234 | 9499,1
235 | 9308,0
236 | 9305,1
237 | 9302,1
238 | 9309,1
239 | 4980,0
240 | 5444,1
241 | 5394,1
242 | 5461,1
243 | 5494,1
244 | 5398,0
245 | 5466,1
246 | 5450,0
247 | 4675,0
248 | 4632,0
249 | 5769,1
250 | 4757,1
251 | 4827,0
252 | 4820,0
253 | 5159,1
254 | 5759,0
255 | 5739,1
256 | 5738,1
257 | 5770,1
258 | 5826,1
259 | 5745,1
260 | 5744,1
261 | 5774,1
262 | 5760,0
263 | 5746,1
264 | 5762,0
265 | 5711,1
266 | 5802,1
267 | 5740,0
268 | 5716,1
269 | 5772,1
270 | 5722,1
271 | 5751,1
272 | 5721,0
273 | 5812,0
274 | 5792,0
275 | 5735,1
276 | 5767,0
277 | 5817,1
278 | 5807,1
279 | 5777,1
280 | 5736,0
281 | 5763,1
282 | 5813,0
283 | 5741,1
284 | 5731,1
285 | 5819,1
286 | 5724,1
287 | 5789,0
288 | 5141,1
289 | 5175,1
290 | 5145,0
291 | 5469,0
292 | 5106,1
293 | 5299,0
294 | 5335,1
295 | 7121,1
296 | 7106,0
297 | 7104,0
298 | 7113,1
299 | 7116,0
300 | 7149,1
301 | 7228,1
302 | 7301,1
303 | 7109,1
304 | 6987,1
305 | 7028,1
306 | 7029,1
307 | 7030,1
308 | 7013,0
309 | 7017,0
310 | 7032,1
311 | 7073,0
312 | 242,0
313 | 104,1
314 | 282,1
315 | 96,1
316 | 23,1
317 | 6102,0
318 | 6938,0
319 | 8304,1
320 | 8238,1
321 | 8386,0
322 | 8348,1
323 | 1416,1
324 | 6237,0
325 | 6226,1
326 | 6184,0
327 | 6174,0
328 | 6172,1
329 | 6171,1
330 | 6170,1
331 | 6151,0
332 | 6150,0
333 | 6149,1
334 | 6148,1
335 | 6147,0
336 | 6146,0
337 | 6145,0
338 | 6144,0
339 | 6142,1
340 | 6141,1
341 | 6137,0
342 | 6136,0
343 | 6131,1
344 | 6134,0
345 | 6133,1
346 | 6132,1
347 | 6129,0
348 | 6128,1
349 | 6127,1
350 | 6126,0
351 | 6125,0
352 | 6123,1
353 | 6122,1
354 | 6121,1
355 | 6119,1
356 | 6116,1
357 | 6115,0
358 | 6114,0
359 | 6111,1
360 | 6110,1
361 | 6108,0
362 | 6107,1
363 | 6106,1
364 | 6104,1
365 | 6103,0
366 | 6101,1
367 | 6100,1
368 | 6099,1
369 | 6098,0
370 | 6097,1
371 | 6095,1
372 | 6093,1
373 | 6092,1
374 | 6091,1
375 | 6090,1
376 | 6177,1
377 | 6221,1
378 | 6244,1
379 | 6241,1
380 | 6240,1
381 | 6238,1
382 | 6236,1
383 | 6235,1
384 | 6220,1
385 | 6218,1
386 | 6217,1
387 | 6216,0
388 | 6215,1
389 | 6214,1
390 | 6210,1
391 | 6209,0
392 | 6207,1
393 | 6201,0
394 | 6200,0
395 | 6199,1
396 | 6198,1
397 | 6197,1
398 | 6195,1
399 | 6194,1
400 | 6193,1
401 | 6192,1
402 | 6191,1
403 | 6189,1
404 | 6163,1
405 | 6154,1
406 | 8274,1
407 | 7192,1
408 | 7194,1
409 | 7210,0
410 | 7238,1
411 | 7245,0
412 | 7246,0
413 | 7256,0
414 | 7265,0
415 | 7268,1
416 | 366,1
417 | 5245,0
418 | 5264,0
419 | 7195,0
420 | 7262,0
421 | 7250,1
422 | 1925,1
423 | 5719,1
424 | 5798,1
425 | 5816,1
426 | 5821,0
427 | 5710,1
428 | 5715,1
429 | 5720,0
430 | 5733,1
431 | 5820,1
432 | 5737,1
433 | 5755,0
434 | 5781,1
435 | 5782,1
436 | 5784,1
437 | 5146,0
438 | 5133,0
439 | 5709,1
440 | 5712,0
441 | 5795,0
442 | 5750,0
443 | 5708,0
444 | 5749,1
445 | 5727,0
446 | 5780,0
447 | 5756,1
448 | 5723,1
449 | 5730,0
450 | 5717,0
451 | 5732,1
452 | 5804,1
453 | 5752,1
454 | 5799,1
455 | 5713,1
456 | 5728,1
457 | 5787,1
458 | 5788,1
459 | 5793,0
460 | 5803,0
461 | 5758,1
462 | 7317,0
463 | 7034,1
464 | 615,0
465 | 627,0
466 | 607,0
467 | 631,0
468 | 623,0
469 | 719,1
470 | 640,0
471 | 827,0
472 | 895,1
473 | 946,1
474 | 936,0
475 | 7717,0
476 | 7787,1
477 | 7837,1
478 | 7847,1
479 | 7867,1
480 | 8077,1
481 | 8122,1
482 | 1743,0
483 | 1799,1
484 | 2175,1
485 | 2160,0
486 | 2171,0
487 | 2148,1
488 | 2180,0
489 | 2157,1
490 | 1948,0
491 | 1941,0
492 | 1949,0
493 | 1965,1
494 | 1981,1
495 | 1992,1
496 | 2016,1
497 | 2011,1
498 | 2019,1
499 | 2020,1
500 | 2151,1
501 | 1862,1
502 | 1872,0
503 | 1864,1
504 | 1871,1
505 | 1857,1
506 | 1865,0
507 | 1873,0
508 | 1850,0
509 | 1858,1
510 | 1874,1
511 | 1868,1
512 | 1829,1
513 | 1853,0
514 | 1926,0
515 | 1966,1
516 | 1918,1
517 | 1959,0
518 | 1936,1
519 | 1952,0
520 | 1960,1
521 | 1968,1
522 | 1938,0
523 | 1963,1
524 | 1720,0
525 | 1736,1
526 | 1744,1
527 | 1752,0
528 | 1729,1
529 | 1745,1
530 | 1753,0
531 | 1722,1
532 | 1730,0
533 | 1738,1
534 | 1724,1
535 | 1740,1
536 | 1733,1
537 | 1718,1
538 | 1726,1
539 | 1750,0
540 | 1782,1
541 | 1719,1
542 | 7566,1
543 | 1751,1
544 | 2214,0
545 | 2201,1
546 | 2204,1
547 | 2294,0
548 | 2280,1
549 | 2338,1
550 | 2292,1
551 | 2300,1
552 | 2316,0
553 | 2283,0
554 | 7584,1
555 | 7580,1
556 | 7578,0
557 | 7570,0
558 | 8608,0
559 | 8727,1
560 | 8760,1
561 | 8768,1
562 | 8616,1
563 | 8617,1
564 | 8770,1
565 | 8730,1
566 | 8619,1
567 | 8629,1
568 | 8612,1
569 | 8724,1
570 | 8631,0
571 | 8725,0
572 | 8640,1
573 | 8759,0
574 | 8774,1
575 | 8824,1
576 | 8557,0
577 | 8791,1
578 | 8777,1
579 | 8811,1
580 | 8787,0
581 | 8805,1
582 | 8534,1
583 | 8687,1
584 | 9045,1
585 | 8673,1
586 | 9041,1
587 | 8701,1
588 | 9053,0
589 | 8985,1
590 | 8957,1
591 | 8966,0
592 | 8695,1
593 | 8967,1
594 | 9004,1
595 | 8690,1
596 | 9012,1
597 | 8969,1
598 | 8961,1
599 | 8970,1
600 | 8954,0
601 | 8962,0
602 | 8965,1
603 | 8973,0
604 | 8975,1
605 | 9006,1
606 | 8976,1
607 | 9007,1
608 | 8977,0
609 | 8992,1
610 | 9008,0
611 | 9001,1
612 | 8719,1
613 | 8996,1
614 | 9011,1
615 | 1742,0
616 | 1749,1
617 | 999,1
618 | 1061,0
619 | 1404,0
620 | 1552,1
621 | 1257,1
622 | 1158,1
623 | 1070,1
624 | 9452,1
625 | 417,1
626 | 421,0
627 | 407,0
628 | 424,1
629 | 402,1
630 | 403,1
631 | 404,1
632 | 428,0
633 | 429,1
634 | 409,1
635 | 413,0
636 | 5883,1
637 | 5848,0
638 | 6416,1
639 | 5838,1
640 | 6287,1
641 | 6417,0
642 | 5841,1
643 | 5894,1
644 | 5904,0
645 | 5913,0
646 | 5921,1
647 | 5939,1
648 | 5969,0
649 | 5895,0
650 | 5905,1
651 | 5914,0
652 | 5923,0
653 | 5932,1
654 | 5942,1
655 | 5961,1
656 | 5970,1
657 | 5884,0
658 | 5906,1
659 | 5924,0
660 | 5933,1
661 | 5943,1
662 | 5953,1
663 | 5963,0
664 | 5972,0
665 | 5888,0
666 | 5934,1
667 | 5898,1
668 | 5908,1
669 | 5919,0
670 | 5926,0
671 | 5935,1
672 | 5945,1
673 | 5955,1
674 | 5891,1
675 | 5900,0
676 | 5927,0
677 | 5946,1
678 | 5966,0
679 | 5975,0
680 | 5901,1
681 | 5911,0
682 | 5875,1
683 | 5948,1
684 | 5893,1
685 | 5902,1
686 | 5920,1
687 | 5938,1
688 | 5959,1
689 | 5968,1
690 | 5988,1
691 | 5999,1
692 | 6455,0
693 | 5979,1
694 | 6421,0
695 | 5991,1
696 | 5992,1
697 | 6004,0
698 | 6458,0
699 | 5982,1
700 | 5993,0
701 | 6425,0
702 | 6451,1
703 | 6309,1
704 | 5984,0
705 | 5994,0
706 | 6444,1
707 | 5997,0
708 | 6007,0
709 | 6445,0
710 | 6453,0
711 | 5998,1
712 | 6427,1
713 | 6435,0
714 | 6446,0
715 | 6403,1
716 | 5922,1
717 | 5915,0
718 | 5910,1
719 | 6401,0
720 | 6003,0
721 | 5899,0
722 | 6396,0
723 | 5873,0
724 | 6418,0
725 | 5874,1
726 | 5916,1
727 | 5878,1
728 | 5983,1
729 | 5990,1
730 | 5996,1
731 | 5940,1
732 | 5846,1
733 | 5871,0
734 | 6436,1
735 | 5872,1
736 | 5956,1
737 | 6402,1
738 | 4758,1
739 | 5285,0
740 | 9153,0
741 | 9137,0
742 | 9151,1
743 | 9143,0
744 | 9201,1
745 | 6173,0
746 | 6284,0
747 | 6276,1
748 | 6258,1
749 | 6252,0
750 | 6255,0
751 | 6166,0
752 | 6085,1
753 | 6025,1
754 | 6268,1
755 | 6180,1
756 | 6143,1
757 | 6041,1
758 | 5829,1
759 | 8427,1
760 | 8218,1
761 | 6023,0
762 | 5835,1
763 | 5831,1
764 | 5830,1
765 | 6039,0
766 | 6086,1
767 | 6413,1
768 | 6412,1
769 | 6411,1
770 | 6087,0
771 | 6077,0
772 | 6076,1
773 | 6071,0
774 | 6069,0
775 | 6038,1
776 | 6036,1
777 | 6035,0
778 | 6034,1
779 | 6030,0
780 | 6024,0
781 | 6021,0
782 | 6019,1
783 | 6017,1
784 | 6013,0
785 | 6012,0
786 | 6011,1
787 | 6010,1
788 | 5870,0
789 | 5867,0
790 | 5865,1
791 | 5860,1
792 | 5836,1
793 | 6231,0
794 | 6212,1
795 | 6140,1
796 | 6138,0
797 | 6120,1
798 | 6118,0
799 | 6073,0
800 | 6022,1
801 | 6020,0
802 | 8227,1
803 | 8225,1
804 | 8230,0
805 | 5856,0
806 | 8307,1
807 | 1409,0
808 | 6959,0
809 | 7525,1
810 | 6961,1
811 | 6967,1
812 | 6973,1
813 | 6974,1
814 | 6976,1
815 | 7516,1
816 | 6979,1
817 | 6980,0
818 | 6982,1
819 | 6983,0
820 | 6985,1
821 | 6931,1
822 | 6043,0
823 | 6945,1
824 | 7519,1
825 | 7526,1
826 | 7523,1
827 | 6956,1
828 | 6960,1
829 | 7524,1
830 | 6963,1
831 | 6964,0
832 | 6965,0
833 | 6966,1
834 | 6969,1
835 | 6971,1
836 | 6975,1
837 | 7517,0
838 | 6978,1
839 | 6981,0
840 | 6984,0
841 | 6899,0
842 | 6903,0
843 | 6904,1
844 | 6905,0
845 | 6906,1
846 | 6909,1
847 | 6911,1
848 | 6916,1
849 | 8215,0
850 | 6921,1
851 | 6932,0
852 | 6046,0
853 | 6944,1
854 | 7515,1
855 | 7514,1
856 | 6962,1
857 | 6968,0
858 | 6972,1
859 | 6970,1
860 | 6977,1
861 | 8329,1
862 | 7379,0
863 | 7080,1
864 | 6744,1
865 | 7098,1
866 | 7158,0
867 | 7163,0
868 | 7165,0
869 | 7340,1
870 | 7372,0
871 | 7394,0
872 | 7397,1
873 | 281,1
874 | 8258,0
875 | 8259,1
876 | 8290,1
877 | 7461,1
878 | 7323,1
879 | 8254,0
880 | 8270,1
881 | 8233,1
882 | 8285,0
883 | 6016,1
884 | 8423,0
885 | 8237,0
886 | 6040,1
887 | 6064,0
888 | 6957,0
889 | 8369,1
890 | 8247,1
891 | 8426,0
892 | 8428,0
893 | 9058,1
894 | 8249,0
895 | 9057,1
896 | 7139,1
897 | 7307,1
898 | 7331,1
899 | 7337,1
900 | 7378,0
901 | 7405,0
902 | 66,0
903 | 149,1
904 | 328,1
905 | 334,0
906 | 2274,1
907 | 5753,1
908 | 6709,1
909 | 7000,0
910 | 6989,0
911 | 7031,1
912 | 7062,0
913 | 7460,1
914 | 7123,0
915 | 7128,1
916 | 7145,1
917 | 7147,0
918 | 7166,1
919 | 7255,0
920 | 7275,1
921 | 7258,0
922 | 7291,1
923 | 7310,0
924 | 7330,1
925 | 7333,0
926 | 7411,0
927 | 178,0
928 | 378,1
929 | 8241,1
930 | 6988,0
931 | 8256,1
932 | 8796,0
933 | 8264,0
934 | 8265,0
935 | 8231,0
936 | 8271,0
937 | 6190,0
938 | 8275,0
939 | 8420,1
940 | 8283,1
941 | 8284,1
942 | 6008,1
943 | 8422,1
944 | 8296,1
945 | 8297,1
946 | 8300,1
947 | 8235,0
948 | 8306,0
949 | 8310,0
950 | 8236,1
951 | 8311,0
952 | 8314,0
953 | 8239,1
954 | 8240,0
955 | 8323,1
956 | 8242,0
957 | 8325,1
958 | 8326,1
959 | 8222,1
960 | 8430,1
961 | 6042,1
962 | 8335,1
963 | 8343,1
964 | 6074,0
965 | 8351,0
966 | 8353,0
967 | 8354,0
968 | 7296,1
969 | 8365,1
970 | 8374,1
971 | 8376,0
972 | 8378,0
973 | 8412,1
974 | 8387,0
975 | 8389,1
976 | 6243,1
977 | 7507,0
978 | 7343,1
979 | 6005,1
980 | 5729,1
981 | 5380,1
982 | 5381,0
983 | 5565,1
984 | 7011,1
985 | 7199,1
986 | 7224,1
987 | 7277,0
988 | 7490,1
989 | 7492,1
990 | 7300,0
991 | 7306,0
992 | 7408,1
993 | 7418,0
994 | 5887,0
995 | 5987,0
996 | 173,0
997 | 357,1
998 | 258,1
999 | 374,0
1000 | 94,1
1001 | 1859,1
1002 | 6188,1
1003 | 5207,0
1004 | 6448,1
1005 | 8312,1
1006 | 8313,0
1007 | 8334,1
1008 | 8337,1
1009 | 8357,1
1010 | 8366,0
1011 | 8411,0
1012 | 8388,1
1013 | 8395,1
1014 | 7014,1
1015 | 7035,0
1016 | 6810,1
1017 | 7498,0
1018 | 7506,0
1019 | 7390,0
1020 | 7284,1
1021 | 7081,1
1022 | 8243,0
1023 | 8245,1
1024 | 7033,0
1025 | 2150,1
1026 | 100000,1
1027 | 8266,0
1028 | 6897,1
1029 | 6898,1
1030 | 5837,0
1031 | 6907,1
1032 | 7438,1
1033 | 6910,1
1034 | 6913,1
1035 | 6914,0
1036 | 6918,1
1037 | 6919,1
1038 | 8214,1
1039 | 6924,1
1040 | 8424,1
1041 | 6926,0
1042 | 6928,1
1043 | 6933,1
1044 | 7520,1
1045 | 7521,0
1046 | 6936,0
1047 | 7522,1
1048 | 6937,0
1049 | 6939,0
1050 | 6900,1
1051 | 6901,1
1052 | 6908,1
1053 | 6009,0
1054 | 6915,0
1055 | 6917,1
1056 | 6920,0
1057 | 6922,0
1058 | 6923,0
1059 | 6927,0
1060 | 6929,1
1061 | 6930,0
1062 | 6940,0
1063 | 6942,1
1064 | 6943,1
1065 | 7518,1
1066 | 6946,1
1067 | 8213,0
1068 | 6951,1
1069 | 6958,1
1070 | 7305,0
1071 | 7376,1
1072 | 7386,1
1073 | 7404,1
1074 | 7403,0
1075 | 7457,1
1076 | 7463,1
1077 | 7015,1
1078 | 7024,0
1079 | 7079,0
1080 | 7152,1
1081 | 7297,1
1082 | 7381,0
1083 | 7413,0
1084 | 7176,1
1085 | 7352,0
1086 | 2327,0
1087 | 7117,1
1088 | 7172,1
1089 | 7168,1
1090 | 7423,0
1091 | 7425,1
1092 | 7223,1
1093 | 7239,1
1094 | 7276,1
1095 | 7281,1
1096 | 7287,1
1097 | 7292,0
1098 | 7299,0
1099 | 7303,0
1100 | 7309,0
1101 | 7328,0
1102 | 7406,1
1103 | 2320,1
1104 | 7242,1
1105 | 7462,1
1106 | 5385,1
1107 | 5292,0
1108 | 5337,1
1109 | 5350,1
1110 | 5377,1
1111 | 5386,0
1112 | 5310,1
1113 | 5282,1
1114 | 5339,0
1115 | 5322,0
1116 | 5331,1
1117 | 5364,0
1118 | 5373,1
1119 | 4879,1
1120 | 7069,1
1121 | 7496,0
1122 | 7502,1
1123 | 7344,0
1124 | 7346,1
1125 | 7353,0
1126 | 7373,0
1127 | 7384,0
1128 | 81,1
1129 | 373,0
1130 | 383,0
1131 | 1867,0
1132 | 957,0
1133 | 998,1
1134 | 1006,1
1135 | 992,1
1136 | 1002,1
1137 | 1166,1
1138 | 9077,1
1139 | 9104,1
1140 | 9152,0
1141 | 9165,0
1142 | 9179,0
1143 | 6996,0
1144 | 7008,1
1145 | 6729,1
1146 | 7092,1
1147 | 7164,1
1148 | 7169,0
1149 | 7181,0
1150 | 7252,1
1151 | 7446,1
1152 | 7270,1
1153 | 7483,0
1154 | 7316,1
1155 | 7351,1
1156 | 7391,1
1157 | 1,1
1158 | 392,0
1159 | 379,1
1160 | 380,1
1161 | 267,1
1162 | 2057,0
1163 | 5742,0
1164 | 5056,1
1165 | 5122,1
1166 | 5158,0
1167 | 5832,0
1168 | 6994,0
1169 | 7002,1
1170 | 7026,0
1171 | 6730,0
1172 | 7075,1
1173 | 7126,1
1174 | 7227,0
1175 | 7229,0
1176 | 7449,1
1177 | 6847,0
1178 | 6953,1
1179 | 7320,1
1180 | 7354,0
1181 | 7283,0
1182 | 2,0
1183 | 386,1
1184 | 1716,1
1185 | 1967,1
1186 | 5785,1
1187 | 4802,1
1188 | 5116,0
1189 | 5202,1
1190 | 7071,1
1191 | 7064,1
1192 | 7078,1
1193 | 7094,1
1194 | 7141,1
1195 | 7143,0
1196 | 7151,0
1197 | 7150,0
1198 | 7424,0
1199 | 7178,1
1200 | 7188,1
1201 | 7201,1
1202 | 7206,1
1203 | 7205,1
1204 | 7231,0
1205 | 7244,1
1206 | 7260,1
1207 | 7263,1
1208 | 7280,0
1209 | 7282,0
1210 | 7472,0
1211 | 7382,0
1212 | 7392,1
1213 | 7477,1
1214 | 8610,0
1215 | 8692,1
1216 | 6727,1
1217 | 7105,1
1218 | 7479,1
1219 | 7482,0
1220 | 7504,1
1221 | 7508,0
1222 | 7355,1
1223 | 5896,0
1224 | 166,1
1225 | 223,1
1226 | 126,1
1227 | 390,0
1228 | 321,1
1229 | 259,1
1230 | 362,1
1231 | 260,1
1232 | 91,0
1233 | 641,1
1234 | 5160,1
1235 | 5232,0
1236 | 5606,0
1237 | 5628,0
1238 | 7004,0
1239 | 7100,1
1240 | 7102,0
1241 | 7110,0
1242 | 7135,1
1243 | 7186,1
1244 | 7430,1
1245 | 2187,1
1246 | 6094,0
1247 | 6096,0
1248 | 6109,0
1249 | 6112,1
1250 | 6124,0
1251 | 6169,1
1252 | 6202,1
1253 | 6203,0
1254 | 6242,0
1255 | 6318,1
1256 | 6990,1
1257 | 6992,1
1258 | 6998,0
1259 | 4927,1
1260 | 4935,1
1261 | 4862,1
1262 | 5596,1
1263 | 5517,1
1264 | 5582,0
1265 | 5590,1
1266 | 5536,1
1267 | 5670,0
1268 | 5678,0
1269 | 5645,1
1270 | 2290,1
1271 | 5805,0
1272 | 4997,1
1273 | 5341,1
1274 | 6449,1
1275 | 1366,1
1276 | 1363,0
1277 | 1317,1
1278 | 1313,1
1279 | 1312,1
1280 | 1360,1
1281 | 1362,1
1282 | 1256,1
1283 | 9342,1
1284 | 9450,1
1285 | 9437,0
1286 | 9436,0
1287 | 9434,1
1288 | 9427,1
1289 | 9421,1
1290 | 9416,0
1291 | 9413,0
1292 | 9412,1
1293 | 9411,1
1294 | 9410,0
1295 | 9408,1
1296 | 9407,0
1297 | 9405,1
1298 | 9404,1
1299 | 9399,1
1300 | 9392,0
1301 | 9391,1
1302 | 9390,1
1303 | 9388,0
1304 | 9386,1
1305 | 9385,1
1306 | 9384,1
1307 | 9383,0
1308 | 9382,1
1309 |
--------------------------------------------------------------------------------
/data/k_matrix.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/grimmlab/permGWAS/3f7a1c2e3e4c63281f5719425ff9ac405f8d9cfc/data/k_matrix.h5
--------------------------------------------------------------------------------
/data/x_matrix.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/grimmlab/permGWAS/3f7a1c2e3e4c63281f5719425ff9ac405f8d9cfc/data/x_matrix.h5
--------------------------------------------------------------------------------
/data/x_matrix.map:
--------------------------------------------------------------------------------
1 | 1 1_657 0 657
2 | 1 1_3102 0 3102
3 | 1 1_4648 0 4648
4 | 1 1_4880 0 4880
5 | 1 1_5975 0 5975
6 | 1 1_6063 0 6063
7 | 1 1_6449 0 6449
8 | 1 1_6514 0 6514
9 | 1 1_6603 0 6603
10 | 1 1_6768 0 6768
11 | 1 1_7601 0 7601
12 | 1 1_8193 0 8193
13 | 1 1_8617 0 8617
14 | 1 1_10219 0 10219
15 | 1 1_10449 0 10449
16 | 1 1_10969 0 10969
17 | 1 1_11493 0 11493
18 | 1 1_11696 0 11696
19 | 1 1_12584 0 12584
20 | 1 1_12659 0 12659
21 | 1 1_13045 0 13045
22 | 1 1_14385 0 14385
23 | 1 1_19819 0 19819
24 | 1 1_20892 0 20892
25 | 1 1_21043 0 21043
26 | 1 1_21128 0 21128
27 | 1 1_21829 0 21829
28 | 1 1_22522 0 22522
29 | 1 1_23838 0 23838
30 | 1 1_25315 0 25315
31 | 1 1_25365 0 25365
32 | 1 1_25773 0 25773
33 | 1 1_26288 0 26288
34 | 1 1_27265 0 27265
35 | 1 1_28948 0 28948
36 | 1 1_28978 0 28978
37 | 1 1_29291 0 29291
38 | 1 1_30529 0 30529
39 | 1 1_30683 0 30683
40 | 1 1_31515 0 31515
41 | 1 1_31926 0 31926
42 | 1 1_32210 0 32210
43 | 1 1_32807 0 32807
44 | 1 1_34125 0 34125
45 | 1 1_34599 0 34599
46 | 1 1_35856 0 35856
47 | 1 1_37072 0 37072
48 | 1 1_38946 0 38946
49 | 1 1_39751 0 39751
50 | 1 1_41178 0 41178
51 | 1 1_41427 0 41427
52 | 1 1_44567 0 44567
53 | 1 1_45075 0 45075
54 | 1 1_45580 0 45580
55 | 1 1_45683 0 45683
56 | 1 1_46373 0 46373
57 | 1 1_46499 0 46499
58 | 1 1_46912 0 46912
59 | 1 1_47577 0 47577
60 | 1 1_47692 0 47692
61 | 1 1_48118 0 48118
62 | 1 1_48181 0 48181
63 | 1 1_49080 0 49080
64 | 1 1_51392 0 51392
65 | 1 1_51706 0 51706
66 | 1 1_51878 0 51878
67 | 1 1_52202 0 52202
68 | 1 1_53183 0 53183
69 | 1 1_53729 0 53729
70 | 1 1_53901 0 53901
71 | 1 1_55684 0 55684
72 | 1 1_57136 0 57136
73 | 1 1_57686 0 57686
74 | 1 1_59637 0 59637
75 | 1 1_60083 0 60083
76 | 1 1_60772 0 60772
77 | 1 1_61122 0 61122
78 | 1 1_61266 0 61266
79 | 1 1_61405 0 61405
80 | 1 1_61661 0 61661
81 | 1 1_62259 0 62259
82 | 1 1_62935 0 62935
83 | 1 1_63084 0 63084
84 | 1 1_63645 0 63645
85 | 1 1_63759 0 63759
86 | 1 1_63915 0 63915
87 | 1 1_64149 0 64149
88 | 1 1_64651 0 64651
89 | 1 1_68340 0 68340
90 | 1 1_68880 0 68880
91 | 1 1_69311 0 69311
92 | 1 1_70933 0 70933
93 | 1 1_71326 0 71326
94 | 1 1_71348 0 71348
95 | 1 1_71868 0 71868
96 | 1 1_72138 0 72138
97 | 1 1_72756 0 72756
98 | 1 1_72894 0 72894
99 | 1 1_72924 0 72924
100 | 1 1_73047 0 73047
101 | 1 1_73467 0 73467
102 | 1 1_73691 0 73691
103 | 1 1_73851 0 73851
104 | 1 1_73989 0 73989
105 | 1 1_74169 0 74169
106 | 1 1_74707 0 74707
107 | 1 1_75481 0 75481
108 | 1 1_75721 0 75721
109 | 1 1_75899 0 75899
110 | 1 1_76188 0 76188
111 | 1 1_76217 0 76217
112 | 1 1_76847 0 76847
113 | 1 1_76879 0 76879
114 | 1 1_76906 0 76906
115 | 1 1_77127 0 77127
116 | 1 1_77140 0 77140
117 | 1 1_77243 0 77243
118 | 1 1_77458 0 77458
119 | 1 1_78803 0 78803
120 | 1 1_78975 0 78975
121 | 1 1_79418 0 79418
122 | 1 1_80216 0 80216
123 | 1 1_80374 0 80374
124 | 1 1_80400 0 80400
125 | 1 1_81068 0 81068
126 | 1 1_81496 0 81496
127 | 1 1_81854 0 81854
128 | 1 1_81869 0 81869
129 | 1 1_82197 0 82197
130 | 1 1_82290 0 82290
131 | 1 1_83117 0 83117
132 | 1 1_83177 0 83177
133 | 1 1_83219 0 83219
134 | 1 1_84144 0 84144
135 | 1 1_84379 0 84379
136 | 1 1_84558 0 84558
137 | 1 1_85561 0 85561
138 | 1 1_85860 0 85860
139 | 1 1_86656 0 86656
140 | 1 1_87060 0 87060
141 | 1 1_87791 0 87791
142 | 1 1_87985 0 87985
143 | 1 1_88300 0 88300
144 | 1 1_88658 0 88658
145 | 1 1_89312 0 89312
146 | 1 1_90606 0 90606
147 | 1 1_92353 0 92353
148 | 1 1_92866 0 92866
149 | 1 1_93562 0 93562
150 | 1 1_93740 0 93740
151 |
--------------------------------------------------------------------------------
/data/y_matrix.csv:
--------------------------------------------------------------------------------
1 | accession_id,phenotype_value
2 | 9381,
3 | 9380,
4 | 9378,
5 | 9371,
6 | 9367,
7 | 9363,
8 | 9356,
9 | 9355,
10 | 9354,
11 | 9353,
12 | 9352,
13 | 9351,
14 | 9344,
15 | 9343,
16 | 9339,
17 | 9336,
18 | 9332,
19 | 9323,
20 | 9321,
21 | 9482,
22 | 9481,
23 | 9472,
24 | 9471,
25 | 9470,
26 | 9469,
27 | 9455,
28 | 9454,
29 | 9453,
30 | 9451,
31 | 9419,
32 | 9418,
33 | 9409,
34 | 9402,
35 | 9369,
36 | 9349,
37 | 9476,
38 | 9433,
39 | 9446,
40 | 9443,
41 | 9442,
42 | 997,
43 | 996,
44 | 1068,
45 | 1026,
46 | 1585,
47 | 1435,
48 | 1169,
49 | 1075,
50 | 1132,
51 | 1064,
52 | 1063,
53 | 1062,
54 | 1247,
55 | 991,
56 | 1391,
57 | 1374,
58 | 1318,
59 | 1254,
60 | 1163,
61 | 1153,
62 | 1073,
63 | 1072,
64 | 394,
65 | 7,
66 | 203,
67 | 236,
68 | 367,
69 | 123,
70 | 395,
71 | 196,
72 | 264,
73 | 185,
74 | 297,
75 | 318,
76 | 323,
77 | 79,
78 | 198,
79 | 371,
80 | 280,
81 | 12,
82 | 347,
83 | 268,
84 | 288,
85 | 377,
86 | 252,
87 | 296,
88 | 341,
89 | 156,
90 | 397,
91 | 263,
92 | 48,
93 | 45,
94 | 210,
95 | 83,
96 | 372,
97 | 393,
98 | 205,
99 | 87,
100 | 62,
101 | 309,
102 | 222,
103 | 160,
104 | 229,
105 | 369,
106 | 227,
107 | 230,
108 | 217,
109 | 194,
110 | 391,
111 | 340,
112 | 167,
113 | 266,
114 | 208,
115 | 335,
116 | 213,
117 | 388,
118 | 331,
119 | 216,
120 | 277,
121 | 85,
122 | 310,
123 | 389,
124 | 387,
125 | 191,
126 | 224,
127 | 82,
128 | 225,
129 | 295,
130 | 169,
131 | 375,
132 | 292,
133 | 215,
134 | 337,
135 | 320,
136 | 171,
137 | 346,
138 | 151,
139 | 137,
140 | 291,
141 | 385,
142 | 84,
143 | 349,
144 | 219,
145 | 322,
146 | 204,
147 | 273,
148 | 212,
149 | 146,
150 | 348,
151 | 157,
152 | 214,
153 | 316,
154 | 186,
155 | 314,
156 | 293,
157 | 183,
158 | 287,
159 | 290,
160 | 168,
161 | 343,
162 | 153,
163 | 339,
164 | 60,
165 | 174,
166 | 88,
167 | 359,
168 | 298,
169 | 162,
170 | 311,
171 | 329,
172 | 175,
173 | 163,
174 | 77,
175 | 302,
176 | 231,
177 | 148,
178 | 106,
179 | 283,
180 | 184,
181 | 122,
182 | 170,
183 | 396,
184 | 275,
185 | 244,
186 | 116,
187 | 364,
188 | 121,
189 | 165,
190 | 32,
191 | 201,
192 | 326,
193 | 368,
194 | 332,
195 | 361,
196 | 202,
197 | 200,
198 | 257,
199 | 80,
200 | 9,
201 | 187,
202 | 89,
203 | 207,
204 | 69,
205 | 188,
206 | 306,
207 | 360,
208 | 237,
209 | 327,
210 | 261,
211 | 86,
212 | 228,
213 | 190,
214 | 74,
215 | 8,
216 | 4,
217 | 159,
218 | 262,
219 | 51,
220 | 5,
221 | 363,
222 | 338,
223 | 355,
224 | 269,
225 | 278,
226 | 179,
227 | 6,
228 | 206,
229 | 461,
230 | 466,
231 | 9490,
232 | 9496,
233 | 9504,
234 | 9499,
235 | 9308,
236 | 9305,
237 | 9302,
238 | 9309,
239 | 4980,
240 | 5444,
241 | 5394,
242 | 5461,
243 | 5494,
244 | 5398,
245 | 5466,
246 | 5450,
247 | 4675,
248 | 4632,
249 | 5769,
250 | 4757,
251 | 4827,
252 | 4820,
253 | 5159,
254 | 5759,
255 | 5739,
256 | 5738,
257 | 5770,
258 | 5826,
259 | 5745,
260 | 5744,
261 | 5774,
262 | 5760,
263 | 5746,
264 | 5762,
265 | 5711,
266 | 5802,
267 | 5740,
268 | 5716,
269 | 5772,
270 | 5722,
271 | 5751,
272 | 5721,
273 | 5812,
274 | 5792,
275 | 5735,
276 | 5767,
277 | 5817,
278 | 5807,
279 | 5777,
280 | 5736,
281 | 5763,
282 | 5813,
283 | 5741,
284 | 5731,
285 | 5819,
286 | 5724,
287 | 5789,
288 | 5141,
289 | 5175,
290 | 5145,
291 | 5469,
292 | 5106,
293 | 5299,
294 | 5335,
295 | 7121,
296 | 7106,
297 | 7104,
298 | 7113,
299 | 7116,
300 | 7149,
301 | 7228,
302 | 7301,
303 | 7109,
304 | 6987,
305 | 7028,
306 | 7029,
307 | 7030,
308 | 7013,
309 | 7017,
310 | 7032,
311 | 7073,
312 | 242,
313 | 104,
314 | 282,
315 | 96,
316 | 23,
317 | 6102,
318 | 6938,
319 | 8304,
320 | 8238,
321 | 8386,
322 | 8348,
323 | 1416,
324 | 6237,
325 | 6226,
326 | 6184,
327 | 6174,
328 | 6172,
329 | 6171,
330 | 6170,
331 | 6151,
332 | 6150,
333 | 6149,
334 | 6148,
335 | 6147,
336 | 6146,
337 | 6145,
338 | 6144,
339 | 6142,
340 | 6141,
341 | 6137,
342 | 6136,
343 | 6131,
344 | 6134,
345 | 6133,
346 | 6132,
347 | 6129,
348 | 6128,
349 | 6127,
350 | 6126,
351 | 6125,
352 | 6123,
353 | 6122,
354 | 6121,
355 | 6119,
356 | 6116,
357 | 6115,
358 | 6114,
359 | 6111,
360 | 6110,
361 | 6108,
362 | 6107,
363 | 6106,
364 | 6104,
365 | 6103,
366 | 6101,
367 | 6100,
368 | 6099,
369 | 6098,
370 | 6097,
371 | 6095,
372 | 6093,
373 | 6092,
374 | 6091,
375 | 6090,
376 | 6177,
377 | 6221,
378 | 6244,
379 | 6241,
380 | 6240,
381 | 6238,
382 | 6236,
383 | 6235,
384 | 6220,
385 | 6218,
386 | 6217,
387 | 6216,
388 | 6215,
389 | 6214,
390 | 6210,
391 | 6209,
392 | 6207,
393 | 6201,
394 | 6200,
395 | 6199,
396 | 6198,
397 | 6197,
398 | 6195,
399 | 6194,
400 | 6193,
401 | 6192,
402 | 6191,
403 | 6189,
404 | 6163,
405 | 6154,
406 | 8274,74.0
407 | 7192,
408 | 7194,
409 | 7210,
410 | 7238,
411 | 7245,
412 | 7246,
413 | 7256,
414 | 7265,
415 | 7268,
416 | 366,
417 | 5245,
418 | 5264,
419 | 7195,
420 | 7262,
421 | 7250,
422 | 1925,
423 | 5719,
424 | 5798,
425 | 5816,
426 | 5821,
427 | 5710,
428 | 5715,
429 | 5720,
430 | 5733,
431 | 5820,
432 | 5737,
433 | 5755,
434 | 5781,
435 | 5782,
436 | 5784,
437 | 5146,
438 | 5133,
439 | 5709,
440 | 5712,
441 | 5795,
442 | 5750,
443 | 5708,
444 | 5749,
445 | 5727,
446 | 5780,
447 | 5756,
448 | 5723,
449 | 5730,
450 | 5717,
451 | 5732,
452 | 5804,
453 | 5752,
454 | 5799,
455 | 5713,
456 | 5728,
457 | 5787,
458 | 5788,
459 | 5793,
460 | 5803,
461 | 5758,
462 | 7317,
463 | 7034,
464 | 615,
465 | 627,
466 | 607,
467 | 631,
468 | 623,
469 | 719,
470 | 640,
471 | 827,
472 | 895,
473 | 946,
474 | 936,
475 | 7717,
476 | 7787,
477 | 7837,
478 | 7847,
479 | 7867,
480 | 8077,
481 | 8122,
482 | 1743,
483 | 1799,
484 | 2175,
485 | 2160,
486 | 2171,
487 | 2148,
488 | 2180,
489 | 2157,
490 | 1948,
491 | 1941,
492 | 1949,
493 | 1965,
494 | 1981,
495 | 1992,
496 | 2016,
497 | 2011,
498 | 2019,
499 | 2020,
500 | 2151,
501 | 1862,
502 | 1872,
503 | 1864,
504 | 1871,
505 | 1857,
506 | 1865,
507 | 1873,
508 | 1850,
509 | 1858,
510 | 1874,
511 | 1868,
512 | 1829,
513 | 1853,
514 | 1926,
515 | 1966,
516 | 1918,
517 | 1959,
518 | 1936,
519 | 1952,
520 | 1960,
521 | 1968,
522 | 1938,
523 | 1963,
524 | 1720,
525 | 1736,
526 | 1744,
527 | 1752,
528 | 1729,
529 | 1745,
530 | 1753,
531 | 1722,
532 | 1730,
533 | 1738,
534 | 1724,
535 | 1740,
536 | 1733,
537 | 1718,
538 | 1726,
539 | 1750,
540 | 1782,
541 | 1719,
542 | 7566,
543 | 1751,
544 | 2214,
545 | 2201,
546 | 2204,
547 | 2294,
548 | 2280,
549 | 2338,
550 | 2292,
551 | 2300,
552 | 2316,
553 | 2283,
554 | 7584,
555 | 7580,
556 | 7578,
557 | 7570,
558 | 8608,
559 | 8727,
560 | 8760,
561 | 8768,
562 | 8616,
563 | 8617,
564 | 8770,
565 | 8730,
566 | 8619,
567 | 8629,
568 | 8612,
569 | 8724,
570 | 8631,
571 | 8725,
572 | 8640,
573 | 8759,
574 | 8774,
575 | 8824,
576 | 8557,
577 | 8791,
578 | 8777,
579 | 8811,
580 | 8787,
581 | 8805,
582 | 8534,
583 | 8687,
584 | 9045,
585 | 8673,
586 | 9041,
587 | 8701,
588 | 9053,
589 | 8985,
590 | 8957,
591 | 8966,
592 | 8695,
593 | 8967,
594 | 9004,
595 | 8690,
596 | 9012,
597 | 8969,
598 | 8961,
599 | 8970,
600 | 8954,
601 | 8962,
602 | 8965,
603 | 8973,
604 | 8975,
605 | 9006,
606 | 8976,
607 | 9007,
608 | 8977,
609 | 8992,
610 | 9008,
611 | 9001,
612 | 8719,
613 | 8996,
614 | 9011,
615 | 1742,
616 | 1749,
617 | 999,
618 | 1061,
619 | 1404,
620 | 1552,
621 | 1257,
622 | 1158,
623 | 1070,
624 | 9452,
625 | 417,
626 | 421,
627 | 407,
628 | 424,
629 | 402,
630 | 403,
631 | 404,
632 | 428,
633 | 429,
634 | 409,
635 | 413,
636 | 5883,
637 | 5848,
638 | 6416,
639 | 5838,
640 | 6287,
641 | 6417,
642 | 5841,
643 | 5894,
644 | 5904,
645 | 5913,
646 | 5921,
647 | 5939,
648 | 5969,
649 | 5895,
650 | 5905,
651 | 5914,
652 | 5923,
653 | 5932,
654 | 5942,
655 | 5961,
656 | 5970,
657 | 5884,
658 | 5906,
659 | 5924,
660 | 5933,
661 | 5943,
662 | 5953,
663 | 5963,
664 | 5972,
665 | 5888,
666 | 5934,
667 | 5898,
668 | 5908,
669 | 5919,
670 | 5926,
671 | 5935,
672 | 5945,
673 | 5955,
674 | 5891,
675 | 5900,
676 | 5927,
677 | 5946,
678 | 5966,
679 | 5975,
680 | 5901,
681 | 5911,
682 | 5875,
683 | 5948,
684 | 5893,
685 | 5902,
686 | 5920,
687 | 5938,
688 | 5959,
689 | 5968,
690 | 5988,
691 | 5999,
692 | 6455,
693 | 5979,
694 | 6421,
695 | 5991,
696 | 5992,
697 | 6004,
698 | 6458,
699 | 5982,
700 | 5993,
701 | 6425,
702 | 6451,
703 | 6309,
704 | 5984,
705 | 5994,
706 | 6444,
707 | 5997,
708 | 6007,
709 | 6445,
710 | 6453,
711 | 5998,
712 | 6427,
713 | 6435,
714 | 6446,
715 | 6403,
716 | 5922,
717 | 5915,
718 | 5910,
719 | 6401,
720 | 6003,
721 | 5899,
722 | 6396,
723 | 5873,
724 | 6418,
725 | 5874,
726 | 5916,
727 | 5878,
728 | 5983,
729 | 5990,
730 | 5996,
731 | 5940,
732 | 5846,
733 | 5871,
734 | 6436,
735 | 5872,
736 | 5956,
737 | 6402,
738 | 4758,
739 | 5285,
740 | 9153,
741 | 9137,
742 | 9151,
743 | 9143,
744 | 9201,
745 | 6173,
746 | 6284,
747 | 6276,
748 | 6258,
749 | 6252,
750 | 6255,
751 | 6166,
752 | 6085,
753 | 6025,
754 | 6268,
755 | 6180,
756 | 6143,
757 | 6041,
758 | 5829,
759 | 8427,
760 | 8218,
761 | 6023,
762 | 5835,
763 | 5831,
764 | 5830,
765 | 6039,
766 | 6086,
767 | 6413,
768 | 6412,
769 | 6411,
770 | 6087,
771 | 6077,
772 | 6076,
773 | 6071,
774 | 6069,
775 | 6038,
776 | 6036,
777 | 6035,
778 | 6034,
779 | 6030,
780 | 6024,
781 | 6021,
782 | 6019,
783 | 6017,
784 | 6013,
785 | 6012,
786 | 6011,
787 | 6010,
788 | 5870,
789 | 5867,
790 | 5865,
791 | 5860,
792 | 5836,
793 | 6231,
794 | 6212,
795 | 6140,
796 | 6138,
797 | 6120,
798 | 6118,
799 | 6073,
800 | 6022,
801 | 6020,
802 | 8227,
803 | 8225,
804 | 8230,97.0
805 | 5856,
806 | 8307,
807 | 1409,
808 | 6959,51.0
809 | 7525,46.0
810 | 6961,46.0
811 | 6967,44.0
812 | 6973,53.0
813 | 6974,103.0
814 | 6976,56.0
815 | 7516,100.0
816 | 6979,44.0
817 | 6980,51.0
818 | 6982,49.0
819 | 6983,71.0
820 | 6985,56.0
821 | 6931,46.0
822 | 6043,90.0
823 | 6945,55.0
824 | 7519,76.0
825 | 7526,53.0
826 | 7523,57.0
827 | 6956,69.0
828 | 6960,47.0
829 | 7524,51.0
830 | 6963,60.0
831 | 6964,93.0
832 | 6965,102.0
833 | 6966,53.0
834 | 6969,70.0
835 | 6971,51.0
836 | 6975,51.0
837 | 7517,107.0
838 | 6978,49.0
839 | 6981,44.0
840 | 6984,53.0
841 | 6899,54.0
842 | 6903,57.0
843 | 6904,66.0
844 | 6905,65.0
845 | 6906,43.0
846 | 6909,51.0
847 | 6911,46.0
848 | 6916,63.0
849 | 8215,51.0
850 | 6921,64.0
851 | 6932,51.0
852 | 6046,93.0
853 | 6944,49.0
854 | 7515,49.0
855 | 7514,58.0
856 | 6962,52.0
857 | 6968,71.0
858 | 6972,63.0
859 | 6970,48.0
860 | 6977,49.0
861 | 8329,46.0
862 | 7379,
863 | 7080,
864 | 6744,
865 | 7098,
866 | 7158,
867 | 7163,57.0
868 | 7165,
869 | 7340,
870 | 7372,
871 | 7394,
872 | 7397,
873 | 281,
874 | 8258,73.0
875 | 8259,73.0
876 | 8290,50.0
877 | 7461,61.0
878 | 7323,56.0
879 | 8254,52.0
880 | 8270,49.0
881 | 8233,59.0
882 | 8285,70.0
883 | 6016,75.0
884 | 8423,70.0
885 | 8237,97.0
886 | 6040,71.0
887 | 6064,96.0
888 | 6957,84.0
889 | 8369,76.0
890 | 8247,87.0
891 | 8426,49.0
892 | 8428,
893 | 9058,101.0
894 | 8249,81.0
895 | 9057,76.0
896 | 7139,
897 | 7307,
898 | 7331,
899 | 7337,
900 | 7378,
901 | 7405,
902 | 66,
903 | 149,
904 | 328,
905 | 334,
906 | 2274,
907 | 5753,
908 | 6709,52.0
909 | 7000,65.0
910 | 6989,
911 | 7031,
912 | 7062,46.0
913 | 7460,49.0
914 | 7123,59.5
915 | 7128,
916 | 7145,
917 | 7147,71.0
918 | 7166,
919 | 7255,46.0
920 | 7275,46.0
921 | 7258,
922 | 7291,
923 | 7310,
924 | 7330,
925 | 7333,
926 | 7411,
927 | 178,
928 | 378,
929 | 8241,73.0
930 | 6988,48.0
931 | 8256,61.0
932 | 8796,
933 | 8264,46.0
934 | 8265,44.0
935 | 8231,91.0
936 | 8271,49.0
937 | 6190,
938 | 8275,68.0
939 | 8420,56.0
940 | 8283,71.0
941 | 8284,61.0
942 | 6008,60.0
943 | 8422,106.0
944 | 8296,45.0
945 | 8297,73.0
946 | 8300,61.0
947 | 8235,60.0
948 | 8306,96.0
949 | 8310,49.0
950 | 8236,91.0
951 | 8311,49.0
952 | 8314,64.0
953 | 8239,52.0
954 | 8240,93.0
955 | 8323,51.0
956 | 8242,120.0
957 | 8325,49.0
958 | 8326,67.0
959 | 8222,90.0
960 | 8430,
961 | 6042,56.0
962 | 8335,104.0
963 | 8343,62.0
964 | 6074,91.0
965 | 8351,78.0
966 | 8353,41.0
967 | 8354,70.0
968 | 7296,70.0
969 | 8365,51.0
970 | 8374,59.0
971 | 8376,84.0
972 | 8378,56.0
973 | 8412,
974 | 8387,52.0
975 | 8389,63.0
976 | 6243,56.0
977 | 7507,
978 | 7343,
979 | 6005,
980 | 5729,
981 | 5380,
982 | 5381,
983 | 5565,
984 | 7011,
985 | 7199,
986 | 7224,
987 | 7277,
988 | 7490,
989 | 7492,
990 | 7300,
991 | 7306,60.0
992 | 7408,
993 | 7418,63.0
994 | 5887,
995 | 5987,
996 | 173,
997 | 357,
998 | 258,
999 | 374,
1000 | 94,
1001 | 1859,
1002 | 6188,
1003 | 5207,
1004 | 6448,
1005 | 8312,66.0
1006 | 8313,49.0
1007 | 8334,64.0
1008 | 8337,70.0
1009 | 8357,
1010 | 8366,
1011 | 8411,
1012 | 8388,60.0
1013 | 8395,69.0
1014 | 7014,92.0
1015 | 7035,
1016 | 6810,
1017 | 7498,
1018 | 7506,
1019 | 7390,
1020 | 7284,
1021 | 7081,46.0
1022 | 8243,66.0
1023 | 8245,46.0
1024 | 7033,76.0
1025 | 2150,
1026 | 100000,58.0
1027 | 8266,99.0
1028 | 6897,62.0
1029 | 6898,41.0
1030 | 5837,57.0
1031 | 6907,58.0
1032 | 7438,75.0
1033 | 6910,49.0
1034 | 6913,99.0
1035 | 6914,73.0
1036 | 6918,108.0
1037 | 6919,71.0
1038 | 8214,51.0
1039 | 6924,49.0
1040 | 8424,46.0
1041 | 6926,49.0
1042 | 6928,55.0
1043 | 6933,56.0
1044 | 7520,60.0
1045 | 7521,60.0
1046 | 6936,67.0
1047 | 7522,83.0
1048 | 6937,65.0
1049 | 6939,49.0
1050 | 6900,90.0
1051 | 6901,86.0
1052 | 6908,49.0
1053 | 6009,98.0
1054 | 6915,53.0
1055 | 6917,121.0
1056 | 6920,71.0
1057 | 6922,48.0
1058 | 6923,44.0
1059 | 6927,51.0
1060 | 6929,71.0
1061 | 6930,49.0
1062 | 6940,49.0
1063 | 6942,46.0
1064 | 6943,49.0
1065 | 7518,103.0
1066 | 6946,62.0
1067 | 8213,44.0
1068 | 6951,68.0
1069 | 6958,49.0
1070 | 7305,
1071 | 7376,
1072 | 7386,
1073 | 7404,
1074 | 7403,
1075 | 7457,
1076 | 7463,
1077 | 7015,
1078 | 7024,
1079 | 7079,
1080 | 7152,
1081 | 7297,
1082 | 7381,
1083 | 7413,
1084 | 7176,
1085 | 7352,
1086 | 2327,
1087 | 7117,
1088 | 7172,
1089 | 7168,
1090 | 7423,
1091 | 7425,
1092 | 7223,
1093 | 7239,
1094 | 7276,
1095 | 7281,
1096 | 7287,
1097 | 7292,
1098 | 7299,
1099 | 7303,
1100 | 7309,
1101 | 7328,
1102 | 7406,
1103 | 2320,
1104 | 7242,
1105 | 7462,
1106 | 5385,
1107 | 5292,
1108 | 5337,
1109 | 5350,
1110 | 5377,
1111 | 5386,
1112 | 5310,
1113 | 5282,
1114 | 5339,
1115 | 5322,
1116 | 5331,
1117 | 5364,
1118 | 5373,
1119 | 4879,
1120 | 7069,
1121 | 7496,
1122 | 7502,
1123 | 7344,
1124 | 7346,64.0
1125 | 7353,
1126 | 7373,
1127 | 7384,
1128 | 81,
1129 | 373,
1130 | 383,
1131 | 1867,
1132 | 957,
1133 | 998,
1134 | 1006,
1135 | 992,
1136 | 1002,
1137 | 1166,
1138 | 9077,
1139 | 9104,
1140 | 9152,
1141 | 9165,
1142 | 9179,
1143 | 6996,
1144 | 7008,
1145 | 6729,
1146 | 7092,
1147 | 7164,
1148 | 7169,
1149 | 7181,
1150 | 7252,
1151 | 7446,
1152 | 7270,
1153 | 7483,
1154 | 7316,
1155 | 7351,
1156 | 7391,
1157 | 1,
1158 | 392,
1159 | 379,
1160 | 380,
1161 | 267,
1162 | 2057,
1163 | 5742,
1164 | 5056,
1165 | 5122,
1166 | 5158,
1167 | 5832,
1168 | 6994,
1169 | 7002,
1170 | 7026,
1171 | 6730,
1172 | 7075,
1173 | 7126,
1174 | 7227,
1175 | 7229,
1176 | 7449,
1177 | 6847,
1178 | 6953,
1179 | 7320,
1180 | 7354,
1181 | 7283,
1182 | 2,
1183 | 386,
1184 | 1716,
1185 | 1967,
1186 | 5785,
1187 | 4802,
1188 | 5116,
1189 | 5202,
1190 | 7071,
1191 | 7064,79.0
1192 | 7078,
1193 | 7094,58.5
1194 | 7141,
1195 | 7143,
1196 | 7151,
1197 | 7150,
1198 | 7424,43.0
1199 | 7178,
1200 | 7188,
1201 | 7201,
1202 | 7206,
1203 | 7205,
1204 | 7231,46.0
1205 | 7244,
1206 | 7260,
1207 | 7263,
1208 | 7280,
1209 | 7282,51.0
1210 | 7472,
1211 | 7382,
1212 | 7392,
1213 | 7477,59.0
1214 | 8610,
1215 | 8692,
1216 | 6727,
1217 | 7105,
1218 | 7479,
1219 | 7482,
1220 | 7504,
1221 | 7508,
1222 | 7355,
1223 | 5896,
1224 | 166,
1225 | 223,
1226 | 126,
1227 | 390,
1228 | 321,
1229 | 259,
1230 | 362,
1231 | 260,
1232 | 91,
1233 | 641,
1234 | 5160,
1235 | 5232,
1236 | 5606,
1237 | 5628,
1238 | 7004,
1239 | 7100,
1240 | 7102,
1241 | 7110,
1242 | 7135,
1243 | 7186,
1244 | 7430,
1245 | 2187,
1246 | 6094,
1247 | 6096,
1248 | 6109,
1249 | 6112,
1250 | 6124,
1251 | 6169,
1252 | 6202,
1253 | 6203,
1254 | 6242,
1255 | 6318,
1256 | 6990,
1257 | 6992,
1258 | 6998,
1259 | 4927,
1260 | 4935,
1261 | 4862,
1262 | 5596,
1263 | 5517,
1264 | 5582,
1265 | 5590,
1266 | 5536,
1267 | 5670,
1268 | 5678,
1269 | 5645,
1270 | 2290,
1271 | 5805,
1272 | 4997,
1273 | 5341,
1274 | 6449,
1275 | 1366,
1276 | 1363,
1277 | 1317,
1278 | 1313,
1279 | 1312,
1280 | 1360,
1281 | 1362,
1282 | 1256,
1283 | 9342,
1284 | 9450,
1285 | 9437,
1286 | 9436,
1287 | 9434,
1288 | 9427,
1289 | 9421,
1290 | 9416,
1291 | 9413,
1292 | 9412,
1293 | 9411,
1294 | 9410,
1295 | 9408,
1296 | 9407,
1297 | 9405,
1298 | 9404,
1299 | 9399,
1300 | 9392,
1301 | 9391,
1302 | 9390,
1303 | 9388,
1304 | 9386,
1305 | 9385,
1306 | 9384,
1307 | 9383,
1308 | 9382,
1309 |
--------------------------------------------------------------------------------
/data/y_matrix.pheno:
--------------------------------------------------------------------------------
1 | FID IID phenotype_value
2 | 5837 5837 57.0
3 | 6008 6008 60.0
4 | 6009 6009 98.0
5 | 6016 6016 75.0
6 | 6040 6040 71.0
7 | 6042 6042 56.0
8 | 6043 6043 90.0
9 | 6046 6046 93.0
10 | 6064 6064 96.0
11 | 6074 6074 91.0
12 | 6243 6243 56.0
13 | 6709 6709 52.0
14 | 6897 6897 62.0
15 | 6898 6898 41.0
16 | 6899 6899 54.0
17 | 6900 6900 90.0
18 | 6901 6901 86.0
19 | 6903 6903 57.0
20 | 6904 6904 66.0
21 | 6905 6905 65.0
22 | 6906 6906 43.0
23 | 6907 6907 58.0
24 | 6908 6908 49.0
25 | 6909 6909 51.0
26 | 6910 6910 49.0
27 | 6911 6911 46.0
28 | 6913 6913 99.0
29 | 6914 6914 73.0
30 | 6915 6915 53.0
31 | 6916 6916 63.0
32 | 6917 6917 121.0
33 | 6918 6918 108.0
34 | 6919 6919 71.0
35 | 6920 6920 71.0
36 | 6921 6921 64.0
37 | 6922 6922 48.0
38 | 6923 6923 44.0
39 | 6924 6924 49.0
40 | 6926 6926 49.0
41 | 6927 6927 51.0
42 | 6928 6928 55.0
43 | 6929 6929 71.0
44 | 6930 6930 49.0
45 | 6931 6931 46.0
46 | 6932 6932 51.0
47 | 6933 6933 56.0
48 | 6936 6936 67.0
49 | 6937 6937 65.0
50 | 6939 6939 49.0
51 | 6940 6940 49.0
52 | 6942 6942 46.0
53 | 6943 6943 49.0
54 | 6944 6944 49.0
55 | 6945 6945 55.0
56 | 6946 6946 62.0
57 | 6951 6951 68.0
58 | 6956 6956 69.0
59 | 6957 6957 84.0
60 | 6958 6958 49.0
61 | 6959 6959 51.0
62 | 6960 6960 47.0
63 | 6961 6961 46.0
64 | 6962 6962 52.0
65 | 6963 6963 60.0
66 | 6964 6964 93.0
67 | 6965 6965 102.0
68 | 6966 6966 53.0
69 | 6967 6967 44.0
70 | 6968 6968 71.0
71 | 6969 6969 70.0
72 | 6970 6970 48.0
73 | 6971 6971 51.0
74 | 6972 6972 63.0
75 | 6973 6973 53.0
76 | 6974 6974 103.0
77 | 6975 6975 51.0
78 | 6976 6976 56.0
79 | 6977 6977 49.0
80 | 6978 6978 49.0
81 | 6979 6979 44.0
82 | 6980 6980 51.0
83 | 6981 6981 44.0
84 | 6982 6982 49.0
85 | 6983 6983 71.0
86 | 6984 6984 53.0
87 | 6985 6985 56.0
88 | 6988 6988 48.0
89 | 7000 7000 65.0
90 | 7014 7014 92.0
91 | 7033 7033 76.0
92 | 7062 7062 46.0
93 | 7064 7064 79.0
94 | 7081 7081 46.0
95 | 7094 7094 58.5
96 | 7123 7123 59.5
97 | 7147 7147 71.0
98 | 7163 7163 57.0
99 | 7231 7231 46.0
100 | 7255 7255 46.0
101 | 7275 7275 46.0
102 | 7282 7282 51.0
103 | 7296 7296 70.0
104 | 7306 7306 60.0
105 | 7323 7323 56.0
106 | 7346 7346 64.0
107 | 7418 7418 63.0
108 | 7424 7424 43.0
109 | 7438 7438 75.0
110 | 7460 7460 49.0
111 | 7461 7461 61.0
112 | 7477 7477 59.0
113 | 7514 7514 58.0
114 | 7515 7515 49.0
115 | 7516 7516 100.0
116 | 7517 7517 107.0
117 | 7518 7518 103.0
118 | 7519 7519 76.0
119 | 7520 7520 60.0
120 | 7521 7521 60.0
121 | 7522 7522 83.0
122 | 7523 7523 57.0
123 | 7524 7524 51.0
124 | 7525 7525 46.0
125 | 7526 7526 53.0
126 | 8213 8213 44.0
127 | 8214 8214 51.0
128 | 8215 8215 51.0
129 | 8222 8222 90.0
130 | 8230 8230 97.0
131 | 8231 8231 91.0
132 | 8233 8233 59.0
133 | 8235 8235 60.0
134 | 8236 8236 91.0
135 | 8237 8237 97.0
136 | 8239 8239 52.0
137 | 8240 8240 93.0
138 | 8241 8241 73.0
139 | 8242 8242 120.0
140 | 8243 8243 66.0
141 | 8245 8245 46.0
142 | 8247 8247 87.0
143 | 8249 8249 81.0
144 | 8254 8254 52.0
145 | 8256 8256 61.0
146 | 8258 8258 73.0
147 | 8259 8259 73.0
148 | 8264 8264 46.0
149 | 8265 8265 44.0
150 | 8266 8266 99.0
151 | 8270 8270 49.0
152 | 8271 8271 49.0
153 | 8274 8274 74.0
154 | 8275 8275 68.0
155 | 8283 8283 71.0
156 | 8284 8284 61.0
157 | 8285 8285 70.0
158 | 8290 8290 50.0
159 | 8296 8296 45.0
160 | 8297 8297 73.0
161 | 8300 8300 61.0
162 | 8306 8306 96.0
163 | 8310 8310 49.0
164 | 8311 8311 49.0
165 | 8312 8312 66.0
166 | 8313 8313 49.0
167 | 8314 8314 64.0
168 | 8323 8323 51.0
169 | 8325 8325 49.0
170 | 8326 8326 67.0
171 | 8329 8329 46.0
172 | 8334 8334 64.0
173 | 8335 8335 104.0
174 | 8337 8337 70.0
175 | 8343 8343 62.0
176 | 8351 8351 78.0
177 | 8353 8353 41.0
178 | 8354 8354 70.0
179 | 8365 8365 51.0
180 | 8369 8369 76.0
181 | 8374 8374 59.0
182 | 8376 8376 84.0
183 | 8378 8378 56.0
184 | 8387 8387 52.0
185 | 8388 8388 60.0
186 | 8389 8389 63.0
187 | 8395 8395 69.0
188 | 8420 8420 56.0
189 | 8422 8422 106.0
190 | 8423 8423 70.0
191 | 8424 8424 46.0
192 | 8426 8426 49.0
193 | 9057 9057 76.0
194 | 9058 9058 101.0
195 | 100000 100000 58.0
196 |
--------------------------------------------------------------------------------
/docs/DATAGUIDE.md:
--------------------------------------------------------------------------------
1 | # Data Guide
2 |
3 | The minimal requirement is to provide a genotype and a phenotype file. We provide test data in the folder `data`.
4 | permGWAS2 is designed to work with several genotype file formats:
5 |
6 | ## Genotype file
7 | permGWAS needs **fully imputed** genotypes. We support our custom HDF5/H5/H5PY file, CSV PLINK and binary PLINK files.
8 | We recommend to use permGWAS2 with HDF5/H5/H5PY files. For this we provide a function to create an H5 file which satisfies
9 | our requirements and takes CSV, PLINK and binary PLINK genotype files as an input. For more info on how to use this function,
10 | see the section **Create H5 file** below.
11 |
12 | ### HDF5/H5/H5PY
13 | The file has to contain the following keys:
14 |
15 | - snps: genotype matrix, additively encoded (012)
16 | - sample_ids: vector containing corresponding sample ids
17 | - position_index: vector containing the positions of all SNPs
18 | - chr_index: vector containing the corresponding chromosome number
19 |
20 | ```shell
21 | python3 permGWAS.py -x ./data/x_matrix.h5 -y ./data/y_matrix.csv
22 | ```
23 |
24 | ### CSV
25 | The **first column** should be the **sample ids**. The **column names** should be the **SNP identifiers** in the form
26 | "CHR_POSITION" (e.g. Chr1_657). The values should be the genotype matrix in **additive encoding**.
27 |
28 | ```shell
29 | python3 permGWAS.py -x ./data/x_matrix.csv -y ./data/y_matrix.csv
30 | ```
31 |
32 | ### PLINK
33 | To use PLINK data, a .map and .ped file with the same prefix need to be in the same folder.
34 | To run permGWAS2 with PLINK files, you can use PREFIX.map or PREFIX.ped as option for the genotype file.
35 |
36 | ```shell
37 | python3 permGWAS.py -x ./data/x_matrix.map -y ./data/y_matrix.pheno
38 | ```
39 |
40 | ### binary PLINK
41 | To use binary PLINK data, a .bed, .bim and .fam file with the same prefix need to be in the same folder.
42 | To run permGWAS2 with binary PLINK files, you can use PREFIX.bed, PREFIX.bim or PREFIX.fam as option for the genotype file.
43 |
44 |
45 | ## Phenotype file
46 | permGWAS2 currently only accepts CSV, PHENO and TXT files for the phenotype. Here the **first column** should contain
47 | the **sample ids**. The remaining columns should contain the phenotype values with the phenotype name as column name.
48 | For TXT and PHENO files it is assumed that the values are separated by a **single space**. The samples need not be in
49 | the same order as in the genotype file. permGWAS2 automatically matched genotype and phenotype and discards all samples
50 | where only one of both is available.
51 | It is possible to run permGWAS with several traits one after another as long as they are stored in the same
52 | phenotype file.
53 |
54 | ```shell
55 | python3 permGWAS.py -x ./data/x_matrix.h5 -y ./data/y_matrix.csv -trait phenotype_value phenotype_2
56 | ```
57 | You can also run permGWAS2 for all available phenotypes in your phenotype file:
58 |
59 | ```shell
60 | python3 permGWAS.py -x ./data/x_matrix.h5 -y ./data/y_matrix.csv -trait all
61 | ```
62 |
63 | ## Kinship file
64 | Per default permGWAS2 computes the realized relationship kernel as kinship matrix.
65 | It is also possible to provide a kinship matrix. Currently, permGWAS only accepts CSV, H5, HDF5, H5PY files as
66 | kinship file. For CSV files the first column should contain the sample ids. For H5, HDF5, H5PY files the kinship
67 | matrix should have the key 'kinship' and the corresponding sample ids the key 'sample_ids'
68 | The sample ids need to match those of the genotype matrix.
69 |
70 | ```shell
71 | python3 permGWAS.py -x ./data/x_matrix.h5 -y ./data/y_matrix.csv -k ./data/k_matrix.csv
72 | ```
73 |
74 | ## Covariates file
75 | It is possible to run permGWAS2 with covariates. If no covariates file is provided, only the intercept will be used as
76 | fixed effect. Currently, permGWAS2 only accepts CSV files for covariates. Here the first column should contain the
77 | sample ids. The sample ids must match those of the phenotype file.
78 |
79 | ```shell
80 | python3 permGWAS.py -x ./data/x_matrix.h5 -y ./data/y_matrix.csv -cov ./data/cov_matrix.csv
81 | ```
82 |
83 | ## create H5 file
84 | We provide a function to create an H5 file which satisfies our requirements. It is possible to create the H5 based on a
85 | CSV, PLINK or binary PLINK files which have to fulfil the same requirements as above. The function takes the genotype
86 | file path via the option `-x` and additionally one can specify a new directory to save the H5 file via `-sd` if the save
87 | directory is not specified, the new file will be stored in the same directory as the input file.
88 |
89 | ```shell
90 | python3 create_h5_file.py -x ./data/x_matrix.map -sd ./data/test
91 | ```
--------------------------------------------------------------------------------
/docs/INSTALLATION.md:
--------------------------------------------------------------------------------
1 | # Requirements
2 |
3 | To ensure a stable working environment, we recommend using [Docker](https://www.docker.com). To follow this recommendation,
4 | docker needs to be installed and running on your machine. We provide a Dockerfile based on CUDA 11.5 and Ubuntu 20.4.
5 |
6 | If you want to use permGWAS2 without Docker, you need to install all packages mentioned in the
7 | [requirements file](../Docker/requirements.txt).
8 |
9 | # Installation Guide
10 |
11 | 1. Clone the repository into the directory where you want to set up the project
12 |
13 | ```shell
14 | git clone https://github.com/grimmlab/permGWAS.git
15 | ```
16 |
17 | 2. To use permGWAS2 within a Docker environment, navigate to `Docker` and build a Docker image using the provided Dockerfile.
18 |
19 | ```shell
20 | cd permGWAS/Docker
21 | docker build -t IMAGENAME .
22 | ```
23 |
24 | 3. Run an interactive Docker container based on the created image.\
25 | You have to mount the directory where the repository is located on your machine in the Docker container.
26 | If you want to work on GPU, specify the GPUs to mount.
27 |
28 | ```shell
29 | docker run -it -v PATH_TO_REPO_FOLDER:/NAME_OF_DIRECTORY_IN_CONTAINER --gpus device=DEVICE_NUMBER --name CONTAINERNAME IMAGENAME
30 | ```
31 |
32 | ### Example
33 |
34 | 1. Assume our repository is located in a folder called `/myhome` and we want to name our image `permGWAS_image`
35 |
36 | ```shell
37 | cd /myhome/permGWAS/Docker
38 | docker build -t permGWAS_image .
39 | ```
40 |
41 | 2. Further, assume that we want to call our container `permGWAS_container`, our data is located in (subfolders of)
42 | `/myhome` (i.e. we only need to mount one directory) and we want to use GPU 1. Then we have to run the following command:
43 |
44 | ```shell
45 | docker run -it -v /myhome/:/myhome_in_container/ --gpus device=1 --name permGWAS_container permGWAS_image
46 | ```
47 |
48 | 3. If we need to mount a second directory (e.g. we want to save our results in a different folder called `/results`),
49 | we can run the following:
50 |
51 | ```shell
52 | docker run -it -v /myhome/:/myhome_in_container/ -v /results/:/results/ --gpus device=1 --name permGWAS_container permGWAS_image
53 | ```
54 |
55 | With this the setup is finished. For details on how to run permGWAS, see our [Quickstart Guide](./QUICKSTART.md).
--------------------------------------------------------------------------------
/docs/OPTIONS.md:
--------------------------------------------------------------------------------
1 | # Optional settings
2 | ## Minor allele frequency (MAF)
3 | It is possible to filter the markers for minor allele frequency. For this use the flag `-maf` and specify an integer
4 | value between 0 and 30. For example to remove all SNPs with MAF<10%:
5 | ```shell
6 | python3 permGWAS.py -x ./data/x_matrix.h5 -y ./data/y_matrix.csv -maf 10
7 | ```
8 | Per default permGWAS2 does not filter for MAF.
9 |
10 | ## GPU usage
11 | For faster computations, permGWAS2 supports GPU usage. If one or several GPUs are available permGWAS2 will per default use
12 | the GPU device 0 for its computations. If no GPUs are available, permGWAS will perform all computations on CPUs only.
13 | To change the GPU you can use the flag `-device` and specify the number of the GPU to use. If you do NOT want to use
14 | GPUs, although they are available, you can use the flag `disable_gpu`:
15 | ```shell
16 | python3 permGWAS.py -x ./data/x_matrix.h5 -y ./data/y_matrix.csv -device 1
17 |
18 | python3 permGWAS.py -x ./data/x_matrix.h5 -y ./data/y_matrix.csv -disable_gpu
19 | ```
20 |
21 | ## Batch size
22 | It is possible to adjust the batch size for the simultaneous computation of univariate tests via `-batch`. Here the
23 | default is set to 50000. If you run into memory errors while using permGWAS2 we suggest reducing the batch size.
24 | ```shell
25 | python3 permGWAS.py -x ./data/x_matrix.h5 -y ./data/y_matrix.csv -batch 10000
26 | ```
27 | When using permGWAS2 with permutations, several univariate tests will be computed for all permutations at once.
28 | To prevent running into memory errors, one can adjust the batch size for permutations separately via `-batch_perm`.
29 | Here the default value is set to 1000. We suggest adjusting this parameter depending on the number of samples and number
30 | of permutations. For more information about permutations see [permGWAS2 with permutations](./PERMUTATIONS.md)
31 | ```shell
32 | python3 permGWAS.py -x ./data/x_matrix.h5 -y ./data/y_matrix.csv -perm 100 -batch_perm 500
33 | ```
34 |
35 | ## Batch-wise loading of genotype
36 | As memory is a limiting factor, permGWAS2 is also capable to load the genotype matrix batch-wise from file under certain
37 | conditions. For this you have to provide a precomputed kinship matrix (see [DataGuide](./DATAGUIDE.md)) and the genotype matrix
38 | must be provided via an HDF5 file (see [DataGuide](./DATAGUIDE.md) for a function to create an HDF5 file).
39 |
40 | However, if memory is not an issue, we recommend loading the genotype file completely to improve the speed of permGWAS2.
41 | When no precomputed kinship is provided, the genotype matrix will be loaded completely per default. It is also possible
42 | to force permGWAS2 to load the genotype matrix completely even if a kinship is provided via the flag `-load_genotype`.
43 | ```shell
44 | python3 permGWAS.py -x ./data/x_matrix.h5 -y ./data/y_matrix.csv -load_genotype
45 | ```
46 |
47 | ## Model (coming soon)
48 | permGWAS computes test statistics and p-values based on a Linear Mixed Model (LMM). In the future there will be other
49 | models available. The model can be chosen via `-model`. Currently, only `lmm` is available.
50 |
51 | ## Non-additive encoding
52 | permGWAS assumes that the genotypes are in additive encoding (i.e. number of minor alleles) and produces an error if the genotypes
53 | are encoded differently. If your data is **not additively encoded**, you can use the flag `-not_add`. For example if you
54 | are working with other data than SNP data. However, our framework was developed for SNP data, and we give no guarantee that it
55 | works for other purposes.
56 |
57 |
58 | See [Quickstart](./QUICKSTART.md), [permGWAS2 with permutations](./PERMUTATIONS.md) and [Create plots](./PLOTS.md) for
59 | detailed explanations of other flags and options.
60 |
61 | ## Overview of all flags and options
62 | |**flag**|**description**|
63 | |---|---|
64 | |-x (--genotype_file) | absolute or relative path to genotype file |
65 | |-y (--phenotype_file) | absolute or relative path to phenotype file |
66 | |-trait (--y_name)| name of phenotype (column) to be used in phenotype file, optional, default is "phenotype_value"|
67 | |-k (-kinship_file) | absolute or relative path to kinship file, optional|
68 | |-cov (--covariate_file) | absolute or relative path to covariates file, optional|
69 | |-cov_list (--covariate_list) | names of covariates to use from covariate_file, optional |
70 | |-maf (--maf_threshold) | minor allele frequency threshold as percentage value, optional, default is 0|
71 | |-load_genotype | choose whether to load full genotype from file or batch-wise during computations, optional, default is False|
72 | |-config (--config_file) | full path to yaml config file|
73 | |-model | specify model name, only relevant if you define your own models, currently only lmm is available|
74 | |-out_dir | name of the directory result-files should be stored in, optional, if not provided, files will be stored in folder "results" in current directory|
75 | |-out_file | NAME of result files, will be stored as NAME_p_values and NAME_min_p_values, optional, if not provided name of phenotype will be used|
76 | |-disable_gpu | use if you want to perform computations on CPU only though GPU would be available|
77 | |-device | GPU device to be used, optional, default is 0|
78 | |-perm | number of permutations to be performed, optional, default is 0|
79 | |-perm_method | method to use for permutations: y - permute only y, x - permute y and kinship matrix, default is x|
80 | |-adj_p_value | additionally compute permutation-based adjusted p-values and store them in the p-value file, optional default is False|
81 | |-batch (--batch_size) | number of SNPs to work on simultaneously, optional, default is 50000|
82 | |-batch_perm (--perm_batch_size) | number of SNPs to work on simultaneously while using permutations, optional, default is 1000|
83 | |-mplot (--plot, --manhattan)| creates Manhattan plot, optional|
84 | |-qqplot | creates QQ-plot, optional|
85 | |-not_add | use when genotype is not in additive encoding|
86 |
--------------------------------------------------------------------------------
/docs/PERMUTATIONS.md:
--------------------------------------------------------------------------------
1 | # permGWAS2 with permutations
2 |
3 | The main purpose of permGWAS2 is to perform GWAS with permutation-based thresholds. To use permGWAS2 with permutations,
4 | you have to specify the number of permutations *q* via the flag `-perm`:
5 | ```shell
6 | python3 permGWAS.py -x ./data/x_matrix.h5 -y ./data/y_matrix.csv -perm 100
7 | ```
8 | This creates an additional result file `min_p_values_NAME.csv` containing for each permutation the seed and the minimal
9 | p-value. Additionally, the `summary_statistics_NAME.txt` output file now contains permutation-based significance
10 | thresholds for common significance levels $\alpha$.
11 |
12 | ### General workflow of permGWAS2 with permutations
13 | 1. Compute p-values for all available SNPs during normal GWAS run
14 | 2. Create *q* permutations
15 | 3. Compute the test statistic for each permutation and SNP in batches
16 | 4. For each permutation find the maximal test statistic over all SNPs and compute the corresponding minimal p-value
17 | 5. The permutation-based threshold is given as the ($1-\alpha$)th percentile for a significance level $\alpha$
18 | (*maxT/minP method*)
19 |
20 | ### Additional settings
21 | - permGWAS2 supports two different permutation strategies which can be selected via the flag `-perm_method`:
22 | 1. `x`(default): permutes the fixed effects matrix including SNP of interest and covariates (equivalent to permuting
23 | the phenotype and covariance matrix). This method considers the population structure while permuting.
24 | 2. `y`: only permute the phenotype vector. This method is faster but breaks the population structure between the
25 | samples
26 | ```shell
27 | python3 permGWAS.py -x ./data/x_matrix.h5 -y ./data/y_matrix.csv -perm 100 -perm_method x
28 |
29 | python3 permGWAS.py -x ./data/x_matrix.h5 -y ./data/y_matrix.csv -perm 100 -perm_method y
30 | ```
31 | - permGWAS2 supports computations on GPUs. If GPUs are available, it will automatically use the 0th GPU. If no GPUs are
32 | available, permGWAS will perform all computations on CPUs only. To change the GPU you can use the flag `-device` and
33 | specify the number of the GPU to use. If you do NOT want to use GPUs, although they are available, you can use the flag
34 | `disable_gpu`:
35 | ```shell
36 | python3 permGWAS.py -x ./data/x_matrix.h5 -y ./data/y_matrix.csv -perm 100 -device 1
37 |
38 | python3 permGWAS.py -x ./data/x_matrix.h5 -y ./data/y_matrix.csv -perm 100 -dasable_gpu
39 | ```
40 | - Since permGWAS2 computes the test statistics for different SNPs and permutations simultaneously in batches, the
41 | available VRAM poses a limitation. To avoid running into memory errors (when using GPUs), you can manually adjust the
42 | batch-size, i.e. the number of SNPs to be processed simultaneously for all permutations, via the flag `-batch_perm`
43 | (The default are 1000 SNPs):
44 | ```shell
45 | python3 permGWAS.py -x ./data/x_matrix.h5 -y ./data/y_matrix.csv -perm 100 -batch_perm 500
46 | ```
47 | - permGWAS is also able to compute permutation-based adjusted p-values and save them in the p_value output file via the
48 | flag `adj_p_value`. However, it should be noted that in order to get meaningful adjusted p-values, millions of
49 | permutations are needed.
50 | ```shell
51 | python3 permGWAS.py -x ./data/x_matrix.h5 -y ./data/y_matrix.csv -perm 100 -adj_p_value
52 | ```
53 |
--------------------------------------------------------------------------------
/docs/PLOTS.md:
--------------------------------------------------------------------------------
1 | # Create plots
2 |
3 | permGWAS is able to create Manhattan and QQ-plots during a GWAS run and form existing p-value files via the
4 | `create_plot.py` script.
5 |
6 | ## Manhattan plot
7 |
8 |
9 | - While running permGWAS, you can use the flag `-mplot` to generate and save a Manhattan plot with Bonferroni
10 | significance threshold for significance level $\alpha=0.05$. If you use permGWAS2 with permutations, additionally the
11 | permutation-based threshold will be plotted.
12 | - If you already have result files generated by permGWAS, you can also create a Manhattan plot afterward. You only need
13 | to specify the p-value file (relative or absolute path) and use the flag `-mplot`:
14 | ```shell
15 | python3 create_plot.py -p_val PATH_TO_RESULT_P_VALUE_FILE -mplot
16 | ```
17 | - By default, it uses a significance level of 5%. You can change it via the flag `-sig_level`, which expects an integer
18 | value, e.g.
19 | ```shell
20 | python3 create_plot.py -p_val PATH_TO_RESULT_P_VALUE_FILE -mplot -sig_level 1
21 | ```
22 | - If you have a corresponding minimal p-value file available, you can additionally plot the permutation-based significance
23 | threshold by giving the path to the file via the flag `-min_p_val`:
24 | ```shell
25 | python3 create_plot.py -p_val PATH_TO_RESULT_P_VALUE_FILE -min_p_val PATH_TO_MIN_P_VALUE_FILE -mplot
26 | ```
27 | - The resulting Manhattan plot will be saved in the same folder where the p-value file is stored, unless you specify a
28 | different directory via `-out_dir`. If no other name is specified via `-out_file`, the plot will be named
29 | `manhattan_PHENOTYPE_NAME.png`.
30 |
31 |
32 | ### QQ-plot
33 |
34 |
35 | - While running permGWAS, you can use the flag `-qqplot` to generate and save a simple QQ-plot including the inflation
36 | factor lambda.
37 | - To generate a QQ-plot afterward based on existing p-value result files, you only need to specify the p-value file
38 | (relative or absolute path) and use the flag `-qqplot`:
39 | ```shell
40 | python3 create_plot.py -p_val PATH_TO_RESULT_P_VALUE_FILE -qqplot
41 | ```
42 | - The resulting QQ-plot will be saved in the same folder where the p-value file is stored, unless you specify a
43 | different directory via `-out_dir`. If no other name is specified via `-out_file`, the plot will be named
44 | `qq_plot_PHENOTYPE_NAME.png`.
--------------------------------------------------------------------------------
/docs/QUICKSTART.md:
--------------------------------------------------------------------------------
1 | # Quickstart Guide
2 |
3 | ## Simple workflow using Docker
4 |
5 | 1. Create a new Docker container using our [Installation Guide](./INSTALLATION.md) or start an existing container with:
6 |
7 | ```shell
8 | docker start -i CONTAINERNAME
9 | ```
10 |
11 | 2. Navigate to the directory where the permGWAS2 repository is located:
12 |
13 | ```shell
14 | cd /REPO_DIRECTORY/permGWAS
15 | ```
16 |
17 | 3. Run the script with the test data provided in the `./data` folder:
18 |
19 | ```shell
20 | python3 permGWAS.py -x ./data/x_matrix.h5 -y ./data/y_matrix.csv
21 | ```
22 |
23 | To use permGWAS2 without Docker, simply omit the first step.
24 |
25 |
26 | ## Basic settings
27 | ### 1. Input Data
28 | Details on the supported data types can be found in the [Data Guide](./DATAGUIDE.md).
29 | ###### Genotype & Phenotype
30 | - The minimal requirement is to provide a genotype and a phenotype file (as relative or absolute paths) via the
31 | flags `-x` and `-y`, respectively.
32 | - By default, permGWAS assumes that the phenotype in the phenotype file is called `phenotype_value`. You can specify a
33 | different name via the flag `-trait`:
34 | ```shell
35 | python3 permGWAS.py -x PATH_TO_GENOTYPE -y PATH_TO_PHENOTYPE -trait PHENO_NAME
36 | ```
37 | - It is possible to run permGWAS2 for several phenotypes located in the same phenotype file one after another. You can
38 | either specify a list of phenotypes or run permGWAS2 for all available phenotypes in the file by using the key word `all`:
39 | ```shell
40 | python3 permGWAS.py -x PATH_TO_GENOTYPE -y PATH_TO_PHENOTYPE -trait PHENO_1 PHENO_2 PHENO_3
41 |
42 | python3 permGWAS.py -x PATH_TO_GENOTYPE -y PATH_TO_PHENOTYPE -trait all
43 | ```
44 |
45 | ###### Kinship
46 | By default, permGWAS2 computes the realized relationship kernel as kinship matrix. You can use a pre-computed genomic
47 | relationship matrix via the flag `-k`:
48 | ```shell
49 | python3 permGWAS.py -x PATH_TO_GENOTYPE -y PATH_TO_PHENOTYPE -k PATH_TO_KINSHIP
50 | ```
51 |
52 | ###### Covariates
53 | It is possible to run permGWAS2 with additional covariates. To specify the covariate file, use the flag `cov`.
54 | By default, this uses all available covariates in the file. If you only want to use certain columns/covariates, you
55 | have to use the flag `-cov_list` and specify the covariate names as a list:
56 | ```shell
57 | python3 permGWAS.py -x PATH_TO_GENOTYPE -y PATH_TO_PHENOTYPE -cov PATH_TO_COVARIATE_FILE
58 |
59 | python3 permGWAS.py -x PATH_TO_GENOTYPE -y PATH_TO_PHENOTYPE -cov PATH_TO_COVARIATE_FILE -cov_list COV_1 COV_2 COV_3
60 | ```
61 |
62 | ### 2. Config file
63 | permGWAS2 accepts yaml config files where you can specify all flags and options instead of passing them all separately:
64 | ```shell
65 | python3 permGWAS.py -config ./data/config.yaml
66 | ```
67 | The config file should have the following structure:
68 | ```YAML
69 | ---
70 | genotype_file: "PATH_TO_GENOTYPE"
71 | phenotype_file: "PATH_TO_PHENOTYPE"
72 | trait: "PHENO_NAME"
73 | kinship_file: "PATH_TO_KINSHIP"
74 | covariate_file: "PATH_TO_COVARIATE_FILE"
75 | covariate_list:
76 | - "COV_1"
77 | - "COV_2"
78 | - "COV_3"
79 | ```
80 |
81 | ### 3. Output files
82 | Per default permGWAS2 creates a CSV output file and saves it in a directory called `results`. You can also specify a
83 | different directory for the output files via the flag `-out_dir`. The output file will be saved under the name
84 | `p_values_NAME.csv`, where NAME will be the phenotype name by default, but can also be changed via `-out_file`.
85 | ```shell
86 | python3 permGWAS.py -x PATH_TO_GENOTYPE -y PATH_TO_PHENOTYPE -out_dir RESULT_FILE_DIR -out_file RESULT_FILE_NAME
87 | ```
88 | The result file contains for each analyzed SNP:
89 | - CHR: chromosome number
90 | - POS: position within chromosome
91 | - p_value: computed p-value
92 | - test_stat: computed test statistic
93 | - maf: minor allele frequency of SNP
94 | - SE: standard error
95 | - effect_size: coefficient beta
96 |
97 | Additionally, a TXT file with summary statistics will be saved.
98 | This file contains the estimates of the variance components of the null model,
99 | the narrow-sense heritability, the Bonferroni threshold and,
100 | if activated, the permutation-based threshold.
101 |
102 |
103 | ## Further options
104 | The table below shows all available flags. For detailed explanations of further flags and options go to
105 | [permGWAS2 with permutations](./PERMUTATIONS.md), [Create plots](./PLOTS.md) and [Optional settings](./OPTIONS.md).
106 |
107 | |**flag**|**description**|
108 | |---|---|
109 | |-x (--genotype_file) | absolute or relative path to genotype file |
110 | |-y (--phenotype_file) | absolute or relative path to phenotype file |
111 | |-trait (--y_name)| name of phenotype (column) to be used in phenotype file, optional, default is "phenotype_value"|
112 | |-k (-kinship_file) | absolute or relative path to kinship file, optional|
113 | |-cov (--covariate_file) | absolute or relative path to covariates file, optional|
114 | |-cov_list (--covariate_list) | names of covariates to use from covariate_file, optional |
115 | |-maf (--maf_threshold) | minor allele frequency threshold as percentage value, optional, default is 0|
116 | |-load_genotype | choose whether to load full genotype from file or batch-wise during computations, optional, default is False|
117 | |-config (--config_file) | full path to yaml config file|
118 | |-model | specify model name, only relevant if you define your own models, currently only lmm is available|
119 | |-out_dir | name of the directory result-files should be stored in, optional, if not provided, files will be stored in folder "results" in current directory|
120 | |-out_file | NAME of result files, will be stored as NAME_p_values and NAME_min_p_values, optional, if not provided name of phenotype will be used|
121 | |-disable_gpu | use if you want to perform computations on CPU only though GPU would be available|
122 | |-device | GPU device to be used, optional, default is 0|
123 | |-perm | number of permutations to be performed, optional, default is 0|
124 | |-perm_method | method to use for permutations: y - permute only y, x - permute y and kinship matrix, default is x|
125 | |-adj_p_value | additionally compute permutation-based adjusted p-values and store them in the p-value file, optional default is False|
126 | |-batch (--batch_size) | number of SNPs to work on simultaneously, optional, default is 50000|
127 | |-batch_perm (--perm_batch_size) | number of SNPs to work on simultaneously while using permutations, optional, default is 1000|
128 | |-mplot (--plot, --manhattan)| creates Manhattan plot, optional|
129 | |-qqplot | creates QQ-plot, optional|
130 | |-not_add | use when genotype is not in additive encoding|
--------------------------------------------------------------------------------
/docs/manhattan.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/grimmlab/permGWAS/3f7a1c2e3e4c63281f5719425ff9ac405f8d9cfc/docs/manhattan.png
--------------------------------------------------------------------------------
/docs/qq_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/grimmlab/permGWAS/3f7a1c2e3e4c63281f5719425ff9ac405f8d9cfc/docs/qq_plot.png
--------------------------------------------------------------------------------
/models/__init__.py:
--------------------------------------------------------------------------------
1 | __all__ = ["_base_model", "lmm"]
--------------------------------------------------------------------------------
/models/_base_model.py:
--------------------------------------------------------------------------------
1 | import abc
2 | import torch
3 | import pathlib
4 | import pandas as pd
5 | import numpy as np
6 |
7 | from preprocess import data_loader
8 | from postprocess import plot_functions
9 |
10 |
11 | class BaseModel(abc.ABC):
12 |
13 | def __init__(self, dataset: data_loader.Dataset, batch_size: int, device: torch.device, perm: int = None,
14 | perm_batch_size: int = None):
15 | self.dataset = dataset
16 | self.batch_size = batch_size
17 | self.device = device
18 | self.perm_batch_size = perm_batch_size
19 | self.perm = perm
20 | self.v_g = None # genetic variance component for LMM
21 | self.v_e = None # residual variance component for LMM
22 | self.delta = None # v_e/v_g
23 | self.effect_size = None # effect sizes for all SNPs
24 | self.SE = None # standard errors for all SNPs
25 | self.test_stat = None # tests statistics for all SNPs
26 | self.p_value = None # p_values for all SNPs
27 | self.seeds = None # seeds for permutation with numpy generator
28 | self.perm_p_val = None # permutation-based p-values
29 | self.min_p_value = None # minimal p-values for all permutations
30 |
31 | @abc.abstractmethod
32 | def gwas(self):
33 | """
34 | Function to perform batch-wise computation of univariate test
35 |
36 | """
37 |
38 | @abc.abstractmethod
39 | def perm_gwas(self, **kwargs):
40 | """
41 | Function to perform batch-wise computation of permutation-based test
42 |
43 | """
44 |
45 | # general methods
46 | def perm_seeds(self) -> np.array:
47 | """
48 | get seeds for permutations
49 |
50 | :return: array with seeds
51 | """
52 | rng = np.random.default_rng()
53 | return rng.choice(1000000, self.perm, replace=False)
54 |
55 | def permute(self, data: torch.tensor) -> torch.tensor:
56 | """
57 | Create tensor with permutations of input data
58 |
59 | :param data: input data to permute of shape (n,c) or (n)
60 |
61 | :return: tensor with permuted data of shape (p,n,c) or (n,p)
62 | """
63 | data = data.to(torch.device("cpu"))
64 | x_perm = []
65 | for seed in self.seeds:
66 | tmp = np.random.default_rng(seed=seed)
67 | x_perm.append(tmp.permutation(data, axis=0))
68 | if data.ndim == 1:
69 | return torch.t(torch.tensor(np.array(x_perm), dtype=torch.float64, device=self.device))
70 | else:
71 | return torch.tensor(np.array(x_perm), dtype=torch.float64, device=self.device)
72 |
73 | def save_results(self, data_dir: pathlib.Path, filename: str):
74 | """
75 | Save p-values results to csv file as p_values_filename. If permutations were computed, also save
76 | minimal p-values as min_p_values_filename.
77 |
78 | :param data_dir: full path to results directory
79 | :param filename: name of results file
80 | """
81 | df = pd.DataFrame({'CHR': self.dataset.chromosomes,
82 | 'POS': self.dataset.positions,
83 | 'p_value': self.p_value,
84 | 'test_stat': self.test_stat,
85 | 'maf': self.dataset.maf,
86 | 'SE': self.SE,
87 | 'effect_size': self.effect_size})
88 | if self.perm_p_val is not None:
89 | df['adjusted_p_val'] = self.perm_p_val
90 | df.to_csv(data_dir.joinpath('p_values_' + filename), index=False)
91 | if self.min_p_value is not None:
92 | df_min = pd.DataFrame({'seed': self.seeds,
93 | 'min_p_val': self.min_p_value})
94 | df_min.to_csv(data_dir.joinpath('min_p_values_' + filename), index=False)
95 |
96 | def manhattan_plot(self, data_dir: pathlib.Path, filename: str, sig_level: int = 5):
97 | """
98 | Save Manhattan plot as manhattan_FILENAME.png to data_dir
99 |
100 | :param data_dir: full path to save directory
101 | :param filename: name of file
102 | :param sig_level: significance level for Bonferroni and perm thresholds, default is 5
103 | """
104 | df = pd.DataFrame({'CHR': self.dataset.chromosomes,
105 | 'POS': self.dataset.positions,
106 | 'p_value': self.p_value})
107 |
108 | plot_functions.manhattan_plot(df=df, data_dir=data_dir, filename=filename, min_p_values=self.min_p_value,
109 | sig_level=sig_level)
110 |
111 | def qq_plot(self, data_dir: pathlib.Path, filename: str):
112 | """
113 | Save QQ-plot as qq_plot_FILENAME.png to data_dir
114 |
115 | :param data_dir: full path to save directory
116 | :param filename: name of file
117 | """
118 | plot_functions.qq_plot(p_values=self.p_value, data_dir=data_dir, filename=filename)
119 |
--------------------------------------------------------------------------------
/models/lmm.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 | import time
4 | import scipy.stats as stats
5 |
6 | from . import _base_model
7 | from preprocess import data_loader
8 | from optimize import brent
9 |
10 |
11 | class LMM(_base_model.BaseModel):
12 |
13 | def __init__(self, dataset: data_loader.Dataset, batch_size: int, device: torch.device, perm: int = None,
14 | perm_batch_size: int = None):
15 | super().__init__(dataset=dataset, batch_size=batch_size, device=device, perm=perm,
16 | perm_batch_size=perm_batch_size)
17 | self.D = None # eigenvalues of K
18 | self.U = None # unitary matrix of eigenvectors of K
19 | self.freedom_deg = None # adjusted degrees of freedom = n_samples - degrees of freedom = int
20 | self.Uy = None # y after linear transformation with eigenvectors
21 | self.UZ = None # fixed effects after linear transformation with eigenvectors
22 |
23 | def gwas(self):
24 | """
25 | Perform batch-wise computation of univariate test with linear mixed model (EMMAX):
26 | (1) compute spectral decomposition of K=UDU'
27 | (2) transform data: U'y, U'Z
28 | (3) estimate delta and compute variance components
29 | (4) calculate residual sum of squares of null model
30 | (5) batch-wise:
31 | (a) linearly transform marker
32 | (b) calculate effect size, residual sum of squares and standard error
33 | (c) calculate test statistic
34 | (6) calculate p-values
35 | Dataset:
36 | X: genotype matrix of shape (n,m) or (n,b) if batch-wise
37 | y: phenotype vector of shape (n)
38 | K: kinship matrix of shape (n,n)
39 | fixed: vector/matrix of fixed effects of shape (n) or (n,c)
40 | """
41 | start = time.time()
42 | self.freedom_deg = self.dataset.n_samples - self.dataset.fixed.shape[1]
43 | # get spectral decomposition
44 | self.D, self.U = torch.linalg.eigh(self.dataset.K)
45 | # linearly transform data, i.e. compute U'y and U'Z for fixed effects Z
46 | self.Uy = self.transform_input(X=self.dataset.y, U=self.U) # shape (n)
47 | self.UZ = self.transform_input(X=self.dataset.fixed, U=self.U) # shape (n) or (n,c)
48 | # estimate delta and compute variance components
49 | self.delta = self.estimate_delta(gridlen=100, logdelta_min=-10, logdelta_max=10, reml=True)
50 | D = self.D + self.delta
51 | ZD = self._zd(UZ=self.UZ, D=D)
52 | ZDZ = self._zdz(UZ=self.UZ, ZD=ZD)
53 | self.v_g, self.v_e = self.compute_var_components(D=D, UZ=self.UZ, ZD=ZD, ZDZ=ZDZ, reml=True)
54 | # calculate residual sum of squares of null model
55 | RSS_0 = self.get_rss_h0() # shape: (1)
56 | self.freedom_deg -= 1
57 | # in batches:
58 | SE = []
59 | effect_size = []
60 | test_stat = []
61 | for batch in range(int(np.ceil(self.dataset.n_snps / self.batch_size))):
62 | # set bounds for SNP batch
63 | lower_bound, upper_bound = self._bounds(batch_size=self.batch_size, batch=batch)
64 | # load and transform batch of SNPs
65 | US = self._s_matrix(lower_bound=lower_bound, upper_bound=upper_bound) # shape: (n,b)
66 | # transform data
67 | US = self.transform_input(X=US, U=self.U)
68 | # calculate effect size, residual sum of squares and standard error
69 | RSS_1, stds, betas = self.get_rss_and_se(D=D, S=US, ZD=ZD, ZDZ=ZDZ)
70 | SE.append(stds.to(torch.device("cpu")))
71 | effect_size.append(betas.to(torch.device("cpu")))
72 | # calculate test statistic
73 | test_stat.append(self.get_f_score(rss0=RSS_0, rss1=RSS_1).to(torch.device("cpu")))
74 | # free GPU space
75 | if self.device.type != "cpu":
76 | with torch.cuda.device(self.device):
77 | del RSS_1
78 | del US
79 | del stds
80 | del betas
81 | torch.cuda.empty_cache()
82 | self.SE = torch.cat(SE, dim=0) # shape: (m)
83 | self.effect_size = torch.cat(effect_size, dim=0) # shape: (m)
84 | self.test_stat = torch.cat(test_stat, dim=0) # shape: (m)
85 | time_test_stats = time.time()
86 | print("Have test statistics of %d SNPs. Elapsed time: %f" % (self.test_stat.shape[0], time_test_stats - start))
87 | print("Calculate P-values now")
88 | # compute p-values
89 | self.p_value = torch.tensor(list(map(self.get_p_value, self.test_stat)))
90 | print("Have P-values. Elapsed time: ", time.time() - time_test_stats)
91 | if self.device.type != "cpu":
92 | with torch.cuda.device(self.device):
93 | del D
94 | del ZD
95 | del ZDZ
96 | del self.dataset.K
97 | torch.cuda.empty_cache()
98 |
99 | def perm_gwas(self, perm_method: str = 'x', adj_p_value: bool = False):
100 | """
101 | Perform batch-wise computation of permutation-based test with linear mixed model (EMMAX):
102 | reuse spectral decomposition of K=UDU'
103 | perm method y:
104 | (1) permute phenotype p times
105 | (2) transform data: U'y
106 | (3) estimate delta and compute variance components for each permutation
107 | (4) calculate residual sum of squares of null model
108 | (5) batch-wise:
109 | (a) linearly transform marker
110 | (b) calculate residual sum of squares
111 | (c) calculate test statistic
112 | perm method x:
113 | (1) permute fixed effects p times
114 | (2) transform data: U'Z
115 | (3) estimate delta and compute variance components for each permutation
116 | (4) calculate residual sum of squares of null model
117 | (5) batch-wise:
118 | (a) permute marker p times
119 | (b) linearly transform marker
120 | (c) calculate residual sum of squares
121 | (d) calculate test statistic
122 | (6) calculate minimal p-values for Westfall-Young permutation-based threshold
123 | optional: (7) calculate permutation-based p-values
124 | Dataset:
125 | X: genotype matrix of shape (n,m) or (n,b) if batch-wise
126 | y: phenotype vector of shape (n)
127 | K: kinship matrix of shape (n,n)
128 | fixed: vector/matrix of fixed effects of shape (n) or (n,c)
129 |
130 | :param perm_method: y to permute phenotype or x to permute fixed effects + marker
131 | :param adj_p_value: if True compute adjusted p-values, default is false
132 | """
133 | start = time.time()
134 | if self.test_stat is None:
135 | raise Exception('Need to first calculate true test statistics using LMM.gwas().')
136 | self.freedom_deg = self.dataset.n_samples - self.dataset.fixed.shape[1]
137 | self.seeds = self.perm_seeds()
138 | if perm_method == 'y':
139 | # compute permutations of y
140 | self.Uy = self.permute(data=self.dataset.y) # shape: (n,p)
141 | self.Uy = torch.unsqueeze(torch.t(self.transform_input(X=self.Uy, U=self.U)), 2) # shape: (p,n,1)
142 | # estimate variance components for each permutation
143 | self.delta = self.estimate_delta_perm(gridlen=100, logdelta_min=-10, logdelta_max=10, reml=True)
144 | self.D = self._d_delta(delta=self.delta, batch_size=self.perm) # shape: (p,1,n)
145 | self.UZ = self.get_3d_copy(v=self.UZ, batch_size=self.perm) # shape: (p,n,c)
146 | ZD = self._zd(UZ=self.UZ, D=self.D) # shape: (p,c,n)
147 | ZDZ = self._zdz(UZ=self.UZ, ZD=ZD) # shape: (p,c,c)
148 | v_g, _ = self.compute_var_components(D=self.D, UZ=self.UZ, ZD=ZD, ZDZ=ZDZ, reml=True) # shape: (p)
149 | elif perm_method == 'x':
150 | self.Uy = self.get_3d_copy(v=self.Uy, batch_size=self.perm) # shape: (p,n,1)
151 | if self.dataset.fixed.shape[1] > 1:
152 | # permute and transform fixed effects
153 | self.UZ = self.permute(data=self.dataset.fixed) # shape: (p,n,c)
154 | self.UZ = self.transform_input(X=self.UZ, U=self.U) # shape: (p,n,c)
155 | # estimate variance components
156 | self.delta = self.estimate_delta_perm(gridlen=100, logdelta_min=-10, logdelta_max=10, reml=True)
157 | self.D = self._d_delta(delta=self.delta, batch_size=self.perm) # shape: (p,1,n)
158 | ZD = self._zd(UZ=self.UZ, D=self.D) # shape: (p,c,n)
159 | ZDZ = self._zdz(UZ=self.UZ, ZD=ZD) # shape: (p,c,c)
160 | v_g, _ = self.compute_var_components(D=self.D, UZ=self.UZ, ZD=ZD, ZDZ=ZDZ, reml=True) # shape: (p)
161 | else:
162 | # reuse UZ, delta, sigma and get 3D copies
163 | self.D = self._d_delta(delta=self.delta, batch_size=self.perm) # shape: (p,1,n)
164 | self.UZ = self.get_3d_copy(v=self.UZ, batch_size=self.perm) # shape: (p,n,c)
165 | ZD = self._zd(UZ=self.UZ, D=self.D) # shape: (p,c,n)
166 | ZDZ = self._zdz(UZ=self.UZ, ZD=ZD) # shape: (p,c,c)
167 | v_g = self.v_g.repeat(self.perm) # shape: (p)
168 | else:
169 | raise Exception('Choose either permutation method x or y.')
170 | # calculate rss for null model
171 | RSS_0 = self.get_rss_h0().repeat(self.perm) # shape: (p)
172 | self.freedom_deg -= 1
173 | if self.device.type != "cpu":
174 | with torch.cuda.device(self.device):
175 | del self.delta
176 | del self.dataset.y
177 | del self.dataset.fixed
178 | torch.cuda.empty_cache()
179 | var_comp_time = time.time()
180 | print("Have variance components. Elapsed time: ", var_comp_time - start)
181 | test_stat = []
182 | for batch in range(int(np.ceil(self.dataset.n_snps / self.perm_batch_size))):
183 | # set bounds for SNP batch
184 | lower_bound, upper_bound = self._bounds(batch_size=self.perm_batch_size, batch=batch)
185 | # load and transform batch of SNPs
186 | print("\rCalculate perm test statistics for SNPs %d to %d" % (lower_bound, upper_bound), end='')
187 | if perm_method == 'y':
188 | US = self._s_matrix(lower_bound=lower_bound, upper_bound=upper_bound, save_meta=False) # shape: (n,b)
189 | # transform data
190 | US = self.transform_input(X=US, U=self.U)
191 | # get 3D copy of S for permutations
192 | US = self.get_3d_copy(v=US, batch_size=self.perm) # shape: (p,n,b)
193 | else:
194 | US = self._s_matrix(lower_bound=lower_bound, upper_bound=upper_bound, device=torch.device("cpu"),
195 | save_meta=False) # shape: (n,b)
196 | US = self.permute(data=US) # shape: (p,n,b)
197 | # transform data
198 | US = self.transform_input(X=US, U=self.U) # shape: (p,n,b)
199 | # calculate residual sum of squares
200 | RSS = self.get_rss_perm(S=US, ZD=ZD, ZDZ=ZDZ, v_g=v_g) # shape: (p,b)
201 | # calculate test statistics
202 | test_stat.append(self.get_f_score(rss0=torch.t(RSS_0.repeat(RSS.shape[1], 1)),
203 | rss1=RSS).to(torch.device("cpu"))) # shape: (p,b))
204 | if self.device.type != "cpu":
205 | with torch.cuda.device(self.device):
206 | del RSS
207 | del US
208 | torch.cuda.empty_cache()
209 | test_stat = torch.cat(test_stat, dim=1).to(torch.device("cpu")) # shape: (p,m)
210 | time_test_stats = time.time()
211 | print("\nHave perm test statistics. Elapsed time: ", time_test_stats - var_comp_time)
212 | if adj_p_value:
213 | # calculate permutation-based p-values
214 | self.perm_p_val = self.get_perm_p_value(perm_test_stats=test_stat) # shape: (m)
215 | print("Have adjusted p-values")
216 | # calculate Westfall-Young permutation-based threshold
217 | self.min_p_value = self.get_min_p_value(test_stat=test_stat) # shape: (p)
218 | print("Have minimal p-values. Elapsed time: ", time.time() - time_test_stats)
219 |
220 | def estimate_delta(self, gridlen: int = 100, logdelta_min: int = -10, logdelta_max: int = 10,
221 | reml: bool = True) -> torch.tensor:
222 | """
223 | Estimate ratio of variance components delta of LMM
224 | Get grid of evenly divided delta values on logarithmic scale and compute neg loglikelihood for each
225 |
226 | :param gridlen: length of grid, default=100
227 | :param logdelta_min: lower bound for delta (log value), default=-10
228 | :param logdelta_max: upper bound for delta (log value), default=10
229 | :param reml: if True use REML estimate, if False use ML, default=True
230 |
231 | :return: optimal delta
232 | """
233 | deltas = torch.exp(torch.linspace(start=logdelta_min, end=logdelta_max, steps=gridlen + 1, device=self.device))
234 | neglogs = self.negloglikelihood(delta=deltas, Uy=self.Uy, UZ=self.UZ, reml=reml)
235 | neglogs.to(self.device)
236 | delta_opt = self._minimize(Uy=self.Uy, UZ=self.UZ, deltas=deltas, neglogs=neglogs, gridlen=gridlen, reml=reml)
237 | return delta_opt
238 |
239 | def _minimize(self, Uy: torch.tensor, UZ: torch.tensor, deltas: torch.tensor, neglogs: torch.tensor,
240 | gridlen: int = 100, reml: bool = True) -> torch.tensor:
241 | """
242 | minimize negative loglikelihood function with brent search
243 |
244 | :param Uy: transformed phenotype vector U'y
245 | :param UZ: transformed vector of fixed effects U'Z
246 | :param deltas: tensor with possible delta values in ascending order
247 | :param neglogs: tensor with negative loglikelihood value for each delta
248 | :param gridlen: length of delta grid, default=100
249 | :param reml: if True use REML estimate, if False use ML, default=True
250 |
251 | :return: optimal delta
252 | """
253 | tmp = torch.argmin(neglogs)
254 | delta_opt = deltas[tmp]
255 | neglog_opt = neglogs[tmp]
256 | # use brent search for each triple in grid
257 | for i in range(gridlen - 1):
258 | if (neglogs[i + 1] < neglogs[i]) and (neglogs[i + 1] < neglogs[i + 2]):
259 | delta_tmp, neglog_tmp, niters = brent.brent_search(f=self.negloglikelihood, a=deltas[i],
260 | b=deltas[i + 2], x=deltas[i + 1], fx=neglogs[i + 1],
261 | Uy=Uy, UZ=UZ, reml=reml)
262 | if neglog_tmp < neglog_opt:
263 | delta_opt = delta_tmp
264 | neglog_opt = neglog_tmp
265 | return delta_opt
266 |
267 | def negloglikelihood(self, delta: torch.tensor, UZ: torch.tensor, Uy: torch.tensor, reml: bool = True) \
268 | -> torch.tensor:
269 | """
270 | compute negative loglikelihood for one delta value or several values in parallel
271 |
272 | :param delta: ratio of variance components
273 | :param UZ: transformed fixed effects U'Z
274 | :param Uy: transformed phenotype U'y
275 | :param reml: if True use REML estimate, if False use ML, default=True
276 |
277 | :return: negative loglikelihood
278 | """
279 | if delta.ndim == 0:
280 | D = self.D + delta
281 | else:
282 | D = self._d_delta(delta=delta, batch_size=len(delta)) # shape: (b,1,n)
283 | ZD = self._zd(UZ=UZ, D=D)
284 | ZDZ = self._zdz(UZ=UZ, ZD=ZD)
285 | beta = self._beta(ZDZ=ZDZ, ZDy=torch.matmul(ZD, Uy))
286 | sigma = self._sigma(D=D, Uy=Uy, UZ=UZ, beta=beta, reml=reml)
287 | if D.ndim == 1:
288 | logdetD = torch.sum(torch.log(D))
289 | else:
290 | logdetD = torch.sum(torch.squeeze(torch.log(D)), 1)
291 | if not reml:
292 | return (self.dataset.n_samples*torch.log(2*torch.pi*sigma) + logdetD + self.dataset.n_samples) / 2
293 | else:
294 | if UZ.ndim == 2:
295 | logdetZ = torch.logdet(torch.matmul(torch.t(UZ), UZ))
296 | elif UZ.ndim == 3:
297 | logdetZ = torch.logdet(torch.matmul(torch.transpose(UZ, dim0=1, dim1=2), UZ))
298 | else:
299 | logdetZ = torch.logdet(torch.matmul(torch.transpose(UZ, dim0=2, dim1=3), UZ))
300 | logdetZDZ = torch.logdet(ZDZ)
301 | return (self.freedom_deg*torch.log(2*torch.pi*sigma) + logdetD + self.freedom_deg - logdetZ + logdetZDZ) / 2
302 |
303 | def compute_var_components(self, D: torch.tensor, UZ: torch.tensor, ZD: torch.tensor, ZDZ: torch.tensor,
304 | reml: bool = True) -> tuple:
305 | """
306 | Compute variance components v_g^2 and v_e^2 with Var(y) = v_g^2K + v_e^2I
307 |
308 | :param D: vector with eigenvalues of K
309 | :param UZ: transformed fixed effects U'Z
310 | :param ZD: precomputed matrix product of (U'Z)'D^-1
311 | :param ZDZ: precomputed matrix product of (U'Z)'D^-1(U'Z)
312 | :param reml: if True use REML estimate, if False use ML, default=True
313 |
314 | :return: v_g^2 and v_e^2
315 | """
316 | beta = self._beta(ZDZ=ZDZ, ZDy=torch.matmul(ZD, self.Uy))
317 | v_g = self._sigma(D=D, Uy=self.Uy, UZ=UZ, beta=beta, reml=reml)
318 | v_e = self.delta * v_g
319 | return v_g, v_e
320 |
321 | def get_rss_h0(self, sigma_opt: bool = True, reml: bool = True) -> torch.tensor:
322 | """
323 | Compute residual sum of squares of H0 (marker has no effect on phenotype),
324 | i.e. for fixed effects Z, covariance matrix V and phenotype y compute:
325 | b = (Z'V^{-1}Z)^{-1}Z'V^{-1}y
326 | rss = (y-Zb)'V^{-1}(y-Zb)
327 | note that for optimal sigma_g rss=n-c (REML) or rss=n (ML)
328 |
329 | :param sigma_opt: if True return degrees of freedom, default is True
330 | :param reml: if True use REML estimate, if False use ML, default=True
331 |
332 | :return: residual sum of squares
333 | """
334 | if sigma_opt:
335 | if reml:
336 | return torch.tensor(self.dataset.n_samples - self.dataset.fixed.shape[1], device=self.device)
337 | else:
338 | return torch.tensor(self.dataset.n_samples, device=self.device)
339 | else:
340 | raise NotImplementedError
341 |
342 | def get_rss_and_se(self, D: torch.tensor, S: torch.tensor, ZD: torch.tensor, ZDZ: torch.tensor) -> tuple:
343 | """
344 | Compute residual sum of squares of alternative hypothesis (marker has effect on phenotype),
345 | i.e. for a 3D tensor with batches of fixed effects X and 3D tensor with copies of phenotype y:
346 | beta = (X'D^{-1}X)^{-1}X'D^{-1}y
347 | rss = (y-Xbeta)'D^{-1}(y-Xbeta)
348 | Use block-wise computation for beta, i.e., for computation of beta use the fact that X=[Z,s] for fixed
349 | effects Z and SNP s.
350 |
351 | :param D: vector with eigenvalues of K + ratio of variance components delta; shape: (n)
352 | :param S: matrix containing several markers in batches; shape: (n,b)
353 | :param ZD: precomputed matrix product of (U'Z)'D^-1; shape: (c,n)
354 | :param ZDZ: precomputed matrix product of (U'Z)'D^-1(U'Z); shape: (c,c)
355 |
356 | :return: residual sum of squares, standard error and effect size in batches
357 | """
358 | batch_size = S.shape[1]
359 | # get (X'D^{-1}X)^{-1}
360 | SD, XDX = self._xdx(D=D, S=S, ZD=ZD, ZDZ=ZDZ)
361 | XDX = torch.linalg.pinv(XDX, hermitian=True)
362 | # compute Z'Dy
363 | ZDy = self.get_3d_copy(v=torch.matmul(ZD, self.Uy), batch_size=batch_size) # shape: (b,c,1)
364 | # compute X'Dy
365 | SD = torch.matmul(SD, self.Uy).reshape(batch_size, 1, 1) # shape: (b,1,1)
366 | # put together 3D tensor
367 | SD = torch.cat((ZDy, SD), dim=1) # shape: (b,c+1,1)
368 | # compute beta
369 | beta = torch.matmul(XDX, SD) # shape: (b,c+1,1)
370 | # compute rss
371 | S = self._x_batch(X=S, fixed=self.UZ) # shape (b,n,c+1)
372 | S = torch.matmul(S, beta) # shape (b,n,1)
373 | S = self.get_3d_copy(v=self.Uy, batch_size=batch_size) - S # shape (b,n,1)
374 | resD = torch.div(S, torch.unsqueeze(D, 1))
375 | S = torch.squeeze(torch.matmul(torch.transpose(resD, dim0=1, dim1=2), S)) / self.v_g
376 | # get standard error
377 | diag = torch.diagonal(XDX, dim1=1, dim2=2)[:, -1]
378 | se = torch.sqrt(self.v_g * diag)
379 | return S, se, torch.squeeze(beta[:, -1])
380 |
381 | def get_f_score(self, rss0: torch.tensor, rss1: torch.tensor) -> torch.tensor:
382 | """
383 | Compute tensor of test statistics
384 |
385 | :param rss0: residual sum of squares of H0: marker has no effect on phenotype
386 | :param rss1: residual sum of squares of H1: marker has effect on phenotype
387 |
388 | :return: F1 score
389 | """
390 | return self.freedom_deg * (rss0 - rss1) / rss1
391 |
392 | def get_p_value(self, f_score: float) -> float:
393 | """
394 | Compute p-value using survival function of f distribution
395 |
396 | :param f_score: F1 score
397 |
398 | :return: p-value
399 | """
400 | return stats.f.sf(f_score, 1, self.freedom_deg)
401 |
402 | # functions for permutations
403 | def estimate_delta_perm(self, gridlen: int = 100, logdelta_min: int = -10, logdelta_max: int = 10,
404 | reml: bool = True) -> torch.tensor:
405 | """
406 | Estimate ratio of variance components delta of LMM for permutations
407 | Get grid of evenly divided delta values on logarithmic scale and compute neg loglikelihood for each
408 |
409 | :param gridlen: length of grid, default=100
410 | :param logdelta_min: lower bound for delta (log value), default=-10
411 | :param logdelta_max: upper bound for delta (log value), default=10
412 | :param reml: if True use REML estimate, if False use ML, default=True
413 |
414 | :return: tensor with optimal delta for each permutation
415 | """
416 | deltas = torch.exp(torch.linspace(start=logdelta_min, end=logdelta_max, steps=gridlen + 1, device=self.device))
417 | if self.UZ.ndim == 2:
418 | # for perm method y: same U'Z for each permutation
419 | neglogs = self.negloglikelihood(delta=deltas, Uy=self.get_4d_copy(v=self.Uy, batch_size=len(deltas)),
420 | UZ=self.UZ, reml=reml)
421 | else:
422 | # for perm method x: have different U'Z for each permutation
423 | neglogs = self.negloglikelihood(delta=deltas, Uy=self.get_4d_copy(v=self.Uy, batch_size=len(deltas)),
424 | UZ=self.get_4d_copy(v=self.UZ, batch_size=len(deltas)), reml=reml)
425 | neglogs.to(self.device)
426 | delta_opt = []
427 | if self.UZ.ndim == 2:
428 | # for perm method y: same U'Z for each permutation
429 | for i in range(self.perm):
430 | delta_opt.append(self._minimize(Uy=self.Uy[i, :, 0], UZ=self.UZ, deltas=deltas, neglogs=neglogs[i, :],
431 | gridlen=100, reml=True))
432 | else:
433 | # for perm method x: have different U'Z for each permutation
434 | for i in range(self.perm):
435 | delta_opt.append(self._minimize(Uy=self.Uy[i, :, 0], UZ=self.UZ[i, :, :], deltas=deltas,
436 | neglogs=neglogs[i, :], gridlen=100, reml=True))
437 | return torch.tensor(delta_opt, device=self.device)
438 |
439 | def get_rss_perm(self, S: torch.tensor, ZD: torch.tensor, ZDZ: torch.tensor, v_g: torch.tensor) -> torch.tensor:
440 | """
441 | Compute residual sum of squares of alternative hypothesis (marker has effect on phenotype) with permutations,
442 | i.e. for a 4D tensor with copies of batches of fixed effects Z and markers S and 4D tensor with copies of
443 | permutations of phenotype y:
444 | b = (X'X)^{-1}X'y
445 | rss = (y-Xb)'(y-Xb)
446 | Use block-wise computation for beta, i.e., for computation of beta use the fact that X=[Z,s] for fixed
447 | effects Z and SNP s.
448 |
449 | :param S: matrix containing batch of markers, shape: (p,n,b)
450 | :param ZD: 3D tensor containing matrix product (U'Z)'D^{-1} for each permutation, shape: (p,c,n)
451 | :param ZDZ: 3D tensor containing matrix product (U'Z)'D^{-1}(U'Z) for each permutation, shape: (p,c,c)
452 | :param v_g: tensor containing genetic variance component for each permutation, shape: (p)
453 |
454 | :return: residual sum of squares in batches
455 | """
456 | batch_size = S.shape[2]
457 | y_batch = self.get_4d_copy(v=self.Uy, batch_size=batch_size) # shape: (p,b,n,1)
458 | beta = self._beta_perm(S=S, ZD=ZD, ZDZ=ZDZ, y_batch=y_batch, batch_size=batch_size) # shape: (p,b,c+1,1)
459 | # compute residuals
460 | S = self._x_batch(X=S, fixed=self.UZ) # shape: (p,b,n,c+1)
461 | S = y_batch - torch.matmul(S, beta) # shape: (p,b,n,1)
462 | # compute residual sum of squares
463 | rss = torch.div(torch.transpose(S, dim0=2, dim1=3), self.get_4d_copy(v=self.D, batch_size=batch_size))
464 | rss = torch.squeeze(torch.matmul(rss, S)) # shape: (p,b)
465 | return torch.t(torch.div(torch.t(rss), torch.unsqueeze(v_g, dim=0)))
466 |
467 | def get_perm_p_value(self, perm_test_stats: torch.tensor) -> torch.tensor:
468 | """
469 | Compute permutation-based p-values via
470 | p = R/(qm) with R being the number of permuted test statistics bigger than the observed test statistic
471 |
472 | :param perm_test_stats: matrix containing test-statistics for all permutations and SNPs, dim (p,m)
473 |
474 | :return: adjusted p-values
475 | """
476 | sorted_test_stats, ind = torch.sort(perm_test_stats.flatten())
477 | n = sorted_test_stats.shape[0]
478 | test_stats_ind = torch.searchsorted(sorted_test_stats.contiguous(), self.test_stat.contiguous(), right=True)
479 | adj_p_value = ((n - test_stats_ind) / n).type(torch.float64)
480 | return torch.where(adj_p_value == 0., 1 / n, adj_p_value)
481 |
482 | def get_min_p_value(self, test_stat: torch.tensor) -> torch.tensor:
483 | """
484 | Compute minimal p-values for each permutation:
485 | First search the maximal test statistic for each permutation, since the survival function is decreasing, this
486 | gives the minimal p-value
487 |
488 | :param test_stat: matrix containing test-statistics for all permutations and SNPs, dim (p,m)
489 |
490 | :return: vector containing the minimal p-value for each permutation
491 | """
492 | max_test_stats, _ = torch.max(test_stat, dim=1)
493 | min_p_val = []
494 | for test in max_test_stats:
495 | min_p_val.append(self.get_p_value(f_score=test))
496 | return torch.tensor(min_p_val)
497 |
498 | # functions to compute intermediate results
499 | @staticmethod
500 | def _zd(UZ: torch.tensor, D: torch.tensor) -> torch.tensor:
501 | """
502 | Compute (U'Z)'D^{-1} for fixed effects Z of shape (n,c) or (p,n,c)
503 |
504 | :param UZ: transformed fixed effects U'Z
505 | :param D: vector with eigenvalues of K + ratio of variance components delta
506 |
507 | :return: Z'D^{-1}
508 | """
509 | if UZ.ndim == 2:
510 | return torch.div(torch.t(UZ), D)
511 | elif UZ.ndim == 3:
512 | return torch.div(torch.transpose(UZ, dim0=1, dim1=2), D)
513 | elif UZ.ndim == 4:
514 | return torch.div(torch.transpose(UZ, dim0=2, dim1=3), D)
515 |
516 | @staticmethod
517 | def _zdz(UZ: torch.tensor, ZD: torch.tensor) -> torch.tensor:
518 | """
519 | Compute (U'Z)'D^{-1}(U'Z) for fixed effects Z of shape (c,c) or (p,c,c)
520 |
521 | :param UZ: transformed fixed effects U'Z
522 | :param ZD: precomputed (U'Z)'D^{-1}
523 |
524 | :return: (U'Z)'D^{-1}(U'Z)
525 | """
526 | return torch.matmul(ZD, UZ)
527 |
528 | @staticmethod
529 | def _beta(ZDZ: torch.tensor, ZDy: torch.tensor) -> torch.tensor:
530 | """
531 | compute effect size beta = ((U'Z)'D^-1(U'Z))^-1(U'Z)'D^-1(U'y)
532 |
533 | :param ZDZ: precomputed matrix product of (U'Z)'D^-1(U'Z)
534 | :param ZDy: precomputed matrix product of (U'Z)'D^-1(U'y)
535 |
536 | :return: beta
537 | """
538 | return torch.linalg.solve(ZDZ, ZDy)
539 |
540 | def _sigma(self, D: torch.tensor, Uy: torch.tensor, UZ: torch.tensor, beta: torch.tensor, reml: bool = True) \
541 | -> torch.tensor:
542 | """
543 | compute variance component v_g^2 = ((U'y)-(U'Z)beta)'D^-1((U'y)-(U'Z)beta)/(n-c)
544 |
545 | :param D: vector with eigenvalues of K + ratio of variance components delta
546 | :param Uy: transformed phenotype U'y, shape (n)
547 | :param UZ: transformed fixed effects U'Z, shape (n,c)
548 | :param beta: effect size, shape (c)
549 | :param reml: if True use REML estimate, if False use ML, default=True
550 |
551 | :return: v_g^2
552 | """
553 | if D.ndim == 3:
554 | if Uy.ndim == 1:
555 | Uy = self.get_3d_copy(v=Uy, batch_size=D.shape[0])
556 | if beta.ndim == 2:
557 | beta = torch.unsqueeze(beta, 2)
558 | res = Uy - torch.matmul(UZ, beta)
559 | res = torch.multiply(res, res)
560 | if D.ndim == 1:
561 | res = torch.sum(torch.div(res, D))
562 | elif res.ndim == 3:
563 | res = torch.div(torch.transpose(res, dim0=1, dim1=2), D)
564 | res = torch.sum(torch.squeeze(res), 1)
565 | elif res.ndim == 4:
566 | res = torch.div(torch.transpose(res, dim0=2, dim1=3), D)
567 | res = torch.sum(torch.squeeze(res), 2)
568 | if not reml:
569 | return res / self.dataset.n_samples
570 | else:
571 | return res / self.freedom_deg
572 |
573 | def _xdx(self, D: torch.tensor, S: torch.tensor, ZD: torch.tensor, ZDZ: torch.tensor) -> tuple:
574 | """
575 | Compute (X'D^{-1}X)^{-1} for X=([Z,s_i],...,[Z,s_{i+b-1}]) of shape (b,n,c+1) for fixed effects Z of shape (n,c)
576 | and SNPs s_j
577 | For permutations compute 4D version
578 |
579 | :param D: vector with eigenvalues of K + ratio of variance components delta; shape: (n) or (p,1,n)
580 | :param S: matrix with batch of b SNPs (n,b) or (p,n,b)
581 | :param ZD: Z'D^{-1} for fixed effects Z and matrix of eigenvalues+delta D (c,n) or (p,c,n) for perm
582 | :param ZDZ: Z'D^{-1}Z for fixed effects Z and matrix of eigenvalues+delta D (c,c) or (p,c,c) for perm
583 |
584 | :return: S'D^{-1} and (X'D^{-1}X)^{-1}
585 | """
586 | if ZD.ndim == 2:
587 | batch_size = S.shape[1]
588 | # compute Z'Ds_i for each SNP s_i in batches
589 | ZDS = torch.unsqueeze(torch.t(torch.matmul(ZD, S)), dim=2) # shape: (b,c,1)
590 | # compute s_iDs_i for all SNPs in batch
591 | SD = torch.unsqueeze(torch.div(torch.t(S), D), dim=1) # shape: (b,1,n)
592 | XDX = torch.bmm(SD, torch.unsqueeze(torch.t(S), dim=2)) # shape: (b,1,1)
593 | # put together 3D tensor for XDX
594 | XDX = torch.cat((torch.cat((self.get_3d_copy(v=ZDZ, batch_size=batch_size), ZDS), dim=2),
595 | torch.cat((torch.transpose(ZDS, dim0=1, dim1=2), XDX), dim=2)), dim=1) # shape: (b,c+1,c+1)
596 | elif ZD.ndim == 3:
597 | batch_size = S.shape[2]
598 | # get 4D copy of ZDZ for batch
599 | ZDZ_4d = self.get_4d_copy(v=ZDZ, batch_size=batch_size) # shape: (p,b,c,c)
600 | # compute Z'D^{-1}S
601 | ZDS = torch.unsqueeze(torch.transpose(torch.matmul(ZD, S), dim0=1, dim1=2), 3) # shape: (p,b,c,1)
602 | # compute S'D^{-1}S
603 | St = torch.transpose(S, dim0=1, dim1=2) # shape: (p,b,n)
604 | SD = torch.unsqueeze(torch.divide(St, self.D), dim=2) # shape: (p,b,1,n)
605 | # compute S'D^{-1}S
606 | XDX = torch.matmul(SD, torch.unsqueeze(St, dim=3))
607 | # put together X'D^{-1}X
608 | XDX = torch.concat((torch.transpose(ZDS, dim0=2, dim1=3), XDX), dim=3)
609 | XDX = torch.concat((torch.concat((ZDZ_4d, ZDS), dim=3), XDX), dim=2) # shape: (p,b,c+1,c+1)
610 | else:
611 | raise Exception('Can only compute XDX for 2D or 3D version of ZD.')
612 | return SD, XDX
613 |
614 | def _beta_perm(self, S: torch.tensor, ZD: torch.tensor, ZDZ: torch.tensor, y_batch: torch.tensor, batch_size: int) \
615 | -> torch.tensor:
616 | """
617 | Compute betas for permutations in 4D tensor using block-wise computations
618 |
619 | :param S: matrix containing batch of markers, shape: (p,n,b)
620 | :param ZD: 3D tensor containing matrix product (U'Z)'D^{-1} for each permutation, shape: (p,c,n)
621 | :param ZDZ: 3D tensor containing matrix product (U'Z)'D^{-1}(U'Z) for each permutation, shape: (p,c,c)
622 | :param y_batch: 4D copy of permutations of phenotype vector, shape: (p,b,n,1)
623 | :param batch_size: number of markers
624 |
625 | :return: 4D tensor with beta values for all markers nad permutations, shape: (p,b,c+1,1)
626 | """
627 | # get S'D^{-1}S and X'D^{-1}X
628 | SD, XDX = self._xdx(D=self.D, S=S, ZD=ZD, ZDZ=ZDZ) # shape: (p,b,1,n), (p,b,c+1,c+1)
629 | # get X'D^{-1}y
630 | XDy = self.get_4d_copy(v=torch.matmul(ZD, self.Uy), batch_size=batch_size) # shape: (p,b,c,1)
631 | SD = torch.matmul(SD, y_batch) # shape: (p,b,1,1)
632 | XDy = torch.concat((XDy, SD), dim=2) # shape: (p,b,c+1,1)
633 | # get beta of shape: (p,b,c+1,1)
634 | return self._beta(ZDZ=XDX, ZDy=XDy)
635 |
636 | # functions for data transformation
637 | @staticmethod
638 | def transform_input(X: torch.tensor, U: torch.tensor) -> torch.tensor:
639 | """
640 | compute U'X
641 |
642 | :param X: input vector/matrix
643 | :param U: input matrix
644 |
645 | :return: product with transpose
646 | """
647 | return torch.matmul(torch.t(U), X)
648 |
649 | def _d_delta(self, delta: torch.tensor, batch_size: int):
650 | """
651 | get 3D tensor with D + delta*I as batches for diagonal matrix with eigenvalues D and different variance
652 | component ratios delta. If delta is one value, return tensor with b copies of D+delta.
653 |
654 | :param delta: variance component ratio shape: (b) or (1)
655 | :param batch_size: number of needed copies of D
656 |
657 | :return: D + delta of shape (b,1,n)
658 | """
659 | if delta.ndim == 1:
660 | return torch.unsqueeze(self.D.repeat(batch_size, 1) + torch.unsqueeze(delta, 1), 1)
661 | else:
662 | return torch.unsqueeze((self.D + delta).repeat(batch_size, 1), 1)
663 |
664 | def _s_matrix(self, lower_bound: int, upper_bound: int, device=None, save_meta: bool = True) -> torch.tensor:
665 | """
666 | load batch of markers to specified device
667 |
668 | :param lower_bound: lower bound of marker batch
669 | :param upper_bound: upper bound of marker batch
670 | :param device: either cpu or cuda device
671 | :param save_meta: if genotype is loaded batch-wise, set to False for permutations to prevent saving of meta info
672 |
673 | :return: matrix with markers of shape (n,upper_bound-lower_bound)
674 | """
675 | if device is None:
676 | device = self.device
677 | if self.dataset.X is None:
678 | # load X batch-wise
679 | self.dataset.load_genotype_batch_wise(device=device, save_meta=save_meta, snp_lower_index=lower_bound,
680 | snp_upper_index=upper_bound) # shape: (n,b)
681 | S = self.dataset.X # shape: (n,b)
682 | self.dataset.reset_genotype()
683 | else:
684 | # get X_batch if X was completely loaded before
685 | S = self.dataset.X[:, lower_bound:upper_bound].to(device) # shape: (n,b)
686 | return S
687 |
688 | def _x_batch(self, X: torch.tensor, fixed: torch.tensor) -> torch.tensor:
689 | """
690 | Create 3D or 4D tensor where each matrix in the 3D tensor contains the same fixed effects and a different SNP,
691 | and the 4D tensor contains copies of the 3D tensors
692 |
693 | :param X: genotype matrix/tensor of shape (n,b) or (p,n,b)
694 | :param fixed: matrix/tensor of fixed effects of shape (n,c) or (p,n,c)
695 |
696 | :return: tensor of shape (b,n,c+1) or (p,b,n,c+1)
697 | """
698 | if X.ndim == 2:
699 | b = self.get_3d_copy(v=fixed, batch_size=X.shape[1])
700 | return torch.cat((b, torch.transpose(torch.unsqueeze(X, 0), 0, 2)), dim=2)
701 | elif X.ndim == 3:
702 | b = self.get_4d_copy(v=fixed, batch_size=X.shape[2])
703 | return torch.cat((b, torch.unsqueeze(torch.transpose(X, dim0=1, dim1=2), 3)), dim=3)
704 |
705 | @staticmethod
706 | def get_3d_copy(v: torch.tensor, batch_size: int) -> torch.tensor:
707 | """
708 | Create 3D tensor with copies of input tensor
709 |
710 | :param v: vector/matrix of shape (n) or (n,c)
711 | :param batch_size: batch size of new 3D tensor
712 |
713 | :return: tensor of copies of v with shape (batch_size,n,1) or (batch_size,n,c)
714 | """
715 | if v.ndim == 1:
716 | return torch.unsqueeze(v.expand(batch_size, v.shape[0]), 2)
717 | if v.ndim == 2:
718 | return v.expand(batch_size, v.shape[0], v.shape[1])
719 |
720 | @staticmethod
721 | def get_4d_copy(v: torch.tensor, batch_size: int) -> torch.tensor:
722 | """
723 | Create 4D tensor with copies of input tensor
724 |
725 | :param v: tensor of shape (p,n,c)
726 | :param batch_size: batch size of new 4D tensor
727 |
728 | :return: tensor of copies of v with shape (p,b,n,c)
729 | """
730 | return torch.transpose(v.expand(batch_size, v.shape[0], v.shape[1], v.shape[2]), dim0=0, dim1=1)
731 |
732 | # helper functions
733 | def _bounds(self, batch_size: int, batch: int) -> tuple:
734 | """
735 | compute upper and lower bound for natch-wise computations
736 |
737 | :param batch_size: number of markers within batch
738 | :param batch: number of batch
739 |
740 | :return: lower and upper bound
741 | """
742 | lower_bound = batch * batch_size
743 | upper_bound = (batch + 1) * batch_size
744 | if upper_bound > self.dataset.n_snps:
745 | upper_bound = self.dataset.n_snps
746 | return lower_bound, upper_bound
747 |
--------------------------------------------------------------------------------
/optimize/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/grimmlab/permGWAS/3f7a1c2e3e4c63281f5719425ff9ac405f8d9cfc/optimize/__init__.py
--------------------------------------------------------------------------------
/optimize/brent.py:
--------------------------------------------------------------------------------
1 | # Brent's method
2 |
3 | def brent_search(f, a: float, b: float, x: float = None, fx: float = None, rel_tol: float = 1.48e-08,
4 | abs_tol: float = 1.48e-08, max_iter: int = 500, **kwargs) -> tuple:
5 | """
6 | Find minimum of a function using Brent's method (see Numerical Recipes 3rd Edition: The Art of Scientific Computing)
7 | Given a function f with minimum in interval [a,b], find local minimum.
8 |
9 | :param f: function to be minimized
10 | :param a: lower bound of interval
11 | :param b: upper bound of interval
12 | :param x: starting point (initial guess of minimum)
13 | :param fx: function value of f
14 | :param rel_tol: relative tolerance, default=1.48e-08
15 | :param abs_tol: absolute tolerance, default=1.48e-08
16 | :param max_iter: maximal number of iterations, default=500
17 | :param kwargs: additional arguments of f
18 |
19 | :return: minimum x, function value of minimum f(x) and number of iterations
20 | """
21 |
22 | golden = 0.381966011250105097
23 | if a > b:
24 | raise ValueError('Interval boundaries do not fit. a must be smaller or equal to b.')
25 | if x is None:
26 | x = a + golden * (b-a)
27 | if fx is None:
28 | fx = f(x, **kwargs)
29 | if not (a <= x <= b):
30 | raise ValueError('Starting value x needs to be within interval boundaries.')
31 |
32 | # initialize values
33 | x_sec, fx_sec = x, fx # second best value and function value
34 | x_trd, fx_trd = x, fx # third best value and function value
35 | d, e = 0.0, 0.0 # step size and direction of last two iterations
36 | i = -1
37 |
38 | for i in range(max_iter):
39 | mid = 0.5 * (a + b)
40 | tol1 = rel_tol * abs(x) + abs_tol
41 | tol2 = 2.0 * tol1
42 |
43 | # check stopping crit
44 | if abs(x - mid) <= tol2 - 0.5 * (b - a):
45 | break
46 |
47 | # compute Lagrange polynomial through (x, f(x)), (x_sec, f(x_sec)) and (x_trd, f(x_trd))
48 | if abs(e) > tol1:
49 | tmp1 = (x - x_sec) * (fx - fx_trd)
50 | denominator = (x - x_trd) * (fx - fx_sec)
51 | numerator = (x - x_trd) * denominator - (x - x_sec) * tmp1
52 | denominator = 2.0 * (denominator - tmp1)
53 | if denominator > 0.0:
54 | numerator = -numerator
55 | denominator = abs(denominator)
56 | tmp1 = e
57 | e = d
58 |
59 | if (abs(numerator) >= abs(0.5 * denominator * tmp1)) or (numerator <= denominator * (a-x)) or \
60 | (numerator >= denominator * (b-x)):
61 | # golden section step
62 | e = b-x if x < mid else a-x
63 | d = golden * e
64 | else:
65 | # polynomial interpolation step
66 | d = numerator / denominator
67 | x_new = x + d
68 | if (x_new - a < tol2) or (b - x_new < tol2):
69 | d = tol1 if x < mid else -tol1
70 | else:
71 | # golden section step
72 | e = b - x if x < mid else a - x
73 | d = golden * e
74 |
75 | # function must not be evaluated too close to x
76 | if tol1 <= abs(d):
77 | x_new = x + d
78 | elif 0.0 < d:
79 | x_new = x + tol1
80 | else:
81 | x_new = x - tol1
82 | fx_new = f(x_new, **kwargs)
83 |
84 | # check if x_new is better than previous x
85 | if fx_new <= fx:
86 | # decrease interval size
87 | if x_new >= x:
88 | a = x
89 | else:
90 | b = x
91 | # replace previous best 3 with current best 3
92 | x_trd, fx_trd = x_sec, fx_sec
93 | x_sec, fx_sec = x, fx
94 | x, fx = x_new, fx_new
95 | else:
96 | # decrease interval size
97 | if x_new < x:
98 | a = x_new
99 | else:
100 | b = x_new
101 | # check if x_new better than second or third and replace accordingly
102 | if fx_new <= fx_sec or x_sec == x:
103 | x_trd, fx_trd = x_sec, fx_sec
104 | x_sec, fx_sec = x_new, fx_new
105 | elif fx_new <= fx_trd or x_trd == x or x_trd == x_sec:
106 | x_trd, fx_trd = x_new, fx_new
107 |
108 | return x, fx, i+1
109 |
--------------------------------------------------------------------------------
/perform_gwas.py:
--------------------------------------------------------------------------------
1 | import pathlib
2 | import time
3 | import torch
4 |
5 | from preprocess import data_loader
6 | from utils import helper_functions
7 |
8 |
9 | def run(genotype_file: pathlib.Path, phenotype_file: pathlib.Path, model: str, trait: str = 'phenotype_value',
10 | kinship_file: pathlib.Path = None, covariate_file: pathlib.Path = None, covariate_list: list = None,
11 | maf_threshold: int = 0, load_genotype: bool = False,
12 | out_dir: pathlib.Path = pathlib.Path.cwd().joinpath('results'), out_file: str = None,
13 | device: torch.device = torch.device('cpu'), perm: int = 0, perm_method: str = 'x',
14 | adj_p_value: bool = False, batch_size: int = 50000, perm_batch_size: int = 1000, manhattan: bool = False,
15 | qqplot: bool = False, not_add: bool = False):
16 | # check user specified arguments
17 | start = time.time()
18 | print('Start loading data now')
19 |
20 | # load data
21 | dataset = data_loader.Dataset(genotype_file=genotype_file, phenotype_file=phenotype_file, trait=trait,
22 | maf_threshold=maf_threshold, load_genotype=load_genotype, kinship_file=kinship_file,
23 | covariate_file=covariate_file, covariate_list=covariate_list, not_add=not_add)
24 | dataset.to_device(device=device)
25 | have_data = time.time()
26 | print('Loaded data, elapsed time: %f s.' % (have_data - start))
27 | print('Start performing GWAS on phenotype %s for %d samples and %d SNPs.'
28 | % (trait, dataset.n_samples, dataset.n_snps))
29 |
30 | # perform GWAS
31 | gwas_model = helper_functions.get_model_class_name(model_name=model)(dataset=dataset, batch_size=batch_size,
32 | device=device, perm=perm,
33 | perm_batch_size=perm_batch_size)
34 | gwas_model.gwas()
35 | done_gwas = time.time()
36 | print('Done performing GWAS on phenotype %s for %d samples and %d SNPs.\n'
37 | 'Elapsed time: %f s' % (trait, dataset.n_samples, len(dataset.positions), done_gwas - have_data))
38 |
39 | # perform GWAS with permutations
40 | if perm > 0:
41 | print('Start performing GWAS with %d permutations.' % perm)
42 | gwas_model.perm_gwas(perm_method=perm_method, adj_p_value=adj_p_value)
43 | done_perm = time.time()
44 | print('Done performing GWAS with %d permutations.\n'
45 | 'Elapsed time: %f s' % (perm, done_perm - done_gwas))
46 |
47 | # save results
48 | print('Save results.')
49 | gwas_model.save_results(data_dir=out_dir, filename=out_file)
50 | total_time = time.time() - start
51 | print('Total time: ', total_time)
52 |
53 | # plots
54 | if manhattan:
55 | print('Save Manhattan plot with significance level of 5%.')
56 | gwas_model.manhattan_plot(data_dir=out_dir, filename=out_file, sig_level=5)
57 | total_time = time.time() - start
58 | if qqplot:
59 | print('Save QQ-plot.')
60 | gwas_model.qq_plot(data_dir=out_dir, filename=out_file)
61 | total_time = time.time() - start
62 |
63 | # summary statistics
64 | if not load_genotype:
65 | # reset number of SNPs in case of batch-wise loading
66 | dataset.n_snps = len(dataset.positions)
67 | helper_functions.get_summary_stats(out_dir=out_dir, out_file=out_file, genotype_file=genotype_file,
68 | phenotype_file=phenotype_file, trait=trait, samples=dataset.n_samples,
69 | snps=dataset.n_snps, model=model, maf_threshold=maf_threshold, perm=perm,
70 | v_g=gwas_model.v_g.item(), v_e=gwas_model.v_e.item(),
71 | min_p_val=gwas_model.min_p_value, time=total_time, kinship_file=kinship_file,
72 | covariate_file=covariate_file, covariate_list=covariate_list,
73 | perm_method=perm_method)
74 |
--------------------------------------------------------------------------------
/permGWAS.py:
--------------------------------------------------------------------------------
1 | # run the script here
2 | import argparse
3 | import pathlib
4 |
5 | from utils import check_functions
6 | import perform_gwas
7 |
8 |
9 | if __name__ == "__main__":
10 | parser = argparse.ArgumentParser()
11 | parser.add_argument('-x', '--genotype_file', type=str, default=None,
12 | help='Specify the full path to the genotype file, absolute and relative paths are accepted, '
13 | 'only accept .h5, .hdf5, .h5py, .csv, PLINK and binary PLINK files, '
14 | 'PLINK and binary PLINK: all required files must be in the same folder with same prefix. '
15 | 'See documentation for correct format.')
16 | parser.add_argument('-y', '--phenotype_file', type=str, default=None,
17 | help='Specify the full path to the phenotype file, absolute and relative paths are '
18 | 'accepted, only accept .csv, .txt and .pheno files. See documentation for correct format.')
19 | parser.add_argument('-trait', '--trait', '--y_name', nargs='+', type=str, default=['phenotype_value'],
20 | help='Specify the name of phenotype (column) to be used in phenotype file,'
21 | 'default is "phenotype_value". You can run permGWAS on several phenotypes one after '
22 | 'another if they are in the same phenotype_file. Juste name the phenotypes, '
23 | 'e.g. --trait pheno1 pheno2 if you want to use all available traits use --trait all')
24 | parser.add_argument('-k', '--kinship_file', '--k', '--kinship', type=str, default=None,
25 | help='Specify the the full path to the kinship file, absolute and relative paths are accepted,'
26 | 'only accept .csv and .h5/.h5py/.hdf5 files. See documentation for correct format. '
27 | 'Optional, if not provided realized relationship kernel will be calculated')
28 | parser.add_argument('-cov', '--covariate_file', '--cov', '--cov_file', type=str, default=None,
29 | help='Specify the full path to the covariates file, absolute and relative paths are accepted,'
30 | 'currently only accept .csv files. Optional, if not provided only intercept will be used '
31 | 'as fixed effect.')
32 | parser.add_argument('-cov_list', '--covariate_list', nargs='+', type=str, default=None,
33 | help='Specify the covariates (column headers) to use from the covariates file. Optional, if '
34 | 'not provided, will use all available columns as covariates.')
35 | parser.add_argument('-maf', '--maf_threshold', '--maf', type=int, choices=range(0, 31), default=0,
36 | help='Specify minor allele frequency threshold as percentage value. '
37 | 'Optional, if not provided no maf filtering will be performed.')
38 | parser.add_argument('-load_genotype', action='store_true',
39 | help='If used, genotype matrix will be completely loaded from file during preprocessing. '
40 | 'Otherwise load genotype batch-wise during computations of test statistics. '
41 | 'Batch-wise loading is only possible, if kinship file is provided. Default is False')
42 | parser.add_argument('-config', '--config_file', type=str, default=None,
43 | help='Specify the full path to the yaml config file. Specify all required arguments to use in '
44 | 'this config file and just give the config file instead of all required parameters. '
45 | 'For more info regarding the required format see the documentation.')
46 | parser.add_argument('-model', type=str, default='lmm',
47 | help='Specify the model to use for GWAS. Currently only lmm (linear mixed model) is '
48 | 'implemented.')
49 | parser.add_argument('-out_dir', '--out_dir', type=str, default=pathlib.Path.cwd().joinpath('results'),
50 | help='Specify the name of the directory result-files should be stored in,'
51 | 'absolute and relative paths are accepted. Optional, if not provided, files will be '
52 | 'stored in folder "results" in current directory,')
53 | parser.add_argument('-out_file', '--out_file', type=str, default=None,
54 | help='Specify NAME of result files, will be stored as p_values_NAME and min_p_values_NAME,'
55 | 'optional, if not provided name of phenotype will be used. If you run permGWAS with '
56 | 'several phenotypes, will always use name of phenotype.')
57 | parser.add_argument('-disable_gpu', action='store_true',
58 | help='If used, GPUs will be disabled and only CPUs will be used for computations.')
59 | parser.add_argument('-device', '--device', type=int, default=0,
60 | help='Specify GPU device to be used, default is 0.')
61 | parser.add_argument('-perm', '--perm', type=int, default=0,
62 | help='Specify the number of permutations (integer value) to be performed, optional, if not '
63 | 'provided no permutations will be performed')
64 | parser.add_argument('-perm_method', type=str, default='x',
65 | help='Specify the method to use for permutations: x or y,'
66 | 'for x permute fixed effects matrix including SNP of interest, which is equivalent to '
67 | 'permuting the phenotype and the covariance matrix; for y permute only the phenotype '
68 | 'vector as in permGWAS Version1. Default is x.')
69 | parser.add_argument('-adj_p_value', action='store_true',
70 | help='If used, will additionally compute adjusted permutation-based p-values for each SNP.')
71 | parser.add_argument('-batch', '--batch_size', '--batch', type=int, default=50000,
72 | help='Specify number of SNPs to work on simultaneously, default is 50000')
73 | parser.add_argument('-batch_perm', '--perm_batch_size', '--batch_perm', type=int, default=1000,
74 | help='Specify number of SNPs to work on simultaneously, default is 1000')
75 | parser.add_argument('-mplot', '--manhattan', '--plot', action='store_true',
76 | help='optional, creates manhattan plot')
77 | parser.add_argument('-qqplot', '--qqplot', action='store_true',
78 | help='optional, creates QQ-plot')
79 | parser.add_argument('-not_add', '--not_add', action='store_true',
80 | help='optional, use if genotype has different encoding.')
81 | args = vars(parser.parse_args())
82 | # check config file
83 | args = check_functions.check_all_arguments(args=args)
84 | phenotypes = args["trait"]
85 |
86 | # run pipeline
87 | for trait in phenotypes:
88 | print('Working on phenotype ', trait)
89 | args["trait"] = trait
90 | args = check_functions.check_output_files(args=args)
91 | print('Checked if all specified files exist.')
92 | try:
93 | perform_gwas.run(**args)
94 | args["out_file"] = None
95 | except Exception as exc:
96 | print("Failure when running permGWAS2.0")
97 | print(exc)
98 | continue
99 |
--------------------------------------------------------------------------------
/permGWAS_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/grimmlab/permGWAS/3f7a1c2e3e4c63281f5719425ff9ac405f8d9cfc/permGWAS_logo.png
--------------------------------------------------------------------------------
/postprocess/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/grimmlab/permGWAS/3f7a1c2e3e4c63281f5719425ff9ac405f8d9cfc/postprocess/__init__.py
--------------------------------------------------------------------------------
/postprocess/plot_functions.py:
--------------------------------------------------------------------------------
1 | import pathlib
2 | import numpy as np
3 | import pandas as pd
4 | import matplotlib.pyplot as plt
5 | import seaborn as sns
6 | import scipy.stats as stats
7 | plt.rc('axes', axisbelow=True)
8 | plt.rcParams['axes.labelsize'] = 16
9 | plt.rcParams['xtick.labelsize'] = 14
10 | plt.rcParams['ytick.labelsize'] = 14
11 | plt.rcParams['legend.fontsize'] = 16
12 | plt.rcParams['axes.titlesize'] = 20
13 |
14 | from utils import helper_functions
15 |
16 |
17 | def manhattan_plot(df: pd.DataFrame, data_dir: pathlib.Path, filename: str, min_p_values: np.array = None,
18 | sig_level: int = 5):
19 | """
20 | Save Manhattan plot as manhattan_FILENAME.png to data_dir
21 |
22 | :param df: DataFrame containing chromosome (CHR) and position (POS) identifiers, and corresponding p_values
23 | :param data_dir: full path to save directory
24 | :param filename: name of file
25 | :param min_p_values: array containing minimal p_values to compute permutation-based threshold
26 | :param sig_level: significance level for Bonferroni and perm thresholds, default is 5
27 | """
28 | if not {'CHR', 'POS', 'p_value'}.issubset(df.columns):
29 | raise Exception('Cannot create Manhattan plot; need CHR, POS and p_value in DataFrame.')
30 | n_snps = len(df)
31 | df = df[df['p_value'] <= 0.01].copy()
32 | if isinstance(df['CHR'].values[0], str):
33 | try:
34 | df['CHR'] = [int(x.replace('Chr', '')) for x in df['CHR']]
35 | except Exception as exc:
36 | print("Chromosome identifier might be wrong. Use the chromosome number.")
37 | print(exc)
38 | running_pos = 0
39 | cumul_pos = []
40 | for chrom, group_df in df.groupby('CHR'):
41 | cumul_pos.append(group_df['POS'] + running_pos)
42 | running_pos += group_df['POS'].max()
43 | df['cumul_pos'] = pd.concat(cumul_pos)
44 |
45 | fig, ax = plt.subplots(1, 1, figsize=(20, 5), constrained_layout=True)
46 | sns.scatterplot(ax=ax, data=df, x='cumul_pos', y='p_value', hue='CHR', palette='colorblind', linewidth=0, s=20,
47 | legend=None)
48 | ax.spines['top'].set_visible(False)
49 | ax.spines['right'].set_visible(False)
50 | ax.set_yscale("log")
51 | ax.invert_yaxis()
52 | ax.minorticks_off()
53 | ax.set_xlabel('Chromosome')
54 | ax.set_ylabel(r'$-log_{10}$(p-value)')
55 | ax.set_xticks(df.groupby('CHR')['cumul_pos'].median())
56 | ax.set_xticklabels(np.unique(df['CHR']))
57 |
58 | if min_p_values is not None:
59 | ax.axhline(helper_functions.compute_perm_threshold(min_p_values, sig_level), linewidth=1.5, color='blue',
60 | label='permGWAS2')
61 | ax.axhline(helper_functions.compute_bonf_threshold(n_snps, sig_level), linewidth=1.5, color='red',
62 | label='Bonferroni')
63 | ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.13), fancybox=True, ncol=2, frameon=True)
64 | fig.savefig(data_dir.joinpath('manhattan_' + pathlib.Path(filename).with_suffix('.png').as_posix()))
65 | fig.clf()
66 |
67 |
68 | def qq_plot(p_values: np.array, data_dir: pathlib.Path, filename: str):
69 | """
70 | Save QQ-plot as qq_plot_FILENAME.png to data_dir
71 |
72 | :param p_values: array containing p_values
73 | :param data_dir: full path to save directory
74 | :param filename: name of file
75 | """
76 | n_snps = len(p_values)
77 | observed_p = -np.log10(np.sort(p_values))
78 | expected_p = -np.log10(np.arange(1.0 / float(n_snps), 1, 1.0 / float(n_snps + 1)))
79 | inflation_factor = np.median(stats.chi2.isf(p_values, 1)) / 0.456
80 |
81 | plt.figure(figsize=(6, 6))
82 | plt.plot(expected_p, observed_p, '.', markersize=4, markeredgewidth=0, alpha=0.8)
83 | plt.plot(expected_p, expected_p, 'k--', linewidth=0.75)
84 | plt.text(3.5, 0.5, "$\lambda=%.2f$" % inflation_factor)
85 | plt.xlabel('Expected $-log10(p-value)$')
86 | plt.ylabel('Observed $-log10(p-value)$')
87 | plt.savefig(data_dir.joinpath('qq_plot_' + pathlib.Path(filename).with_suffix('.png').as_posix()))
88 | plt.clf()
89 |
--------------------------------------------------------------------------------
/preprocess/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/grimmlab/permGWAS/3f7a1c2e3e4c63281f5719425ff9ac405f8d9cfc/preprocess/__init__.py
--------------------------------------------------------------------------------
/preprocess/data_loader.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 | import pandas as pd
4 | import h5py
5 | import pathlib
6 | from pandas_plink import read_plink1_bin
7 |
8 |
9 | class Genotype:
10 | """
11 | Class for loading of genotype data.
12 |
13 | **Attributes**
14 |
15 | - genotype_file (*pathlib.Path*): full path to genotype file for data loading
16 | - X (*torch.tensor*): matrix containing genotype values
17 | - sample_ids (*numpy.array*): ids of genotype samples
18 | - chromosomes (*numpy.array*): chromosome identifier of SNPs
19 | - positions (*numpy.array*): position identifier of SNPs
20 | - maf (*torch.tensor*): vector containing minor allele frequencies
21 | - sample_index (*numpy.array*): indices of the samples to load from the genotype matrix
22 | - n_samples (*int*): number of samples
23 | - n_snps (*int*): number of SNPs
24 | - maf_threshold (*int*): threshold for minor allele frequency filtering
25 |
26 | **Functions**
27 |
28 | - load_genotype_ids(load_genotype): load sample_ids from .h5/.hdf5/.h5py file
29 | - load_genotype_data(): load and encode genotype data from file, calls the following functions:
30 | - load_genotype_hdf5_file(sample_index, snp_lower_index, snp_upper_index): load genotype data from
31 | .h5/.hdf5/.h5py files
32 | - load_genotype_csv_file(): load genotype data from .csv files
33 | - load_genotype_binary_plink_file(): load genotype data from binary PLINK files
34 | - load_genotype_plink_file(): load genotype data from PLINK files
35 | - encode_genotype(): check encoding of genotype, change to additive if necessary, create torch.tensor,
36 | calls the following functions:
37 | - check_encoding()
38 | - get_additive_encoding()
39 | - load_genotype_batch_wise(maf_threshold, snp_lower_index, snp_upper_index): batch-wise loading and filtering
40 | of genotype data
41 | - filter_monomorphic_snps(): remove monomorphic SNPs
42 | - get_minor_allele_freq(): compute minor allele frequencies
43 | - use_maf_filter(maf_threshold): filter for minor allele frequency
44 | - save_genotype_hdf5(filename): save genotype data as .h5 file
45 | - reset_genotype(): delete X for batch-wise loading
46 | - get_matched_data(data, row_index): filter samples of data
47 |
48 | :param genotype_file: full path to genotype file
49 | :param maf_threshold: threshold for minor allele frequency filtering
50 | :param not_add: use if genotype has different / not additive encoding
51 | """
52 |
53 | def __init__(self, genotype_file: pathlib.Path, maf_threshold: int = 0, not_add: bool = False):
54 | self.genotype_file = genotype_file
55 | self.maf_threshold = maf_threshold
56 | self.not_add = not_add
57 | self.sample_ids = None
58 | self.chromosomes = None
59 | self.positions = None
60 | self.X = None
61 | self.maf = None
62 | self.sample_index = None
63 | self.n_samples = None
64 | self.n_snps = None
65 |
66 | def load_genotype_ids(self, load_genotype: bool = False) -> np.array:
67 | """
68 | Load sample_ids from .h5/.hdf5/.h5py genotype file.
69 | """
70 | if self.genotype_file.suffix not in ('.h5', '.hdf5', '.h5py'):
71 | raise Exception('Can only load genotype IDs from .h5/.hdf5/.h5py files.')
72 | with h5py.File(self.genotype_file, "r") as gt:
73 | self.sample_ids = gt['sample_ids'][:].astype(str)
74 | if not load_genotype:
75 | self.n_snps = len(gt['position_index'][:])
76 |
77 | def load_genotype_data(self):
78 | """
79 | Load and encode genotype data. Accepts PLINK files, binary PLINK files, .csv and .h5, .hdf5, .h5py files.
80 | For .h5/.hdf5/.h5py files only load needed samples defined in self.sample_index.
81 | After loading check encoding of genotype and change to additive if necessary.
82 | Return genotype matrix as torch.tensor, chromosomes, positions and sample_ids as np.arrays.
83 | """
84 | suffix = self.genotype_file.suffix
85 | if suffix in ('.h5', '.hdf5', '.h5py'):
86 | self.X, self.chromosomes, self.positions = self.load_genotype_hdf5_file()
87 | elif suffix == '.csv':
88 | self.X, self.sample_ids, self.chromosomes, self.positions = self.load_genotype_csv_file()
89 | elif suffix in ('.bed', '.bim', '.fam'):
90 | self.X, self.sample_ids, self.chromosomes, self.positions = self.load_genotype_binary_plink_file()
91 | elif suffix in ('.map', '.ped'):
92 | self.X, self.sample_ids, self.chromosomes, self.positions = self.load_genotype_plink_file()
93 | # check if genotype is in additive encoding, change encoding if not
94 | # change X from np.array to torch.tensor
95 | self.encode_genotype()
96 | self.n_samples = len(self.sample_ids)
97 | self.n_snps = len(self.positions)
98 |
99 | def load_genotype_batch_wise(self, device: torch.device = torch.device("cpu"), save_meta: bool = True,
100 | snp_lower_index: int = None, snp_upper_index: int = None):
101 | """
102 | Load and encode genotype data batch-wise. After loading filter for monomorphic snps and minor allele frequency.
103 | Only accept .h5(.hdf5/.h5py files.
104 |
105 | :param device: device (cpu/gpu) for computations
106 | :param save_meta: save chromosome and position identifiers if True
107 | :param snp_lower_index: lower bound of batch
108 | :param snp_upper_index: upper bound of batch
109 | """
110 | self.X, chromosomes, positions = self.load_genotype_hdf5_file(snp_lower_index=snp_lower_index,
111 | snp_upper_index=snp_upper_index)
112 | self.encode_genotype()
113 | chromosomes, positions = self.filter_monomorphic_snps(chromosomes=chromosomes, positions=positions)
114 | maf = self.get_minor_allele_freq()
115 | if self.maf_threshold != 0:
116 | maf, chromosomes, positions = self.use_maf_filter(maf=maf, chromosomes=chromosomes, positions=positions)
117 | self.X = self.X.to(device)
118 |
119 | if save_meta:
120 | if self.chromosomes is None:
121 | self.chromosomes = chromosomes
122 | self.positions = positions
123 | self.maf = maf
124 | else:
125 | self.chromosomes = np.concatenate((self.chromosomes, chromosomes))
126 | self.positions = np.concatenate((self.positions, positions))
127 | self.maf = torch.cat((self.maf, maf))
128 |
129 | def load_genotype_hdf5_file(self, snp_lower_index: int = None, snp_upper_index: int = None) -> tuple:
130 | """
131 | Load genotype matrix from .h5/.hdf5/.h5py file.
132 | Only load needed samples and SNPs batch wise:
133 | will only load specified samples given in sample_index
134 | if snp_upper_bound/snp_lower_bound is given, will load SNPs batch-wise, else will load all SNPs
135 | H5, HDF5, H5PY files need to have the following structure:
136 | snps: genotype matrix either in additive encoding or in raw nucleotide encoding (biallelic
137 | notation (i.e. 'AA', 'AT', ...) or iupac notation (i.e. 'A', 'W', ...)) with samples as
138 | rows and markers as columns
139 | sample_ids: sample identifier in the same order as the rows of the genotype matrix
140 | chr_index: chromosome identifier in the same order as the columns of the genotype matrix
141 | position_index: position number (integer) in the same order as the columns of the genotype matrix
142 |
143 | :param snp_lower_index: lower bound of batch
144 | :param snp_upper_index: upper bound of batch
145 |
146 | :return: Genotype values, chromosomes and positions and sample_ids if no sample_index is specified
147 | """
148 | with h5py.File(self.genotype_file, "r") as gt:
149 | chromosomes = gt['chr_index'][snp_lower_index:snp_upper_index].astype(str)
150 | positions = gt['position_index'][snp_lower_index:snp_upper_index].astype(int)
151 | if isinstance(self.sample_index, (np.ndarray, list)):
152 | # using sample indices directly does not work for h5py --> use workaround
153 | indices, inverse = np.unique(self.sample_index, return_inverse=True)
154 | X = gt['snps'][indices, snp_lower_index:snp_upper_index]
155 | X = X[inverse, :]
156 | return X, chromosomes, positions
157 | else:
158 | raise Exception('sample_index needs to be a list in order to load certain genotype samples only.')
159 |
160 | def load_genotype_csv_file(self) -> (np.array, np.array, np.array, np.array):
161 | """
162 | Load .csv genotype file. File must have the following structure:
163 | First column must contain the sample ids, the column names should be the SNP ids as CHROMOSOME_POSITION.
164 | The values should be the genotype matrix either in additive encoding or in raw nucleotide encoding (biallelic
165 | notation (i.e. 'AA', 'AT', ...) or iupac notation (i.e. 'A', 'W', ...)).
166 |
167 | :return: Genotype values, sample_ids, chromosomes and positions
168 | """
169 | gt = pd.read_csv(self.genotype_file, index_col=0)
170 | snp_ids = np.array(list(map(lambda a: a.split("_"), gt.columns.values)))
171 | chromosomes = snp_ids[:, 0]
172 | positions = snp_ids[:, 1].astype(int)
173 | sample_ids = np.asarray(gt.index, dtype=str)
174 | X = np.asarray(gt.values)
175 | return X, sample_ids, chromosomes, positions
176 |
177 | def load_genotype_binary_plink_file(self) -> (np.array, np.array, np.array, np.array):
178 | """
179 | Load binary PLINK file, .bim, .fam, .bed files with same prefix need to be in same folder.
180 |
181 | :return: Genotype values, sample_ids, chromosomes and positions
182 | """
183 | prefix = self.genotype_file.with_suffix('').as_posix()
184 | gt = read_plink1_bin(prefix + '.bed', prefix + '.bim', prefix + '.fam', ref="a0", verbose=False)
185 | sample_ids = np.array(gt['fid'], dtype=str).flatten()
186 | positions = np.array(gt['pos']).flatten()
187 | chromosomes = np.array(gt['chrom']).flatten()
188 | X = np.asarray(gt.values)
189 | return X, sample_ids, chromosomes, positions
190 |
191 | def load_genotype_plink_file(self) -> (np.array, np.array, np.array, np.array):
192 | """
193 | Load PLINK files, .map and .ped file with same prefix need to be in same folder.
194 | Accepts GENOTYPENAME.ped and GENOTYPENAME.map as input
195 |
196 | :return: Genotype values, sample_ids, chromosomes and positions
197 | """
198 | prefix = self.genotype_file.with_suffix('').as_posix()
199 | with open(prefix + '.map', 'r') as f:
200 | chromosomes = []
201 | positions = []
202 | for line in f:
203 | tmp = line.strip().split(" ")
204 | chromosomes.append(tmp[0].strip())
205 | positions.append(int(float(tmp[-1].strip())))
206 | chromosomes = np.array(chromosomes)
207 | positions = np.array(positions)
208 | iupac_map = {"AA": "A", "GG": "G", "TT": "T", "CC": "C", "AG": "R", "GA": "R", "RR": "R", "CT": "Y", "TC": "Y",
209 | "YY": "Y", "GC": "S", "CG": "S", "SS": "S", "AT": "W", "TA": "W", "WW": "W", "GT": "K", "TG": "K",
210 | "KK": "K", "AC": "M", "CA": "M", "MM": "M"}
211 | with open(prefix + '.ped', 'r') as f:
212 | sample_ids = []
213 | X = []
214 | for line in f:
215 | tmp = line.strip().split(" ")
216 | sample_ids.append(tmp[1].strip())
217 | snps = []
218 | j = 6
219 | while j < len(tmp) - 1:
220 | snps.append(iupac_map[tmp[j] + tmp[j + 1]])
221 | j += 2
222 | X.append(snps)
223 | sample_ids = np.array(sample_ids, dtype=str)
224 | X = np.array(X)
225 | return X, sample_ids, chromosomes, positions
226 |
227 | def encode_genotype(self):
228 | """
229 | first check encoding of genotype, then change to additive if necessary, finally change X from np.array
230 | to torch.tensor
231 | """
232 | if self.not_add:
233 | print('Genotype might not be in additive encoding. Will not check encoding of genotype.')
234 | self.X = torch.tensor(self.X, dtype=torch.float64)
235 | else:
236 | enc_of_X = self.check_encoding()
237 | # if genotype in biallelic notation, will change to iupac notation and then encode additively
238 | if enc_of_X == 'biallelic':
239 | iupac_map = {"AA": "A", "GG": "G", "TT": "T", "CC": "C", "AG": "R", "GA": "R", "CT": "Y", "TC": "Y",
240 | "GC": "S", "CG": "S", "AT": "W", "TA": "W", "GT": "K", "TG": "K", "AC": "M", "CA": "M"}
241 | self.X = np.vectorize(iupac_map.__getitem__)(self.X.astype(str))
242 | enc_of_X = 'iupac'
243 | if enc_of_X == 'iupac':
244 | self.X = torch.tensor(self.get_additive_encoding(), dtype=torch.float64)
245 | elif enc_of_X == 'additive':
246 | self.X = torch.tensor(self.X, dtype=torch.float64)
247 | else:
248 | raise Exception('Genotype in wrong encoding. Can only deal with additive, iupac and biallelic '
249 | 'encoding. If you want to use different encoding use flag -not_add.')
250 |
251 | def check_encoding(self):
252 | """
253 | Check the encoding of the genotype matrix
254 |
255 | :return: encoding of the genotype matrix
256 | """
257 | if self.X[0, 0].astype(str) in ['A', 'C', 'G', 'T', 'M', 'R', 'W', 'S', 'Y', 'K']:
258 | return 'iupac'
259 | elif self.X[0, 0] in [0, 1, 2]:
260 | return 'additive'
261 | elif self.X[0, 0] in ["AA", "GG", "TT", "CC", "AG", "GA", "CT", "TC", "GC", "CG", "AT", "TA", "GT", "TG",
262 | "AC", "CA"]:
263 | return 'biallelic'
264 | else:
265 | raise Exception('Genotype in wrong encoding. Can only deal with additive, iupac and biallelic encoding. '
266 | 'Please check again.')
267 |
268 | def get_additive_encoding(self):
269 | """
270 | Function to compute additive encoding of genotype matrix with
271 | 0: homozygous major allele
272 | 1: heterozygous
273 | 2: homozygous minor allele
274 |
275 | :return: gnotype in additive encoding
276 | """
277 | alleles = []
278 | index_arr = []
279 | pairs = [['A', 'C'], ['A', 'G'], ['A', 'T'], ['C', 'G'], ['C', 'T'], ['G', 'T']]
280 | heterozygous_nuc = ['M', 'R', 'W', 'S', 'Y', 'K']
281 | for i, col in enumerate(np.transpose(self.X)):
282 | unique, inv, counts = np.unique(col, return_counts=True, return_inverse=True)
283 | unique = unique.astype(str)
284 | boolean = (unique == 'A') | (unique == 'T') | (unique == 'C') | (unique == 'G')
285 | tmp = np.zeros(3)
286 | if len(unique) > 3:
287 | raise Exception('More than two alleles encountered at snp ' + str(i))
288 | elif len(unique) == 3:
289 | hetero = unique[~boolean][0]
290 | homozygous = unique[boolean]
291 | for j, pair in enumerate(pairs):
292 | if all(h in pair for h in homozygous) and hetero != heterozygous_nuc[j]:
293 | raise Exception('More than two alleles encountered at snp ' + str(i))
294 | tmp[~boolean] = 1.0
295 | tmp[np.argmin(counts[boolean])] = 2.0
296 | elif len(unique) == 2:
297 | if list(unique) in pairs:
298 | tmp[np.argmin(counts)] = 2.0
299 | else:
300 | tmp[(~boolean).nonzero()] = 1.0
301 | else:
302 | if unique[0] in heterozygous_nuc:
303 | tmp[0] = 1.0
304 | alleles.append(tmp)
305 | index_arr.append(inv)
306 | alleles = np.transpose(np.array(alleles))
307 | index_arr = np.transpose(np.array(index_arr))
308 | cols = np.arange(alleles.shape[1])
309 | return alleles[index_arr, cols]
310 |
311 | def filter_monomorphic_snps(self, chromosomes: np.array = None, positions: np.array = None) -> (np.array, np.array):
312 | """
313 | Remove monomorphic SNPs, i.e., SNPs that are constant
314 |
315 | :param chromosomes: vector with chromosome identifiers
316 | :param positions: vector with position identifiers
317 |
318 | :return filtered chromosomes and positions
319 | """
320 | tmp = self.X == self.X[0, :]
321 | self.X = self.X[:, ~tmp.all(0)]
322 | if chromosomes is None:
323 | self.chromosomes = self.chromosomes[~tmp.all(0)]
324 | self.positions = self.positions[~tmp.all(0)]
325 | else:
326 | return chromosomes[~tmp.all(0)], positions[~tmp.all(0)]
327 |
328 | def get_minor_allele_freq(self):
329 | """
330 | Function to calculate minor allele frequencies of each SNP
331 |
332 | :return: vector containing frequencies
333 | """
334 |
335 | return (torch.sum(self.X, 0)) / (2 * self.X.shape[0])
336 |
337 | def use_maf_filter(self, maf: torch.tensor = None, chromosomes: np.array = None, positions: np.array = None) \
338 | -> (torch.tensor, np.array, np.array):
339 | """
340 | filter genotype by minor allele frequency
341 |
342 | :param maf: vector containing minor allele frequencies
343 | :param chromosomes: vector with chromosome identifiers
344 | :param positions: vector with position identifiers
345 |
346 | :return: tensor with filtered maf frequencies, chromosomes and positions
347 | """
348 | if maf is None:
349 | tmp = self.maf > (self.maf_threshold / 100)
350 | self.X = self.X[:, tmp]
351 | self.chromosomes = self.chromosomes[tmp]
352 | self.positions = self.positions[tmp]
353 | self.maf = self.maf[tmp]
354 | else:
355 | # for batch-wise loading
356 | tmp = maf > (self.maf_threshold / 100)
357 | self.X = self.X[:, tmp]
358 | return maf[tmp], chromosomes[tmp], positions[tmp]
359 |
360 | def save_genotype_hdf5(self, filename: pathlib.Path):
361 | """
362 | Save genotype data to .h5 file
363 |
364 | :param filename: Full path to new genotype file
365 | """
366 | if any(elem is None for elem in [self.X, self.sample_ids, self.chromosomes, self.positions]):
367 | raise Exception('Cannot save genotype file. Some values are None, please check again.')
368 | print('Save genotype data as ' + filename.as_posix() + '.\nThis might take some time.')
369 | with h5py.File(filename.with_suffix('.h5'), 'w') as f:
370 | f.create_dataset('sample_ids', data=self.sample_ids.astype(bytes), chunks=True, compression="gzip")
371 | f.create_dataset('chr_index', data=self.chromosomes.astype(bytes), chunks=True, compression="gzip")
372 | f.create_dataset('position_index', data=self.positions.astype(int), chunks=True, compression="gzip")
373 | f.create_dataset('snps', data=self.X, chunks=True, compression="gzip", compression_opts=7)
374 | print('Done saving H5 file.')
375 |
376 | def reset_genotype(self):
377 | """
378 | Delete X for batchwise loading
379 | """
380 | self.X = None
381 |
382 | @staticmethod
383 | def get_matched_data(data, row_index: np.array):
384 | """
385 | Get rows of data specified in index array
386 |
387 | :param data: data to match, either np.array or torch.tensor
388 | :param row_index: row-index array for filtering / matching
389 | """
390 | if data.ndim == 2:
391 | return data[row_index, :]
392 | if data.ndim == 1:
393 | return data[row_index]
394 | else:
395 | raise Exception('Cannot match data, dimensions are wrong. Expected dimension 1 or 2 but got '
396 | + str(data.ndim) + ' instead. Please check again.')
397 |
398 |
399 | class Dataset(Genotype):
400 | """
401 | Class for loading and preparation of genotype, phenotype, kinship and covariates.
402 |
403 | **Attributes**
404 |
405 | - genotype_file (*pathlib.Path*): full path to genotype file for data loading
406 | - X (*torch.tensor*): matrix containing genotype values
407 | - sample_ids (*numpy.array*): ids of genotype samples
408 | - chromosomes (*numpy.array*): chromosome identifier of SNPs
409 | - positions (*numpy.array*): position identifier of SNPs
410 | - y (*torch.tensor*): tensor containing phenotypic values
411 | - K (*torch.tensor*): kinship matrix
412 | - fixed (*torch.tensor*): matrix containing fixed effects, i.e. vector of ones and covariates if available
413 | - maf (*torch.tensor*): vector containing minor allele frequencies
414 | - sample_index (*np.array*): vector containing sample indices for batch-wise loading of X
415 | - n_samples (*int*): number of samples
416 | - n_snps (*int*): number of SNPs
417 | - maf_threshold (*int*): threshold for minor allele frequency filtering
418 |
419 | **Functions**
420 |
421 | - load_and_prepare_data(): load load and match data, calls the following functions:
422 | - see class Genotype for all genotype specific functions
423 | - load_phenotype(phenotype_file, trait): load phenotype fom file
424 | - load_kinship(kinship_file): load kinship matrix from file
425 | - compute_rrk_kinship(): compute realized relationship kernel
426 | - normalize_kinship(): normalize kinship matrix using a Gower's centered matrix
427 | - load_covariates(covariates_file, column_list): load covariates from file
428 | - get_fixed_effects(): create fixed effects vector/matrix
429 | - match_data(data_ids1, data_ids2): match ids of two datasets
430 | - to_device(device): move tensors to device
431 |
432 | :param genotype_file: full path to genotype file
433 | :param phenotype_file: full path to phenotype file
434 | :param trait: name of phenotypic trait to use
435 | :param maf_threshold: minor allele frequency threshold to use for SNP filtering, default is 0 (no filtering)
436 | :param load_genotype: bool, if False load genotype batch-wise during computations, default is False
437 | :param kinship_file: full path to kinship file, optional, if missing, compute rrk kinship
438 | :param covariate_file: full path to covariate file, optional
439 | :param covariate_list: list of covariates to use, optional
440 | :param not_add: use if genotype has different / not additive encoding
441 | """
442 |
443 | def __init__(self, genotype_file: pathlib.Path, phenotype_file: pathlib.Path, trait: str, maf_threshold: int = 0,
444 | load_genotype: bool = False, kinship_file: pathlib.Path = None, covariate_file: pathlib.Path = None,
445 | covariate_list: list = None, not_add: bool = False):
446 | super().__init__(genotype_file=genotype_file, maf_threshold=maf_threshold, not_add=not_add)
447 |
448 | self.y = None
449 | self.K = None
450 | self.fixed = None
451 | self.load_and_prepare_data(phenotype_file=phenotype_file, trait=trait, load_genotype=load_genotype,
452 | kinship_file=kinship_file, covariate_file=covariate_file,
453 | covariate_list=covariate_list)
454 |
455 | def load_and_prepare_data(self, phenotype_file: pathlib.Path, trait: str, load_genotype: bool = False,
456 | kinship_file: pathlib.Path = None, covariate_file: pathlib.Path = None,
457 | covariate_list: list = None):
458 | """
459 | Load and match genotype, phenotype, kinship and covariates.
460 | 1. Load phenotype from file.
461 | 2. Load genotype and match with pheno:
462 | If load_genotype is False, only load geno sample_ids from file and match data
463 | Load genotype sample_ids, match with pheno and load geno data only for needed samples
464 | 3. Filter genotype for monomorphic SNPs and minor allele frequency
465 | 4. Load kinship from file and match with geno, or compute kinship from geno data
466 | 5. if available load covariates from file
467 |
468 | :param phenotype_file: full path to phenotype file
469 | :param trait: name of phenotypic trait to use
470 | :param load_genotype: bool, if False load genotype batch-wise during computations, default is False
471 | :param kinship_file: full path to kinship file, optional, if missing, compute rrk kinship
472 | :param covariate_file: full path to covariate file, optional
473 | :param covariate_list: list of covariates to use, optional
474 | """
475 | # load phenotype
476 | y, y_ids = self.load_phenotype(phenotype_file=phenotype_file, trait=trait)
477 | # load genotype
478 | if not load_genotype:
479 | # only load and match sample ids of genotype, values will be loaded batch-wise during computations
480 | self.load_genotype_ids(load_genotype=False)
481 | pheno_index, self.sample_index = self.match_data(data_ids1=y_ids, data_ids2=self.sample_ids)
482 | if len(pheno_index) == 0:
483 | raise Exception("Samples of genotype and phenotype do not match.")
484 | else:
485 | if self.genotype_file.suffix in ('.h5', '.hdf5', '.h5py'):
486 | # load genotype sample ids, match data and only load genotype values for needed samples
487 | self.load_genotype_ids()
488 | pheno_index, self.sample_index = self.match_data(data_ids1=y_ids, data_ids2=self.sample_ids)
489 | if len(pheno_index) == 0:
490 | raise Exception("Samples of genotype and phenotype do not match.")
491 | self.load_genotype_data()
492 | else:
493 | self.load_genotype_data()
494 | pheno_index, self.sample_index = self.match_data(data_ids1=y_ids, data_ids2=self.sample_ids)
495 | if len(pheno_index) == 0:
496 | raise Exception("Samples of genotype and phenotype do not match.")
497 | self.X = self.get_matched_data(data=self.X, row_index=self.sample_index)
498 | self.filter_monomorphic_snps()
499 | self.maf = self.get_minor_allele_freq()
500 | if self.maf_threshold != 0:
501 | self.use_maf_filter()
502 | self.n_snps = len(self.positions)
503 | self.y = self.get_matched_data(data=y, row_index=pheno_index)
504 | self.sample_ids = self.get_matched_data(data=self.sample_ids, row_index=self.sample_index)
505 | self.n_samples = len(self.y)
506 | # kinship
507 | if kinship_file is None:
508 | # compute kinship matrix
509 | self.K = self.compute_rrk_kinship()
510 | else:
511 | # load kinship from file
512 | self.K, K_ids = self.load_kinship(kinship_file=kinship_file)
513 | _, K_index = self.match_data(data_ids1=self.sample_ids, data_ids2=K_ids)
514 | if len(K_index) == len(self.sample_ids):
515 | self.K = self.K[K_index, :][:, K_index]
516 | else:
517 | raise Exception("Sample ids of genotype and kinship matrix do not match. Please check again")
518 | self.normalize_kinship()
519 | # fixed effects
520 | if covariate_file is not None:
521 | # load covariates from file
522 | cov = self.load_covariates(covariate_file=covariate_file, covariate_list=covariate_list)
523 | cov_ids = np.asarray(cov.index, dtype=y_ids.dtype).flatten()
524 | _, cov_index = self.match_data(data_ids1=self.sample_ids, data_ids2=cov_ids)
525 | if len(cov_index) == len(self.sample_ids):
526 | self.fixed = torch.tensor(cov.values, dtype=torch.float64).flatten()[cov_index]
527 | else:
528 | raise Exception('Sample ids of covariates and phenotype do not match.')
529 | self.get_fixed_effects()
530 |
531 | def load_phenotype(self, phenotype_file: pathlib.Path, trait: str) -> (torch.Tensor, np.array):
532 | """
533 | Load phenotype from file. Accept .csv and single white space separated .txt and .pheno files.
534 | Phenotype data needs to contain sample identifiers as first column and phenotypic traits as remaining columns.
535 | The trait name should be the respective column name. Can contain more than one phenotype columns.
536 | Will drop NAN values during preparation and compute mean over replicates.
537 |
538 | :param phenotype_file: full path to phenotype file
539 | :param trait: name of phenotypic trait / column to use
540 |
541 | :return: tensor containing phenotypic traits and array containing respective sample_ids
542 | """
543 |
544 | suffix = phenotype_file.suffix
545 | # load CSV
546 | if suffix == ".csv":
547 | y = pd.read_csv(phenotype_file)
548 | # load PHENO or TXT
549 | elif suffix == ".txt":
550 | y = pd.read_csv(phenotype_file, sep=" ")
551 | elif suffix == ".pheno":
552 | y = pd.read_csv(phenotype_file, sep=" ")
553 | if {'FID', 'IID'}.issubset(set(y.columns)):
554 | y.drop(columns='FID', inplace=True)
555 | else:
556 | raise NotImplementedError('Only accept CSV, PHENO and TXT phenotype files')
557 | # account for replicates
558 | y = y.sort_values(y.columns[0]).groupby(y.columns[0]).mean()
559 | if trait not in y.columns:
560 | raise Exception('Phenotype ' + trait + ' is not in phenotype file ' + phenotype_file.as_posix())
561 | else:
562 | y = y[[trait]].dropna()
563 | return torch.tensor(y.values, dtype=torch.float64).flatten(), np.asarray(y.index, dtype=str).flatten()
564 |
565 | def load_kinship(self, kinship_file: pathlib.Path) -> (torch.tensor, np.array):
566 | """
567 | load kinship matrix from file. Only take .csv or .h5/.hdf5/.h5py files.
568 | For .csv files sample ids have to be in first column, .h5/.hdf5/.h5py files need to contain the kinship matrix
569 | with key 'kinship' and the corresponding sample ids with key 'sample_ids'.
570 |
571 | :param kinship_file: full path to kinship file
572 |
573 | :return: torch.tensor containing kinship matrix and array with sample ids
574 | """
575 | # load .csv
576 | suffix = kinship_file.suffix
577 | if suffix == ".csv":
578 | kin = pd.read_csv(kinship_file, index_col=0)
579 | K = torch.tensor(kin.values)
580 | sample_ids = np.array(kin.index, dtype=str)
581 | # load .h5/.hdf5/.h5py
582 | elif suffix in (".h5", ".hdf5", ".h5py"):
583 | with h5py.File(kinship_file, "r") as f:
584 | K = torch.tensor(f['kinship'][:], dtype=torch.float64)
585 | sample_ids = f['sample_ids'][:].astype(str)
586 | else:
587 | raise NotImplementedError('Only accept .csv, .h5, .hdf5, .h5py kinship files')
588 | return K, sample_ids
589 |
590 | def compute_rrk_kinship(self) -> torch.tensor:
591 | """
592 | compute realized relationship kernel as kinship matrix
593 |
594 | :return: kinship matrix
595 | """
596 | if self.X is None:
597 | raise Exception('Cannot compute kinship matrix, no genotype matrix available.')
598 | X_stand = (self.X - self.X.mean(axis=0)) / self.X.std(axis=0)
599 | K = torch.matmul(X_stand, torch.t(X_stand)) / self.X.shape[1]
600 | # set negative values in K to zero
601 | return torch.where(K > 0, K, 0.)
602 |
603 | def normalize_kinship(self):
604 | """
605 | normalize kinship matrix using a Gower's centered matrix
606 | """
607 | n = self.K.shape[0]
608 | P = (torch.eye(n, dtype=self.K.dtype, device=self.K.device) -
609 | torch.ones(n, n, dtype=self.K.dtype, device=self.K.device) / n)
610 | self.K = (n - 1) / torch.sum(torch.mul(P, self.K)) * self.K
611 |
612 | def load_covariates(self, covariate_file: pathlib.Path, covariate_list: list = None) -> torch.tensor:
613 | """
614 | Only take .csv files: sample ids have to be in first column, if column_list is available, will load all columns
615 | specified, else will load all available columns
616 |
617 | :param covariate_file: full path to covariates file
618 | :param covariate_list: list containing column names/headers of covariates to load
619 |
620 | :return: pandas DataFrame containing covariates with sample ids as index
621 | """
622 | if covariate_file.suffix == ".csv":
623 | covs = pd.read_csv(covariate_file)
624 | covs = covs.sort_values(covs.columns[0]).groupby(covs.columns[0]).mean().dropna()
625 | if covariate_list is not None:
626 | if set(covariate_list).issubset(set(covs.columns)):
627 | covs = covs[covariate_list]
628 | else:
629 | raise Exception('Specified covariates are not available in covariate file. Please check again.')
630 | else:
631 | raise NotImplementedError('Only accept .csv covariates files')
632 | return covs
633 |
634 | def get_fixed_effects(self):
635 | """
636 | Check for covariates and create fixed effects matrix with ones as first column and covariates as remaining
637 | columns if available --> dim: (n, c+1)
638 | """
639 | if self.fixed is None:
640 | self.fixed = torch.ones((len(self.y), 1), dtype=torch.float64)
641 | elif self.fixed.ndim == 1:
642 | self.fixed = torch.stack((torch.ones(len(self.y), dtype=torch.float64), self.fixed), dim=1)
643 | else:
644 | self.fixed = torch.cat((torch.ones((len(self.y), 1), dtype=torch.float64), self.fixed), dim=1)
645 |
646 | def to_device(self, device: torch.device):
647 | """
648 | move data to device
649 |
650 | :param device: cpu or cuda
651 | """
652 | self.y = self.y.to(device)
653 | self.K = self.K.to(device)
654 | self.fixed = self.fixed.to(device)
655 |
656 | @staticmethod
657 | def match_data(data_ids1: np.array, data_ids2: np.array) -> (np.array, np.array):
658 | """
659 | match two datasets
660 |
661 | :param data_ids1: ids of first dataset
662 | :param data_ids2: ids of second dataset
663 |
664 | :return: two arrays with indices of matched data
665 | """
666 | return (np.reshape(data_ids1, (data_ids1.shape[0], 1)) == data_ids2.astype(data_ids1.dtype)).nonzero()
667 |
--------------------------------------------------------------------------------
/supplementary_data/simulated_phenotypes_her30.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/grimmlab/permGWAS/3f7a1c2e3e4c63281f5719425ff9ac405f8d9cfc/supplementary_data/simulated_phenotypes_her30.h5
--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/grimmlab/permGWAS/3f7a1c2e3e4c63281f5719425ff9ac405f8d9cfc/utils/__init__.py
--------------------------------------------------------------------------------
/utils/check_functions.py:
--------------------------------------------------------------------------------
1 | import pathlib
2 | import torch
3 | import pandas as pd
4 | from utils import helper_functions
5 | import models
6 |
7 |
8 | def check_all_arguments(args: dict) -> dict:
9 | """
10 | Check user specified arguments for plausibility and turn all file paths to pathlib.Path objects
11 | :param args:
12 | :return:
13 | """
14 | if args["config_file"] is not None:
15 | args = helper_functions.parse_config_file(args=args)
16 | del args["config_file"]
17 | # check if specified files exist
18 | args["genotype_file"] = check_file(filepath=args["genotype_file"])
19 | args["phenotype_file"] = check_file(filepath=args["phenotype_file"])
20 | args["kinship_file"] = check_file(filepath=args["kinship_file"])
21 | args["covariate_file"] = check_file(filepath=args["covariate_file"])
22 | if args["trait"] is None:
23 | args["trait"] = 'phenotype_value'
24 | elif (args["trait"] == 'all') or (args["trait"] == ['all']):
25 | print('Will perform computations on all available phenotypes.')
26 | args["out_file"] = None
27 | suffix = args["phenotype_file"].suffix
28 | if suffix == ".csv":
29 | df = pd.read_csv(args["phenotype_file"], index_col=0)
30 | # load PHENO or TXT
31 | elif suffix == ".txt":
32 | df = pd.read_csv(args["phenotype_file"], index_col=0, sep=" ")
33 | elif suffix == ".pheno":
34 | df = pd.read_csv(args["phenotype_file"], index_col=0, sep=" ")
35 | if 'FID' in df.columns:
36 | df.drop(columns='FID', inplace=True)
37 | if 'IID' in df.columns:
38 | df.drop(columns='IID', inplace=True)
39 | else:
40 | raise Exception('Only accept .txt, .pheno or .csv phenotype files.')
41 | args["trait"] = df.columns.tolist()
42 | elif isinstance(args["trait"], str):
43 | args["trait"] = [args["trait"]]
44 | elif isinstance(args["trait"], list):
45 | args["out_file"] = None
46 | else:
47 | raise Exception('Something is wrong with the trait name. Please check again.')
48 | # sanity checks for fast loading and batch-wise loading
49 | if args["kinship_file"] is None:
50 | args["load_genotype"] = True
51 | if args["genotype_file"].suffix not in ('.h5', '.hdf5', '.h5py'):
52 | args["load_genotype"] = True
53 | # check gpu
54 | if torch.cuda.is_available() and not args["disable_gpu"]:
55 | dev = "cuda:" + str(args["device"])
56 | print('GPU is available. Perform computations on device ', dev)
57 | else:
58 | dev = "cpu"
59 | print('GPU is not available. Perform computations on device ', dev)
60 | del args["disable_gpu"]
61 | args["device"] = torch.device(dev)
62 | # check model
63 | if args["model"] is None:
64 | args["model"] = 'lmm'
65 | if args["model"] not in models.__all__:
66 | raise NotImplementedError('Specified model not implemented')
67 |
68 | # sanity checks
69 | if args["maf_threshold"] is None:
70 | args["maf_threshold"] = 0
71 | if isinstance(args["covariate_list"], str):
72 | args["covariate_list"] = [args["covariate_list"]]
73 | # check permutation method
74 | if args["perm"] is None:
75 | args["perm"] = 0
76 | if args["perm"] > 0:
77 | if args["perm_method"] not in ('x', 'y'):
78 | raise NotImplementedError(' Can only perform permutation methods x and y. Please check again.')
79 | if args["adj_p_value"] and args["perm"] == 0:
80 | raise Exception('Can not compute adjusted p-values with 0 permutations. Please check again.')
81 | return args
82 |
83 |
84 | def check_output_files(args: dict) -> dict:
85 | # check output directory and file
86 | if args["out_file"] is None:
87 | args["out_file"] = args["trait"] + '.csv'
88 | if args["out_dir"] is None:
89 | args["out_dir"] = pathlib.Path.cwd().joinpath('results')
90 | args["out_dir"], args["out_file"] = check_dir_paths(args["out_dir"], args["out_file"])
91 | return args
92 |
93 |
94 | def check_file(filepath: str):
95 | """
96 | Check if specified file exists
97 |
98 | :param filepath: full path to file
99 |
100 | :return: path to file as Path object
101 | """
102 | if filepath is None:
103 | return None
104 | else:
105 | filepath = pathlib.Path(filepath)
106 | if filepath.is_file():
107 | return filepath
108 | else:
109 | raise FileNotFoundError('There is no file ', filepath.as_posix())
110 |
111 |
112 | def check_dir_paths(out_dir: str, out_file: str, prefix: str = 'p_values_') -> (pathlib.Path, pathlib.Path):
113 | """
114 | Check if directory for result files exists, if not, create directory.
115 | Then check if result files already exist, if they already exist, rename result file by adding (i) to the
116 | end of the file
117 |
118 | :param out_dir: directory to save result files
119 | :param out_file: result file
120 | :param prefix: prefix to use when checking for existing files, default is p_values_
121 |
122 | :return: path object
123 | """
124 | my_path = pathlib.Path(out_dir)
125 | if prefix in ('manhattan_', 'qq_plot_'):
126 | suffix = '.png'
127 | elif prefix == '':
128 | suffix = '.h5'
129 | else:
130 | suffix = '.csv'
131 | out_file = pathlib.Path(out_file).with_suffix(suffix).as_posix()
132 | if my_path.is_dir():
133 | if my_path.joinpath(prefix + out_file).exists():
134 | if suffix == '.h5':
135 | raise Exception('File %s already exists in chosen directory %s.' % (out_file, out_dir))
136 | i = 1
137 | new_file = pathlib.Path(out_file).with_suffix('').as_posix() + '(' + str(i) + ')' + suffix
138 | new_path = my_path.joinpath(prefix + new_file)
139 | while new_path.exists():
140 | i += 1
141 | new_file = pathlib.Path(out_file).with_suffix('').as_posix() + '(' + str(i) + ')' + suffix
142 | new_path = my_path.joinpath(prefix + new_file)
143 | print('The file %s already exists in chosen directory %s. Changed filename to %s.'
144 | % (prefix + out_file, out_dir, prefix + new_file))
145 | else:
146 | new_file = out_file
147 | else:
148 | new_file = out_file
149 | my_path.mkdir(parents=True, exist_ok=True)
150 | return my_path, new_file
151 |
--------------------------------------------------------------------------------
/utils/helper_functions.py:
--------------------------------------------------------------------------------
1 | import yaml
2 | import pathlib
3 | import importlib
4 | import inspect
5 | import numpy as np
6 |
7 | import models
8 |
9 |
10 | def parse_config_file(args: dict) -> dict:
11 | """
12 | Read yaml config file to update all user specified arguments
13 |
14 | :param args: dict with user specified arguments
15 |
16 | :return: updated dict with arguments
17 | """
18 | config_path = pathlib.Path(args["config_file"])
19 | if not config_path.is_file():
20 | raise FileNotFoundError('Specified config file does not exist. Please check again.')
21 | if config_path.suffix not in ['.yaml', '.yml']:
22 | raise Exception('Only accept yaml config files. Please check again.')
23 | config = yaml.safe_load(open(config_path))
24 | args.update(config)
25 | return args
26 |
27 |
28 | def get_model_class_name(model_name: str = 'lmm'):
29 | """
30 | Get class name of model for user input
31 |
32 | :param model_name: user input of model name
33 | :return: model class name
34 | """
35 | if model_name in models.__all__:
36 | model_name = 'models.' + model_name
37 | for name, cls in inspect.getmembers(importlib.import_module(model_name), inspect.isclass):
38 | if cls.__module__ == model_name:
39 | return cls
40 | else:
41 | raise NotImplementedError('No class named ', model_name)
42 | else:
43 | raise NotImplementedError('No class named ', model_name)
44 |
45 |
46 | def estimate_heritability(v_g: float, v_e: float) -> float:
47 | """
48 | compute narrow sense heritability
49 | :param v_g: genetic variance component
50 | :param v_e: residual variance component
51 | :return: narrow sense heritability
52 | """
53 | return v_g / (v_g + v_e)
54 |
55 |
56 | def compute_perm_threshold(min_p_val: np.array, sig_level: int) -> float:
57 | """
58 | Compute permutation-based threshold
59 | :param min_p_val: array with minimal p-values
60 | :param sig_level: significance level as percentage value
61 | :return: threshold
62 | """
63 | return np.percentile(min_p_val, sig_level)
64 |
65 |
66 | def compute_bonf_threshold(number_snps: int, sig_level: int) -> float:
67 | """
68 | Compute Bonferroni threshold
69 | :param number_snps: number of SNPs
70 | :param sig_level: significance level as percentage value
71 | :return: threshold
72 | """
73 | return (sig_level / 100) / number_snps
74 |
75 |
76 | def print_summary_stats(genotype_file: pathlib.Path, phenotype_file: pathlib.Path, trait: str, samples: int, snps: int,
77 | model: str, maf_threshold: int, perm: int, v_g: float, v_e: float, h2: float, bonf1: float,
78 | bonf5: float, perm1: float, perm5: float, time: float, kinship_file: pathlib.Path = None,
79 | covariate_file: pathlib.Path = None, covariate_list: list = None, perm_method: str = None):
80 | """
81 | Print summary statistics
82 |
83 | :param genotype_file:
84 | :param phenotype_file:
85 | :param trait: name of phenotypic trait
86 | :param samples: number of samples used
87 | :param snps: number of SNPs used
88 | :param model: model used for GWAS
89 | :param maf_threshold: threshold used for maf filtering
90 | :param perm: number of permutations
91 | :param v_g: genetic variance component
92 | :param v_e: residual variiance component
93 | :param h2: narrow-sense heritability
94 | :param bonf1: Bonferroni threshold significance level 1%
95 | :param bonf5: Bonferroni threshold significance level 5%
96 | :param perm1: permutation-based threshold significance level 1%
97 | :param perm5: permutation-based threshold significance level 5%
98 | :param kinship_file:
99 | :param covariate_file:
100 | :param covariate_list: list containing covariates
101 | :param perm_method: method used for permutations
102 | """
103 | print('\n')
104 | print('+++++++++ Summary Statistics +++++++++')
105 | print('## Genotype file: ' + genotype_file.as_posix())
106 | print('## Phenotype file: ' + phenotype_file.as_posix())
107 | print('## Phenotype: ' + trait)
108 | if covariate_file is not None:
109 | print('## Covariate file: ' + covariate_file.as_posix())
110 | if covariate_list is not None:
111 | print('## Used covariates: ' + ",".join(covariate_list))
112 | else:
113 | print('## Used all available covariates')
114 | if kinship_file is not None:
115 | print('## Kinship file: ' + kinship_file.as_posix())
116 | print('## Number of individuals: ' + str(samples))
117 | print('## Number of SNPs: ' + str(snps))
118 | print('## Model: ' + model)
119 | print('## MAF threshold: ' + str(maf_threshold))
120 | print('## Number of permutations: ' + str(perm))
121 | if perm_method is not None:
122 | print('## permutation method: ' + perm_method)
123 | if model == 'lmm':
124 | print('## v_g estimate in null model: ' + str(v_g))
125 | print('## v_e estimate in null model: ' + str(v_e))
126 | print('## Narrow-sense heritability estimate: ' + str(h2))
127 | print('## Bonferroni threshold (1% significance level): ' + str(bonf1))
128 | print('## Bonferroni threshold (5% significance level): ' + str(bonf5))
129 | if perm1 is not None:
130 | print('## Permutation-based threshold (1% significance level): ' + str(perm1))
131 | print('## Permutation-based threshold (5% significance level): ' + str(perm5))
132 | print('## Total time: %.2f s' %time)
133 | print('+++++++++++++++++++++++++++')
134 | print('\n')
135 |
136 |
137 | def write_summary_stats(out_dir: pathlib.Path, out_file: str, genotype_file: pathlib.Path, phenotype_file: pathlib.Path,
138 | trait: str, samples: int, snps: int, model: str, maf_threshold: int, perm: int, v_g: float,
139 | v_e: float, h2: float, bonf1: float, bonf5: float, perm1: float, perm5: float, time: float,
140 | kinship_file: pathlib.Path = None, covariate_file: pathlib.Path = None,
141 | covariate_list: list = None, perm_method: str = None):
142 | """
143 | Save summary statistics to txt file
144 |
145 | :param out_dir:
146 | :param out_file:
147 | :param genotype_file:
148 | :param phenotype_file:
149 | :param trait: name of phenotypic trait
150 | :param samples: number of samples used
151 | :param snps: number of SNPs used
152 | :param model: model used for GWAS
153 | :param maf_threshold: threshold used for maf filtering
154 | :param perm: number of permutations
155 | :param v_g: genetic variance component
156 | :param v_e: residual variance component
157 | :param h2: narrow-sense heritability
158 | :param bonf1: Bonferroni threshold significance level 1%
159 | :param bonf5: Bonferroni threshold significance level 5%
160 | :param perm1: permutation-based threshold significance level 1%
161 | :param perm5: permutation-based threshold significance level 5%
162 | :param kinship_file:
163 | :param covariate_file:
164 | :param covariate_list: list containing covariates
165 | :param perm_method: method used for permutations
166 | """
167 | filename = out_dir.joinpath('summary_statistics_' + pathlib.Path(out_file).with_suffix('.txt').as_posix())
168 | with open(filename, 'w') as f:
169 | f.write('Summary Statistics:\n')
170 | f.write('## Genotype file:\t' + genotype_file.as_posix() + '\n')
171 | f.write('## Phenotype file:\t' + phenotype_file.as_posix() + '\n')
172 | f.write('## Phenotype:\t' + trait + '\n')
173 | if covariate_file is not None:
174 | f.write('## Covariate file:\t' + covariate_file.as_posix() + '\n')
175 | if covariate_list is not None:
176 | f.write('## Used covariates:\t' + ",".join(covariate_list) + '\n')
177 | else:
178 | f.write('## Used all available covariates' + '\n')
179 | if kinship_file is not None:
180 | f.write('## Kinship file:\t' + kinship_file.as_posix() + '\n')
181 | f.write('## Number of individuals:\t' + str(samples) + '\n')
182 | f.write('## Number of SNPs:\t' + str(snps) + '\n')
183 | f.write('## Model:\t' + model + '\n')
184 | f.write('## MAF threshold:\t' + str(maf_threshold) + '\n')
185 | f.write('## Number of permutations:\t' + str(perm) + '\n')
186 | if perm_method is not None:
187 | f.write('## permutation method:\t' + perm_method + '\n')
188 | if model == 'lmm':
189 | f.write('## v_g estimate in null model:\t' + str(v_g) + '\n')
190 | f.write('## v_e estimate in null model:\t' + str(v_e) + '\n')
191 | f.write('## Narrow-sense heritability estimate:\t' + str(h2) + '\n')
192 | f.write('## Bonferroni threshold (1% significance level):\t' + str(bonf1) + '\n')
193 | f.write('## Bonferroni threshold (5% significance level):\t' + str(bonf5) + '\n')
194 | if perm1 is not None:
195 | f.write('## Permutation-based threshold (1% significance level):\t' + str(perm1) + '\n')
196 | f.write('## Permutation-based threshold (5% significance level):\t' + str(perm5) + '\n')
197 | f.write('## Total time:\t' + str(time) + ' s\n')
198 |
199 |
200 | def get_summary_stats(out_dir: pathlib.Path, out_file: str, genotype_file: pathlib.Path, phenotype_file: pathlib.Path,
201 | trait: str, samples: int, snps: int, model: str, maf_threshold: int, perm: int, v_g: float,
202 | v_e: float, min_p_val: np.array, time: float, kinship_file: pathlib.Path = None,
203 | covariate_file: pathlib.Path = None, covariate_list: list = None, perm_method: str = None):
204 | """
205 | Compute summary statistics, print and save them to file
206 |
207 | :param out_dir:
208 | :param out_file:
209 | :param genotype_file:
210 | :param phenotype_file:
211 | :param trait: name of phenotypic trait
212 | :param samples: number of samples used
213 | :param snps: number of SNPs used
214 | :param model: model used for GWAS
215 | :param maf_threshold: threshold used for maf filtering
216 | :param perm: number of permutations
217 | :param v_g: genetic variance component
218 | :param v_e: residual variance component
219 | :param min_p_val: minimal p-values
220 | :param kinship_file:
221 | :param covariate_file:
222 | :param covariate_list: list containing covariates
223 | :param perm_method: method used for permutations
224 | """
225 | if model == 'lmm':
226 | h2 = estimate_heritability(v_g=v_g, v_e=v_e)
227 | else:
228 | h2 = None
229 | bonf1 = compute_bonf_threshold(number_snps=snps, sig_level=1)
230 | bonf5 = compute_bonf_threshold(number_snps=snps, sig_level=5)
231 | if min_p_val is not None:
232 | perm1 = compute_perm_threshold(min_p_val=min_p_val, sig_level=1)
233 | perm5 = compute_perm_threshold(min_p_val=min_p_val, sig_level=5)
234 | else:
235 | perm1 = None
236 | perm5 = None
237 |
238 | write_summary_stats(out_dir=out_dir, out_file=out_file, genotype_file=genotype_file, phenotype_file=phenotype_file,
239 | trait=trait, samples=samples, snps=snps, model=model, maf_threshold=maf_threshold, perm=perm,
240 | v_g=v_g, v_e=v_e, h2=h2, bonf1=bonf1, bonf5=bonf5, perm1=perm1, perm5=perm5, time=time,
241 | kinship_file=kinship_file, covariate_file=covariate_file, covariate_list=covariate_list,
242 | perm_method=perm_method)
243 | print_summary_stats(genotype_file=genotype_file, phenotype_file=phenotype_file,
244 | trait=trait, samples=samples, snps=snps, model=model, maf_threshold=maf_threshold, perm=perm,
245 | v_g=v_g, v_e=v_e, h2=h2, bonf1=bonf1, bonf5=bonf5, perm1=perm1, perm5=perm5, time=time,
246 | kinship_file=kinship_file, covariate_file=covariate_file, covariate_list=covariate_list,
247 | perm_method=perm_method)
248 |
--------------------------------------------------------------------------------