├── gitignore ├── .DS_Store ├── docs ├── .DS_Store ├── build │ ├── .DS_Store │ ├── objects.inv │ ├── _static │ │ ├── file.png │ │ ├── plus.png │ │ ├── minus.png │ │ ├── css │ │ │ ├── fonts │ │ │ │ ├── lato-bold.woff │ │ │ │ ├── lato-bold.woff2 │ │ │ │ ├── lato-normal.woff │ │ │ │ ├── lato-normal.woff2 │ │ │ │ ├── Roboto-Slab-Bold.woff │ │ │ │ ├── lato-bold-italic.woff │ │ │ │ ├── Roboto-Slab-Bold.woff2 │ │ │ │ ├── fontawesome-webfont.eot │ │ │ │ ├── fontawesome-webfont.ttf │ │ │ │ ├── lato-bold-italic.woff2 │ │ │ │ ├── lato-normal-italic.woff │ │ │ │ ├── Roboto-Slab-Regular.woff │ │ │ │ ├── Roboto-Slab-Regular.woff2 │ │ │ │ ├── fontawesome-webfont.woff │ │ │ │ ├── fontawesome-webfont.woff2 │ │ │ │ └── lato-normal-italic.woff2 │ │ │ └── badge_only.css │ │ ├── documentation_options.js │ │ ├── js │ │ │ ├── badge_only.js │ │ │ ├── html5shiv.min.js │ │ │ ├── html5shiv-printshiv.min.js │ │ │ └── theme.js │ │ ├── pygments.css │ │ ├── doctools.js │ │ ├── language_data.js │ │ └── sphinx_highlight.js │ ├── .doctrees │ │ ├── api.doctree │ │ ├── genal.doctree │ │ ├── index.doctree │ │ ├── modules.doctree │ │ ├── environment.pickle │ │ └── introduction.doctree │ ├── _images │ │ └── MR_plot_SBP_AS.png │ ├── .buildinfo │ ├── _sources │ │ ├── api.rst.txt │ │ ├── genal.rst.txt │ │ ├── modules.rst.txt │ │ └── index.rst.txt │ ├── search.html │ ├── _modules │ │ ├── index.html │ │ └── genal │ │ │ └── clump.html │ ├── py-modindex.html │ └── index.html ├── source │ ├── .DS_Store │ ├── Images │ │ ├── genal_logo.png │ │ ├── MR_plot_SBP_AS.png │ │ └── Genal_flowchart.png │ ├── conf.py │ ├── api.rst │ ├── modules.rst │ └── index.rst ├── requirements.txt ├── Makefile └── make.bat ├── genal_logo.png ├── Genal_flowchart.png ├── .gitignore ├── genal ├── __init__.py ├── constants.py ├── clump.py ├── genes.py ├── snp_query.py ├── colocalization.py ├── lift.py ├── MRpresso.py ├── proxy.py ├── extract_prs.py └── association.py ├── readthedocs.yaml ├── .readthedocs.yaml └── pyproject.toml /gitignore: -------------------------------------------------------------------------------- 1 | .gitignore -------------------------------------------------------------------------------- /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CypRiv/genal/HEAD/.DS_Store -------------------------------------------------------------------------------- /docs/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CypRiv/genal/HEAD/docs/.DS_Store -------------------------------------------------------------------------------- /genal_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CypRiv/genal/HEAD/genal_logo.png -------------------------------------------------------------------------------- /Genal_flowchart.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CypRiv/genal/HEAD/Genal_flowchart.png -------------------------------------------------------------------------------- /docs/build/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CypRiv/genal/HEAD/docs/build/.DS_Store -------------------------------------------------------------------------------- /docs/build/objects.inv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CypRiv/genal/HEAD/docs/build/objects.inv -------------------------------------------------------------------------------- /docs/source/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CypRiv/genal/HEAD/docs/source/.DS_Store -------------------------------------------------------------------------------- /docs/build/_static/file.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CypRiv/genal/HEAD/docs/build/_static/file.png -------------------------------------------------------------------------------- /docs/build/_static/plus.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CypRiv/genal/HEAD/docs/build/_static/plus.png -------------------------------------------------------------------------------- /docs/build/_static/minus.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CypRiv/genal/HEAD/docs/build/_static/minus.png -------------------------------------------------------------------------------- /docs/build/.doctrees/api.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CypRiv/genal/HEAD/docs/build/.doctrees/api.doctree -------------------------------------------------------------------------------- /docs/build/.doctrees/genal.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CypRiv/genal/HEAD/docs/build/.doctrees/genal.doctree -------------------------------------------------------------------------------- /docs/build/.doctrees/index.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CypRiv/genal/HEAD/docs/build/.doctrees/index.doctree -------------------------------------------------------------------------------- /docs/source/Images/genal_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CypRiv/genal/HEAD/docs/source/Images/genal_logo.png -------------------------------------------------------------------------------- /docs/build/.doctrees/modules.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CypRiv/genal/HEAD/docs/build/.doctrees/modules.doctree -------------------------------------------------------------------------------- /docs/build/_images/MR_plot_SBP_AS.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CypRiv/genal/HEAD/docs/build/_images/MR_plot_SBP_AS.png -------------------------------------------------------------------------------- /docs/source/Images/MR_plot_SBP_AS.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CypRiv/genal/HEAD/docs/source/Images/MR_plot_SBP_AS.png -------------------------------------------------------------------------------- /docs/build/.doctrees/environment.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CypRiv/genal/HEAD/docs/build/.doctrees/environment.pickle -------------------------------------------------------------------------------- /docs/source/Images/Genal_flowchart.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CypRiv/genal/HEAD/docs/source/Images/Genal_flowchart.png -------------------------------------------------------------------------------- /docs/build/.doctrees/introduction.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CypRiv/genal/HEAD/docs/build/.doctrees/introduction.doctree -------------------------------------------------------------------------------- /docs/build/_static/css/fonts/lato-bold.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CypRiv/genal/HEAD/docs/build/_static/css/fonts/lato-bold.woff -------------------------------------------------------------------------------- /docs/build/_static/css/fonts/lato-bold.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CypRiv/genal/HEAD/docs/build/_static/css/fonts/lato-bold.woff2 -------------------------------------------------------------------------------- /docs/build/_static/css/fonts/lato-normal.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CypRiv/genal/HEAD/docs/build/_static/css/fonts/lato-normal.woff -------------------------------------------------------------------------------- /docs/build/_static/css/fonts/lato-normal.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CypRiv/genal/HEAD/docs/build/_static/css/fonts/lato-normal.woff2 -------------------------------------------------------------------------------- /docs/build/_static/css/fonts/Roboto-Slab-Bold.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CypRiv/genal/HEAD/docs/build/_static/css/fonts/Roboto-Slab-Bold.woff -------------------------------------------------------------------------------- /docs/build/_static/css/fonts/lato-bold-italic.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CypRiv/genal/HEAD/docs/build/_static/css/fonts/lato-bold-italic.woff -------------------------------------------------------------------------------- /docs/build/_static/css/fonts/Roboto-Slab-Bold.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CypRiv/genal/HEAD/docs/build/_static/css/fonts/Roboto-Slab-Bold.woff2 -------------------------------------------------------------------------------- /docs/build/_static/css/fonts/fontawesome-webfont.eot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CypRiv/genal/HEAD/docs/build/_static/css/fonts/fontawesome-webfont.eot -------------------------------------------------------------------------------- /docs/build/_static/css/fonts/fontawesome-webfont.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CypRiv/genal/HEAD/docs/build/_static/css/fonts/fontawesome-webfont.ttf -------------------------------------------------------------------------------- /docs/build/_static/css/fonts/lato-bold-italic.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CypRiv/genal/HEAD/docs/build/_static/css/fonts/lato-bold-italic.woff2 -------------------------------------------------------------------------------- /docs/build/_static/css/fonts/lato-normal-italic.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CypRiv/genal/HEAD/docs/build/_static/css/fonts/lato-normal-italic.woff -------------------------------------------------------------------------------- /docs/build/_static/css/fonts/Roboto-Slab-Regular.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CypRiv/genal/HEAD/docs/build/_static/css/fonts/Roboto-Slab-Regular.woff -------------------------------------------------------------------------------- /docs/build/_static/css/fonts/Roboto-Slab-Regular.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CypRiv/genal/HEAD/docs/build/_static/css/fonts/Roboto-Slab-Regular.woff2 -------------------------------------------------------------------------------- /docs/build/_static/css/fonts/fontawesome-webfont.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CypRiv/genal/HEAD/docs/build/_static/css/fonts/fontawesome-webfont.woff -------------------------------------------------------------------------------- /docs/build/_static/css/fonts/fontawesome-webfont.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CypRiv/genal/HEAD/docs/build/_static/css/fonts/fontawesome-webfont.woff2 -------------------------------------------------------------------------------- /docs/build/_static/css/fonts/lato-normal-italic.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CypRiv/genal/HEAD/docs/build/_static/css/fonts/lato-normal-italic.woff2 -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | dist/ 3 | .ipynb_checkpoints/ 4 | ipynb_checkpoints/ 5 | genal/.ipynb_checkpoints/ 6 | test_data/ 7 | cursor/ 8 | tests/ 9 | tmp_GENAL/ 10 | REVIEW_REPORT.md 11 | TASKS.md -------------------------------------------------------------------------------- /docs/build/.buildinfo: -------------------------------------------------------------------------------- 1 | # Sphinx build info version 1 2 | # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. 3 | config: 1a3c03fa317dbf0f46b6f7567774d6c5 4 | tags: 645f666f9bcd5a90fca523b33c5a78b7 5 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx 2 | sphinx_rtd_theme 3 | aiohttp==3.9.5 4 | nest_asyncio==1.5.5 5 | numpy>=1.24.4,<2.0 6 | pandas>=2.0.3 7 | plotnine==0.12.3 8 | psutil==5.9.1 9 | pyliftover==0.4 10 | scikit_learn>=1.3.0 11 | scipy>=1.11.4 12 | statsmodels==0.14.0 13 | tqdm==4.66.1 14 | wget==3.2 -------------------------------------------------------------------------------- /docs/build/_static/documentation_options.js: -------------------------------------------------------------------------------- 1 | const DOCUMENTATION_OPTIONS = { 2 | VERSION: 'v0.8', 3 | LANGUAGE: 'en', 4 | COLLAPSE_INDEX: false, 5 | BUILDER: 'html', 6 | FILE_SUFFIX: '.html', 7 | LINK_SUFFIX: '.html', 8 | HAS_SOURCE: true, 9 | SOURCELINK_SUFFIX: '.txt', 10 | NAVIGATION_WITH_KEYS: false, 11 | SHOW_SEARCH_SUMMARY: true, 12 | ENABLE_SEARCH_SHORTCUTS: true, 13 | }; -------------------------------------------------------------------------------- /genal/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | from .tools import default_config, write_config, set_plink, install_plink, delete_tmp, get_reference_panel_path, get_plink_path 4 | from .geno_tools import Combine_Geno 5 | from .genes import filter_by_gene_func 6 | from .constants import CONFIG_DIR 7 | 8 | __version__ = "1.4.5" 9 | 10 | config_path = os.path.join(CONFIG_DIR, "config.json") 11 | 12 | if not os.path.exists(CONFIG_DIR): 13 | os.makedirs(CONFIG_DIR) 14 | 15 | 16 | if not os.path.exists(config_path): 17 | write_config(default_config()) 18 | print(f"Configuration file for genal placed at '{config_path}'") 19 | 20 | from .Geno import Geno 21 | -------------------------------------------------------------------------------- /readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yaml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | # Set the version of Python and other tools you might need 9 | build: 10 | os: ubuntu-22.04 11 | tools: 12 | python: "3.11" 13 | 14 | # Build documentation in the docs/ directory with Sphinx 15 | sphinx: 16 | configuration: docs/conf.py 17 | 18 | # We recommend specifying your dependencies to enable reproducible builds: 19 | # https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html 20 | python: 21 | install: 22 | - requirements: docs/requirements.txt -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yaml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | # Set the version of Python and other tools you might need 9 | build: 10 | os: ubuntu-22.04 11 | tools: 12 | python: "3.11" 13 | 14 | # Build documentation in the docs/ directory with Sphinx 15 | sphinx: 16 | configuration: docs/source/conf.py 17 | 18 | # We recommend specifying your dependencies to enable reproducible builds: 19 | # https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html 20 | python: 21 | install: 22 | - requirements: docs/requirements.txt 23 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/build/_static/js/badge_only.js: -------------------------------------------------------------------------------- 1 | !function(e){var t={};function r(n){if(t[n])return t[n].exports;var o=t[n]={i:n,l:!1,exports:{}};return e[n].call(o.exports,o,o.exports,r),o.l=!0,o.exports}r.m=e,r.c=t,r.d=function(e,t,n){r.o(e,t)||Object.defineProperty(e,t,{enumerable:!0,get:n})},r.r=function(e){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},r.t=function(e,t){if(1&t&&(e=r(e)),8&t)return e;if(4&t&&"object"==typeof e&&e&&e.__esModule)return e;var n=Object.create(null);if(r.r(n),Object.defineProperty(n,"default",{enumerable:!0,value:e}),2&t&&"string"!=typeof e)for(var o in e)r.d(n,o,function(t){return e[t]}.bind(null,o));return n},r.n=function(e){var t=e&&e.__esModule?function(){return e.default}:function(){return e};return r.d(t,"a",t),t},r.o=function(e,t){return Object.prototype.hasOwnProperty.call(e,t)},r.p="",r(r.s=4)}({4:function(e,t,r){}}); -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | %SPHINXBUILD% >NUL 2>NUL 14 | if errorlevel 9009 ( 15 | echo. 16 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 17 | echo.installed, then set the SPHINXBUILD environment variable to point 18 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 19 | echo.may add the Sphinx directory to PATH. 20 | echo. 21 | echo.If you don't have Sphinx installed, grab it from 22 | echo.https://www.sphinx-doc.org/ 23 | exit /b 1 24 | ) 25 | 26 | if "%1" == "" goto help 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["flit_core >=3.2,<4"] 3 | build-backend = "flit_core.buildapi" 4 | 5 | [project] 6 | name = "genal-python" # Updated name for PyPI 7 | version = "1.4.5" 8 | authors = [{name = "Cyprien Rivier", email = "riviercyprien@gmail.com"}] 9 | description = "A python toolkit for polygenic risk scoring and mendelian randomization." 10 | readme = "README.md" 11 | requires-python = ">=3.8" 12 | license = {file = "LICENSE"} 13 | classifiers = [ 14 | "Programming Language :: Python :: 3", 15 | "License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)", 16 | "Operating System :: OS Independent", 17 | ] 18 | 19 | # Dependencies section 20 | dependencies = [ 21 | "aiohttp>=3.7", 22 | "nest_asyncio>=1.5", 23 | "numpy>=1.17.3", 24 | "pandas>=1.0", 25 | "plotnine>=0.9", 26 | "psutil>=5.0", 27 | "requests>=2.0", 28 | "pyliftover>=0.4", 29 | "scikit_learn>=0.24", 30 | "scipy>=1.7,<1.13", 31 | "statsmodels>=0.13,<0.15", 32 | "tqdm>=4.38", 33 | "wget>=3.0", 34 | "fastparquet>=0.4", 35 | "pyarrow>=3.0" 36 | ] 37 | 38 | [tool.setuptools.package-dir] 39 | genal = "genal/" 40 | 41 | 42 | [project.urls] 43 | Home = "https://github.com/CypRiv/genal" 44 | 45 | [tool.flit.module] 46 | name = "genal" 47 | -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # For the full list of built-in configuration values, see the documentation: 4 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 5 | 6 | # -- Project information ----------------------------------------------------- 7 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information 8 | import os 9 | import sys 10 | import sphinx_rtd_theme 11 | sys.path.insert(0, os.path.abspath('../../')) 12 | 13 | project = 'genal' 14 | copyright = '2023, Cyprien A. Rivier' 15 | author = 'Cyprien A. Rivier' 16 | release = 'v1.1' 17 | 18 | 19 | # -- General configuration --------------------------------------------------- 20 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration 21 | 22 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.napoleon', 'sphinx.ext.viewcode' ] 23 | 24 | templates_path = ['_templates'] 25 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 26 | 27 | 28 | 29 | # -- Options for HTML output ------------------------------------------------- 30 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output 31 | 32 | html_theme = 'sphinx_rtd_theme' 33 | html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] 34 | html_static_path = ['_static'] 35 | -------------------------------------------------------------------------------- /genal/constants.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | STANDARD_COLUMNS = ["CHR", "POS", "SNP", "EA", "NEA", "BETA", "SE", "P"] 4 | BUILDS = ["37", "38"] 5 | POPULATIONS = ["EUR", "AFR", "EAS", "AMR", "SAS"] 6 | REF_PANELS = [f"{pop}_{build}" for pop in POPULATIONS for build in BUILDS] 7 | REF_PANEL_COLUMNS = ["CHR", "SNP", "POS", "A1", "A2"] 8 | BUCKET_URL = "https://storage.googleapis.com/genal_files/" 9 | REF_PANELS_URL = BUCKET_URL + "{panel}.tar.gz" 10 | REF_PARQUET_URL = BUCKET_URL + "reference_variants_{build}.parquet" 11 | CONFIG_DIR = os.path.expanduser("~/.genal/") 12 | CHECKS_DICT = { 13 | "CHR": False, 14 | "POS": False, 15 | "P": False, 16 | "EA": False, 17 | "NEA": False, 18 | "BETA": False, 19 | "SNP": False, 20 | "NA_removal": False, 21 | } 22 | MR_METHODS_NAMES = { 23 | "IVW": "Inverse-Variance Weighted", 24 | "IVW-RE": "Inverse Variance Weighted (Random Effects)", 25 | "IVW-FE": "Inverse Variance Weighted (Fixed Effects)", 26 | "UWR": "Unweighted Regression", 27 | "WM": "Weighted Median", 28 | "WM-pen": "Penalised Weighted Median", 29 | "Simple-median": "Simple Median", 30 | "Sign": "Sign concordance test", 31 | "Egger": ("MR Egger", "Egger Intercept"), 32 | "Egger-boot": ("MR Egger bootstrap", "Egger Intercept bootstrap"), 33 | "Simple-mode": "Simple mode", 34 | "Weighted-mode": "Weighted mode", 35 | } -------------------------------------------------------------------------------- /docs/build/_sources/api.rst.txt: -------------------------------------------------------------------------------- 1 | === 2 | API 3 | === 4 | 5 | genal.GENO class 6 | ----------------- 7 | 8 | .. autoclass:: genal.Geno 9 | :members: 10 | :undoc-members: 11 | :show-inheritance: 12 | 13 | genal.geno\_tools module 14 | ------------------------ 15 | 16 | .. automodule:: genal.geno_tools 17 | :members: 18 | :undoc-members: 19 | :show-inheritance: 20 | 21 | genal.tools module 22 | ------------------ 23 | 24 | .. automodule:: genal.tools 25 | :members: 26 | :undoc-members: 27 | :show-inheritance: 28 | 29 | genal.clump module 30 | ------------------ 31 | 32 | .. automodule:: genal.clump 33 | :members: 34 | :undoc-members: 35 | :show-inheritance: 36 | 37 | genal.proxy module 38 | ------------------ 39 | 40 | .. automodule:: genal.proxy 41 | :members: 42 | :undoc-members: 43 | :show-inheritance: 44 | 45 | genal.extract\_prs module 46 | ------------------------- 47 | 48 | .. automodule:: genal.extract_prs 49 | :members: 50 | :undoc-members: 51 | :show-inheritance: 52 | 53 | genal.association module 54 | ------------------------ 55 | 56 | .. automodule:: genal.association 57 | :members: 58 | :undoc-members: 59 | :show-inheritance: 60 | 61 | genal.MR\_tools module 62 | ---------------------- 63 | 64 | .. automodule:: genal.MR_tools 65 | :members: 66 | :undoc-members: 67 | :show-inheritance: 68 | 69 | genal.MR module 70 | --------------- 71 | 72 | .. automodule:: genal.MR 73 | :members: 74 | :undoc-members: 75 | :show-inheritance: 76 | 77 | genal.MRpresso module 78 | --------------------- 79 | 80 | .. automodule:: genal.MRpresso 81 | :members: 82 | :undoc-members: 83 | :show-inheritance: 84 | 85 | genal.lift module 86 | ----------------- 87 | 88 | .. automodule:: genal.lift 89 | :members: 90 | :undoc-members: 91 | :show-inheritance: 92 | 93 | genal.snp_query module 94 | ---------------------- 95 | 96 | .. automodule:: genal.snp_query 97 | :members: 98 | :undoc-members: 99 | :show-inheritance: 100 | 101 | -------------------------------------------------------------------------------- /docs/build/_sources/genal.rst.txt: -------------------------------------------------------------------------------- 1 | genal package 2 | ============= 3 | 4 | Submodules 5 | ---------- 6 | 7 | genal.GENO module 8 | ----------------- 9 | 10 | .. automodule:: genal.Geno 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | genal.MR module 16 | --------------- 17 | 18 | .. automodule:: genal.MR 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | genal.MR\_tools module 24 | ---------------------- 25 | 26 | .. automodule:: genal.MR_tools 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | genal.MRpresso module 32 | --------------------- 33 | 34 | .. automodule:: genal.MRpresso 35 | :members: 36 | :undoc-members: 37 | :show-inheritance: 38 | 39 | genal.association module 40 | ------------------------ 41 | 42 | .. automodule:: genal.association 43 | :members: 44 | :undoc-members: 45 | :show-inheritance: 46 | 47 | genal.clump module 48 | ------------------ 49 | 50 | .. automodule:: genal.clump 51 | :members: 52 | :undoc-members: 53 | :show-inheritance: 54 | 55 | genal.extract\_prs module 56 | ------------------------- 57 | 58 | .. automodule:: genal.extract_prs 59 | :members: 60 | :undoc-members: 61 | :show-inheritance: 62 | 63 | genal.geno\_tools module 64 | ------------------------ 65 | 66 | .. automodule:: genal.geno_tools 67 | :members: 68 | :undoc-members: 69 | :show-inheritance: 70 | 71 | genal.lift module 72 | ----------------- 73 | 74 | .. automodule:: genal.lift 75 | :members: 76 | :undoc-members: 77 | :show-inheritance: 78 | 79 | genal.proxy module 80 | ------------------ 81 | 82 | .. automodule:: genal.proxy 83 | :members: 84 | :undoc-members: 85 | :show-inheritance: 86 | 87 | genal.tools module 88 | ------------------ 89 | 90 | .. automodule:: genal.tools 91 | :members: 92 | :undoc-members: 93 | :show-inheritance: 94 | 95 | Module contents 96 | --------------- 97 | 98 | .. automodule:: genal 99 | :members: 100 | :undoc-members: 101 | :show-inheritance: 102 | -------------------------------------------------------------------------------- /docs/source/api.rst: -------------------------------------------------------------------------------- 1 | === 2 | API 3 | === 4 | 5 | genal.GENO class 6 | ----------------- 7 | 8 | .. autoclass:: genal.Geno 9 | :members: __init__, preprocess_data, get_reference_panel, clump, update_snpids, extract_snps, prs, set_phenotype, association_test, query_outcome, MR, MR_plot, MR_forest, MRpresso, filter_by_gene, colocalize, lift, query_gwas_catalog, standardize_betas, update_eaf, sort_group, copy, save 10 | :undoc-members: 11 | :show-inheritance: 12 | 13 | genal.geno\_tools module 14 | ------------------------ 15 | 16 | .. automodule:: genal.geno_tools 17 | :members: 18 | :undoc-members: 19 | :show-inheritance: 20 | 21 | genal.tools module 22 | ------------------ 23 | 24 | .. automodule:: genal.tools 25 | :members: 26 | :undoc-members: 27 | :show-inheritance: 28 | 29 | genal.clump module 30 | ------------------ 31 | 32 | .. automodule:: genal.clump 33 | :members: 34 | :undoc-members: 35 | :show-inheritance: 36 | 37 | genal.proxy module 38 | ------------------ 39 | 40 | .. automodule:: genal.proxy 41 | :members: 42 | :undoc-members: 43 | :show-inheritance: 44 | 45 | genal.extract\_prs module 46 | ------------------------- 47 | 48 | .. automodule:: genal.extract_prs 49 | :members: 50 | :undoc-members: 51 | :show-inheritance: 52 | 53 | genal.association module 54 | ------------------------ 55 | 56 | .. automodule:: genal.association 57 | :members: 58 | :undoc-members: 59 | :show-inheritance: 60 | 61 | genal.MR\_tools module 62 | ---------------------- 63 | 64 | .. automodule:: genal.MR_tools 65 | :members: 66 | :undoc-members: 67 | :show-inheritance: 68 | 69 | genal.MR module 70 | --------------- 71 | 72 | .. automodule:: genal.MR 73 | :members: 74 | :undoc-members: 75 | :show-inheritance: 76 | 77 | genal.MRpresso module 78 | --------------------- 79 | 80 | .. automodule:: genal.MRpresso 81 | :members: 82 | :undoc-members: 83 | :show-inheritance: 84 | 85 | genal.lift module 86 | ----------------- 87 | 88 | .. automodule:: genal.lift 89 | :members: 90 | :undoc-members: 91 | :show-inheritance: 92 | 93 | genal.snp_query module 94 | ---------------------- 95 | 96 | .. automodule:: genal.snp_query 97 | :members: 98 | :undoc-members: 99 | :show-inheritance: 100 | 101 | -------------------------------------------------------------------------------- /docs/source/modules.rst: -------------------------------------------------------------------------------- 1 | ============== 2 | The Geno class 3 | ============== 4 | 5 | The main object of the package is the :class:`~genal.Geno` class that contains the SNP-level data and manipulates it through its methods. 6 | 7 | .. autoclass:: genal.Geno 8 | 9 | ============== 10 | Main functions 11 | ============== 12 | 13 | Preprocessing 14 | ============= 15 | 16 | The preprocessing of the SNP-level data is performed with the :func:`~genal.Geno.preprocess_data` method: 17 | 18 | .. automethod:: genal.Geno.preprocess_data 19 | 20 | 21 | Clumping 22 | ======== 23 | 24 | Clumping is performed with the :func:`~genal.Geno.clump` method: 25 | 26 | .. automethod:: genal.Geno.clump 27 | 28 | Polygenic Risk Scoring 29 | ====================== 30 | 31 | The computation of a polygenic risk score in a target population is performed with the :func:`~genal.Geno.prs` method: 32 | 33 | .. automethod:: genal.Geno.prs 34 | 35 | Querying outcome data 36 | ===================== 37 | 38 | Before running Mendelian Randomization, the extraction of the genetic instruments from the :class:`~genal.Geno` object containing the SNP-outcome association data is done with :func:`~genal.Geno.query_outcome` method: 39 | 40 | .. automethod:: genal.Geno.query_outcome 41 | 42 | Mendelian Randomization 43 | ======================= 44 | 45 | Various Mendelian Randomization methods are computed with the :func:`~genal.Geno.MR` method: 46 | 47 | .. automethod:: genal.Geno.MR 48 | 49 | MR-PRESSO 50 | ========= 51 | 52 | The MR-PRESSO algorithm to detect and correct horizontal pleiotropy is executed with :func:`~genal.Geno.MRpresso` method: 53 | 54 | .. automethod:: genal.Geno.MRpresso 55 | 56 | Phenotype assignment 57 | ==================== 58 | 59 | Before running SNP-association tests, assigning a dataframe with phenotypic data to the :class:`~genal.Geno` object is done with :func:`~genal.Geno.set_phenotype` method: 60 | 61 | .. automethod:: genal.Geno.set_phenotype 62 | 63 | SNP-association tests 64 | ===================== 65 | 66 | SNP-association testing is conducted with :func:`~genal.Geno.association_test` method: 67 | 68 | .. automethod:: genal.Geno.association_test 69 | 70 | Genetic lifting 71 | =============== 72 | 73 | Lifting the SNP data to another genetic build is done with :func:`~genal.Geno.lift` method: 74 | 75 | .. automethod:: genal.Geno.lift 76 | 77 | GWAS Catalog 78 | ============ 79 | 80 | Querying the GWAS Catalog to extract traits associated with the SNPs is done with :func:`~genal.Geno.query_gwas_catalog` method: 81 | 82 | .. automethod:: genal.Geno.query_gwas_catalog -------------------------------------------------------------------------------- /docs/build/_sources/modules.rst.txt: -------------------------------------------------------------------------------- 1 | ============== 2 | The Geno class 3 | ============== 4 | 5 | The main object of the package is the :class:`~genal.Geno` class that contains the SNP-level data and manipulates it through its methods. 6 | 7 | .. autoclass:: genal.Geno 8 | 9 | ============== 10 | Main functions 11 | ============== 12 | 13 | Preprocessing 14 | ============= 15 | 16 | The preprocessing of the SNP-level data is performed with the :func:`~genal.Geno.preprocess_data` method: 17 | 18 | .. automethod:: genal.Geno.preprocess_data 19 | 20 | 21 | Clumping 22 | ======== 23 | 24 | Clumping is performed with the :func:`~genal.Geno.clump` method: 25 | 26 | .. automethod:: genal.Geno.clump 27 | 28 | Polygenic Risk Scoring 29 | ====================== 30 | 31 | The computation of a polygenic risk score in a target population is performed with the :func:`~genal.Geno.prs` method: 32 | 33 | .. automethod:: genal.Geno.prs 34 | 35 | Querying outcome data 36 | ===================== 37 | 38 | Before running Mendelian Randomization, the extraction of the genetic instruments from the :class:`~genal.Geno` object containing the SNP-outcome association data is done with :func:`~genal.Geno.query_outcome` method: 39 | 40 | .. automethod:: genal.Geno.query_outcome 41 | 42 | Mendelian Randomization 43 | ======================= 44 | 45 | Various Mendelian Randomization methods are computed with the :func:`~genal.Geno.MR` method: 46 | 47 | .. automethod:: genal.Geno.MR 48 | 49 | MR-PRESSO 50 | ========= 51 | 52 | The MR-PRESSO algorithm to detect and correct horizontal pleiotropy is executed with :func:`~genal.Geno.MRpresso` method: 53 | 54 | .. automethod:: genal.Geno.MRpresso 55 | 56 | Phenotype assignment 57 | ==================== 58 | 59 | Before running SNP-association tests, assigning a dataframe with phenotypic data to the :class:`~genal.Geno` object is done with :func:`~genal.Geno.set_phenotype` method: 60 | 61 | .. automethod:: genal.Geno.set_phenotype 62 | 63 | SNP-association tests 64 | ===================== 65 | 66 | SNP-association testing is conducted with :func:`~genal.Geno.association_test` method: 67 | 68 | .. automethod:: genal.Geno.association_test 69 | 70 | Genetic lifting 71 | =============== 72 | 73 | Lifting the SNP data to another genetic build is done with :func:`~genal.Geno.lift` method: 74 | 75 | .. automethod:: genal.Geno.lift 76 | 77 | GWAS Catalog 78 | ============ 79 | 80 | Querying the GWAS Catalog to extract traits associated with the SNPs is done with :func:`~genal.Geno.query_gwas_catalog` method: 81 | 82 | .. automethod:: genal.Geno.query_gwas_catalog -------------------------------------------------------------------------------- /docs/build/_static/js/html5shiv.min.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @preserve HTML5 Shiv 3.7.3 | @afarkas @jdalton @jon_neal @rem | MIT/GPL2 Licensed 3 | */ 4 | !function(a,b){function c(a,b){var c=a.createElement("p"),d=a.getElementsByTagName("head")[0]||a.documentElement;return c.innerHTML="x",d.insertBefore(c.lastChild,d.firstChild)}function d(){var a=t.elements;return"string"==typeof a?a.split(" "):a}function e(a,b){var c=t.elements;"string"!=typeof c&&(c=c.join(" ")),"string"!=typeof a&&(a=a.join(" ")),t.elements=c+" "+a,j(b)}function f(a){var b=s[a[q]];return b||(b={},r++,a[q]=r,s[r]=b),b}function g(a,c,d){if(c||(c=b),l)return c.createElement(a);d||(d=f(c));var e;return e=d.cache[a]?d.cache[a].cloneNode():p.test(a)?(d.cache[a]=d.createElem(a)).cloneNode():d.createElem(a),!e.canHaveChildren||o.test(a)||e.tagUrn?e:d.frag.appendChild(e)}function h(a,c){if(a||(a=b),l)return a.createDocumentFragment();c=c||f(a);for(var e=c.frag.cloneNode(),g=0,h=d(),i=h.length;i>g;g++)e.createElement(h[g]);return e}function i(a,b){b.cache||(b.cache={},b.createElem=a.createElement,b.createFrag=a.createDocumentFragment,b.frag=b.createFrag()),a.createElement=function(c){return t.shivMethods?g(c,a,b):b.createElem(c)},a.createDocumentFragment=Function("h,f","return function(){var n=f.cloneNode(),c=n.createElement;h.shivMethods&&("+d().join().replace(/[\w\-:]+/g,function(a){return b.createElem(a),b.frag.createElement(a),'c("'+a+'")'})+");return n}")(t,b.frag)}function j(a){a||(a=b);var d=f(a);return!t.shivCSS||k||d.hasCSS||(d.hasCSS=!!c(a,"article,aside,dialog,figcaption,figure,footer,header,hgroup,main,nav,section{display:block}mark{background:#FF0;color:#000}template{display:none}")),l||i(a,d),a}var k,l,m="3.7.3-pre",n=a.html5||{},o=/^<|^(?:button|map|select|textarea|object|iframe|option|optgroup)$/i,p=/^(?:a|b|code|div|fieldset|h1|h2|h3|h4|h5|h6|i|label|li|ol|p|q|span|strong|style|table|tbody|td|th|tr|ul)$/i,q="_html5shiv",r=0,s={};!function(){try{var a=b.createElement("a");a.innerHTML="",k="hidden"in a,l=1==a.childNodes.length||function(){b.createElement("a");var a=b.createDocumentFragment();return"undefined"==typeof a.cloneNode||"undefined"==typeof a.createDocumentFragment||"undefined"==typeof a.createElement}()}catch(c){k=!0,l=!0}}();var t={elements:n.elements||"abbr article aside audio bdi canvas data datalist details dialog figcaption figure footer header hgroup main mark meter nav output picture progress section summary template time video",version:m,shivCSS:n.shivCSS!==!1,supportsUnknownElements:l,shivMethods:n.shivMethods!==!1,type:"default",shivDocument:j,createElement:g,createDocumentFragment:h,addElements:e};a.html5=t,j(b),"object"==typeof module&&module.exports&&(module.exports=t)}("undefined"!=typeof window?window:this,document); -------------------------------------------------------------------------------- /docs/build/_static/css/badge_only.css: -------------------------------------------------------------------------------- 1 | .clearfix{*zoom:1}.clearfix:after,.clearfix:before{display:table;content:""}.clearfix:after{clear:both}@font-face{font-family:FontAwesome;font-style:normal;font-weight:400;src:url(fonts/fontawesome-webfont.eot?674f50d287a8c48dc19ba404d20fe713?#iefix) format("embedded-opentype"),url(fonts/fontawesome-webfont.woff2?af7ae505a9eed503f8b8e6982036873e) format("woff2"),url(fonts/fontawesome-webfont.woff?fee66e712a8a08eef5805a46892932ad) format("woff"),url(fonts/fontawesome-webfont.ttf?b06871f281fee6b241d60582ae9369b9) format("truetype"),url(fonts/fontawesome-webfont.svg?912ec66d7572ff821749319396470bde#FontAwesome) format("svg")}.fa:before{font-family:FontAwesome;font-style:normal;font-weight:400;line-height:1}.fa:before,a .fa{text-decoration:inherit}.fa:before,a .fa,li .fa{display:inline-block}li .fa-large:before{width:1.875em}ul.fas{list-style-type:none;margin-left:2em;text-indent:-.8em}ul.fas li .fa{width:.8em}ul.fas li .fa-large:before{vertical-align:baseline}.fa-book:before,.icon-book:before{content:"\f02d"}.fa-caret-down:before,.icon-caret-down:before{content:"\f0d7"}.fa-caret-up:before,.icon-caret-up:before{content:"\f0d8"}.fa-caret-left:before,.icon-caret-left:before{content:"\f0d9"}.fa-caret-right:before,.icon-caret-right:before{content:"\f0da"}.rst-versions{position:fixed;bottom:0;left:0;width:300px;color:#fcfcfc;background:#1f1d1d;font-family:Lato,proxima-nova,Helvetica Neue,Arial,sans-serif;z-index:400}.rst-versions a{color:#2980b9;text-decoration:none}.rst-versions .rst-badge-small{display:none}.rst-versions .rst-current-version{padding:12px;background-color:#272525;display:block;text-align:right;font-size:90%;cursor:pointer;color:#27ae60}.rst-versions .rst-current-version:after{clear:both;content:"";display:block}.rst-versions .rst-current-version .fa{color:#fcfcfc}.rst-versions .rst-current-version .fa-book,.rst-versions .rst-current-version .icon-book{float:left}.rst-versions .rst-current-version.rst-out-of-date{background-color:#e74c3c;color:#fff}.rst-versions .rst-current-version.rst-active-old-version{background-color:#f1c40f;color:#000}.rst-versions.shift-up{height:auto;max-height:100%;overflow-y:scroll}.rst-versions.shift-up .rst-other-versions{display:block}.rst-versions .rst-other-versions{font-size:90%;padding:12px;color:grey;display:none}.rst-versions .rst-other-versions hr{display:block;height:1px;border:0;margin:20px 0;padding:0;border-top:1px solid #413d3d}.rst-versions .rst-other-versions dd{display:inline-block;margin:0}.rst-versions .rst-other-versions dd a{display:inline-block;padding:6px;color:#fcfcfc}.rst-versions.rst-badge{width:auto;bottom:20px;right:20px;left:auto;border:none;max-width:300px;max-height:90%}.rst-versions.rst-badge .fa-book,.rst-versions.rst-badge .icon-book{float:none;line-height:30px}.rst-versions.rst-badge.shift-up .rst-current-version{text-align:right}.rst-versions.rst-badge.shift-up .rst-current-version .fa-book,.rst-versions.rst-badge.shift-up .rst-current-version .icon-book{float:left}.rst-versions.rst-badge>.rst-current-version{width:auto;height:30px;line-height:30px;padding:0 6px;display:block;text-align:center}@media screen and (max-width:768px){.rst-versions{width:85%;display:none}.rst-versions.shift{display:block}} -------------------------------------------------------------------------------- /docs/build/_sources/index.rst.txt: -------------------------------------------------------------------------------- 1 | .. genal documentation master file, created by 2 | sphinx-quickstart on Thu Sep 14 14:04:16 2023. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | genal: A Python Toolkit for Genetic Risk Scoring and Mendelian Randomization 7 | ============================================================================ 8 | 9 | :Author: Cyprien Rivier 10 | :Date: |today| 11 | :Version: "0.8" 12 | 13 | Genal is a python module designed to make it easy to run genetic risk scores and mendelian randomization analyses. It integrates a collection of tools that facilitate the cleaning of single nucleotide polymorphism data (usually derived from Genome-Wide Association Studies) and enable the execution of key clinical population genetic workflows. The functionalities provided by genal include clumping, lifting, association testing, polygenic risk scoring, and Mendelian randomization analyses, all within a single Python module. 14 | 15 | The module prioritizes user-friendliness and intuitive operation, aiming to reduce the complexity of data analysis for researchers. Despite its focus on simplicity, Genal does not sacrifice the depth of customization or the precision of analysis. Researchers can expect to maintain analytical rigour while benefiting from the streamlined experience. 16 | 17 | Genal draws on concepts from well-established R packages such as TwoSampleMR, MR-Presso, MendelianRandomization, and gwasvcf, adapting their proven methodologies to the Python environment. This approach ensures that users have access to tried and tested techniques with the versatility of Python's data science tools. 18 | 19 | To install the latest release, type:: 20 | 21 | pip install genal-python 22 | 23 | Contents 24 | -------- 25 | 26 | .. toctree:: 27 | :maxdepth: 1 28 | 29 | Home 30 | introduction.rst 31 | modules.rst 32 | api.rst 33 | 34 | 35 | 36 | Indices and tables 37 | ================== 38 | 39 | * :ref:`genindex` 40 | * :ref:`modindex` 41 | * :ref:`search` 42 | 43 | Citation 44 | -------- 45 | 46 | If you use genal in your work, please cite the following paper: 47 | 48 | .. [Rivier.2024] *Genal: A Python Toolkit for Genetic Risk Scoring and Mendelian Randomization* 49 | Cyprien Rivier, Cyprien A. Rivier, Santiago Clocchiatti-Tuozzo, Shufan Huo, Victor Torres-Lopez, Daniela Renedo, Kevin N. Sheth, Guido J. Falcone, Julian N. Acosta. 50 | medRxiv. 2024 May `10.1101/2024.05.23.24307776 `_. 51 | 52 | References 53 | ---------- 54 | 55 | .. [Hemani.2018] *The MR-Base platform supports systematic causal inference across the human phenome.* 56 | Hemani G, Zheng J, Elsworth B, Wade KH, Baird D, Haberland V, Laurin C, Burgess S, Bowden J, Langdon R, Tan VY, Yarmolinsky J, Shihab HA, Timpson NJ, Evans DM, Relton C, Martin RM, Davey Smith G, Gaunt TR, Haycock PC, The MR-Base Collaboration 57 | eLife. 2018 May `10.7554/eLife.34408 `_. 58 | PMID: `29846171 `_. 59 | 60 | .. [Verbanck.2018] *Detection of widespread horizontal pleiotropy in causal relationships inferred from Mendelian randomization between complex traits and diseases.* 61 | Marie Verbanck, Chia-Yen Chen, Benjamin Neale, Ron Do. 62 | Nature Genetics 2018 May `10.1038/s41588-018-0099-7 `_. 63 | PMID: `29686387 `_. 64 | 65 | .. [Lyon.2020] *The variant call format provides efficient and robust storage of GWAS summary statistics.* 66 | Matthew Lyon, Shea J Andrews, Ben Elsworth, Tom R Gaunt, Gibran Hemani, Edoardo Marcora. 67 | bioRxiv 2020 May 30 `2020.05.29.115824v1 `_. 68 | PMID: `33441155 `_. 69 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. genal documentation master file, created by 2 | sphinx-quickstart on Thu Sep 14 14:04:16 2023. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | .. image:: Images/genal_logo.png 7 | :alt: genal_logo 8 | :width: 400px 9 | 10 | genal: A Python Toolkit for Genetic Risk Scoring and Mendelian Randomization 11 | ============================================================================ 12 | 13 | :Author: Cyprien A. Rivier 14 | :Date: |today| 15 | :Version: "1.2" 16 | 17 | Genal is a python module designed to make it easy to run genetic risk scores and mendelian randomization analyses. It integrates a collection of tools that facilitate the cleaning of single nucleotide polymorphism data (usually derived from Genome-Wide Association Studies) and enable the execution of key clinical population genetic workflows. The functionalities provided by genal include clumping, lifting, association testing, polygenic risk scoring, and Mendelian randomization analyses, all within a single Python module. 18 | 19 | The module prioritizes user-friendliness and intuitive operation, aiming to reduce the complexity of data analysis for researchers. Despite its focus on simplicity, Genal does not sacrifice the depth of customization or the precision of analysis. Researchers can expect to maintain analytical rigour while benefiting from the streamlined experience. 20 | 21 | Genal draws on concepts from well-established R packages such as TwoSampleMR, MR-Presso, MendelianRandomization, and gwasvcf, adapting their proven methodologies to the Python environment. This approach ensures that users have access to tried and tested techniques with the versatility of Python's data science tools. 22 | 23 | To install the latest release, type:: 24 | 25 | pip install genal-python 26 | 27 | Contents 28 | -------- 29 | 30 | .. toctree:: 31 | :maxdepth: 1 32 | 33 | Home 34 | introduction.rst 35 | modules.rst 36 | api.rst 37 | 38 | 39 | 40 | Indices and tables 41 | ================== 42 | 43 | * :ref:`genindex` 44 | * :ref:`modindex` 45 | * :ref:`search` 46 | 47 | Citation 48 | -------- 49 | 50 | If you use genal in your work, please cite the following paper: 51 | 52 | .. [Rivier.2024] *Genal: A Python Toolkit for Genetic Risk Scoring and Mendelian Randomization* 53 | Cyprien A. Rivier, Santiago Clocchiatti-Tuozzo, Shufan Huo, Victor Torres-Lopez, Daniela Renedo, Kevin N. Sheth, Guido J. Falcone, Julian N. Acosta. 54 | Bioinformatics Advances. 2024 December; `10.1093/bioadv/vbae207 `_. 55 | 56 | References 57 | ---------- 58 | 59 | .. [Hemani.2018] *The MR-Base platform supports systematic causal inference across the human phenome.* 60 | Hemani G, Zheng J, Elsworth B, Wade KH, Baird D, Haberland V, Laurin C, Burgess S, Bowden J, Langdon R, Tan VY, Yarmolinsky J, Shihab HA, Timpson NJ, Evans DM, Relton C, Martin RM, Davey Smith G, Gaunt TR, Haycock PC, The MR-Base Collaboration 61 | eLife. 2018 May `10.7554/eLife.34408 `_. 62 | PMID: `29846171 `_. 63 | 64 | .. [Verbanck.2018] *Detection of widespread horizontal pleiotropy in causal relationships inferred from Mendelian randomization between complex traits and diseases.* 65 | Marie Verbanck, Chia-Yen Chen, Benjamin Neale, Ron Do. 66 | Nature Genetics 2018 May `10.1038/s41588-018-0099-7 `_. 67 | PMID: `29686387 `_. 68 | 69 | .. [Lyon.2020] *The variant call format provides efficient and robust storage of GWAS summary statistics.* 70 | Matthew Lyon, Shea J Andrews, Ben Elsworth, Tom R Gaunt, Gibran Hemani, Edoardo Marcora. 71 | bioRxiv 2020 May 30 `2020.05.29.115824v1 `_. 72 | PMID: `33441155 `_. 73 | -------------------------------------------------------------------------------- /genal/clump.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | import uuid 4 | import re 5 | 6 | from .tools import get_reference_panel_path, get_plink_path, run_plink_command 7 | 8 | def clump_data_plink2( 9 | data, 10 | reference_panel="EUR_37", 11 | kb=10000, 12 | r2=0.01, 13 | p1=5e-8, 14 | p2=0.01, 15 | name=None, 16 | ram=10000, 17 | ): 18 | """ 19 | Perform clumping on the given data using plink. Corresponds to the :meth:`Geno.clump` method. 20 | 21 | Args: 22 | data (pd.DataFrame): Input data with at least 'SNP' and 'P' columns. 23 | reference_panel (str, optional): The reference population to get linkage disequilibrium values and find proxies. 24 | Acceptable populations are "EUR", "SAS", "AFR", "EAS", "AMR" and available builds are 37 and 38 ("EUR_38" or "AFR_37" etc...) 25 | Also accepts or a path to a specific bed/bim/fam or pgen/pvar/psam panel. 26 | Default is "EUR_37". 27 | kb (int, optional): Clumping window in terms of thousands of SNPs. Default is 10000. 28 | r2 (float, optional): Linkage disequilibrium threshold, values between 0 and 1. Default is 0.01. 29 | p1 (float, optional): P-value threshold during clumping. SNPs above this value are not considered. Default is 5e-8. 30 | p2 (float, optional): P-value threshold post-clumping to further filter the clumped SNPs. If p2 < p1, it won't be considered. Default is 0.01. 31 | name (str, optional): Name used for the files created in the tmp_GENAL folder. 32 | ram (int, optional): Amount of RAM in MB to be used by plink. 33 | 34 | Returns: 35 | pd.DataFrame: Data after clumping, if any. 36 | """ 37 | 38 | # Create unique ID for the name if none is passed 39 | if name is None: 40 | name = str(uuid.uuid4())[:8] 41 | 42 | # Save the relevant data columns to a temporary file 43 | to_clump_filename = os.path.join("tmp_GENAL", f"{name}_to_clump.txt") 44 | data[["SNP", "P"]].to_csv(to_clump_filename, index=False, sep="\t") 45 | 46 | # Get reference panel path and type 47 | ref_path, filetype = get_reference_panel_path(reference_panel) 48 | 49 | # Construct and execute the plink clumping command 50 | output_path = os.path.join("tmp_GENAL", name) 51 | 52 | # Base command differs based on filetype 53 | base_cmd = f"{get_plink_path()} --memory {ram}" 54 | if filetype == "bed": 55 | base_cmd += f" --bfile {ref_path}" 56 | else: # pgen 57 | base_cmd += f" --pfile {ref_path}" 58 | 59 | plink_command = f"{base_cmd} --rm-dup force-first --clump {to_clump_filename} --clump-kb {kb} \ 60 | --clump-r2 {r2} --clump-p1 {p1} --clump-p2 {p2} --out {output_path}" 61 | 62 | run_plink_command(plink_command) 63 | 64 | # Read log file to get the number of missing top variant IDs 65 | log_content = open(os.path.join("tmp_GENAL", f"{name}.log")).read() 66 | match = re.search(r"(\d+)\s+top\s+variant\s+ID", log_content) 67 | if match: 68 | missing_variants = int(match.group(1)) 69 | print(f"Warning: {missing_variants} top variant IDs missing") 70 | 71 | if "No significant --clump results." in log_content: 72 | print("No SNPs remaining after clumping.") 73 | return 74 | 75 | match = re.search(r"(\d+)\s+clump[s]?\s+formed\s+from\s+(\d+)\s+index", log_content) 76 | if match: 77 | print(f"{match.group(1)} clumps formed from {match.group(2)} top variants.") 78 | 79 | # Extract the list of clumped SNPs and get the relevant data subset 80 | clumped_filename = os.path.join("tmp_GENAL", f"{name}.clumps") 81 | if not os.path.exists(clumped_filename): 82 | raise FileNotFoundError(f"'{clumped_filename}' is missing.") 83 | plink_clumped = pd.read_csv(clumped_filename, sep="\s+", usecols=["ID"]) 84 | clumped_data = data[data["SNP"].isin(plink_clumped["ID"])] 85 | clumped_data.reset_index(drop=True, inplace=True) 86 | return clumped_data -------------------------------------------------------------------------------- /docs/build/_static/js/html5shiv-printshiv.min.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @preserve HTML5 Shiv 3.7.3-pre | @afarkas @jdalton @jon_neal @rem | MIT/GPL2 Licensed 3 | */ 4 | !function(a,b){function c(a,b){var c=a.createElement("p"),d=a.getElementsByTagName("head")[0]||a.documentElement;return c.innerHTML="x",d.insertBefore(c.lastChild,d.firstChild)}function d(){var a=y.elements;return"string"==typeof a?a.split(" "):a}function e(a,b){var c=y.elements;"string"!=typeof c&&(c=c.join(" ")),"string"!=typeof a&&(a=a.join(" ")),y.elements=c+" "+a,j(b)}function f(a){var b=x[a[v]];return b||(b={},w++,a[v]=w,x[w]=b),b}function g(a,c,d){if(c||(c=b),q)return c.createElement(a);d||(d=f(c));var e;return e=d.cache[a]?d.cache[a].cloneNode():u.test(a)?(d.cache[a]=d.createElem(a)).cloneNode():d.createElem(a),!e.canHaveChildren||t.test(a)||e.tagUrn?e:d.frag.appendChild(e)}function h(a,c){if(a||(a=b),q)return a.createDocumentFragment();c=c||f(a);for(var e=c.frag.cloneNode(),g=0,h=d(),i=h.length;i>g;g++)e.createElement(h[g]);return e}function i(a,b){b.cache||(b.cache={},b.createElem=a.createElement,b.createFrag=a.createDocumentFragment,b.frag=b.createFrag()),a.createElement=function(c){return y.shivMethods?g(c,a,b):b.createElem(c)},a.createDocumentFragment=Function("h,f","return function(){var n=f.cloneNode(),c=n.createElement;h.shivMethods&&("+d().join().replace(/[\w\-:]+/g,function(a){return b.createElem(a),b.frag.createElement(a),'c("'+a+'")'})+");return n}")(y,b.frag)}function j(a){a||(a=b);var d=f(a);return!y.shivCSS||p||d.hasCSS||(d.hasCSS=!!c(a,"article,aside,dialog,figcaption,figure,footer,header,hgroup,main,nav,section{display:block}mark{background:#FF0;color:#000}template{display:none}")),q||i(a,d),a}function k(a){for(var b,c=a.getElementsByTagName("*"),e=c.length,f=RegExp("^(?:"+d().join("|")+")$","i"),g=[];e--;)b=c[e],f.test(b.nodeName)&&g.push(b.applyElement(l(b)));return g}function l(a){for(var b,c=a.attributes,d=c.length,e=a.ownerDocument.createElement(A+":"+a.nodeName);d--;)b=c[d],b.specified&&e.setAttribute(b.nodeName,b.nodeValue);return e.style.cssText=a.style.cssText,e}function m(a){for(var b,c=a.split("{"),e=c.length,f=RegExp("(^|[\\s,>+~])("+d().join("|")+")(?=[[\\s,>+~#.:]|$)","gi"),g="$1"+A+"\\:$2";e--;)b=c[e]=c[e].split("}"),b[b.length-1]=b[b.length-1].replace(f,g),c[e]=b.join("}");return c.join("{")}function n(a){for(var b=a.length;b--;)a[b].removeNode()}function o(a){function b(){clearTimeout(g._removeSheetTimer),d&&d.removeNode(!0),d=null}var d,e,g=f(a),h=a.namespaces,i=a.parentWindow;return!B||a.printShived?a:("undefined"==typeof h[A]&&h.add(A),i.attachEvent("onbeforeprint",function(){b();for(var f,g,h,i=a.styleSheets,j=[],l=i.length,n=Array(l);l--;)n[l]=i[l];for(;h=n.pop();)if(!h.disabled&&z.test(h.media)){try{f=h.imports,g=f.length}catch(o){g=0}for(l=0;g>l;l++)n.push(f[l]);try{j.push(h.cssText)}catch(o){}}j=m(j.reverse().join("")),e=k(a),d=c(a,j)}),i.attachEvent("onafterprint",function(){n(e),clearTimeout(g._removeSheetTimer),g._removeSheetTimer=setTimeout(b,500)}),a.printShived=!0,a)}var p,q,r="3.7.3",s=a.html5||{},t=/^<|^(?:button|map|select|textarea|object|iframe|option|optgroup)$/i,u=/^(?:a|b|code|div|fieldset|h1|h2|h3|h4|h5|h6|i|label|li|ol|p|q|span|strong|style|table|tbody|td|th|tr|ul)$/i,v="_html5shiv",w=0,x={};!function(){try{var a=b.createElement("a");a.innerHTML="",p="hidden"in a,q=1==a.childNodes.length||function(){b.createElement("a");var a=b.createDocumentFragment();return"undefined"==typeof a.cloneNode||"undefined"==typeof a.createDocumentFragment||"undefined"==typeof a.createElement}()}catch(c){p=!0,q=!0}}();var y={elements:s.elements||"abbr article aside audio bdi canvas data datalist details dialog figcaption figure footer header hgroup main mark meter nav output picture progress section summary template time video",version:r,shivCSS:s.shivCSS!==!1,supportsUnknownElements:q,shivMethods:s.shivMethods!==!1,type:"default",shivDocument:j,createElement:g,createDocumentFragment:h,addElements:e};a.html5=y,j(b);var z=/^$|\b(?:all|print)\b/,A="html5shiv",B=!q&&function(){var c=b.documentElement;return!("undefined"==typeof b.namespaces||"undefined"==typeof b.parentWindow||"undefined"==typeof c.applyElement||"undefined"==typeof c.removeNode||"undefined"==typeof a.attachEvent)}();y.type+=" print",y.shivPrint=o,o(b),"object"==typeof module&&module.exports&&(module.exports=y)}("undefined"!=typeof window?window:this,document); -------------------------------------------------------------------------------- /docs/build/search.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Search — genal v0.8 documentation 7 | 8 | 9 | 10 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 |
26 | 55 | 56 |
60 | 61 |
62 |
63 |
64 |
    65 |
  • 66 | 67 |
  • 68 |
  • 69 |
70 |
71 |
72 |
73 |
74 | 75 | 82 | 83 | 84 |
85 | 86 |
87 | 88 |
89 |
90 |
91 | 92 |
93 | 94 |
95 |

© Copyright 2023, Cyprien A. Rivier.

96 |
97 | 98 | Built with Sphinx using a 99 | theme 100 | provided by Read the Docs. 101 | 102 | 103 |
104 |
105 |
106 |
107 |
108 | 113 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | -------------------------------------------------------------------------------- /docs/build/_static/js/theme.js: -------------------------------------------------------------------------------- 1 | !function(n){var e={};function t(i){if(e[i])return e[i].exports;var o=e[i]={i:i,l:!1,exports:{}};return n[i].call(o.exports,o,o.exports,t),o.l=!0,o.exports}t.m=n,t.c=e,t.d=function(n,e,i){t.o(n,e)||Object.defineProperty(n,e,{enumerable:!0,get:i})},t.r=function(n){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(n,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(n,"__esModule",{value:!0})},t.t=function(n,e){if(1&e&&(n=t(n)),8&e)return n;if(4&e&&"object"==typeof n&&n&&n.__esModule)return n;var i=Object.create(null);if(t.r(i),Object.defineProperty(i,"default",{enumerable:!0,value:n}),2&e&&"string"!=typeof n)for(var o in n)t.d(i,o,function(e){return n[e]}.bind(null,o));return i},t.n=function(n){var e=n&&n.__esModule?function(){return n.default}:function(){return n};return t.d(e,"a",e),e},t.o=function(n,e){return Object.prototype.hasOwnProperty.call(n,e)},t.p="",t(t.s=0)}([function(n,e,t){t(1),n.exports=t(3)},function(n,e,t){(function(){var e="undefined"!=typeof window?window.jQuery:t(2);n.exports.ThemeNav={navBar:null,win:null,winScroll:!1,winResize:!1,linkScroll:!1,winPosition:0,winHeight:null,docHeight:null,isRunning:!1,enable:function(n){var t=this;void 0===n&&(n=!0),t.isRunning||(t.isRunning=!0,e((function(e){t.init(e),t.reset(),t.win.on("hashchange",t.reset),n&&t.win.on("scroll",(function(){t.linkScroll||t.winScroll||(t.winScroll=!0,requestAnimationFrame((function(){t.onScroll()})))})),t.win.on("resize",(function(){t.winResize||(t.winResize=!0,requestAnimationFrame((function(){t.onResize()})))})),t.onResize()})))},enableSticky:function(){this.enable(!0)},init:function(n){n(document);var e=this;this.navBar=n("div.wy-side-scroll:first"),this.win=n(window),n(document).on("click","[data-toggle='wy-nav-top']",(function(){n("[data-toggle='wy-nav-shift']").toggleClass("shift"),n("[data-toggle='rst-versions']").toggleClass("shift")})).on("click",".wy-menu-vertical .current ul li a",(function(){var t=n(this);n("[data-toggle='wy-nav-shift']").removeClass("shift"),n("[data-toggle='rst-versions']").toggleClass("shift"),e.toggleCurrent(t),e.hashChange()})).on("click","[data-toggle='rst-current-version']",(function(){n("[data-toggle='rst-versions']").toggleClass("shift-up")})),n("table.docutils:not(.field-list,.footnote,.citation)").wrap("
"),n("table.docutils.footnote").wrap("
"),n("table.docutils.citation").wrap("
"),n(".wy-menu-vertical ul").not(".simple").siblings("a").each((function(){var t=n(this);expand=n(''),expand.on("click",(function(n){return e.toggleCurrent(t),n.stopPropagation(),!1})),t.prepend(expand)}))},reset:function(){var n=encodeURI(window.location.hash)||"#";try{var e=$(".wy-menu-vertical"),t=e.find('[href="'+n+'"]');if(0===t.length){var i=$('.document [id="'+n.substring(1)+'"]').closest("div.section");0===(t=e.find('[href="#'+i.attr("id")+'"]')).length&&(t=e.find('[href="#"]'))}if(t.length>0){$(".wy-menu-vertical .current").removeClass("current").attr("aria-expanded","false"),t.addClass("current").attr("aria-expanded","true"),t.closest("li.toctree-l1").parent().addClass("current").attr("aria-expanded","true");for(let n=1;n<=10;n++)t.closest("li.toctree-l"+n).addClass("current").attr("aria-expanded","true");t[0].scrollIntoView()}}catch(n){console.log("Error expanding nav for anchor",n)}},onScroll:function(){this.winScroll=!1;var n=this.win.scrollTop(),e=n+this.winHeight,t=this.navBar.scrollTop()+(n-this.winPosition);n<0||e>this.docHeight||(this.navBar.scrollTop(t),this.winPosition=n)},onResize:function(){this.winResize=!1,this.winHeight=this.win.height(),this.docHeight=$(document).height()},hashChange:function(){this.linkScroll=!0,this.win.one("hashchange",(function(){this.linkScroll=!1}))},toggleCurrent:function(n){var e=n.closest("li");e.siblings("li.current").removeClass("current").attr("aria-expanded","false"),e.siblings().find("li.current").removeClass("current").attr("aria-expanded","false");var t=e.find("> ul li");t.length&&(t.removeClass("current").attr("aria-expanded","false"),e.toggleClass("current").attr("aria-expanded",(function(n,e){return"true"==e?"false":"true"})))}},"undefined"!=typeof window&&(window.SphinxRtdTheme={Navigation:n.exports.ThemeNav,StickyNav:n.exports.ThemeNav}),function(){for(var n=0,e=["ms","moz","webkit","o"],t=0;t 2 | 3 | 4 | 5 | 6 | Overview: module code — genal v0.8 documentation 7 | 8 | 9 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 |
23 | 52 | 53 |
57 | 58 |
59 |
60 |
61 |
    62 |
  • 63 | 64 |
  • 65 |
  • 66 |
67 |
68 |
69 |
70 |
71 | 72 |

All modules for which code is available

73 | 86 | 87 |
88 |
89 |
90 | 91 |
92 | 93 |
94 |

© Copyright 2023, Cyprien A. Rivier.

95 |
96 | 97 | Built with Sphinx using a 98 | theme 99 | provided by Read the Docs. 100 | 101 | 102 |
103 |
104 |
105 |
106 |
107 | 112 | 113 | 114 | -------------------------------------------------------------------------------- /docs/build/_static/pygments.css: -------------------------------------------------------------------------------- 1 | pre { line-height: 125%; } 2 | td.linenos .normal { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; } 3 | span.linenos { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; } 4 | td.linenos .special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } 5 | span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } 6 | .highlight .hll { background-color: #ffffcc } 7 | .highlight { background: #f8f8f8; } 8 | .highlight .c { color: #3D7B7B; font-style: italic } /* Comment */ 9 | .highlight .err { border: 1px solid #FF0000 } /* Error */ 10 | .highlight .k { color: #008000; font-weight: bold } /* Keyword */ 11 | .highlight .o { color: #666666 } /* Operator */ 12 | .highlight .ch { color: #3D7B7B; font-style: italic } /* Comment.Hashbang */ 13 | .highlight .cm { color: #3D7B7B; font-style: italic } /* Comment.Multiline */ 14 | .highlight .cp { color: #9C6500 } /* Comment.Preproc */ 15 | .highlight .cpf { color: #3D7B7B; font-style: italic } /* Comment.PreprocFile */ 16 | .highlight .c1 { color: #3D7B7B; font-style: italic } /* Comment.Single */ 17 | .highlight .cs { color: #3D7B7B; font-style: italic } /* Comment.Special */ 18 | .highlight .gd { color: #A00000 } /* Generic.Deleted */ 19 | .highlight .ge { font-style: italic } /* Generic.Emph */ 20 | .highlight .ges { font-weight: bold; font-style: italic } /* Generic.EmphStrong */ 21 | .highlight .gr { color: #E40000 } /* Generic.Error */ 22 | .highlight .gh { color: #000080; font-weight: bold } /* Generic.Heading */ 23 | .highlight .gi { color: #008400 } /* Generic.Inserted */ 24 | .highlight .go { color: #717171 } /* Generic.Output */ 25 | .highlight .gp { color: #000080; font-weight: bold } /* Generic.Prompt */ 26 | .highlight .gs { font-weight: bold } /* Generic.Strong */ 27 | .highlight .gu { color: #800080; font-weight: bold } /* Generic.Subheading */ 28 | .highlight .gt { color: #0044DD } /* Generic.Traceback */ 29 | .highlight .kc { color: #008000; font-weight: bold } /* Keyword.Constant */ 30 | .highlight .kd { color: #008000; font-weight: bold } /* Keyword.Declaration */ 31 | .highlight .kn { color: #008000; font-weight: bold } /* Keyword.Namespace */ 32 | .highlight .kp { color: #008000 } /* Keyword.Pseudo */ 33 | .highlight .kr { color: #008000; font-weight: bold } /* Keyword.Reserved */ 34 | .highlight .kt { color: #B00040 } /* Keyword.Type */ 35 | .highlight .m { color: #666666 } /* Literal.Number */ 36 | .highlight .s { color: #BA2121 } /* Literal.String */ 37 | .highlight .na { color: #687822 } /* Name.Attribute */ 38 | .highlight .nb { color: #008000 } /* Name.Builtin */ 39 | .highlight .nc { color: #0000FF; font-weight: bold } /* Name.Class */ 40 | .highlight .no { color: #880000 } /* Name.Constant */ 41 | .highlight .nd { color: #AA22FF } /* Name.Decorator */ 42 | .highlight .ni { color: #717171; font-weight: bold } /* Name.Entity */ 43 | .highlight .ne { color: #CB3F38; font-weight: bold } /* Name.Exception */ 44 | .highlight .nf { color: #0000FF } /* Name.Function */ 45 | .highlight .nl { color: #767600 } /* Name.Label */ 46 | .highlight .nn { color: #0000FF; font-weight: bold } /* Name.Namespace */ 47 | .highlight .nt { color: #008000; font-weight: bold } /* Name.Tag */ 48 | .highlight .nv { color: #19177C } /* Name.Variable */ 49 | .highlight .ow { color: #AA22FF; font-weight: bold } /* Operator.Word */ 50 | .highlight .w { color: #bbbbbb } /* Text.Whitespace */ 51 | .highlight .mb { color: #666666 } /* Literal.Number.Bin */ 52 | .highlight .mf { color: #666666 } /* Literal.Number.Float */ 53 | .highlight .mh { color: #666666 } /* Literal.Number.Hex */ 54 | .highlight .mi { color: #666666 } /* Literal.Number.Integer */ 55 | .highlight .mo { color: #666666 } /* Literal.Number.Oct */ 56 | .highlight .sa { color: #BA2121 } /* Literal.String.Affix */ 57 | .highlight .sb { color: #BA2121 } /* Literal.String.Backtick */ 58 | .highlight .sc { color: #BA2121 } /* Literal.String.Char */ 59 | .highlight .dl { color: #BA2121 } /* Literal.String.Delimiter */ 60 | .highlight .sd { color: #BA2121; font-style: italic } /* Literal.String.Doc */ 61 | .highlight .s2 { color: #BA2121 } /* Literal.String.Double */ 62 | .highlight .se { color: #AA5D1F; font-weight: bold } /* Literal.String.Escape */ 63 | .highlight .sh { color: #BA2121 } /* Literal.String.Heredoc */ 64 | .highlight .si { color: #A45A77; font-weight: bold } /* Literal.String.Interpol */ 65 | .highlight .sx { color: #008000 } /* Literal.String.Other */ 66 | .highlight .sr { color: #A45A77 } /* Literal.String.Regex */ 67 | .highlight .s1 { color: #BA2121 } /* Literal.String.Single */ 68 | .highlight .ss { color: #19177C } /* Literal.String.Symbol */ 69 | .highlight .bp { color: #008000 } /* Name.Builtin.Pseudo */ 70 | .highlight .fm { color: #0000FF } /* Name.Function.Magic */ 71 | .highlight .vc { color: #19177C } /* Name.Variable.Class */ 72 | .highlight .vg { color: #19177C } /* Name.Variable.Global */ 73 | .highlight .vi { color: #19177C } /* Name.Variable.Instance */ 74 | .highlight .vm { color: #19177C } /* Name.Variable.Magic */ 75 | .highlight .il { color: #666666 } /* Literal.Number.Integer.Long */ -------------------------------------------------------------------------------- /docs/build/_static/doctools.js: -------------------------------------------------------------------------------- 1 | /* 2 | * doctools.js 3 | * ~~~~~~~~~~~ 4 | * 5 | * Base JavaScript utilities for all Sphinx HTML documentation. 6 | * 7 | * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. 8 | * :license: BSD, see LICENSE for details. 9 | * 10 | */ 11 | "use strict"; 12 | 13 | const BLACKLISTED_KEY_CONTROL_ELEMENTS = new Set([ 14 | "TEXTAREA", 15 | "INPUT", 16 | "SELECT", 17 | "BUTTON", 18 | ]); 19 | 20 | const _ready = (callback) => { 21 | if (document.readyState !== "loading") { 22 | callback(); 23 | } else { 24 | document.addEventListener("DOMContentLoaded", callback); 25 | } 26 | }; 27 | 28 | /** 29 | * Small JavaScript module for the documentation. 30 | */ 31 | const Documentation = { 32 | init: () => { 33 | Documentation.initDomainIndexTable(); 34 | Documentation.initOnKeyListeners(); 35 | }, 36 | 37 | /** 38 | * i18n support 39 | */ 40 | TRANSLATIONS: {}, 41 | PLURAL_EXPR: (n) => (n === 1 ? 0 : 1), 42 | LOCALE: "unknown", 43 | 44 | // gettext and ngettext don't access this so that the functions 45 | // can safely bound to a different name (_ = Documentation.gettext) 46 | gettext: (string) => { 47 | const translated = Documentation.TRANSLATIONS[string]; 48 | switch (typeof translated) { 49 | case "undefined": 50 | return string; // no translation 51 | case "string": 52 | return translated; // translation exists 53 | default: 54 | return translated[0]; // (singular, plural) translation tuple exists 55 | } 56 | }, 57 | 58 | ngettext: (singular, plural, n) => { 59 | const translated = Documentation.TRANSLATIONS[singular]; 60 | if (typeof translated !== "undefined") 61 | return translated[Documentation.PLURAL_EXPR(n)]; 62 | return n === 1 ? singular : plural; 63 | }, 64 | 65 | addTranslations: (catalog) => { 66 | Object.assign(Documentation.TRANSLATIONS, catalog.messages); 67 | Documentation.PLURAL_EXPR = new Function( 68 | "n", 69 | `return (${catalog.plural_expr})` 70 | ); 71 | Documentation.LOCALE = catalog.locale; 72 | }, 73 | 74 | /** 75 | * helper function to focus on search bar 76 | */ 77 | focusSearchBar: () => { 78 | document.querySelectorAll("input[name=q]")[0]?.focus(); 79 | }, 80 | 81 | /** 82 | * Initialise the domain index toggle buttons 83 | */ 84 | initDomainIndexTable: () => { 85 | const toggler = (el) => { 86 | const idNumber = el.id.substr(7); 87 | const toggledRows = document.querySelectorAll(`tr.cg-${idNumber}`); 88 | if (el.src.substr(-9) === "minus.png") { 89 | el.src = `${el.src.substr(0, el.src.length - 9)}plus.png`; 90 | toggledRows.forEach((el) => (el.style.display = "none")); 91 | } else { 92 | el.src = `${el.src.substr(0, el.src.length - 8)}minus.png`; 93 | toggledRows.forEach((el) => (el.style.display = "")); 94 | } 95 | }; 96 | 97 | const togglerElements = document.querySelectorAll("img.toggler"); 98 | togglerElements.forEach((el) => 99 | el.addEventListener("click", (event) => toggler(event.currentTarget)) 100 | ); 101 | togglerElements.forEach((el) => (el.style.display = "")); 102 | if (DOCUMENTATION_OPTIONS.COLLAPSE_INDEX) togglerElements.forEach(toggler); 103 | }, 104 | 105 | initOnKeyListeners: () => { 106 | // only install a listener if it is really needed 107 | if ( 108 | !DOCUMENTATION_OPTIONS.NAVIGATION_WITH_KEYS && 109 | !DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS 110 | ) 111 | return; 112 | 113 | document.addEventListener("keydown", (event) => { 114 | // bail for input elements 115 | if (BLACKLISTED_KEY_CONTROL_ELEMENTS.has(document.activeElement.tagName)) return; 116 | // bail with special keys 117 | if (event.altKey || event.ctrlKey || event.metaKey) return; 118 | 119 | if (!event.shiftKey) { 120 | switch (event.key) { 121 | case "ArrowLeft": 122 | if (!DOCUMENTATION_OPTIONS.NAVIGATION_WITH_KEYS) break; 123 | 124 | const prevLink = document.querySelector('link[rel="prev"]'); 125 | if (prevLink && prevLink.href) { 126 | window.location.href = prevLink.href; 127 | event.preventDefault(); 128 | } 129 | break; 130 | case "ArrowRight": 131 | if (!DOCUMENTATION_OPTIONS.NAVIGATION_WITH_KEYS) break; 132 | 133 | const nextLink = document.querySelector('link[rel="next"]'); 134 | if (nextLink && nextLink.href) { 135 | window.location.href = nextLink.href; 136 | event.preventDefault(); 137 | } 138 | break; 139 | } 140 | } 141 | 142 | // some keyboard layouts may need Shift to get / 143 | switch (event.key) { 144 | case "/": 145 | if (!DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS) break; 146 | Documentation.focusSearchBar(); 147 | event.preventDefault(); 148 | } 149 | }); 150 | }, 151 | }; 152 | 153 | // quick alias for translations 154 | const _ = Documentation.gettext; 155 | 156 | _ready(Documentation.init); 157 | -------------------------------------------------------------------------------- /genal/genes.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import os 4 | import wget 5 | 6 | from .constants import BUCKET_URL 7 | from .tools import read_config 8 | 9 | 10 | 11 | def filter_by_gene_func(data, gene_identifier, id_type="symbol", window_size=1000000, build="37"): 12 | """ 13 | Filtering the data to include only variants that are within a specified distance of a specific gene. 14 | Corresponds to the :meth:`Geno.filter_by_gene` method. 15 | Args: 16 | data (pd.DataFrame): Input data with at least 'CHR' and 'POS' columns. 17 | gene_identifier (str): Identifier for the gene/protein to filter variants around. 18 | id_type (str, optional): Type of identifier provided. Options are: 19 | - "symbol": Gene symbol (e.g., "APOE") 20 | - "HGNC": HGNC ID (e.g., "HGNC:613") 21 | - "name": Full gene name (e.g., "apolipoprotein E") 22 | - "Ensembl": Ensembl gene ID (e.g., "ENSG00000130203") 23 | - "NCBI": NCBI gene ID (e.g., "348") 24 | - "UCSC": UCSC gene ID (e.g., "uc001hbu.2") 25 | - "Vega": Vega gene ID (e.g., "OTTHUMG00000019505") 26 | Default is "symbol". 27 | window_size (int, optional): Size of the window around the gene in base pairs. Default is 1,000,000 (1Mb). 28 | build (str, optional): Genome build of the data. Default is "37". 29 | 30 | Returns: 31 | pd.DataFrame: Filtered DataFrame containing only variants within the specified window 32 | around the gene, with additional column 'Distance'. 33 | 34 | Notes: 35 | - Distance is calculated from the nearest gene boundary (start or end position) 36 | - Null distances indicate the variant is within the gene 37 | """ 38 | 39 | # Validate id_type 40 | valid_id_types = ["symbol", "HGNC_id", "name", "gene_id", "NCBI_id", "UCSC_id", "Vega_id"] 41 | if id_type in ["HGNC", "NCBI", "UCSC", "Vega"]: 42 | id_type = id_type + "_id" 43 | if id_type == "Ensembl": 44 | id_type = "gene_id" 45 | if id_type not in valid_id_types: 46 | raise ValueError(f"Invalid id_type. Must be one of: {', '.join(valid_id_types)}") 47 | 48 | # Validate build 49 | if int(build) not in [37, 38]: 50 | raise ValueError(f"Invalid build. Must be one of: 37, 38") 51 | 52 | # Download the gene info file if not already present in the reference folder 53 | config = read_config() 54 | ref_path = config["paths"]["ref_path"] 55 | gene_info_file = os.path.join(ref_path, "gene_id_mapping_filtered.parquet") 56 | if not os.path.exists(gene_info_file): 57 | # Download parquet file 58 | print(f"Downloading gene info file to {gene_info_file}...") 59 | url = BUCKET_URL + "gene_id_mapping_filtered.parquet" 60 | try: 61 | wget.download(url, gene_info_file) 62 | print("\nDownload complete.") 63 | except Exception as e: 64 | if os.path.exists(gene_info_file): 65 | os.remove(gene_info_file) 66 | raise RuntimeError(f"Failed to download gene info: {e}") 67 | 68 | df_gene_info = pd.read_parquet(gene_info_file, engine="pyarrow") 69 | 70 | # Find gene coordinates 71 | gene_data = df_gene_info[df_gene_info[id_type] == gene_identifier] 72 | 73 | if gene_data.empty: 74 | raise ValueError(f"Gene with {id_type}='{gene_identifier}' not found in gene info database.") 75 | 76 | if len(gene_data) > 1: 77 | print(f"Warning: Multiple entries found for {id_type}='{gene_identifier}'. Using the first entry.") 78 | gene_data = gene_data.iloc[0,:] 79 | 80 | print(f"Filtering variants within {window_size}bp window (+/- {window_size/2}bp on each side) based on genome build {build} around gene: {', '.join(f'{col}: {gene_data[col]}' for col in valid_id_types)}") 81 | 82 | # Extract gene location information 83 | chrom = gene_data['CHR'] 84 | # Convert to integer if possible 85 | if str(chrom).isdigit(): 86 | chrom = int(chrom) 87 | elif chrom=="X": 88 | chrom=23 89 | else: 90 | raise ValueError(f"Gene {gene_identifier} is located on chromosome {chrom}, which is not supported.") 91 | 92 | gene_start = int(gene_data[f'gene_start_{build}']) 93 | gene_end = int(gene_data[f'gene_end_{build}']) 94 | 95 | # Define the window boundaries 96 | window_start = max(0, gene_start - window_size/2) 97 | window_end = gene_end + window_size/2 98 | 99 | # Filter variants within the window 100 | filtered = data[ 101 | (data['CHR'] == chrom) & 102 | (data['POS'] >= window_start) & 103 | (data['POS'] <= window_end) 104 | ].copy() 105 | 106 | if not filtered.empty: 107 | # Calculate distance from gene: if inside the gene, distance is 0, if before, distance is negative, if after, distance is positive 108 | filtered.loc[:, 'Distance'] = np.nan 109 | 110 | # Create boolean masks 111 | mask_inside = filtered['POS'].between(gene_start, gene_end) 112 | mask_before = filtered['POS'] < gene_start 113 | mask_after = filtered['POS'] > gene_end 114 | 115 | filtered.loc[mask_inside, 'Distance'] = 0 116 | filtered.loc[mask_before, 'Distance'] = filtered['POS'] - gene_start 117 | filtered.loc[mask_after, 'Distance'] = filtered['POS'] - gene_end 118 | 119 | filtered["Distance"] = filtered["Distance"].astype("Int64") 120 | 121 | print(f"Found {len(filtered)} variants.") 122 | else: 123 | print(f"No variants found in a {window_size}bp window around {gene_identifier}") 124 | 125 | return filtered -------------------------------------------------------------------------------- /docs/build/_static/language_data.js: -------------------------------------------------------------------------------- 1 | /* 2 | * language_data.js 3 | * ~~~~~~~~~~~~~~~~ 4 | * 5 | * This script contains the language-specific data used by searchtools.js, 6 | * namely the list of stopwords, stemmer, scorer and splitter. 7 | * 8 | * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. 9 | * :license: BSD, see LICENSE for details. 10 | * 11 | */ 12 | 13 | var stopwords = ["a", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "near", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with"]; 14 | 15 | 16 | /* Non-minified version is copied as a separate JS file, if available */ 17 | 18 | /** 19 | * Porter Stemmer 20 | */ 21 | var Stemmer = function() { 22 | 23 | var step2list = { 24 | ational: 'ate', 25 | tional: 'tion', 26 | enci: 'ence', 27 | anci: 'ance', 28 | izer: 'ize', 29 | bli: 'ble', 30 | alli: 'al', 31 | entli: 'ent', 32 | eli: 'e', 33 | ousli: 'ous', 34 | ization: 'ize', 35 | ation: 'ate', 36 | ator: 'ate', 37 | alism: 'al', 38 | iveness: 'ive', 39 | fulness: 'ful', 40 | ousness: 'ous', 41 | aliti: 'al', 42 | iviti: 'ive', 43 | biliti: 'ble', 44 | logi: 'log' 45 | }; 46 | 47 | var step3list = { 48 | icate: 'ic', 49 | ative: '', 50 | alize: 'al', 51 | iciti: 'ic', 52 | ical: 'ic', 53 | ful: '', 54 | ness: '' 55 | }; 56 | 57 | var c = "[^aeiou]"; // consonant 58 | var v = "[aeiouy]"; // vowel 59 | var C = c + "[^aeiouy]*"; // consonant sequence 60 | var V = v + "[aeiou]*"; // vowel sequence 61 | 62 | var mgr0 = "^(" + C + ")?" + V + C; // [C]VC... is m>0 63 | var meq1 = "^(" + C + ")?" + V + C + "(" + V + ")?$"; // [C]VC[V] is m=1 64 | var mgr1 = "^(" + C + ")?" + V + C + V + C; // [C]VCVC... is m>1 65 | var s_v = "^(" + C + ")?" + v; // vowel in stem 66 | 67 | this.stemWord = function (w) { 68 | var stem; 69 | var suffix; 70 | var firstch; 71 | var origword = w; 72 | 73 | if (w.length < 3) 74 | return w; 75 | 76 | var re; 77 | var re2; 78 | var re3; 79 | var re4; 80 | 81 | firstch = w.substr(0,1); 82 | if (firstch == "y") 83 | w = firstch.toUpperCase() + w.substr(1); 84 | 85 | // Step 1a 86 | re = /^(.+?)(ss|i)es$/; 87 | re2 = /^(.+?)([^s])s$/; 88 | 89 | if (re.test(w)) 90 | w = w.replace(re,"$1$2"); 91 | else if (re2.test(w)) 92 | w = w.replace(re2,"$1$2"); 93 | 94 | // Step 1b 95 | re = /^(.+?)eed$/; 96 | re2 = /^(.+?)(ed|ing)$/; 97 | if (re.test(w)) { 98 | var fp = re.exec(w); 99 | re = new RegExp(mgr0); 100 | if (re.test(fp[1])) { 101 | re = /.$/; 102 | w = w.replace(re,""); 103 | } 104 | } 105 | else if (re2.test(w)) { 106 | var fp = re2.exec(w); 107 | stem = fp[1]; 108 | re2 = new RegExp(s_v); 109 | if (re2.test(stem)) { 110 | w = stem; 111 | re2 = /(at|bl|iz)$/; 112 | re3 = new RegExp("([^aeiouylsz])\\1$"); 113 | re4 = new RegExp("^" + C + v + "[^aeiouwxy]$"); 114 | if (re2.test(w)) 115 | w = w + "e"; 116 | else if (re3.test(w)) { 117 | re = /.$/; 118 | w = w.replace(re,""); 119 | } 120 | else if (re4.test(w)) 121 | w = w + "e"; 122 | } 123 | } 124 | 125 | // Step 1c 126 | re = /^(.+?)y$/; 127 | if (re.test(w)) { 128 | var fp = re.exec(w); 129 | stem = fp[1]; 130 | re = new RegExp(s_v); 131 | if (re.test(stem)) 132 | w = stem + "i"; 133 | } 134 | 135 | // Step 2 136 | re = /^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/; 137 | if (re.test(w)) { 138 | var fp = re.exec(w); 139 | stem = fp[1]; 140 | suffix = fp[2]; 141 | re = new RegExp(mgr0); 142 | if (re.test(stem)) 143 | w = stem + step2list[suffix]; 144 | } 145 | 146 | // Step 3 147 | re = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/; 148 | if (re.test(w)) { 149 | var fp = re.exec(w); 150 | stem = fp[1]; 151 | suffix = fp[2]; 152 | re = new RegExp(mgr0); 153 | if (re.test(stem)) 154 | w = stem + step3list[suffix]; 155 | } 156 | 157 | // Step 4 158 | re = /^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/; 159 | re2 = /^(.+?)(s|t)(ion)$/; 160 | if (re.test(w)) { 161 | var fp = re.exec(w); 162 | stem = fp[1]; 163 | re = new RegExp(mgr1); 164 | if (re.test(stem)) 165 | w = stem; 166 | } 167 | else if (re2.test(w)) { 168 | var fp = re2.exec(w); 169 | stem = fp[1] + fp[2]; 170 | re2 = new RegExp(mgr1); 171 | if (re2.test(stem)) 172 | w = stem; 173 | } 174 | 175 | // Step 5 176 | re = /^(.+?)e$/; 177 | if (re.test(w)) { 178 | var fp = re.exec(w); 179 | stem = fp[1]; 180 | re = new RegExp(mgr1); 181 | re2 = new RegExp(meq1); 182 | re3 = new RegExp("^" + C + v + "[^aeiouwxy]$"); 183 | if (re.test(stem) || (re2.test(stem) && !(re3.test(stem)))) 184 | w = stem; 185 | } 186 | re = /ll$/; 187 | re2 = new RegExp(mgr1); 188 | if (re.test(w) && re2.test(w)) { 189 | re = /.$/; 190 | w = w.replace(re,""); 191 | } 192 | 193 | // and turn initial Y back to y 194 | if (firstch == "y") 195 | w = firstch.toLowerCase() + w.substr(1); 196 | return w; 197 | } 198 | } 199 | 200 | -------------------------------------------------------------------------------- /docs/build/_static/sphinx_highlight.js: -------------------------------------------------------------------------------- 1 | /* Highlighting utilities for Sphinx HTML documentation. */ 2 | "use strict"; 3 | 4 | const SPHINX_HIGHLIGHT_ENABLED = true 5 | 6 | /** 7 | * highlight a given string on a node by wrapping it in 8 | * span elements with the given class name. 9 | */ 10 | const _highlight = (node, addItems, text, className) => { 11 | if (node.nodeType === Node.TEXT_NODE) { 12 | const val = node.nodeValue; 13 | const parent = node.parentNode; 14 | const pos = val.toLowerCase().indexOf(text); 15 | if ( 16 | pos >= 0 && 17 | !parent.classList.contains(className) && 18 | !parent.classList.contains("nohighlight") 19 | ) { 20 | let span; 21 | 22 | const closestNode = parent.closest("body, svg, foreignObject"); 23 | const isInSVG = closestNode && closestNode.matches("svg"); 24 | if (isInSVG) { 25 | span = document.createElementNS("http://www.w3.org/2000/svg", "tspan"); 26 | } else { 27 | span = document.createElement("span"); 28 | span.classList.add(className); 29 | } 30 | 31 | span.appendChild(document.createTextNode(val.substr(pos, text.length))); 32 | const rest = document.createTextNode(val.substr(pos + text.length)); 33 | parent.insertBefore( 34 | span, 35 | parent.insertBefore( 36 | rest, 37 | node.nextSibling 38 | ) 39 | ); 40 | node.nodeValue = val.substr(0, pos); 41 | /* There may be more occurrences of search term in this node. So call this 42 | * function recursively on the remaining fragment. 43 | */ 44 | _highlight(rest, addItems, text, className); 45 | 46 | if (isInSVG) { 47 | const rect = document.createElementNS( 48 | "http://www.w3.org/2000/svg", 49 | "rect" 50 | ); 51 | const bbox = parent.getBBox(); 52 | rect.x.baseVal.value = bbox.x; 53 | rect.y.baseVal.value = bbox.y; 54 | rect.width.baseVal.value = bbox.width; 55 | rect.height.baseVal.value = bbox.height; 56 | rect.setAttribute("class", className); 57 | addItems.push({ parent: parent, target: rect }); 58 | } 59 | } 60 | } else if (node.matches && !node.matches("button, select, textarea")) { 61 | node.childNodes.forEach((el) => _highlight(el, addItems, text, className)); 62 | } 63 | }; 64 | const _highlightText = (thisNode, text, className) => { 65 | let addItems = []; 66 | _highlight(thisNode, addItems, text, className); 67 | addItems.forEach((obj) => 68 | obj.parent.insertAdjacentElement("beforebegin", obj.target) 69 | ); 70 | }; 71 | 72 | /** 73 | * Small JavaScript module for the documentation. 74 | */ 75 | const SphinxHighlight = { 76 | 77 | /** 78 | * highlight the search words provided in localstorage in the text 79 | */ 80 | highlightSearchWords: () => { 81 | if (!SPHINX_HIGHLIGHT_ENABLED) return; // bail if no highlight 82 | 83 | // get and clear terms from localstorage 84 | const url = new URL(window.location); 85 | const highlight = 86 | localStorage.getItem("sphinx_highlight_terms") 87 | || url.searchParams.get("highlight") 88 | || ""; 89 | localStorage.removeItem("sphinx_highlight_terms") 90 | url.searchParams.delete("highlight"); 91 | window.history.replaceState({}, "", url); 92 | 93 | // get individual terms from highlight string 94 | const terms = highlight.toLowerCase().split(/\s+/).filter(x => x); 95 | if (terms.length === 0) return; // nothing to do 96 | 97 | // There should never be more than one element matching "div.body" 98 | const divBody = document.querySelectorAll("div.body"); 99 | const body = divBody.length ? divBody[0] : document.querySelector("body"); 100 | window.setTimeout(() => { 101 | terms.forEach((term) => _highlightText(body, term, "highlighted")); 102 | }, 10); 103 | 104 | const searchBox = document.getElementById("searchbox"); 105 | if (searchBox === null) return; 106 | searchBox.appendChild( 107 | document 108 | .createRange() 109 | .createContextualFragment( 110 | '" 114 | ) 115 | ); 116 | }, 117 | 118 | /** 119 | * helper function to hide the search marks again 120 | */ 121 | hideSearchWords: () => { 122 | document 123 | .querySelectorAll("#searchbox .highlight-link") 124 | .forEach((el) => el.remove()); 125 | document 126 | .querySelectorAll("span.highlighted") 127 | .forEach((el) => el.classList.remove("highlighted")); 128 | localStorage.removeItem("sphinx_highlight_terms") 129 | }, 130 | 131 | initEscapeListener: () => { 132 | // only install a listener if it is really needed 133 | if (!DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS) return; 134 | 135 | document.addEventListener("keydown", (event) => { 136 | // bail for input elements 137 | if (BLACKLISTED_KEY_CONTROL_ELEMENTS.has(document.activeElement.tagName)) return; 138 | // bail with special keys 139 | if (event.shiftKey || event.altKey || event.ctrlKey || event.metaKey) return; 140 | if (DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS && (event.key === "Escape")) { 141 | SphinxHighlight.hideSearchWords(); 142 | event.preventDefault(); 143 | } 144 | }); 145 | }, 146 | }; 147 | 148 | _ready(() => { 149 | /* Do not call highlightSearchWords() when we are on the search page. 150 | * It will highlight words from the *previous* search query. 151 | */ 152 | if (typeof Search === "undefined") SphinxHighlight.highlightSearchWords(); 153 | SphinxHighlight.initEscapeListener(); 154 | }); 155 | -------------------------------------------------------------------------------- /docs/build/py-modindex.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Python Module Index — genal v0.8 documentation 7 | 8 | 9 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 |
26 | 55 | 56 |
60 | 61 |
62 |
63 |
64 |
    65 |
  • 66 | 67 |
  • 68 |
  • 69 |
70 |
71 |
72 |
73 |
74 | 75 | 76 |

Python Module Index

77 | 78 |
79 | g 80 |
81 | 82 | 83 | 84 | 86 | 87 | 89 | 92 | 93 | 94 | 97 | 98 | 99 | 102 | 103 | 104 | 107 | 108 | 109 | 112 | 113 | 114 | 117 | 118 | 119 | 122 | 123 | 124 | 127 | 128 | 129 | 132 | 133 | 134 | 137 | 138 | 139 | 142 | 143 | 144 | 147 |
 
85 | g
90 | genal 91 |
    95 | genal.association 96 |
    100 | genal.clump 101 |
    105 | genal.extract_prs 106 |
    110 | genal.geno_tools 111 |
    115 | genal.lift 116 |
    120 | genal.MR 121 |
    125 | genal.MR_tools 126 |
    130 | genal.MRpresso 131 |
    135 | genal.proxy 136 |
    140 | genal.snp_query 141 |
    145 | genal.tools 146 |
148 | 149 | 150 |
151 |
152 |
153 | 154 |
155 | 156 |
157 |

© Copyright 2023, Cyprien A. Rivier.

158 |
159 | 160 | Built with Sphinx using a 161 | theme 162 | provided by Read the Docs. 163 | 164 | 165 |
166 |
167 |
168 |
169 |
170 | 175 | 176 | 177 | -------------------------------------------------------------------------------- /genal/snp_query.py: -------------------------------------------------------------------------------- 1 | import aiohttp 2 | import asyncio 3 | import numpy as np 4 | import nest_asyncio 5 | from tqdm.auto import tqdm 6 | 7 | # Using nest_asyncio to allow execution in notebooks 8 | nest_asyncio.apply() 9 | 10 | # Main function to start the event loop and run the asynchronous query 11 | def async_query_gwas_catalog(snps, p_threshold=5e-8, return_p=False, return_study=False, 12 | max_associations=None, timeout=100): 13 | try: 14 | loop = asyncio.get_event_loop() 15 | except RuntimeError: 16 | loop = asyncio.new_event_loop() 17 | asyncio.set_event_loop(loop) 18 | results_global, errors, timeouts = loop.run_until_complete( 19 | query_gwas_catalog_coroutine( 20 | snps, p_threshold, return_p, return_study, max_associations, timeout 21 | ) 22 | ) 23 | return results_global, errors, timeouts 24 | 25 | 26 | # Function to query GWAS Catalog API for SNP associations 27 | async def query_gwas_catalog_coroutine(snps, p_threshold=5e-8, return_p=False, return_study=False, 28 | max_associations=None, timeout=100): 29 | """ 30 | Query the GWAS Catalog API for SNP associations. 31 | 32 | Parameters: 33 | snps (list): List of SNPs to query. 34 | p_threshold (float): P-value threshold for filtering associations. 35 | return_p (bool): Whether to return the P-value of the association. 36 | return_study (bool): Whether to return the study ID of the association. 37 | max_associations (int): Maximum number of associations to return for each SNP. 38 | timeout (int): Timeout for each query in seconds. 39 | 40 | Returns: 41 | results_global (dict): Dictionary storing the SNP (keys) and results for each SNP: a list of single strings or tuples 42 | errors (list): List storing SNP for which the GWAS Catalog could not be queried 43 | timeouts (list): List storing SNP for which the timeout was reached 44 | """ 45 | 46 | results_global = {} # Dictionary storing the SNP (keys) and results for each SNP: a list of single strings or tuples 47 | errors = [] # List storing SNP for which the GWAS Catalog could not be queried 48 | timeouts = [] # List storing SNP for which the timeout was reached 49 | 50 | async def fetch(session, url, timeout_duration=timeout): 51 | try: 52 | # Wrap the entire fetch operation with asyncio.wait_for for timeout 53 | response = await asyncio.wait_for(session.get(url), timeout=timeout_duration) 54 | async with response: 55 | if response.status == 200: 56 | return await response.json() 57 | return None 58 | except asyncio.TimeoutError: 59 | return "TIMEOUT" 60 | except aiohttp.ClientError: 61 | return "ERROR" 62 | 63 | async def process_snp(session, snp): 64 | #print(f"Processing SNP {snp}") 65 | 66 | results_snp = [] # List storing the results for each association found for this SNP 67 | 68 | base_url = f"https://www.ebi.ac.uk/gwas/rest/api/singleNucleotidePolymorphisms/{snp}/associations?projection=associationBySnp" 69 | base_data = await fetch(session, base_url, timeout_duration=timeout) 70 | 71 | if base_data == "TIMEOUT": 72 | timeouts.append(snp) 73 | elif base_data == "ERROR" or base_data is None: 74 | errors.append(snp) 75 | else: 76 | i = 0 77 | # Process each association found for this SNP 78 | for assoc in base_data.get('_embedded', {}).get('associations', []): 79 | 80 | # If there are already max_associations, stop the loop 81 | if max_associations and i >= max_associations: 82 | break 83 | i += 1 84 | 85 | pvalue = assoc.get("pvalue", np.nan) 86 | # If the pvalue of the association does not pass the threshold, the association is not processed further nor reported 87 | if pvalue < p_threshold: 88 | efo_traits = assoc.get("efoTraits", []) 89 | if efo_traits: 90 | trait = efo_traits[0].get("trait", "") 91 | else: 92 | trait = "" 93 | 94 | # If the return_study flag is active: query the page containing the GWAS Catalog study ID 95 | if return_study: 96 | study_url = assoc.get("_links", {}).get("study", {}).get("href", "") 97 | if study_url: 98 | study_data = await fetch(session, study_url, timeout_duration=timeout) 99 | if study_data == "TIMEOUT": 100 | study_id = "TIMEOUT" 101 | elif study_data == "ERROR" or study_data is None: 102 | study_id = "Error" 103 | else: 104 | study_id = study_data.get("accessionId", "Not found") 105 | else: 106 | study_id = "Not available" 107 | else: 108 | study_id = None 109 | 110 | # Return a tuple or a string depending on the return flags 111 | if return_p and return_study: 112 | result_assoc = (trait, "{:.4g}".format(pvalue), study_id) 113 | elif return_p: 114 | result_assoc = (trait, "{:.4g}".format(pvalue)) 115 | elif return_study: 116 | result_assoc = (trait, study_id) 117 | else: 118 | result_assoc = trait 119 | results_snp.append(result_assoc) 120 | 121 | else: 122 | continue 123 | 124 | # Clean the associations depending on the flag 125 | # If the P-value and Study ID are not returned, display each trait only once 126 | if not return_p and not return_study: 127 | results_snp = list(set(results_snp)) 128 | # If the P-value must be returned, return each trait once with the lowest p-value 129 | elif return_p and not return_study: 130 | min_trait = {} 131 | for trait, pvalue in results_snp: 132 | if trait not in min_trait or pvalue < min_trait[trait]: 133 | min_trait[trait] = pvalue 134 | results_snp = [(trait, min_trait[trait]) for trait in min_trait] 135 | 136 | results_global[snp] = results_snp 137 | 138 | async with aiohttp.ClientSession() as session: 139 | tasks = [process_snp(session, snp) for snp in snps] 140 | # Initialize tqdm progress bar 141 | with tqdm(total=len(tasks), desc="Processing SNPs") as pbar: 142 | for coro in asyncio.as_completed(tasks): 143 | await coro 144 | pbar.update(1) 145 | 146 | return results_global, errors, timeouts -------------------------------------------------------------------------------- /genal/colocalization.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from numpy import exp, log 4 | from genal.geno_tools import check_beta_column, check_allele_column, check_snp_column, check_int_column 5 | 6 | # Currently does not support multi-allelic SNPs 7 | 8 | def coloc_abf_func(data1, data2, trait1_type="quant", trait2_type="quant", 9 | sdY1=None, sdY2=None, n1=None, n2=None, 10 | p1=1e-4, p2=1e-4, p12=1e-5, merge_on_snp=False): 11 | """ 12 | Perform colocalization analysis between two GWAS datasets using approximate Bayes factors. 13 | Corresponds to the :meth:`Geno.colocalize` method. 14 | 15 | Args: 16 | data1: DataFrame containing GWAS results for trait 1 17 | data2: DataFrame containing GWAS results for trait 2 18 | trait1_type: Type of trait 1 ("quant" for quantitative traits or "cc" for case-control traits), default is "quant" 19 | trait2_type: Type of trait 2 ("quant" for quantitative traits or "cc" for case-control traits), default is "quant" 20 | sdY1: Standard deviation of trait 1 (required for quantitative traits) 21 | sdY2: Standard deviation of trait 2 (required for quantitative traits) 22 | n1: Sample size for trait 1 (used to estimate sdY if not provided) 23 | n2: Sample size for trait 2 (used to estimate sdY if not provided) 24 | p1: Prior probability SNP associated with trait 1, default is 1e-4 25 | p2: Prior probability SNP associated with trait 2, default is 1e-4 26 | p12: Prior probability SNP associated with both traits, default is 1e-5 27 | merge_on_snp: If True, merge the datasets on SNP column. If False, first attempt to merge on CHR and POS columns. 28 | 29 | """ 30 | 31 | # Ensure that the BETA columns are preprocessed 32 | check_beta_column(data1, 'BETA', 'Fill') 33 | check_beta_column(data2, 'BETA', 'Fill') 34 | 35 | # Adjust EAF column names before merging in case one of the datasets does not have it 36 | if 'EAF' in data1.columns: 37 | data1.rename(columns={'EAF': 'EAF_1'}, inplace=True) 38 | if 'EAF' in data2.columns: 39 | data2.rename(columns={'EAF': 'EAF_2'}, inplace=True) 40 | 41 | # First determine if we can merge on position, otherwise try SNP 42 | if all(col in data1.columns for col in ['CHR', 'POS']) and \ 43 | all(col in data2.columns for col in ['CHR', 'POS']) and not merge_on_snp: 44 | 45 | print("Merging datasets using genomic positions (CHR, POS)") 46 | 47 | # Ensure that the CHR and POS columns are preprocessed 48 | check_int_column(data1, "CHR") 49 | check_int_column(data2, "CHR") 50 | check_int_column(data1, "POS") 51 | check_int_column(data2, "POS") 52 | 53 | # Merge using position 54 | merged_data = pd.merge( 55 | data1, 56 | data2, 57 | on=['CHR', 'POS'], 58 | how='left', 59 | suffixes=('_1', '_2') 60 | ) 61 | 62 | elif 'SNP' in data1.columns and 'SNP' in data2.columns: 63 | print("Position columns (CHR, POS) not present in both datasets. Merging datasets using SNP IDs.") 64 | 65 | # Ensure that the SNP column is preprocessed 66 | check_snp_column(data1) 67 | check_snp_column(data2) 68 | 69 | # Merge using SNP 70 | merged_data = pd.merge( 71 | data1, 72 | data2, 73 | on='SNP', 74 | suffixes=('_1', '_2') 75 | ) 76 | 77 | else: 78 | raise ValueError("At least CHR/POS or SNP columns must be present in both datasets for colocalization analysis") 79 | 80 | # After merging, check if we can align alleles 81 | if all(col in merged_data.columns for col in ['EA_1', 'NEA_1', 'EA_2', 'NEA_2']): 82 | print("Aligning effect alleles between datasets") 83 | 84 | # Ensure allele columns are preprocessed 85 | check_allele_column(data1, "EA", keep_indel=False) 86 | check_allele_column(data1, "NEA", keep_indel=False) 87 | check_allele_column(data2, "EA", keep_indel=False) 88 | check_allele_column(data2, "NEA", keep_indel=False) 89 | 90 | # Adjust BETA from trait 2 to correspond to the same effect allele as trait 1 91 | conditions = [ 92 | merged_data["EA_1"] == merged_data["EA_2"], 93 | merged_data["EA_1"] == merged_data["NEA_2"], 94 | True, 95 | ] 96 | choices = [ 97 | merged_data["BETA_2"], 98 | -merged_data["BETA_2"], 99 | np.nan, 100 | ] 101 | merged_data["BETA_2"] = np.select(conditions, choices) 102 | else: 103 | print("Allele columns (EA, NEA) not present in both datasets. " 104 | "This might lead to incorrect results if the effect estimates (BETA) were not obtained with the same reference allele in both datasets.") 105 | 106 | # Clean up columns 107 | merged_data.drop(columns=["EA_2", "NEA_2", "SNP_2", "CHR_2", "POS_2"], inplace=True, errors='ignore') 108 | merged_data.rename(columns={"SNP_1": "SNP", "CHR_1": "CHR", "POS_1": "POS"}, inplace=True, errors='ignore') 109 | 110 | # Drop any rows with duplicate values 111 | if "SNP" in merged_data.columns: 112 | merged_data.drop_duplicates(subset=['SNP'], keep='first', inplace=True) 113 | if "CHR" in merged_data.columns and "POS" in merged_data.columns: 114 | merged_data.drop_duplicates(subset=["CHR", "POS"], keep='first', inplace=True) 115 | 116 | # Drop any rows with missing values 117 | merged_data = merged_data.dropna() 118 | if merged_data.empty: 119 | raise ValueError("No overlapping variants found between the datasets") 120 | 121 | print(f"Using {len(merged_data)} overlapping variants for colocalization analysis") 122 | 123 | # Estimate sdY if not provided for quantitative traits 124 | if trait1_type == "quant" and sdY1 is None: 125 | if 'EAF_1' not in merged_data.columns or n1 is None: 126 | print("Neither sdY1 nor EAF and n1 are provided for trait 1. Assuming sdY1 = 1.") 127 | sdY1 = 1 128 | else: 129 | sdY1 = sdY_est(merged_data['SE_1']**2, merged_data['EAF_1'], n1) 130 | print(f"Using EAF and n1 to estimate sdY1: {sdY1:.2f}") 131 | 132 | if trait2_type == "quant" and sdY2 is None: 133 | if 'EAF_2' not in merged_data.columns or n2 is None: 134 | print("Neither sdY2 nor EAF and n2 are provided for trait 2. Assuming sdY2 = 1.") 135 | sdY2 = 1 136 | else: 137 | sdY2 = sdY_est(merged_data['SE_2']**2, merged_data['EAF_2'], n2) 138 | print(f"Using EAF and n2 to estimate sdY2: {sdY2:.2f}") 139 | 140 | # Calculate Bayes factors for each dataset 141 | lABF_1 = approx_bf_estimates(merged_data['BETA_1'], merged_data['SE_1']**2, 142 | trait_type=trait1_type, sdY=sdY1) 143 | lABF_2 = approx_bf_estimates(merged_data['BETA_2'], merged_data['SE_2']**2, 144 | trait_type=trait2_type, sdY=sdY2) 145 | 146 | # Adjust priors based on number of SNPs 147 | n_snps = len(merged_data) 148 | if n_snps * p1 >= 1: 149 | p1 = 1 / (n_snps + 1) 150 | if n_snps * p2 >= 1: 151 | p2 = 1 / (n_snps + 1) 152 | if n_snps * p12 >= 1: 153 | p12 = 1 / (n_snps + 1) 154 | 155 | # Calculate posterior probabilities 156 | pp = combine_abf(lABF_1, lABF_2, p1, p2, p12) 157 | 158 | # Add SNP-specific results 159 | results_df = merged_data.copy() 160 | results_df['lABF_1'] = lABF_1 161 | results_df['lABF_2'] = lABF_2 162 | results_df['internal.sum.lABF'] = lABF_1 + lABF_2 163 | 164 | # Calculate SNP-specific PP for H4 165 | my_denom_log_abf = logsum(results_df['internal.sum.lABF']) 166 | results_df['SNP.PP.H4'] = np.exp(results_df['internal.sum.lABF'] - my_denom_log_abf) 167 | 168 | return { 169 | 'nsnps': n_snps, 170 | **pp 171 | } 172 | 173 | def approx_bf_estimates(beta, varbeta, trait_type="quant", sdY=1, effect_priors={'quant': 0.15, 'cc': 0.2}): 174 | """ 175 | Calculate approximate Bayes factors using regression estimates. 176 | 177 | Args: 178 | beta: effect size estimate 179 | varbeta: variance of the effect size estimate 180 | trait_type: either "quant" for quantitative trait or "cc" for case-control 181 | sdY: standard deviation of the trait (for quantitative traits) 182 | effect_priors: dictionary with prior effect sizes for quantitative and case-control traits 183 | 184 | Returns: 185 | array: log approximate Bayes factors 186 | """ 187 | z = beta / np.sqrt(varbeta) 188 | 189 | # Set prior standard deviation based on trait type 190 | if trait_type == "quant": 191 | sd_prior = effect_priors['quant'] * sdY 192 | else: # case-control 193 | sd_prior = effect_priors['cc'] 194 | 195 | r = sd_prior**2 / (sd_prior**2 + varbeta) 196 | lABF = 0.5 * (np.log(1 - r) + (r * z**2)) 197 | return lABF 198 | 199 | def logsum(x): 200 | """Calculate log of sum of exponentials""" 201 | my_max = np.max(x) 202 | return my_max + np.log(np.sum(np.exp(x - my_max))) 203 | 204 | def logdiff(x, y): 205 | """Calculate log of difference of exponentials""" 206 | my_max = max(x, y) 207 | return my_max + np.log(exp(x - my_max) - np.exp(y - my_max)) 208 | 209 | def combine_abf(l1, l2, p1, p2, p12): 210 | """Calculate posterior probabilities for different hypotheses""" 211 | lsum = l1 + l2 212 | 213 | lH0_abf = 0 214 | lH1_abf = np.log(p1) + logsum(l1) 215 | lH2_abf = np.log(p2) + logsum(l2) 216 | lH3_abf = np.log(p1) + np.log(p2) + logdiff(logsum(l1) + logsum(l2), logsum(lsum)) 217 | lH4_abf = np.log(p12) + logsum(lsum) 218 | 219 | all_abf = np.array([lH0_abf, lH1_abf, lH2_abf, lH3_abf, lH4_abf]) 220 | denom_log_abf = logsum(all_abf) 221 | pp_abf = np.exp(all_abf - denom_log_abf) 222 | 223 | return { 224 | 'PP.H0.abf': pp_abf[0], 225 | 'PP.H1.abf': pp_abf[1], 226 | 'PP.H2.abf': pp_abf[2], 227 | 'PP.H3.abf': pp_abf[3], 228 | 'PP.H4.abf': pp_abf[4] 229 | } 230 | 231 | def sdY_est(vbeta, maf, n): 232 | """ 233 | Estimate trait standard deviation given vectors of variance of coefficients, MAF and sample size. 234 | 235 | Args: 236 | vbeta: vector of variance of coefficients 237 | maf: vector of MAF (same length as vbeta) 238 | n: sample size 239 | 240 | Returns: 241 | float: estimated standard deviation of Y 242 | """ 243 | oneover = 1/vbeta 244 | nvx = 2 * n * maf * (1-maf) 245 | # Fit linear regression through origin 246 | coef = np.sum(nvx * oneover) / np.sum(oneover**2) 247 | if coef < 0: 248 | raise ValueError("Estimated sdY is negative - this can happen with small datasets, or those with errors. A reasonable estimate of sdY is required to continue.") 249 | return np.sqrt(coef) -------------------------------------------------------------------------------- /docs/build/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | genal: A Python Toolkit for Genetic Risk Scoring and Mendelian Randomization — genal v0.8 documentation 8 | 9 | 10 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 |
25 | 54 | 55 |
59 | 60 |
61 |
62 |
63 |
    64 |
  • 65 | 66 |
  • 67 | View page source 68 |
  • 69 |
70 |
71 |
72 |
73 |
74 | 75 |
76 |

genal: A Python Toolkit for Genetic Risk Scoring and Mendelian Randomization

77 |
78 |
Author:
79 |

Cyprien Rivier

80 |
81 |
Date:
82 |

Aug 13, 2024

83 |
84 |
Version:
85 |

“0.8”

86 |
87 |
88 |

Genal is a python module designed to make it easy to run genetic risk scores and mendelian randomization analyses. It integrates a collection of tools that facilitate the cleaning of single nucleotide polymorphism data (usually derived from Genome-Wide Association Studies) and enable the execution of key clinical population genetic workflows. The functionalities provided by genal include clumping, lifting, association testing, polygenic risk scoring, and Mendelian randomization analyses, all within a single Python module.

89 |

The module prioritizes user-friendliness and intuitive operation, aiming to reduce the complexity of data analysis for researchers. Despite its focus on simplicity, Genal does not sacrifice the depth of customization or the precision of analysis. Researchers can expect to maintain analytical rigour while benefiting from the streamlined experience.

90 |

Genal draws on concepts from well-established R packages such as TwoSampleMR, MR-Presso, MendelianRandomization, and gwasvcf, adapting their proven methodologies to the Python environment. This approach ensures that users have access to tried and tested techniques with the versatility of Python’s data science tools.

91 |

To install the latest release, type:

92 |
pip install genal-python
 93 | 
94 |
95 |
96 |

Contents

97 |
98 | 106 |
107 |
108 |
109 |
110 |

Indices and tables

111 | 116 |
117 |

Citation

118 |

If you use genal in your work, please cite the following paper:

119 |
120 |
121 | [Rivier.2024] 122 |

Genal: A Python Toolkit for Genetic Risk Scoring and Mendelian Randomization 123 | Cyprien Rivier, Cyprien A. Rivier, Santiago Clocchiatti-Tuozzo, Shufan Huo, Victor Torres-Lopez, Daniela Renedo, Kevin N. Sheth, Guido J. Falcone, Julian N. Acosta. 124 | medRxiv. 2024 May 10.1101/2024.05.23.24307776.

125 |
126 |
127 |
128 |
129 |

References

130 |
131 |
132 | [Hemani.2018] 133 |

The MR-Base platform supports systematic causal inference across the human phenome. 134 | Hemani G, Zheng J, Elsworth B, Wade KH, Baird D, Haberland V, Laurin C, Burgess S, Bowden J, Langdon R, Tan VY, Yarmolinsky J, Shihab HA, Timpson NJ, Evans DM, Relton C, Martin RM, Davey Smith G, Gaunt TR, Haycock PC, The MR-Base Collaboration 135 | eLife. 2018 May 10.7554/eLife.34408. 136 | PMID: 29846171.

137 |
138 |
139 | [Verbanck.2018] 140 |

Detection of widespread horizontal pleiotropy in causal relationships inferred from Mendelian randomization between complex traits and diseases. 141 | Marie Verbanck, Chia-Yen Chen, Benjamin Neale, Ron Do. 142 | Nature Genetics 2018 May 10.1038/s41588-018-0099-7. 143 | PMID: 29686387.

144 |
145 |
146 | [Lyon.2020] 147 |

The variant call format provides efficient and robust storage of GWAS summary statistics. 148 | Matthew Lyon, Shea J Andrews, Ben Elsworth, Tom R Gaunt, Gibran Hemani, Edoardo Marcora. 149 | bioRxiv 2020 May 30 2020.05.29.115824v1. 150 | PMID: 33441155.

151 |
152 |
153 |
154 |
155 | 156 | 157 |
158 |
159 |
162 | 163 |
164 | 165 |
166 |

© Copyright 2023, Cyprien A. Rivier.

167 |
168 | 169 | Built with Sphinx using a 170 | theme 171 | provided by Read the Docs. 172 | 173 | 174 |
175 |
176 |
177 |
178 |
179 | 184 | 185 | 186 | -------------------------------------------------------------------------------- /genal/lift.py: -------------------------------------------------------------------------------- 1 | from pyliftover import LiftOver 2 | import os, subprocess 3 | import numpy as np 4 | import wget 5 | import gzip 6 | import shutil 7 | import uuid 8 | import pandas as pd 9 | from concurrent.futures import ThreadPoolExecutor, as_completed 10 | 11 | from .tools import read_config, create_tmp 12 | 13 | 14 | def lift_data( 15 | data, 16 | start="hg19", 17 | end="hg38", 18 | extraction_file=False, 19 | chain_file=None, 20 | name=None, 21 | liftover_path=None, 22 | object_id="tmp_id", 23 | ): 24 | """ 25 | Perform a liftover from one genetic build to another. If the chain file required for the liftover is not present, it will be downloaded. It"s also possible to manually provide the path to the chain file. 26 | If the dataset is large, it is suggested to use an alternate method (e.g., `lift_data_liftover`). 27 | 28 | Args: 29 | data (pd.DataFrame): The input data containing at least "CHR" and "POS" columns. 30 | start (str, optional): The current build of the data. Defaults to "hg19". 31 | end (str, optional): The target build for liftover. Defaults to "hg38". 32 | extraction_file (bool, optional): If True, also prints a CHR POS SNP space-delimited file for extraction. Defaults to False. 33 | chain_file (str, optional): Path to a local chain file for the lift. Overrides the start and end arguments if provided. 34 | name (str, optional): Specify a filename or filepath (without extension) for saving. If not provided, the data is not saved. 35 | liftover_path (str, optional): Specify the path to the USCS liftover executable. If not provided, the lift will be done in python (slower for large amount of SNPs). 36 | object_id (str, optional): Specify the object id for tmp file writing (internal use only) 37 | 38 | Raises: 39 | ValueError: If required columns are missing or if provided chain file path is incorrect. 40 | 41 | Returns: 42 | pd.DataFrame: Lifted data. 43 | 44 | Notes: 45 | Function for the :meth:`Geno.lift` method. 46 | """ 47 | 48 | # Prepare chain file and get its path 49 | chain_path = prepare_chain_file(chain_file, start, end) 50 | 51 | # Prepare the data for lifting: handle missing values in CHR, POS columns 52 | nrows = data.shape[0] 53 | data.dropna(subset=["CHR", "POS"], inplace=True) 54 | data.reset_index(drop=True, inplace=True) 55 | n_na = nrows - data.shape[0] 56 | if n_na: 57 | print( 58 | f"Excluded {n_na} SNPs ({n_na/nrows*100:.3f}%) with NaN values in CHR or POS columns." 59 | ) 60 | 61 | # Perform liftover with the liftover executable or in python 62 | if liftover_path: 63 | data = lift_coordinates_liftover(data, object_id, chain_path, liftover_path) 64 | else: 65 | data = lift_coordinates_python(data, chain_path) 66 | 67 | # Handle post-liftover operations 68 | data = post_lift_operations(data, name, extraction_file) 69 | 70 | return data 71 | 72 | 73 | def prepare_chain_file(chain_file, start, end): 74 | """Handle chain file loading, downloading if necessary. Return its path.""" 75 | if chain_file is not None: # If a local chain file is provided 76 | if not os.path.isfile(chain_file): 77 | raise ValueError("The provided path does not lead to a valid file.") 78 | print( 79 | "You provided a path to a local chain path which will be used for the lift." 80 | ) 81 | chain_path = chain_file 82 | else: # Use the specified start and end builds to identify chain file 83 | # Construct chain filename 84 | chain_name = f"{start.lower()}To{end.capitalize()}.over.chain" 85 | config = read_config() 86 | ref_path = config["paths"]["ref_path"] 87 | chains_folder_path = os.path.join(ref_path, "chain_files") 88 | 89 | # Ensure directory for chain files exists 90 | if not os.path.exists(chains_folder_path): 91 | try: 92 | os.makedirs(chains_folder_path) 93 | except OSError: 94 | raise OSError( 95 | "Unable to create the 'tmp_GENAL' directory. Check permissions." 96 | ) 97 | 98 | # Check for the chain file locally or download it if necessary 99 | chain_path = os.path.join(chains_folder_path, chain_name) 100 | if not os.path.isfile(chain_path): 101 | print( 102 | f"The chain file to lift from {start} to {end} was not found. Attempting to download it..." 103 | ) 104 | # Download the chain file 105 | url = f"https://hgdownload.soe.ucsc.edu/goldenPath/{start.lower()}/liftOver/{chain_name}.gz" 106 | try: 107 | wget.download(url, out=chains_folder_path) 108 | # Decompress the downloaded file 109 | print(f"The download was successful. Unzipping...") 110 | with gzip.open(f"{chain_path}.gz", "rb") as f_in, open( 111 | chain_path, "wb" 112 | ) as f_out: 113 | shutil.copyfileobj(f_in, f_out) 114 | except Exception as e: 115 | print(f"The download was unsuccessful: {e}") 116 | print( 117 | "Consider downloading the chain file manually from the UCSC website and providing its path via the chain_file argument." 118 | ) 119 | raise FileNotFoundError("Chain file not found.") 120 | 121 | return chain_path 122 | 123 | 124 | def lift_coordinates_liftover(data, object_id, chain_path, liftover_path): 125 | """Lift data using the liftover executable and a chain file.""" 126 | # Add the executable part if not there 127 | if not os.path.isfile(liftover_path): 128 | liftover_path = os.path.join(liftover_path, "liftOver") 129 | # Check that it is indeed the path to liftOver executable 130 | try: 131 | process = subprocess.run( 132 | [liftover_path], 133 | stdout=subprocess.PIPE, 134 | stderr=subprocess.PIPE, 135 | timeout=5, 136 | text=True, 137 | ) 138 | if not process.stderr.startswith("liftOver"): 139 | raise TypeError( 140 | "The path provided is an executable, but not the liftOver executable. Check the path." 141 | ) 142 | except Exception as e: 143 | raise TypeError(e) 144 | print("Lifting coordinates using liftOver.") 145 | 146 | # Write data in correct format for liftOver 147 | create_tmp() 148 | data["CHR_liftover"] = "chr" + data.CHR.astype(str) 149 | to_lift_filename = os.path.join("tmp_GENAL", f"{object_id}.prelift") 150 | lifted_filename = os.path.join("tmp_GENAL", f"{object_id}.postlift") 151 | unmapped_filename = os.path.join("tmp_GENAL", f"{object_id}_unMapped") 152 | data[["CHR_liftover", "POS", "POS"]].to_csv( 153 | to_lift_filename, sep=" ", index=False, header=False 154 | ) 155 | 156 | # Call the liftOver software 157 | command = f"{liftover_path} {to_lift_filename} \ 158 | {chain_path} {lifted_filename} {unmapped_filename}" 159 | try: 160 | output = subprocess.run( 161 | command, shell=True, capture_output=True, text=True, check=True 162 | ) 163 | except Exception as e: 164 | print(f"Error running liftOver: {e}") 165 | raise ValueError("Error running liftOver. Check error message for more details.") 166 | 167 | ## Read the output, print the number of unlifted SNPs and remove them from the prelift data. 168 | df_post = pd.read_csv(lifted_filename, sep="\t", header=None) 169 | unMapped = open(unmapped_filename, "r") 170 | Lines = unMapped.readlines() 171 | if len(Lines) > 0: 172 | print(f"{int(len(Lines)/2)} SNPs could not be lifted.") 173 | else: 174 | print(f"All SNPs have been lifted.") 175 | indices = list() 176 | for i in range(1, len(Lines), 2): 177 | c = Lines[i].strip() 178 | (chrom, pos, pos) = c.split("\t") 179 | indices.append(str(chrom) + ":" + str(pos)) 180 | drop_indices = data[(data.CHR_liftover.astype(str) + ":" + data.POS.astype(str)).isin(indices)].index 181 | data.drop(index=drop_indices, inplace=True) 182 | data.reset_index(drop=True, inplace=True) 183 | 184 | # Check the length of files 185 | if len(data) != len(df_post): 186 | raise ValueError( 187 | "There was a problem lifting with liftOver. Try lifting in python (liftover_path = None)." 188 | ) 189 | 190 | ## Merge prelift and postlift data. Unknown chr from the output of liftOver are assigned the value 99. SNPs mapped to unknown chr are deleted from the final data and their number printed. 191 | data["POS"] = df_post[1].astype(int) 192 | data["CHR"] = ( 193 | df_post[0] 194 | .str.split("chr", expand=True)[1] 195 | .str.split("_", expand=True)[0] 196 | .replace({"X": 99, "Y": 99, "Un": 99}) 197 | .astype(int) 198 | ) 199 | nrow_before = data.shape[0] 200 | drop_chr_indices = data[data.CHR == 99].index 201 | data.drop(index=drop_chr_indices, inplace=True) 202 | nrow_diff = nrow_before - data.shape[0] 203 | if nrow_diff > 0: 204 | print( 205 | f"{nrow_diff} SNPs were lifted to an unknown chromosome and deleted from the final files." 206 | ) 207 | data.drop(columns=["CHR_liftover"], inplace=True) 208 | return data 209 | 210 | 211 | def lift_coordinates_python(data, chain_path): 212 | """Perform liftover on data using the chain passed.""" 213 | lo = LiftOver(chain_path) 214 | 215 | # Print message 216 | print("Lifting coordinates in python...") 217 | nrows = data.shape[0] 218 | if nrows > 500000: 219 | print("Your data is large, this can take a few minutes...") 220 | 221 | # Create a list of tuples and lift 222 | coordinates = list(zip(data["CHR"], data["POS"])) 223 | 224 | # Perform the lift 225 | def convert_coordinate(args): 226 | return lo.convert_coordinate(f"chr{args[0]}", args[1], "-") 227 | 228 | results = list(ThreadPoolExecutor().map(convert_coordinate, coordinates)) 229 | 230 | data["POS"] = [res[0][1] if res else np.nan for res in results] 231 | data["CHR"] = [res[0][0].split("chr")[1] if res else np.nan for res in results] 232 | nrows = data.shape[0] 233 | data.dropna(subset=["POS", "CHR"], inplace=True) 234 | data["POS"] = data["POS"].astype("Int32") 235 | data["CHR"] = data["CHR"].astype("Int32") 236 | data.reset_index(drop=True, inplace=True) 237 | n_na = nrows - data.shape[0] 238 | if n_na: 239 | print(f"{n_na} SNPs ({n_na/nrows*100:.3f}%) could not be lifted.") 240 | else: 241 | print("All SNPs have been lifted.") 242 | return data 243 | 244 | 245 | def post_lift_operations(data, name, extraction_file): 246 | """Handle post-liftover operations like reporting, and saving results.""" 247 | if name: 248 | filename = os.path.splitext(name)[0] + ".txt" 249 | data.to_csv(f"{filename}", sep="\t", header=True, index=False) 250 | print(f"Lifted list of SNPs saved to {filename}") 251 | if extraction_file: 252 | if not ("SNP" in data.columns): 253 | data["SNP"] = data["CHR"].astype(str) + ":" + data["POS"].astype(str) 254 | data[["CHR", "POS", "SNP"]].to_csv( 255 | f"{name + '_lifted'}_extraction.txt", sep=" ", header=False, index=False 256 | ) 257 | print(f"Extraction file saved to {name+ '_lifted'}_extraction.txt") 258 | return data 259 | -------------------------------------------------------------------------------- /genal/MRpresso.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import statsmodels.formula.api as smf 4 | from concurrent.futures import ProcessPoolExecutor 5 | from sklearn.linear_model import LinearRegression 6 | from tqdm import tqdm 7 | from numpy.random import default_rng 8 | from functools import partial 9 | 10 | ##todo: implement the multivariable option, for the moment we assume only 1 BETA_e column 11 | 12 | 13 | # MR-PRESSO main function 14 | def mr_presso( 15 | data, 16 | BETA_e_columns=["BETA_e"], 17 | n_iterations=1000, 18 | outlier_test=True, 19 | distortion_test=True, 20 | significance_p=0.05, 21 | cpus=5, 22 | ): 23 | """ 24 | Perform the MR-PRESSO algorithm for detection of horizontal pleiotropy. 25 | 26 | Args: 27 | data (pd.DataFrame): DataFrame with at least 4 columns: BETA_o (outcome), SE_o, BETA_e (exposure), SE_e. 28 | BETA_e_columns (list): List of exposure beta columns. 29 | n_iterations (int): Number of steps performed (random data generation). 30 | outlier_test (bool): If True, identifies outlier SNPs responsible for horizontal pleiotropy. 31 | distortion_test (bool): If True, tests significant distortion in the causal estimates. 32 | significance_p (float): Statistical significance threshold for the detection of horizontal pleiotropy. 33 | cpus (int): Number of CPUs to use for parallel processing. 34 | 35 | Returns: 36 | mod_table (pd.DataFrame): DataFrame with the original and outlier-corrected inverse variance-weighted MR results. 37 | GlobalTest (dict): Dictionary with p-value of the global MR-PRESSO test. 38 | OutlierTest (pd.DataFrame): DataFrame with p-value for each SNP for the outlier test. 39 | BiasTest (dict): Dictionary with results of the distortion test. 40 | """ 41 | # Transforming the data 42 | data = data[["BETA_o", *BETA_e_columns, "SE_o", "SE_e"]].dropna() 43 | data[["BETA_o", *BETA_e_columns]] = data[["BETA_o", *BETA_e_columns]].multiply( 44 | np.sign(data[BETA_e_columns[0]]), axis=0 45 | ) 46 | data["Weights"] = 1 / (data["SE_o"] ** 2) 47 | 48 | if len(data) <= len(BETA_e_columns) + 2: 49 | raise Exception("Not enough instrumental variables (variants)") 50 | if len(data) >= n_iterations: 51 | raise Exception( 52 | "Not enough elements to compute empirical P-values, increase n_iterations" 53 | ) 54 | 55 | print(f"Running the MR-PRESSO algorithm with N = {n_iterations} iterations.") 56 | # 1- Computing the observed residual sum of squares (RSS) 57 | print(f"Computing the observed residual sum of squares...") 58 | RSSobs = getRSS_LOO(data, BETA_e_columns, outlier_test) 59 | 60 | # 2- Computing the distribution of expected residual sum of squares (RSS) 61 | print("Computing the global MR-PRESSO p-value...") 62 | partial_parallel_RSS_LOO = partial( 63 | parallel_RSS_LOO, data=data, BETA_e_columns=BETA_e_columns 64 | ) # Wrapper function freezing the parallel_RSS_LOO call 65 | with ProcessPoolExecutor(max_workers=cpus) as executor: 66 | results = list( 67 | tqdm( 68 | executor.map(partial_parallel_RSS_LOO, range(n_iterations)), 69 | total=n_iterations, 70 | desc="Generating random data", 71 | ncols=100, 72 | ) 73 | ) 74 | 75 | RSSexp = [res[0] for res in results] 76 | Random_data_e = np.vstack([r[1] for r in results]) 77 | Random_data_o = np.vstack([r[2] for r in results]) 78 | 79 | global_p = np.sum([r > RSSobs[0] for r in RSSexp]) / n_iterations 80 | 81 | if outlier_test: 82 | GlobalTest = {"RSSobs": RSSobs[0], "global_test_p": global_p} 83 | else: 84 | GlobalTest = {"RSSobs": RSSobs, "global_test_p": global_p} 85 | 86 | # 3- Computing the single IV outlier test 87 | if global_p < significance_p and outlier_test: 88 | print("Global p-value is below the significance threshold. Running the Outlier test.") 89 | 90 | if len(BETA_e_columns) == 1: 91 | Dif = data["BETA_o"].values - data["BETA_e"].values * RSSobs[1] 92 | Exp = Random_data_o - (Random_data_e * RSSobs[1]) 93 | else: 94 | raise ValueError("Outlier test not done for multi MR.") 95 | 96 | abs_diffs = np.abs(Exp.T) > np.abs(Dif)[:, np.newaxis] 97 | pvals = np.sum(abs_diffs, axis=1) / Exp.shape[0] 98 | 99 | OutlierTest = pd.DataFrame({"RSSobs": Dif**2, "Pvalue": pvals}) 100 | 101 | OutlierTest.index = data.index 102 | OutlierTest["Pvalue"] = np.minimum( 103 | OutlierTest["Pvalue"] * len(data), 1 104 | ) # Bonferroni correction 105 | if data.shape[0] / n_iterations > significance_p: 106 | print( 107 | f"Warning: the Outlier test in unstable. The {significance_p} significance threshold cannot be obtained with {n_iterations} Distributions. Increase n_iterations." 108 | ) 109 | 110 | else: 111 | outlier_test = False 112 | OutlierTest = pd.DataFrame() 113 | 114 | # 4- Computing the test of the distortion of the causal estimate 115 | formula = f"BETA_o ~ -1 + {' + '.join(BETA_e_columns)}" 116 | mod_all = smf.wls(formula, data=data, weights=data["Weights"]).fit() 117 | 118 | BiasTest = {} 119 | subset_data = None 120 | 121 | if distortion_test and outlier_test: 122 | ## Is there an error in the MRPRESSO code? The outlier indices are supposed to be excluded from the expected bias computation (as per the paper). 123 | def get_random_bias(BETA_e_columns, data, ref_outlier): 124 | indices = np.concatenate( 125 | [ 126 | ref_outlier, 127 | np.random.choice( 128 | list(set(range(len(data))) - set(ref_outlier)), 129 | len(data) - len(ref_outlier), 130 | ), 131 | ] 132 | ) 133 | subset_data = data.iloc[indices[: -len(ref_outlier)]] 134 | mod_random = smf.wls( 135 | f"BETA_o ~ -1 + {' + '.join(BETA_e_columns)}", 136 | data=subset_data, 137 | weights=subset_data["Weights"], 138 | ).fit() 139 | return mod_random.params[BETA_e_columns] 140 | 141 | ref_outlier = OutlierTest.loc[OutlierTest["Pvalue"] <= significance_p].index 142 | 143 | if len(ref_outlier) > 0: 144 | if len(ref_outlier) < len(data): 145 | print(f"{len(ref_outlier)}/{len(data)} ({len(ref_outlier)/len(data)*100:.2f}%) outliers found. Running the Distortion test.") 146 | BiasExp = [ 147 | get_random_bias(BETA_e_columns, data, ref_outlier) 148 | for _ in range(n_iterations) 149 | ] 150 | BiasExp = pd.concat(BiasExp, axis=1).transpose() 151 | 152 | subset_data = data.drop(ref_outlier) 153 | mod_no_outliers = smf.wls( 154 | f"BETA_o ~ -1 + {' + '.join(BETA_e_columns)}", 155 | data=subset_data, 156 | weights=subset_data["Weights"], 157 | ).fit() 158 | 159 | BiasObs = ( 160 | mod_all.params[BETA_e_columns] 161 | - mod_no_outliers.params[BETA_e_columns] 162 | ) / abs(mod_no_outliers.params[BETA_e_columns]) 163 | BiasExp = (mod_all.params[BETA_e_columns] - BiasExp) / abs(BiasExp) 164 | 165 | p_value = np.sum(np.abs(BiasExp) > np.abs(BiasObs)) / n_iterations 166 | 167 | BiasTest = { 168 | "outliers_indices": list(ref_outlier), 169 | "distortion_test_coefficient": 100 * BiasObs.values[0], 170 | "distortion_test_p": p_value.iloc[0], 171 | } 172 | else: 173 | print("All SNPs considered as outliers. Skipping the Distortion test.") 174 | BiasTest = { 175 | "outliers_indices": "All SNPs considered as outliers", 176 | "distortion_test_coefficient": np.nan, 177 | "distortion_test_p": np.nan, 178 | } 179 | else: 180 | print("No significant outliers found. Skipping the Distortion test.") 181 | BiasTest = { 182 | "outliers_indices": "No significant outliers", 183 | "distortion_test_coefficient": np.nan, 184 | "distortion_test_p": np.nan, 185 | } 186 | 187 | # 5- Format 188 | row_original = { 189 | "exposure": BETA_e_columns[0], 190 | "method": "Raw", 191 | "nSNP": len(data), 192 | "b": mod_all.params["BETA_e"], 193 | "se": mod_all.bse["BETA_e"], 194 | "pval": mod_all.pvalues["BETA_e"], 195 | } 196 | if "mod_no_outliers" in locals(): 197 | row_corrected = { 198 | "exposure": BETA_e_columns[0], 199 | "method": "Outlier-corrected", 200 | "nSNP": len(data) - len(ref_outlier), 201 | "b": mod_no_outliers.params["BETA_e"], 202 | "se": mod_no_outliers.bse["BETA_e"], 203 | "pval": mod_no_outliers.pvalues["BETA_e"], 204 | } 205 | else: 206 | row_corrected = { 207 | "exposure": BETA_e_columns[0], 208 | "method": "Outlier-corrected", 209 | "nSNP": np.nan, 210 | "b": np.nan, 211 | "se": np.nan, 212 | "pval": np.nan, 213 | } 214 | 215 | mod_table = pd.DataFrame([row_original, row_corrected]) 216 | 217 | return mod_table, GlobalTest, OutlierTest, BiasTest, subset_data 218 | 219 | 220 | ## MR-PRESSO helper functions 221 | # Define the matrix power operator 222 | def power_eigen(x, n): 223 | values, vectors = np.linalg.eig(x) 224 | return vectors.dot(np.diag(values**n)).dot(vectors.T) 225 | 226 | 227 | # Function to compute the residual sum of squares in a LOO framework 228 | def getRSS_LOO(data, BETA_e_columns, returnIV): 229 | dataW = data[["BETA_o"] + BETA_e_columns].multiply(np.sqrt(data["Weights"]), axis=0) 230 | X = dataW[BETA_e_columns].values 231 | Y = dataW["BETA_o"].values 232 | 233 | # Matrix operations after LOO 234 | def loo_calculation(i): 235 | X_loo = np.delete(X, i, axis=0) 236 | Y_loo = np.delete(Y, i, axis=0) 237 | return power_eigen(X_loo.T.dot(X_loo), -1).dot(X_loo.T).dot(Y_loo) 238 | 239 | CausalEstimate_LOO = np.array([loo_calculation(i) for i in range(len(dataW))]) 240 | 241 | if len(BETA_e_columns) == 1: 242 | CausalEstimate_LOO = CausalEstimate_LOO.reshape(-1) 243 | RSS = np.nansum((Y - CausalEstimate_LOO * X.reshape(-1)) ** 2) 244 | else: 245 | raise ValueError("Needs to do the getRSS_LOO for multi exposure.") 246 | # RSS = np.nansum((Y - np.sum(CausalEstimate_LOO.T * X, axis=1)) ** 2) 247 | 248 | if returnIV: 249 | return (RSS, CausalEstimate_LOO) 250 | return RSS 251 | 252 | 253 | # Generate random data based on normal distributions 254 | def getRandomData(data, BETA_e_columns=["BETA_e"]): 255 | rng = default_rng() 256 | 257 | models = [] 258 | for i in range(len(data)): 259 | lm = LinearRegression(fit_intercept=False) 260 | data_i = data.drop(i) 261 | lm.fit( 262 | data_i[BETA_e_columns], data_i["BETA_o"], sample_weight=data_i["Weights"] 263 | ) 264 | models.append(lm) 265 | 266 | random_data_dict = {} 267 | for col, sd_col in zip(BETA_e_columns, ["SE_e"]): 268 | random_data_dict[col] = rng.normal(data[col], data[sd_col]) 269 | 270 | random_data_dict["BETA_o"] = [ 271 | rng.normal( 272 | model.predict(data.iloc[[i]][BETA_e_columns]), data.iloc[i]["SE_o"] 273 | ).item() 274 | for i, model in enumerate(models) 275 | ] 276 | random_data_dict["Weights"] = data["Weights"].values 277 | 278 | random_data_df = pd.DataFrame(random_data_dict) 279 | return random_data_df 280 | 281 | 282 | # Function for the parallel executor in step 2: generate random data and compute the expected residual sum of squares 283 | def parallel_RSS_LOO(i, data, BETA_e_columns): 284 | random_data = getRandomData(data, BETA_e_columns) 285 | 286 | rss_exp = getRSS_LOO(random_data, BETA_e_columns, False) 287 | return (rss_exp, random_data["BETA_e"].values, random_data["BETA_o"].values) 288 | -------------------------------------------------------------------------------- /genal/proxy.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import os 4 | import subprocess 5 | import re 6 | import uuid 7 | 8 | from .tools import get_reference_panel_path, get_plink_path, run_plink_command 9 | 10 | ## TO DO: accept lists of CHR/POS instead of SNP names for these functions 11 | 12 | 13 | def query_outcome_proxy(df, ld, snps_to_extract, snps_df=[]): 14 | """ 15 | Extract the best proxies from a dataframe, as well as specific SNPs. 16 | 17 | Given a dataframe `df` (originating from Geno.data) and a dataframe of potential proxies 18 | (output from `find_proxies`), this function extracts the best proxies from `df` as well as 19 | the SNPs specified in `snps_to_extract`. 20 | This is suited for querying outcome data. 21 | 22 | Args: 23 | df (pd.DataFrame): Dataframe of SNP information with the usual Geno columns 24 | (SNP, BETA, SE, EAF, EA, NEA). EAF is not necessary. 25 | ld (pd.DataFrame): Dataframe of proxies (output from `find_proxies`). 26 | snps_to_extract (list): List of SNPs to extract in addition to the proxies. 27 | snps_df (list, optional): List of SNPs to choose the proxy from. Should be the list of 28 | SNPs in df. Can be provided to avoid recomputing it. Defaults to an empty list. 29 | 30 | Returns: 31 | pd.DataFrame: Dataframe with queried SNPs and their proxies. 32 | """ 33 | # If ld is None 34 | if not isinstance(ld, pd.DataFrame): 35 | raise ValueError("ld is None (The SNPs to be proxied were not found in the reference panel)") 36 | 37 | # If snps_df is empty, populate it with SNPs from df 38 | if not snps_df: 39 | snps_df = df.SNP.values 40 | 41 | # Filter proxies that are present in df 42 | ld = ld[ld.SNP_B.isin(snps_df)] 43 | 44 | # Remove original SNPs 45 | ld = ld[ld["SNP_A"] != ld["SNP_B"]] 46 | 47 | # Sort by r and select the best proxy for each SNP 48 | ld = ld.reindex(ld["R"].abs().sort_values(ascending=False).index) 49 | ld = ld.groupby("SNP_A").first().reset_index(drop=False) 50 | 51 | # Determine SNPs to query 52 | snps_to_query = set(snps_to_extract) | set(ld.SNP_B.values) 53 | df_queried = df[df.SNP.isin(snps_to_query)] 54 | 55 | # Merge dataframes and identify proxies 56 | output = df_queried.merge(ld, how="left", left_on="SNP", right_on="SNP_B") 57 | output["proxy"] = output["SNP_B"].notnull() 58 | 59 | # In the plink output, the alleles taken as reference for proxying are "MAJ_A" and "MAJ_B" (major alleles in the reference panel) 60 | # We want to use as effect allele for the original SNP its minor allele in the reference panel 61 | # So, we flip BETA if the proxied SNP's effect allele is the major allele in the reference panel 62 | conditions = [ 63 | output["EA"] == output["MAJ_B"], 64 | output["EA"] == output["NONMAJ_B"], 65 | ~output["proxy"], 66 | True, 67 | ] 68 | choices = [ 69 | -output["BETA"], # if EA == MAJ_B, flip the sign of BETA 70 | output["BETA"], # if EA == NONMAJ_B, BETA does not change 71 | output["BETA"], # if SNP_B is NaN (The original SNP was not proxied), BETA does not change 72 | np.nan, # if the original SNP was proxied but "EA" is neither "MAJ_A" nor "NONMAJ_A", BETA is NaN 73 | ] 74 | output["BETA"] = np.select(conditions, choices) 75 | 76 | # Flip BETA if the sign of R is negative: indicates that the positive correlation corresponds to MAJ_A with NONMAJ_B 77 | sign_r = np.sign(output["R"]) # Sign of R 78 | output["BETA"] = np.where(sign_r == -1, -output["BETA"], output["BETA"]) 79 | 80 | # Delete SNPs with mismatched alleles 81 | nrow = output.shape[0] 82 | output = output.dropna(subset=["BETA"]) 83 | if output.shape[0] < nrow: 84 | print( 85 | f"Deleted {nrow-output.shape[0]} base SNPs that did not have matching alleles in reference data." 86 | ) 87 | print(f"Found proxies for {output['proxy'].sum()} SNPs.") 88 | 89 | # Replace the proxied SNPs with the position and alleles of the original SNPs 90 | output["SNP"] = np.where(output["proxy"], output["SNP_A"], output["SNP"]) 91 | output["POS"] = np.where(output["proxy"], output["BP_A"], output["POS"]) 92 | output["CHR"] = np.where(output["proxy"], output["CHR_A"], output["CHR"]) 93 | output["EA"] = np.where(output["proxy"], output["NONMAJ_A"], output["EA"]) 94 | output["NEA"] = np.where(output["proxy"], output["MAJ_A"], output["NEA"]) 95 | if "EAF" in output.columns: 96 | output["EAF"] = np.where(output["proxy"], output["NONMAJ_FREQ_A"], output["EAF"]) 97 | 98 | # Drop columns related to ld 99 | output = output.drop(columns=ld.columns) 100 | 101 | return output 102 | 103 | 104 | def apply_proxies(df, ld, searchspace=None): 105 | """ 106 | Given a dataframe (coming from GENO.data attribute) and a dataframe of proxies 107 | (output from find_proxies), replace the SNPs in df with their best proxies, if they exist. 108 | This function is suited for exposure data (before running a PRS for instance). 109 | 110 | Args: 111 | df (DataFrame): Dataframe of SNP information with the usual GENO columns (SNP, BETA, SE, EAF, EA, NEA). EAF is not necessary. 112 | ld (DataFrame): Dataframe of proxies (output from find_proxies). 113 | searchspace (list, optional): List of SNPs to restrict the list of potential proxies. By default, includes all the proxies found. Using a searchspace can be done either at the find_proxies step or at this step, but it is much faster to use it at this step. 114 | 115 | Returns: 116 | DataFrame: A DataFrame with SNPs replaced by their best proxies, if they exist. 117 | """ 118 | # If ld is None 119 | if not isinstance(ld, pd.DataFrame): 120 | raise ValueError("ld is None (The SNPs to be proxied were not found in the reference panel)") 121 | 122 | # Check mandatory columns 123 | mandatory_cols = ["EA", "SNP", "BETA"] 124 | for col in mandatory_cols: 125 | if col not in df.columns: 126 | raise ValueError(f"The column {col} is not found in the data!") 127 | 128 | # Filter by searchspace if provided 129 | if searchspace: 130 | print("Filtering the potential proxies with the searchspace provided.") 131 | ld = ld[ld.SNP_B.isin(searchspace)] 132 | 133 | # Remove original SNPs and sort by r 134 | ld = ld[ld["SNP_A"] != ld["SNP_B"]] 135 | ld = ld.reindex(ld["R"].abs().sort_values(ascending=False).index) 136 | 137 | # Select the best proxy for each SNP 138 | ld = ld.groupby("SNP_A").first().reset_index(drop=False) 139 | 140 | # Merge the dataframes 141 | output = df.merge(ld, how="left", left_on="SNP", right_on="SNP_A") 142 | output["proxy"] = pd.notnull(output["SNP_B"]) 143 | 144 | # In the plink output, the alleles taken as reference for proxying are "MAJ_A" and "MAJ_B" (major alleles in the reference panel) 145 | # We want to use as effect allele for the proxy SNP its minor allele in the reference panel 146 | # So, we flip BETA if the original SNP's effect allele is the major allele in the reference panel 147 | conditions = [ 148 | output["EA"] == output["MAJ_A"], 149 | output["EA"] == output["NONMAJ_A"], 150 | ~output["proxy"], 151 | True, 152 | ] 153 | choices = [ 154 | -output["BETA"], # if EA == MAJ_A, flip the sign of BETA 155 | output["BETA"], # if EA == NONMAJ_A, BETA does not change 156 | output["BETA"], # if SNP_B is NaN (The original SNP was not proxied), BETA does not change 157 | np.nan, # if the original SNP was proxied but "EA" is neither "MAJ_A" nor "NONMAJ_A", BETA is NaN 158 | ] 159 | output["BETA"] = np.select(conditions, choices) 160 | 161 | # Flip BETA if the sign of R is negative: indicates that the positive correlation corresponds to MAJ_A with NONMAJ_B 162 | sign_r = np.sign(output["R"]) # Sign of R 163 | output["BETA"] = np.where(sign_r == -1, -output["BETA"], output["BETA"]) 164 | 165 | # Delete SNPs with mismatched alleles 166 | nrow = output.shape[0] 167 | output = output.dropna(subset=["BETA"]) 168 | if output.shape[0] < nrow: 169 | print( 170 | f"Deleted {nrow-output.shape[0]} base SNPs that did not have matching alleles in reference data." 171 | ) 172 | print(f"Found proxies for {output['proxy'].sum()} SNPs.") 173 | 174 | # Replace the original SNPs with their proxy (if proxied) 175 | # As said above, we use as effect allele the minor allele in the reference panel 176 | output["SNP"] = np.where(output["proxy"], output["SNP_B"], output["SNP"]) 177 | output["EA"] = np.where(output["proxy"], output["NONMAJ_B"], output["EA"]) 178 | if "POS" in output.columns: 179 | output["POS"] = np.where(output["proxy"], output["BP_B"], output["POS"]) 180 | if "CHR" in output.columns: 181 | output["CHR"] = np.where(output["proxy"], output["CHR_B"], output["CHR"]) 182 | if "NEA" in output.columns: 183 | output["NEA"] = np.where(output["proxy"], output["MAJ_B"], output["NEA"]) 184 | if "EAF" in output.columns: 185 | output["EAF"] = np.where(output["proxy"], output["NONMAJ_FREQ_B"], output["EAF"]) 186 | 187 | # Drop ld columns 188 | output.drop(columns=ld.columns, inplace=True) 189 | 190 | return output 191 | 192 | 193 | def find_proxies( 194 | snp_list, 195 | searchspace=None, 196 | reference_panel="EUR_37", 197 | kb=5000, 198 | r2=0.8, 199 | window_snps=1000000, 200 | threads=1, 201 | name=None 202 | ): 203 | """ 204 | Given a list of SNPs, return a table of proxies using PLINK 2.0. 205 | 206 | Args: 207 | snp_list (list): List of rsids. 208 | searchspace (list, optional): List of SNPs to include in the search. By default, includes the whole reference panel. 209 | reference_panel (str, optional): The reference population to get linkage disequilibrium values and find proxies. 210 | Acceptable populations are "EUR", "SAS", "AFR", "EAS", "AMR" and available builds are 37 and 38 ("EUR_38" or "AFR_37" etc...) 211 | Also accepts or a path to a specific bed/bim/fam or pgen/pvar/psam panel. 212 | Default is "EUR_37". 213 | kb (int, optional): Width of the genomic window to look for proxies. Defaults to 5000. 214 | r2 (float, optional): Minimum linkage disequilibrium value with the main SNP for a proxy to be included. Defaults to 0.8. 215 | window_snps (int, optional): Compute the LD value for SNPs that are not more than x SNPs apart from the main SNP. Defaults to 1000000 (equivalent to infinity). 216 | threads (int, optional): Number of threads to use. Defaults to 1. 217 | 218 | Returns: 219 | DataFrame: A DataFrame containing the proxies. Only biallelic SNPs are returned. 220 | """ 221 | # Ensure tmp_GENAL directory exists 222 | os.makedirs(f"tmp_GENAL/", exist_ok=True) 223 | 224 | # Generate a default name if none is provided 225 | if name is None: 226 | name = str(uuid.uuid4())[:8] 227 | 228 | # Convert snp_list to numpy array 229 | snp_list = np.array(list(snp_list)) 230 | 231 | # Check if searchspace is provided 232 | if searchspace is None: 233 | extract_arg = "" 234 | else: 235 | print("Searching proxies in the provided searchspace.") 236 | with open(f"tmp_GENAL/{name}_searchspace.txt", "w") as file: 237 | for s in searchspace + snp_list: 238 | file.write(str(s) + "\n") 239 | extract_arg = "--extract tmp_GENAL/{name}_searchspace.txt" 240 | 241 | # Save snp_list to a file 242 | np.savetxt(f"tmp_GENAL/{name}_snps_to_proxy.txt", snp_list, fmt="%s", delimiter=" ") 243 | 244 | # Get reference panel path and type 245 | ref_path, filetype = get_reference_panel_path(reference_panel) 246 | 247 | # Construct base command based on filetype 248 | base_cmd = f"{get_plink_path()}" 249 | if filetype == "bed": 250 | base_cmd += f" --bfile {ref_path}" 251 | else: # pgen 252 | base_cmd += f" --pfile {ref_path}" 253 | 254 | # Construct base command based on filetype 255 | base_cmd = f"{get_plink_path()}" 256 | if filetype == "bed": 257 | base_cmd += f" --bfile {ref_path}" 258 | else: # pgen 259 | base_cmd += f" --pfile {ref_path}" 260 | 261 | # Construct and execute the plink2 command 262 | command = ( 263 | f"{base_cmd} {extract_arg} " 264 | f"--r-unphased 'cols=chrom,pos,id,maj,nonmaj,freq' " 265 | f"--ld-snp-list tmp_GENAL/{name}_snps_to_proxy.txt " 266 | f"--ld-window-kb {kb} " 267 | f"--ld-window-r2 {r2} " 268 | f"--ld-window {window_snps} " 269 | f"--threads {threads} " 270 | f"--out tmp_GENAL/{name}_proxy.targets" 271 | ) 272 | 273 | run_plink_command(command) 274 | 275 | # Read log file to return amount of SNPs to be proxied present in the ref panel 276 | log_path = os.path.join("tmp_GENAL", f"{name}_proxy.targets.log") 277 | log_content = open(log_path).read() 278 | match = re.search(r'(\d+) variant[s] remaining', log_content) 279 | if match: 280 | n_present = int(match.group(1)) 281 | if n_present == 0: 282 | print("None of the SNPs to be proxied are present in the reference panel.") 283 | return None 284 | else: 285 | print(f"{n_present} SNPs to be proxied are present in the reference panel.") 286 | 287 | # Read and process the output 288 | try: 289 | ld = pd.read_csv(f"tmp_GENAL/{name}_proxy.targets.vcor", sep="\s+") 290 | except FileNotFoundError: 291 | print("No proxies found that meet the specified criteria.") 292 | return None 293 | 294 | # Rename columns to match the expected format 295 | ld.rename(columns={ 296 | 'ID_A': 'SNP_A', 297 | 'ID_B': 'SNP_B', 298 | '#CHROM_A': 'CHR_A', 299 | 'CHROM_B': 'CHR_B', 300 | 'POS_A': 'BP_A', 301 | 'POS_B': 'BP_B', 302 | 'UNPHASED_R': 'R', 303 | }, inplace=True) 304 | 305 | # Create PHASE column for compatibility 306 | #ld['PHASE'] = ld['A1'] + ld['B1'] + ld['A2'] + ld['B2'] 307 | 308 | # Filter out multiallelic SNPs 309 | #ld = ld[ld["PHASE"].str.len() == 4] 310 | #ld = ld.reset_index(drop=True) 311 | 312 | # Convert integer columns to Int64 type 313 | for int_col in ["CHR_A", "CHR_B", "BP_A", "BP_B"]: 314 | ld[int_col] = ld[int_col].astype("Int64") 315 | 316 | return ld 317 | -------------------------------------------------------------------------------- /genal/extract_prs.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import os, subprocess, re, uuid 3 | from functools import partial 4 | from concurrent.futures import ProcessPoolExecutor 5 | 6 | from .tools import check_bfiles, check_pfiles, setup_genetic_path, get_plink_path 7 | 8 | 9 | ### ____________________ 10 | ### PRS functions 11 | ### ____________________ 12 | 13 | def prs_func(data, weighted=True, path=None, ram=20000, cpus=4, name=None): 14 | """ 15 | Compute a PRS (Polygenic Risk Score) using provided SNP-level data. Corresponds to the :meth:`Geno.prs` method 16 | """ 17 | # Get path and filetype 18 | path, filetype = setup_genetic_path(path) 19 | 20 | # Generate a default name if none is provided 21 | if name is None: 22 | name = str(uuid.uuid4())[:8] 23 | 24 | # Call extract_snps 25 | extracted_path = extract_snps_func(data.SNP, name, path, ram=ram, cpus=cpus) 26 | 27 | if extracted_path == "FAILED": 28 | raise ValueError("No SNPs were extracted from the genetic data and the PRS can't be computed.") 29 | 30 | # Additional check to ensure there are no duplicates in the data (need to think more about this, should be done upstream) 31 | data.drop_duplicates(subset=["SNP"], keep="first", inplace=True) 32 | if "CHR" in data.columns and "POS" in data.columns: 33 | data.drop_duplicates(subset=["CHR", "POS"], keep="first", inplace=True) 34 | 35 | # Write processed data to file and run plink on it 36 | data = data[["SNP", "EA", "BETA"]] 37 | data_path = os.path.join("tmp_GENAL", f"{name}_to_prs.txt") 38 | output_path = os.path.join("tmp_GENAL", f"{name}_prs") 39 | 40 | # Set BETA values to 1 if unweighted PRS is required 41 | if not weighted: 42 | data["BETA"] = 1 43 | print(f"Computing an unweighted PRS using {extracted_path} data.") 44 | else: 45 | print(f"Computing a weighted PRS using {extracted_path} data.") 46 | 47 | data.to_csv(data_path, sep="\t", index=False, header=True) 48 | 49 | # We can use --pfile since extract_snps now creates pgen files 50 | plink_command = f"{get_plink_path()} --memory {ram} --pfile {extracted_path} --threads {cpus} \ 51 | --score {data_path} 1 2 3 header --out {output_path} --allow-no-sex" 52 | 53 | # Check for empty dataframe 54 | n_snps = data.shape[0] 55 | if n_snps == 0: 56 | raise ValueError( 57 | "No SNPs remain for the polygenic risk score (PRS) calculation." 58 | ) 59 | 60 | try: 61 | output = subprocess.run( 62 | plink_command, shell=True, capture_output=True, text=True, check=True 63 | ) 64 | except subprocess.CalledProcessError as e: 65 | print(f"Error running PLINK command: {e}") 66 | print(f"PLINK stdout: {e.stdout}") 67 | print(f"PLINK stderr: {e.stderr}") 68 | raise ValueError("PLINK command failed. Check the error messages above for details.") 69 | 70 | # Read and process PRS results 71 | prs_file = output_path + ".sscore" 72 | log_file = output_path + ".log" 73 | if os.path.isfile(prs_file): #If the profile file exists: PRS was successful 74 | #Extracts the number of SNPs used for the PRS computation 75 | log_content = open(log_file).read() 76 | match = re.search(r'--score: (\d+) variant[s] processed', log_content) 77 | if match: 78 | n_predictors = int(match.group(1)) 79 | print( 80 | f"The PRS computation was successful and used {n_predictors}/{n_snps} ({n_predictors/n_snps*100:.3f}%) SNPs." 81 | ) 82 | else: 83 | print("Could not extract the number of SNPs used for the PRS computation.") 84 | #Return scores 85 | df_score = pd.read_csv(prs_file, sep="\s+") 86 | df_score.rename(columns={"#FID": "FID"}, inplace=True) 87 | return df_score 88 | else: 89 | print(output.stdout) 90 | raise ValueError( 91 | f"The PRS computation was not successful. Check the {output_path + '.log'} file." 92 | ) 93 | 94 | 95 | ### _____________________ 96 | ### Extract SNPs functions 97 | ### _____________________ 98 | 99 | # We are currently excluding all multiallelic variants by forcing first on all duplicates. 100 | # Could be improved by keeping the relevant version of the multiallelic SNPs based on allele matching 101 | def extract_snps_func(snp_list, name=None, path=None, ram=20000, cpus=4): 102 | """ 103 | Extracts a list of SNPs from the given path. This function corresponds to the following Geno method: :meth:`Geno.extract_snps`. 104 | 105 | Args: 106 | snp_list (pd.Series): Series of SNPs to extract. 107 | name (str): Name prefix for the output files. 108 | path (str, optional): Path to the dataset. Defaults to the path from the configuration. 109 | 110 | Returns: 111 | str: path to the genetic files containing the extracted SNPs 112 | 113 | Raises: 114 | TypeError: Raises an error when no valid path is saved or when there's an incorrect format in the provided path. 115 | """ 116 | # Check if snp_list is empty Series 117 | if snp_list.empty: 118 | print("The provided SNP list is empty.") 119 | return "FAILED" 120 | 121 | # Generate a default name if none is provided 122 | if name is None: 123 | name = str(uuid.uuid4())[:8] 124 | 125 | # Get path and filetype 126 | path, filetype = setup_genetic_path(path) 127 | 128 | # Prepare the SNP list 129 | snp_list = snp_list.dropna() 130 | snp_list_name = f"{name}_list.txt" 131 | snp_list_path = os.path.join("tmp_GENAL", snp_list_name) 132 | snp_list.to_csv(snp_list_path, sep=" ", index=False, header=None) 133 | nrow = len(snp_list) 134 | 135 | # Check if the data is split by chromosome 136 | filetype_split = "split" if "$" in path else "combined" 137 | 138 | output_path = os.path.join("tmp_GENAL", f"{name}_allchr") 139 | if filetype_split == "split": 140 | ram_estimate_per_cpu = nrow/(1.5*10**2) 141 | n_cpus = max(1, int(ram // ram_estimate_per_cpu)) 142 | workers = min(n_cpus, cpus) 143 | merge_command, bedlist_path = extract_snps_from_split_data( 144 | name, path, output_path, snp_list_path, filetype, workers=workers 145 | ) 146 | handle_multiallelic_variants(name, merge_command, bedlist_path) 147 | else: 148 | extract_snps_from_combined_data(name, path, output_path, snp_list_path, filetype) 149 | 150 | #Check that at least 1 variant has been extracted. If not, return "FAILED" to warn downstream functions (prs, association_test) 151 | log_path = output_path + ".log" 152 | with open(log_path, 'r') as log_file: 153 | if " 0 variants remaining" in log_file.read(): 154 | print("None of the provided SNPs were found in the genetic data.") 155 | return "FAILED" 156 | else: 157 | if check_pfiles(output_path): 158 | print(f"Created pgen/pvar/psam fileset with extracted SNPs: {output_path}") 159 | else: 160 | print(f"Could not extract the SNPs from the provided genetic data: check plink .log file") 161 | # Report SNPs not found 162 | report_snps_not_found(nrow, name) 163 | 164 | return output_path 165 | 166 | 167 | def extract_command_parallel(task_id, name, path, snp_list_path, filetype): 168 | """ 169 | Helper function to run SNP extraction in parallel for different chromosomes. 170 | Args: 171 | task_id (int): Identifier for the task/chromosome. 172 | name (str): Name prefix for the output files. 173 | path (str): Path to the data set. 174 | snp_list_path (str): Path to the list of SNPs to extract. 175 | filetype (str): Type of genetic files ("bed" or "pgen") 176 | Returns: 177 | int: Returns the task_id if no valid files are found. 178 | """ 179 | input_path = path.replace("$", str(task_id)) 180 | 181 | # Check if files exist based on filetype 182 | if filetype == "bed" and not check_bfiles(input_path): 183 | return task_id 184 | elif filetype == "pgen" and not check_pfiles(input_path): 185 | return task_id 186 | 187 | output_path = os.path.join("tmp_GENAL", f"{name}_extract_chr{task_id}") 188 | 189 | # Build command based on filetype 190 | base_cmd = f"{get_plink_path()}" 191 | if filetype == "bed": 192 | base_cmd += f" --bfile {input_path}" 193 | else: # pgen 194 | base_cmd += f" --pfile {input_path}" 195 | 196 | command = f"{base_cmd} --extract {snp_list_path} --rm-dup force-first --make-pgen --out {output_path}" 197 | 198 | subprocess.run( 199 | command, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL 200 | ) 201 | 202 | 203 | def create_bedlist(bedlist_path, output_name, not_found): 204 | """ 205 | Creates a bedlist file for SNP extraction. 206 | Args: 207 | bedlist_path (str): Path to save the bedlist file. 208 | output_name (str): Base name for the output files. 209 | not_found (List[int]): List of chromosome numbers for which no files were found. 210 | """ 211 | with open(bedlist_path, "w+") as bedlist_file: 212 | found = [] 213 | for i in range(1, 23): 214 | if i in not_found: 215 | print(f"bed/bim/fam or pgen/pvar/psam files not found for chr{i}.") 216 | elif check_pfiles(f"{output_name}_chr{i}"): 217 | bedlist_file.write(f"{output_name}_chr{i}\n") 218 | found.append(i) 219 | print(f"SNPs extracted for chr{i}.") 220 | else: 221 | print(f"No SNPs extracted for chr{i}.") 222 | return found 223 | 224 | 225 | def extract_snps_from_split_data(name, path, output_path, snp_list_path, filetype, workers=4): 226 | """Extract SNPs from data split by chromosome.""" 227 | print("Extracting SNPs for each chromosome...") 228 | num_tasks = 22 229 | partial_extract_command_parallel = partial( 230 | extract_command_parallel, 231 | name=name, 232 | path=path, 233 | snp_list_path=snp_list_path, 234 | filetype=filetype 235 | ) # Wrapper function 236 | with ProcessPoolExecutor(max_workers=workers) as executor: 237 | not_found = list( 238 | executor.map(partial_extract_command_parallel, range(1, num_tasks + 1)) 239 | ) 240 | 241 | # Merge extracted SNPs from each chromosome 242 | bedlist_name = f"{name}_bedlist.txt" 243 | bedlist_path = os.path.join("tmp_GENAL", bedlist_name) 244 | found = create_bedlist( 245 | bedlist_path, os.path.join("tmp_GENAL", f"{name}_extract"), not_found 246 | ) 247 | if len(found) == 0: 248 | raise Warning("No SNPs were extracted from any chromosome.") 249 | 250 | # If only one chromosome was extracted, no need to merge, simply rename the files 251 | if len(found) == 1: 252 | chr_path = os.path.join("tmp_GENAL", f"{name}_extract_chr{found[0]}") 253 | for ext in [".pgen", ".pvar", ".psam", ".log"]: 254 | os.rename(f"{chr_path}{ext}", f"{output_path}{ext}") 255 | return None, bedlist_path 256 | 257 | print("Merging SNPs extracted from each chromosome...") 258 | merge_command = f"{get_plink_path()} --pmerge-list {bedlist_path} pfile --out {output_path}" 259 | try: 260 | subprocess.run( 261 | merge_command, shell=True, capture_output=True, text=True, check=True 262 | ) 263 | except subprocess.CalledProcessError as e: 264 | print(f"Error running PLINK command: {e}") 265 | print(f"PLINK stdout: {e.stdout}") 266 | print(f"PLINK stderr: {e.stderr}") 267 | raise ValueError("PLINK command failed. Check the error messages above for details.") 268 | 269 | return merge_command, bedlist_path 270 | 271 | 272 | def extract_snps_from_combined_data(name, path, output_path, snp_list_path, filetype): 273 | """Extract SNPs from combined data.""" 274 | print("Extracting SNPs...") 275 | 276 | # Build command based on filetype 277 | base_cmd = f"{get_plink_path()}" 278 | if filetype == "bed": 279 | base_cmd += f" --bfile {path}" 280 | else: # pgen 281 | base_cmd += f" --pfile {path}" 282 | 283 | extract_command = f"{base_cmd} --extract {snp_list_path} --rm-dup force-first --make-pgen --out {output_path}" 284 | 285 | subprocess.run( 286 | extract_command, 287 | shell=True, 288 | stdout=subprocess.DEVNULL, 289 | stderr=subprocess.DEVNULL, 290 | ) 291 | 292 | 293 | def report_snps_not_found(nrow, name): 294 | """Report the number of SNPs not found in the data.""" 295 | 296 | def count_lines(filepath): 297 | with open(filepath, "r") as file: 298 | return sum(1 for line in file) 299 | 300 | file_path = os.path.join("tmp_GENAL", f"{name}_allchr.pvar") 301 | extracted_snps_count = count_lines(file_path)-1 #pvar files include column names 302 | delta_nrow = nrow - extracted_snps_count 303 | if delta_nrow > 0: 304 | print( 305 | f"{delta_nrow}({delta_nrow/nrow*100:.3f}%) SNPs were not extracted from the genetic data." 306 | ) 307 | 308 | # TODO: Check if this function is still needed with plink2 309 | def handle_multiallelic_variants(name, merge_command, bedlist_path): 310 | """Handle multiallelic variants detected during merging.""" 311 | 312 | if merge_command is None: 313 | return 314 | 315 | def remove_multiallelic(): 316 | missnp_path = os.path.join( 317 | "tmp_GENAL", f"{name}_allchr.vmiss" 318 | ) 319 | if not os.path.exists(missnp_path): 320 | return 0 321 | 322 | snps_to_exclude = pd.read_csv(missnp_path, header=None) 323 | for i in range(1, 23): 324 | pvar_path = os.path.join("tmp_GENAL", f"{name}_extract_chr{i}.pvar") 325 | if not os.path.isfile(pvar_path): 326 | continue 327 | pvar = pd.read_csv(pvar_path, sep="\t", header=None) 328 | # If no SNPs would be left for this chr: remove corresponding bedlist line 329 | n_to_exclude = len(set(pvar[2]).intersection(set(snps_to_exclude[0]))) 330 | if n_to_exclude == len(set(pvar[2])): 331 | print(f"No SNPs remaining for chromosome {i}.") 332 | tmp_filename = os.path.join("tmp_GENAL", "tmp_multiallelic") 333 | with open(bedlist_path, "r") as file, open( 334 | tmp_filename, "w" 335 | ) as temp_file: 336 | output_name = os.path.join("tmp_GENAL", f"{name}_extract") 337 | line_to_exclude = f"{output_name}_chr{i}\n" 338 | for line in file: 339 | if line != line_to_exclude: 340 | temp_file.write(line) 341 | # Replace the original file with the temporary file 342 | os.replace(tmp_filename, bedlist_path) 343 | 344 | # If there is at least one multiallelic SNP for this chr 345 | elif n_to_exclude > 0: 346 | pfile_path = os.path.join("tmp_GENAL", f"{name}_extract_chr{i}") 347 | command = f"{get_plink_path()} --pfile {pfile_path} --exclude {missnp_path} --make-pgen --out {pfile_path}" 348 | subprocess.run( 349 | command, 350 | shell=True, 351 | stdout=subprocess.DEVNULL, 352 | stderr=subprocess.DEVNULL, 353 | ) 354 | return len(snps_to_exclude) 355 | 356 | log_content = open(os.path.join("tmp_GENAL", f"{name}_allchr.log")).read() 357 | if "Error: Multiple" in log_content: 358 | print("Multiallelic variants detected in the genetic files: removing them before merging.") 359 | n_multiallelic = remove_multiallelic() 360 | print(f"Reattempting the merge after exclusion of {n_multiallelic} multiallelic variants.") 361 | subprocess.run( 362 | merge_command, 363 | shell=True, 364 | stdout=subprocess.DEVNULL, 365 | stderr=subprocess.DEVNULL, 366 | ) 367 | 368 | -------------------------------------------------------------------------------- /docs/build/_modules/genal/clump.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | genal.clump — genal v0.0 documentation 7 | 8 | 9 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 |
23 | 53 | 54 |
58 | 59 |
60 |
61 |
62 |
    63 |
  • 64 | 65 | 66 |
  • 67 |
  • 68 |
69 |
70 |
71 |
72 |
73 | 74 |

Source code for genal.clump

 75 | import os
 76 | import subprocess
 77 | import pandas as pd
 78 | import uuid
 79 | 
 80 | from .tools import read_config, get_plink19_path, get_reference_panel_path, create_tmp
 81 | 
 82 | 
 83 | 
84 | [docs] 85 | def clump_data( 86 | data, 87 | reference_panel="eur", 88 | kb=250, 89 | r2=0.1, 90 | p1=5e-8, 91 | p2=0.01, 92 | name="", 93 | ram=10000, 94 | ): 95 | """ 96 | Perform clumping on the given data using plink. Corresponds to the :meth:`Geno.clump` method. 97 | 98 | Args: 99 | data (pd.DataFrame): Input data with at least 'SNP' and 'P' columns. 100 | reference_panel (str): The reference population for linkage disequilibrium values. Accepts values "eur", "sas", "afr", "eas", "amr". Alternatively, a path leading to a specific bed/bim/fam reference panel can be provided. Default is "eur". 101 | kb (int, optional): Clumping window in terms of thousands of SNPs. Default is 250. 102 | r2 (float, optional): Linkage disequilibrium threshold, values between 0 and 1. Default is 0.1. 103 | p1 (float, optional): P-value threshold during clumping. SNPs above this value are not considered. Default is 5e-8. 104 | p2 (float, optional): P-value threshold post-clumping to further filter the clumped SNPs. If p2 < p1, it won't be considered. Default is 0.01. 105 | name (str, optional): Name used for the files created in the tmp_GENAL folder. 106 | ram (int, optional): Amount of RAM in MB to be used by plink. 107 | 108 | Returns: 109 | pd.DataFrame: Data after clumping, if any. 110 | """ 111 | plink19_path = get_plink19_path() 112 | 113 | # Create unique ID for the name if none is passed 114 | if not name: 115 | name = str(uuid.uuid4())[:8] 116 | 117 | # Save the relevant data columns to a temporary file 118 | to_clump_filename = os.path.join("tmp_GENAL", f"{name}_to_clump.txt") 119 | data[["SNP", "P"]].to_csv(to_clump_filename, index=False, sep="\t") 120 | 121 | # Construct and execute the plink clumping command 122 | output_path = os.path.join("tmp_GENAL", name) 123 | plink_command = f"{plink19_path} --memory {ram} --bfile {get_reference_panel_path(reference_panel)} \ 124 | --clump {to_clump_filename} --clump-kb {kb} --clump-r2 {r2} --clump-p1 {p1} \ 125 | --clump-p2 {p2} --out {output_path}" 126 | output = subprocess.run( 127 | plink_command, shell=True, capture_output=True, text=True, check=True 128 | ) 129 | 130 | # Check and print the outputs for relevant information 131 | if output.returncode != 0: 132 | raise RuntimeError( 133 | f"PLINK execution failed with the following error: {output.stderr}" 134 | ) 135 | if "more top variant IDs missing" in output.stderr: 136 | missing_variants = output.stderr.split("more top variant IDs missing")[0].split( 137 | "\n" 138 | )[-1] 139 | print(f"Warning: {missing_variants} top variant IDs missing") 140 | if "No significant --clump results." in output.stderr: 141 | print("No SNPs remaining after clumping.") 142 | return 143 | print(output.stdout.split("--clump: ")[1].split("\n")[0]) 144 | 145 | # Extract the list of clumped SNPs and get the relevant data subset 146 | clumped_filename = os.path.join("tmp_GENAL", f"{name}.clumped") 147 | if not os.path.exists(clumped_filename): 148 | raise FileNotFoundError(f"'{clumped_filename}' is missing.") 149 | plink_clumped = pd.read_csv(clumped_filename, sep="\s+", usecols=["SNP"]) 150 | clumped_data = data[data["SNP"].isin(plink_clumped["SNP"])] 151 | clumped_data.reset_index(drop=True, inplace=True) 152 | return clumped_data
153 | 154 |
155 | 156 |
157 |
158 |
159 | 160 |
161 | 162 |
163 |

© Copyright 2023, Cyprien A. Rivier.

164 |
165 | 166 | Built with Sphinx using a 167 | theme 168 | provided by Read the Docs. 169 | 170 | 171 |
172 |
173 |
174 |
175 |
176 | 181 | 182 | 183 | -------------------------------------------------------------------------------- /genal/association.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from pandas.api.types import is_numeric_dtype 4 | import scipy.stats as st 5 | import os, subprocess 6 | 7 | from .extract_prs import check_pfiles 8 | from .tools import get_plink_path, run_plink_command 9 | 10 | 11 | def association_test_func_plink2(data, covar_list, standardize, name, data_pheno, pheno_type): 12 | """ 13 | Conduct single-SNP association tests against a phenotype. 14 | 15 | This function performs a series of operations: 16 | 1. Checks for necessary preliminary steps. 17 | 2. Updates the PSAM file with the phenotype data. 18 | 3. Creates a covariate file if required. 19 | 4. Runs a PLINK association test. 20 | 5. Processes the results and returns them. 21 | 22 | Args: 23 | data (pd.DataFrame): Genetic data with the standard Geno columns. 24 | covar_list (list): List of column names in the data_pheno DataFrame to use as covariates. 25 | standardize (bool): Flag indicating if the phenotype needs standardization. 26 | name (str): Prefix for the filenames used during the process. 27 | data_pheno (pd.DataFrame): Phenotype data with at least an IID and PHENO columns. 28 | pheno_type (str): Type of phenotype ('binary' or 'quant'). 29 | 30 | Returns: 31 | pd.DataFrame: Processed results of the association test. 32 | 33 | This function corresponds to the following Geno method: :meth:`Geno.association_test`. 34 | """ 35 | 36 | # Check necessary files are available 37 | genetic_path = os.path.join("tmp_GENAL", f"{name}_allchr") 38 | print(genetic_path) 39 | if not check_pfiles(genetic_path): 40 | raise FileNotFoundError( 41 | "Run the extract_snps() method before performing association tests." 42 | ) 43 | if data.shape[0] == 0: 44 | raise ValueError( 45 | "No SNPs for the association tests. Check the .data or .data_clumped dataframes." 46 | ) 47 | 48 | # Update phenotype in the PSAM file 49 | psam = _prepare_psam_file(genetic_path, data_pheno, pheno_type, standardize) 50 | 51 | # Prepare covariate file if covariates are provided 52 | covar_list, covar_filename = _handle_covariates(covar_list, data_pheno, name) 53 | 54 | # Execute PLINK association test 55 | output = _run_plink2_assoc_test( 56 | genetic_path, name, covar_filename, covar_list, pheno_type 57 | ) 58 | 59 | # Process and return results 60 | return _process_results_plink2(output, data, pheno_type) 61 | 62 | def _run_plink2_assoc_test( 63 | genetic_path, name, covar_filename, covar_list, pheno_type 64 | ): 65 | """Helper function to execute the PLINK 2.0 association test.""" 66 | 67 | print( 68 | f"Running {'linear' if pheno_type == 'quant' else 'logistic'} association tests on {genetic_path} data " 69 | f"{f'with adjustment for: {covar_list}' if len(covar_list) > 0 else 'without covariates. This is not recommended'}." 70 | ) 71 | 72 | output = os.path.join("tmp_GENAL", name) 73 | 74 | # Build PLINK 2.0 command - we can use --pfile since extract_snps now creates pgen files 75 | command = [ 76 | get_plink_path(), 77 | "--pfile", genetic_path, 78 | "--glm", 79 | *(["allow-no-covars"] if len(covar_list) == 0 else []), 80 | "no-x-sex", 81 | "--no-input-missing-phenotype", 82 | "--pheno-name", "PHENO1" 83 | ] 84 | 85 | if len(covar_list) > 0: 86 | command.extend([ 87 | "--covar", covar_filename, 88 | "--covar-name", ",".join(covar_list) 89 | ]) 90 | 91 | command.extend(["--out", output]) 92 | 93 | try: 94 | subprocess.run(command, capture_output=True, text=True, check=True) 95 | except Exception as e: 96 | #Handle the case where the association fails because of numerical instability in the covariates 97 | if "scales vary too widely" in str(e): 98 | print("The association test failed because of numerical instability in the covariates. Rescaling the covariates.") 99 | command.extend(["--covar-variance-standardize"]) 100 | run_plink_command(command) 101 | 102 | else: 103 | print(f"Error running PLINK command: {e}") 104 | print(f"PLINK stdout: {e.stdout}") 105 | print(f"PLINK stderr: {e.stderr}") 106 | raise ValueError("PLINK command failed. Check the error messages above for details.") 107 | 108 | return output 109 | 110 | def _process_results_plink2(output, data, pheno_type): 111 | """Helper function to process results after the PLINK association test.""" 112 | # Path to PLINK results 113 | method = "logistic.hybrid" if pheno_type == "binary" else "linear" 114 | results_path = output + f".PHENO1.glm." + method 115 | assoc = pd.read_csv(results_path, delimiter="\s+") 116 | 117 | # Filter to keep only coefficients corresponding to our phenotype 118 | assoc = assoc[assoc["TEST"] == "ADD"] 119 | 120 | # If logistic regression, log-transform the odds ratio 121 | assoc["BETA"] = np.log(assoc.OR) if pheno_type == "binary" else assoc.BETA 122 | 123 | n_na = assoc["BETA"].isna().sum() 124 | 125 | # Rename columns 126 | assoc.rename(columns={"#CHROM": "CHR", "LOG(OR)_SE": "SE"}, errors="ignore", inplace=True) 127 | 128 | # Merge results with the clumped data 129 | data = data.drop(axis=1, columns=["BETA", "SE", "P"], errors="ignore").merge( 130 | assoc[["CHR","POS", "BETA", "SE", "A1", "P"]], how="inner", on=["CHR", "POS"] 131 | ) 132 | 133 | # Adjust beta values based on allele match 134 | data["BETA"] = np.where( 135 | data.EA == data.A1, data.BETA, np.where(data.NEA == data.A1, -data.BETA, np.nan) 136 | ) 137 | 138 | # Drop unnecessary columns 139 | data = data.drop( 140 | axis=1, columns=["A1"], errors="ignore" 141 | ) 142 | 143 | # Remove rows with mismatches in allele columns and notify the user 144 | nrow_previous = data.shape[0] 145 | data = data.dropna(subset="BETA") 146 | delta_nrow = nrow_previous - data.shape[0] - n_na 147 | if (delta_nrow > 0) or (n_na > 0): 148 | print( 149 | f"{f'{n_na}({n_na/nrow_previous*100:.3f}%) SNP-trait tests returned NA value and ' if n_na>0 else ''}{delta_nrow}({delta_nrow/nrow_previous*100:.3f}%) SNPs removed due to allele discrepancies between the main data and the genetic data." 150 | ) 151 | return data 152 | 153 | 154 | def _prepare_psam_file(genetic_path, data_pheno, pheno_type, standardize): 155 | """Helper function to prepare the PSAM file with phenotype data.""" 156 | # Read the PSAM file 157 | psam = pd.read_csv(genetic_path + ".psam", delimiter="\t") 158 | 159 | # Ensure IID column types match before merging to prevent errors 160 | if '#IID' in psam.columns and 'IID' in data_pheno.columns: 161 | psam['#IID'] = psam['#IID'].astype(data_pheno['IID'].dtype) 162 | elif 'IID' in psam.columns and 'IID' in data_pheno.columns: 163 | psam['IID'] = psam['IID'].astype(data_pheno['IID'].dtype) 164 | 165 | # Merge phenotype data with the PSAM dataframe depending on column present 166 | if "#FID" in psam.columns: 167 | data_pheno_trait = data_pheno[["FID", "IID", "PHENO"]].rename(columns={"FID": "#FID", "PHENO": "PHENO1"}).copy() 168 | psam = psam.merge(data_pheno_trait, how="left", on=["#FID", "IID"], indicator=True) 169 | else: 170 | data_pheno_trait = data_pheno[["IID", "PHENO"]].rename(columns={"IID": "#IID", "PHENO": "PHENO1"}).copy() 171 | psam = psam.merge(data_pheno_trait, how="left", on=["#IID"], indicator=True) 172 | 173 | # Verify that the merge was successful 174 | if (psam["_merge"] == "both").sum() == 0: 175 | raise ValueError( 176 | "The IDs in the phenotype dataframe are inconsistent with those in the genetic dataset. Call set_phenotype() method again, specifying the correct column names for the genetic IDs (IID and FID)." 177 | ) 178 | psam.drop(axis=1, columns=["_merge"], inplace=True, errors="ignore") 179 | 180 | # Count the number of individuals with a valid phenotype trait 181 | n_non_na = psam.shape[0] - psam.PHENO1.isna().sum() 182 | print( 183 | f"{n_non_na} individuals are present in the genetic data and have a valid phenotype trait." 184 | ) 185 | 186 | # Update phenotype values based on its type 187 | if pheno_type == "binary": 188 | psam["PHENO1"] = psam["PHENO1"] + 1 189 | psam["PHENO1"] = psam["PHENO1"].astype("Int64") 190 | psam["PHENO1"] = psam["PHENO1"].astype(str).replace('', 'NA') 191 | if (pheno_type == "quant") & (standardize == True): 192 | # Standardizing for quantitative phenotypes 193 | print( 194 | "Standardizing the phenotype to approximate a normal distribution. Use standardize = False if you do not want to standardize." 195 | ) 196 | psam["PHENO1"] = (psam["PHENO1"] - psam["PHENO1"].mean(skipna=True)) / psam["PHENO1"].std(skipna=True) 197 | psam["PHENO1"] = psam["PHENO1"].fillna('NA') 198 | 199 | # Make sure the SEX column is not empty without modifying existing values 200 | psam["SEX"] = psam["SEX"].replace('', 'NA').fillna('NA') 201 | 202 | psam.to_csv(genetic_path + ".psam", sep="\t", index=False) 203 | return psam 204 | 205 | 206 | def _handle_covariates(covar_list, data_pheno, name): 207 | """Helper function to prepare the covariate file.""" 208 | if len(covar_list) > 0: 209 | # Ensure all covariates are present in phenotype data 210 | for col in covar_list: 211 | if col not in data_pheno.columns: 212 | raise TypeError( 213 | f"The {col} column is not found in the .phenotype dataframe." 214 | ) 215 | # Select required columns and rename columns 216 | data_cov = data_pheno[["FID", "IID"] + covar_list].copy() 217 | 218 | # Ensure the covariates are numeric and not trivial (lead to association fail) 219 | for col in covar_list: 220 | if data_pheno[col].nunique() == 1: 221 | print( 222 | f"The {col} covariate contains only one value and is removed from the tests." 223 | ) 224 | data_cov.drop(axis=1, columns=[col], inplace=True) 225 | covar_list.remove(col) 226 | if not pd.api.types.is_numeric_dtype(data_pheno[col]): 227 | print( 228 | f"The {col} covariate is not numeric and is removed from the tests." 229 | ) 230 | data_cov.drop(axis=1, columns=[col], inplace=True, errors="ignore") 231 | covar_list.remove(col) 232 | 233 | # Remove rows with NA values and print their number 234 | nrows = data_cov.shape[0] 235 | data_cov.dropna(inplace=True) 236 | removed_rows = nrows - data_cov.shape[0] 237 | if removed_rows > 0: 238 | print( 239 | f"{removed_rows}({removed_rows/nrows*100:.3f}%) individuals have NA values in the covariates columns and will be excluded from the association tests." 240 | ) 241 | 242 | # Define the covariate filename 243 | covar_filename = os.path.join("tmp_GENAL", f"{name}_covar.cov") 244 | # Ensure FID and IID are in integer format and write the covariate file 245 | data_cov["IID"] = data_cov["IID"].astype("Int64") 246 | data_cov["FID"] = data_cov["FID"].astype("Int64") 247 | data_cov.to_csv(covar_filename, sep=" ", header=True, index=False) 248 | covar = True 249 | else: 250 | covar = False 251 | covar_filename = None 252 | return covar_list, covar_filename 253 | 254 | 255 | ### __________________________ 256 | ### Set phenotype functions 257 | ### __________________________ 258 | 259 | def set_phenotype_func(data_original, PHENO, PHENO_type, IID, FID=None, alternate_control=False): 260 | """ 261 | Set a phenotype dataframe containing individual IDs and phenotype columns formatted for single-SNP association testing. 262 | 263 | Args: 264 | data (pd.DataFrame): Contains at least an individual IDs column and one phenotype column. 265 | IID (str): Name of the individual IDs column in data. 266 | PHENO (str): Name of the phenotype column in data. 267 | PHENO_type (str, optional): Type of the phenotype column. Either "quant" for quantitative (continuous) or "binary". 268 | The function tries to infer the type if not provided. 269 | FID (str, optional): Name of the family ID column in data. If not provided, FID will be set to IID values. 270 | alternate_control (bool): Assumes that for a binary trait, the controls are coded with the most frequent value. 271 | Use True to reverse the assumption. 272 | 273 | Returns: 274 | pd.DataFrame: The modified data. 275 | str: The inferred or provided PHENO_type. 276 | """ 277 | data = data_original.copy() 278 | _validate_columns_existence(data, PHENO, IID, FID) 279 | 280 | data = _standardize_column_names(data, PHENO, IID, FID) 281 | PHENO_type = _determine_phenotype_type(data, PHENO_type) 282 | data = _validate_and_process_phenotype(data, PHENO, PHENO_type, alternate_control) 283 | _report_na_values(data) 284 | 285 | print("The phenotype data is stored in the .phenotype attribute.") 286 | return data, PHENO_type 287 | 288 | 289 | def _validate_columns_existence(data, PHENO, IID, FID): 290 | """Checks if columns exist and raises errors if not.""" 291 | # Check if PHENO is a string 292 | if not isinstance(PHENO, str): 293 | raise ValueError("The PHENO argument must be a string containing the name of the phenotype column.") 294 | # Check if IID is a string 295 | if not isinstance(IID, str): 296 | raise ValueError("The IID argument must be a string containing the name of the individual IDs column.") 297 | 298 | for column in [PHENO, IID]: 299 | # Raise an error if the column name is not provided 300 | if column is None: 301 | raise ValueError(f"Please provide a name for the {column} variable.") 302 | # Raise an error if the column does not exist in the data 303 | if column not in data.columns: 304 | raise ValueError( 305 | f"The column '{column}' is not present in the dataset. This column is required!" 306 | ) 307 | 308 | # Handle FID column 309 | if FID is not None and FID not in data.columns: 310 | raise ValueError(f"The column '{FID}' is not present in the provided dataset.") 311 | 312 | # If IID or FID is numerical, convert to integer 313 | if is_numeric_dtype(data[IID]): 314 | data[IID] = data[IID].astype("Int64") 315 | if FID is not None and is_numeric_dtype(data[FID]): 316 | data[FID] = data[FID].astype("Int64") 317 | 318 | if data.shape[0] == 0: 319 | raise ValueError("The phenotype dataframe is empty.") 320 | 321 | 322 | def _standardize_column_names(data, PHENO, IID, FID): 323 | """Standardizes the column names to 'IID' and 'PHENO'.""" 324 | # Drop redundant columns if they exist and rename the target columns to standard names 325 | if PHENO != "PHENO": 326 | data.drop(axis=1, columns=["PHENO"], errors="ignore", inplace=True) 327 | if IID != "IID": 328 | data.drop(axis=1, columns=["IID"], errors="ignore", inplace=True) 329 | data.rename(columns={IID: "IID", PHENO: "PHENO"}, inplace=True) 330 | 331 | if FID is not None: 332 | if FID != "FID": 333 | data.drop(axis=1, columns=["FID"], errors="ignore", inplace=True) 334 | data.rename(columns={FID: "FID"}, inplace=True) 335 | else: 336 | data["FID"] = data["IID"] 337 | print( 338 | "The FID column was not provided. The FIDs are assumed to be the same as the IIDs." 339 | ) 340 | 341 | return data 342 | 343 | 344 | def _determine_phenotype_type(data, PHENO_type): 345 | """Guesses or validates the phenotype type.""" 346 | # If phenotype type is not given, deduce it based on the unique values in the column 347 | if PHENO_type is None: 348 | if len(np.unique(data.PHENO.dropna())) == 2: 349 | print( 350 | "Detected a binary phenotype in the 'PHENO' column. Specify 'PHENO_type=\"quant\"' if this is incorrect." 351 | ) 352 | return "binary" 353 | else: 354 | print( 355 | "Detected a quantitative phenotype in the 'PHENO' column. Specify 'PHENO_type=\"binary\"' if this is incorrect." 356 | ) 357 | return "quant" 358 | return PHENO_type 359 | 360 | 361 | def _validate_and_process_phenotype(data, PHENO, PHENO_type, alternate_control): 362 | """Validates the phenotype and processes it accordingly.""" 363 | # Process the phenotype based on its type 364 | if PHENO_type == "binary": 365 | _process_binary_phenotype(data, PHENO, alternate_control) 366 | elif PHENO_type == "quant": 367 | _validate_quantitative_phenotype(data, PHENO) 368 | else: 369 | raise ValueError("Accepted values for 'PHENO_type' are 'binary' or 'quant'.") 370 | return data 371 | 372 | 373 | def _process_binary_phenotype(data, PHENO, alternate_control): 374 | """Processes a binary phenotype.""" 375 | # Ensure that the phenotype is binary 376 | if len(np.unique(data.PHENO.dropna())) != 2: 377 | raise ValueError( 378 | f"The '{PHENO}' column is not binary as it contains more than two distinct values." 379 | ) 380 | 381 | if alternate_control: 382 | code_control = data.PHENO.value_counts().index[1] 383 | code_case = data.PHENO.value_counts().index[0] 384 | else: 385 | code_control = data.PHENO.value_counts().index[0] 386 | code_case = data.PHENO.value_counts().index[1] 387 | 388 | print( 389 | f"Identified {code_control} as the control code in 'PHENO'. {'Set alternate_control=True to inverse this interpretation.' if not alternate_control else ''}" 390 | ) 391 | 392 | # Update the control and case codings 393 | data.replace({"PHENO": {code_control: 0, code_case: 1}}, inplace=True) 394 | 395 | # Print number and percentage of cases 396 | n_case = int(data.PHENO.sum()) 397 | print( 398 | f"There are {n_case}({n_case/data.shape[0]*100:.3f}%) cases in the 'PHENO' column." 399 | ) 400 | 401 | 402 | def _validate_quantitative_phenotype(data, PHENO): 403 | """Validates a quantitative phenotype.""" 404 | # Ensure that the phenotype is numeric 405 | if not is_numeric_dtype(data.PHENO): 406 | raise ValueError( 407 | f"The '{PHENO}' column must contain numeric values for a quantitative phenotype." 408 | ) 409 | 410 | 411 | def _report_na_values(data): 412 | """Reports the number of NA values in 'IID' and 'PHENO' columns.""" 413 | nrows = data.shape[0] 414 | n_nan_id = data.IID.isna().sum() 415 | n_nan_pheno = data.PHENO.isna().sum() 416 | 417 | # Report NA values in ID and PHENO columns, if they exist 418 | if n_nan_id > 0: 419 | print( 420 | f"Detected {n_nan_id} NA values in the 'ID' column, accounting for {n_nan_id/nrows*100:.3f}% of entries. These will be omitted during analyses." 421 | ) 422 | if n_nan_pheno > 0: 423 | print( 424 | f"Detected {n_nan_pheno} NA values in the 'PHENO' column, accounting for {n_nan_pheno/nrows*100:.3f}% of entries. These will be omitted during analyses." 425 | ) 426 | --------------------------------------------------------------------------------