├── gitignore
├── .DS_Store
├── docs
    ├── .DS_Store
    ├── build
    │   ├── .DS_Store
    │   ├── objects.inv
    │   ├── _static
    │   │   ├── file.png
    │   │   ├── plus.png
    │   │   ├── minus.png
    │   │   ├── css
    │   │   │   ├── fonts
    │   │   │   │   ├── lato-bold.woff
    │   │   │   │   ├── lato-bold.woff2
    │   │   │   │   ├── lato-normal.woff
    │   │   │   │   ├── lato-normal.woff2
    │   │   │   │   ├── Roboto-Slab-Bold.woff
    │   │   │   │   ├── lato-bold-italic.woff
    │   │   │   │   ├── Roboto-Slab-Bold.woff2
    │   │   │   │   ├── fontawesome-webfont.eot
    │   │   │   │   ├── fontawesome-webfont.ttf
    │   │   │   │   ├── lato-bold-italic.woff2
    │   │   │   │   ├── lato-normal-italic.woff
    │   │   │   │   ├── Roboto-Slab-Regular.woff
    │   │   │   │   ├── Roboto-Slab-Regular.woff2
    │   │   │   │   ├── fontawesome-webfont.woff
    │   │   │   │   ├── fontawesome-webfont.woff2
    │   │   │   │   └── lato-normal-italic.woff2
    │   │   │   └── badge_only.css
    │   │   ├── documentation_options.js
    │   │   ├── js
    │   │   │   ├── badge_only.js
    │   │   │   ├── html5shiv.min.js
    │   │   │   ├── html5shiv-printshiv.min.js
    │   │   │   └── theme.js
    │   │   ├── pygments.css
    │   │   ├── doctools.js
    │   │   ├── language_data.js
    │   │   └── sphinx_highlight.js
    │   ├── .doctrees
    │   │   ├── api.doctree
    │   │   ├── genal.doctree
    │   │   ├── index.doctree
    │   │   ├── modules.doctree
    │   │   ├── environment.pickle
    │   │   └── introduction.doctree
    │   ├── _images
    │   │   └── MR_plot_SBP_AS.png
    │   ├── .buildinfo
    │   ├── _sources
    │   │   ├── api.rst.txt
    │   │   ├── genal.rst.txt
    │   │   ├── modules.rst.txt
    │   │   └── index.rst.txt
    │   ├── search.html
    │   ├── _modules
    │   │   ├── index.html
    │   │   └── genal
    │   │   │   └── clump.html
    │   ├── py-modindex.html
    │   └── index.html
    ├── source
    │   ├── .DS_Store
    │   ├── Images
    │   │   ├── genal_logo.png
    │   │   ├── MR_plot_SBP_AS.png
    │   │   └── Genal_flowchart.png
    │   ├── conf.py
    │   ├── api.rst
    │   ├── modules.rst
    │   └── index.rst
    ├── requirements.txt
    ├── Makefile
    └── make.bat
├── genal_logo.png
├── Genal_flowchart.png
├── .gitignore
├── genal
    ├── __init__.py
    ├── constants.py
    ├── clump.py
    ├── genes.py
    ├── snp_query.py
    ├── colocalization.py
    ├── lift.py
    ├── MRpresso.py
    ├── proxy.py
    ├── extract_prs.py
    └── association.py
├── readthedocs.yaml
├── .readthedocs.yaml
└── pyproject.toml


/gitignore:
--------------------------------------------------------------------------------
1 | .gitignore


--------------------------------------------------------------------------------
/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CypRiv/genal/HEAD/.DS_Store


--------------------------------------------------------------------------------
/docs/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CypRiv/genal/HEAD/docs/.DS_Store


--------------------------------------------------------------------------------
/genal_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CypRiv/genal/HEAD/genal_logo.png


--------------------------------------------------------------------------------
/Genal_flowchart.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CypRiv/genal/HEAD/Genal_flowchart.png


--------------------------------------------------------------------------------
/docs/build/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CypRiv/genal/HEAD/docs/build/.DS_Store


--------------------------------------------------------------------------------
/docs/build/objects.inv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CypRiv/genal/HEAD/docs/build/objects.inv


--------------------------------------------------------------------------------
/docs/source/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CypRiv/genal/HEAD/docs/source/.DS_Store


--------------------------------------------------------------------------------
/docs/build/_static/file.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CypRiv/genal/HEAD/docs/build/_static/file.png


--------------------------------------------------------------------------------
/docs/build/_static/plus.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CypRiv/genal/HEAD/docs/build/_static/plus.png


--------------------------------------------------------------------------------
/docs/build/_static/minus.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CypRiv/genal/HEAD/docs/build/_static/minus.png


--------------------------------------------------------------------------------
/docs/build/.doctrees/api.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CypRiv/genal/HEAD/docs/build/.doctrees/api.doctree


--------------------------------------------------------------------------------
/docs/build/.doctrees/genal.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CypRiv/genal/HEAD/docs/build/.doctrees/genal.doctree


--------------------------------------------------------------------------------
/docs/build/.doctrees/index.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CypRiv/genal/HEAD/docs/build/.doctrees/index.doctree


--------------------------------------------------------------------------------
/docs/source/Images/genal_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CypRiv/genal/HEAD/docs/source/Images/genal_logo.png


--------------------------------------------------------------------------------
/docs/build/.doctrees/modules.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CypRiv/genal/HEAD/docs/build/.doctrees/modules.doctree


--------------------------------------------------------------------------------
/docs/build/_images/MR_plot_SBP_AS.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CypRiv/genal/HEAD/docs/build/_images/MR_plot_SBP_AS.png


--------------------------------------------------------------------------------
/docs/source/Images/MR_plot_SBP_AS.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CypRiv/genal/HEAD/docs/source/Images/MR_plot_SBP_AS.png


--------------------------------------------------------------------------------
/docs/build/.doctrees/environment.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CypRiv/genal/HEAD/docs/build/.doctrees/environment.pickle


--------------------------------------------------------------------------------
/docs/source/Images/Genal_flowchart.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CypRiv/genal/HEAD/docs/source/Images/Genal_flowchart.png


--------------------------------------------------------------------------------
/docs/build/.doctrees/introduction.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CypRiv/genal/HEAD/docs/build/.doctrees/introduction.doctree


--------------------------------------------------------------------------------
/docs/build/_static/css/fonts/lato-bold.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CypRiv/genal/HEAD/docs/build/_static/css/fonts/lato-bold.woff


--------------------------------------------------------------------------------
/docs/build/_static/css/fonts/lato-bold.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CypRiv/genal/HEAD/docs/build/_static/css/fonts/lato-bold.woff2


--------------------------------------------------------------------------------
/docs/build/_static/css/fonts/lato-normal.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CypRiv/genal/HEAD/docs/build/_static/css/fonts/lato-normal.woff


--------------------------------------------------------------------------------
/docs/build/_static/css/fonts/lato-normal.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CypRiv/genal/HEAD/docs/build/_static/css/fonts/lato-normal.woff2


--------------------------------------------------------------------------------
/docs/build/_static/css/fonts/Roboto-Slab-Bold.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CypRiv/genal/HEAD/docs/build/_static/css/fonts/Roboto-Slab-Bold.woff


--------------------------------------------------------------------------------
/docs/build/_static/css/fonts/lato-bold-italic.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CypRiv/genal/HEAD/docs/build/_static/css/fonts/lato-bold-italic.woff


--------------------------------------------------------------------------------
/docs/build/_static/css/fonts/Roboto-Slab-Bold.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CypRiv/genal/HEAD/docs/build/_static/css/fonts/Roboto-Slab-Bold.woff2


--------------------------------------------------------------------------------
/docs/build/_static/css/fonts/fontawesome-webfont.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CypRiv/genal/HEAD/docs/build/_static/css/fonts/fontawesome-webfont.eot


--------------------------------------------------------------------------------
/docs/build/_static/css/fonts/fontawesome-webfont.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CypRiv/genal/HEAD/docs/build/_static/css/fonts/fontawesome-webfont.ttf


--------------------------------------------------------------------------------
/docs/build/_static/css/fonts/lato-bold-italic.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CypRiv/genal/HEAD/docs/build/_static/css/fonts/lato-bold-italic.woff2


--------------------------------------------------------------------------------
/docs/build/_static/css/fonts/lato-normal-italic.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CypRiv/genal/HEAD/docs/build/_static/css/fonts/lato-normal-italic.woff


--------------------------------------------------------------------------------
/docs/build/_static/css/fonts/Roboto-Slab-Regular.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CypRiv/genal/HEAD/docs/build/_static/css/fonts/Roboto-Slab-Regular.woff


--------------------------------------------------------------------------------
/docs/build/_static/css/fonts/Roboto-Slab-Regular.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CypRiv/genal/HEAD/docs/build/_static/css/fonts/Roboto-Slab-Regular.woff2


--------------------------------------------------------------------------------
/docs/build/_static/css/fonts/fontawesome-webfont.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CypRiv/genal/HEAD/docs/build/_static/css/fonts/fontawesome-webfont.woff


--------------------------------------------------------------------------------
/docs/build/_static/css/fonts/fontawesome-webfont.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CypRiv/genal/HEAD/docs/build/_static/css/fonts/fontawesome-webfont.woff2


--------------------------------------------------------------------------------
/docs/build/_static/css/fonts/lato-normal-italic.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CypRiv/genal/HEAD/docs/build/_static/css/fonts/lato-normal-italic.woff2


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | __pycache__/
 2 | dist/
 3 | .ipynb_checkpoints/
 4 | ipynb_checkpoints/
 5 | genal/.ipynb_checkpoints/
 6 | test_data/
 7 | cursor/
 8 | tests/
 9 | tmp_GENAL/
10 | REVIEW_REPORT.md
11 | TASKS.md


--------------------------------------------------------------------------------
/docs/build/.buildinfo:
--------------------------------------------------------------------------------
1 | # Sphinx build info version 1
2 | # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
3 | config: 1a3c03fa317dbf0f46b6f7567774d6c5
4 | tags: 645f666f9bcd5a90fca523b33c5a78b7
5 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
 1 | sphinx
 2 | sphinx_rtd_theme
 3 | aiohttp==3.9.5
 4 | nest_asyncio==1.5.5
 5 | numpy>=1.24.4,<2.0
 6 | pandas>=2.0.3
 7 | plotnine==0.12.3
 8 | psutil==5.9.1
 9 | pyliftover==0.4
10 | scikit_learn>=1.3.0
11 | scipy>=1.11.4
12 | statsmodels==0.14.0
13 | tqdm==4.66.1
14 | wget==3.2


--------------------------------------------------------------------------------
/docs/build/_static/documentation_options.js:
--------------------------------------------------------------------------------
 1 | const DOCUMENTATION_OPTIONS = {
 2 |     VERSION: 'v0.8',
 3 |     LANGUAGE: 'en',
 4 |     COLLAPSE_INDEX: false,
 5 |     BUILDER: 'html',
 6 |     FILE_SUFFIX: '.html',
 7 |     LINK_SUFFIX: '.html',
 8 |     HAS_SOURCE: true,
 9 |     SOURCELINK_SUFFIX: '.txt',
10 |     NAVIGATION_WITH_KEYS: false,
11 |     SHOW_SEARCH_SUMMARY: true,
12 |     ENABLE_SEARCH_SHORTCUTS: true,
13 | };


--------------------------------------------------------------------------------
/genal/__init__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | from .tools import default_config, write_config, set_plink, install_plink, delete_tmp, get_reference_panel_path, get_plink_path
 4 | from .geno_tools import Combine_Geno
 5 | from .genes import filter_by_gene_func
 6 | from .constants import CONFIG_DIR
 7 | 
 8 | __version__ = "1.4.5"
 9 | 
10 | config_path = os.path.join(CONFIG_DIR, "config.json")
11 | 
12 | if not os.path.exists(CONFIG_DIR):
13 |     os.makedirs(CONFIG_DIR)
14 | 
15 | 
16 | if not os.path.exists(config_path):
17 |     write_config(default_config())
18 |     print(f"Configuration file for genal placed at '{config_path}'")
19 | 
20 | from .Geno import Geno
21 | 


--------------------------------------------------------------------------------
/readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | # .readthedocs.yaml
 2 | # Read the Docs configuration file
 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 4 | 
 5 | # Required
 6 | version: 2
 7 | 
 8 | # Set the version of Python and other tools you might need
 9 | build:
10 |   os: ubuntu-22.04
11 |   tools:
12 |     python: "3.11"
13 | 
14 | # Build documentation in the docs/ directory with Sphinx
15 | sphinx:
16 |   configuration: docs/conf.py
17 | 
18 | # We recommend specifying your dependencies to enable reproducible builds:
19 | # https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
20 | python:
21 |   install:
22 |   - requirements: docs/requirements.txt


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | # .readthedocs.yaml
 2 | # Read the Docs configuration file
 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 4 | 
 5 | # Required
 6 | version: 2
 7 | 
 8 | # Set the version of Python and other tools you might need
 9 | build:
10 |   os: ubuntu-22.04
11 |   tools:
12 |     python: "3.11"
13 | 
14 | # Build documentation in the docs/ directory with Sphinx
15 | sphinx:
16 |   configuration: docs/source/conf.py
17 | 
18 | # We recommend specifying your dependencies to enable reproducible builds:
19 | # https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
20 | python:
21 |   install:
22 |   - requirements: docs/requirements.txt
23 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/build/_static/js/badge_only.js:
--------------------------------------------------------------------------------
1 | !function(e){var t={};function r(n){if(t[n])return t[n].exports;var o=t[n]={i:n,l:!1,exports:{}};return e[n].call(o.exports,o,o.exports,r),o.l=!0,o.exports}r.m=e,r.c=t,r.d=function(e,t,n){r.o(e,t)||Object.defineProperty(e,t,{enumerable:!0,get:n})},r.r=function(e){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},r.t=function(e,t){if(1&t&&(e=r(e)),8&t)return e;if(4&t&&"object"==typeof e&&e&&e.__esModule)return e;var n=Object.create(null);if(r.r(n),Object.defineProperty(n,"default",{enumerable:!0,value:e}),2&t&&"string"!=typeof e)for(var o in e)r.d(n,o,function(t){return e[t]}.bind(null,o));return n},r.n=function(e){var t=e&&e.__esModule?function(){return e.default}:function(){return e};return r.d(t,"a",t),t},r.o=function(e,t){return Object.prototype.hasOwnProperty.call(e,t)},r.p="",r(r.s=4)}({4:function(e,t,r){}});


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | 
13 | %SPHINXBUILD% >NUL 2>NUL
14 | if errorlevel 9009 (
15 | 	echo.
16 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
17 | 	echo.installed, then set the SPHINXBUILD environment variable to point
18 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
19 | 	echo.may add the Sphinx directory to PATH.
20 | 	echo.
21 | 	echo.If you don't have Sphinx installed, grab it from
22 | 	echo.https://www.sphinx-doc.org/
23 | 	exit /b 1
24 | )
25 | 
26 | if "%1" == "" goto help
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["flit_core >=3.2,<4"]
 3 | build-backend = "flit_core.buildapi"
 4 | 
 5 | [project]
 6 | name = "genal-python"  # Updated name for PyPI
 7 | version = "1.4.5"
 8 | authors = [{name = "Cyprien Rivier", email = "riviercyprien@gmail.com"}]
 9 | description = "A python toolkit for polygenic risk scoring and mendelian randomization."
10 | readme = "README.md"
11 | requires-python = ">=3.8"
12 | license = {file = "LICENSE"}
13 | classifiers = [
14 |     "Programming Language :: Python :: 3",
15 |     "License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)",
16 |     "Operating System :: OS Independent",
17 | ]
18 | 
19 | # Dependencies section
20 | dependencies = [
21 | "aiohttp>=3.7",
22 | "nest_asyncio>=1.5",
23 | "numpy>=1.17.3",
24 | "pandas>=1.0",
25 | "plotnine>=0.9",
26 | "psutil>=5.0",
27 | "requests>=2.0",
28 | "pyliftover>=0.4",
29 | "scikit_learn>=0.24",
30 | "scipy>=1.7,<1.13",
31 | "statsmodels>=0.13,<0.15",
32 | "tqdm>=4.38",
33 | "wget>=3.0",
34 | "fastparquet>=0.4",
35 | "pyarrow>=3.0"
36 | ]
37 | 
38 | [tool.setuptools.package-dir]
39 | genal = "genal/"
40 | 
41 | 
42 | [project.urls]
43 | Home = "https://github.com/CypRiv/genal"
44 | 
45 | [tool.flit.module]
46 | name = "genal" 
47 | 


--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # For the full list of built-in configuration values, see the documentation:
 4 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 5 | 
 6 | # -- Project information -----------------------------------------------------
 7 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
 8 | import os
 9 | import sys
10 | import sphinx_rtd_theme
11 | sys.path.insert(0, os.path.abspath('../../'))
12 | 
13 | project = 'genal'
14 | copyright = '2023, Cyprien A. Rivier'
15 | author = 'Cyprien A. Rivier'
16 | release = 'v1.1'
17 | 
18 | 
19 | # -- General configuration ---------------------------------------------------
20 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
21 | 
22 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.napoleon', 'sphinx.ext.viewcode' ]
23 | 
24 | templates_path = ['_templates']
25 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
26 | 
27 | 
28 | 
29 | # -- Options for HTML output -------------------------------------------------
30 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
31 | 
32 | html_theme = 'sphinx_rtd_theme'
33 | html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
34 | html_static_path = ['_static']
35 | 


--------------------------------------------------------------------------------
/genal/constants.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | STANDARD_COLUMNS = ["CHR", "POS", "SNP", "EA", "NEA", "BETA", "SE", "P"]
 4 | BUILDS = ["37", "38"]
 5 | POPULATIONS = ["EUR", "AFR", "EAS", "AMR", "SAS"]
 6 | REF_PANELS = [f"{pop}_{build}" for pop in POPULATIONS for build in BUILDS]
 7 | REF_PANEL_COLUMNS = ["CHR", "SNP", "POS", "A1", "A2"]
 8 | BUCKET_URL = "https://storage.googleapis.com/genal_files/"
 9 | REF_PANELS_URL = BUCKET_URL + "{panel}.tar.gz"
10 | REF_PARQUET_URL = BUCKET_URL + "reference_variants_{build}.parquet"
11 | CONFIG_DIR = os.path.expanduser("~/.genal/")
12 | CHECKS_DICT = {
13 |     "CHR": False,
14 |     "POS": False,
15 |     "P": False,
16 |     "EA": False,
17 |     "NEA": False,
18 |     "BETA": False,
19 |     "SNP": False,
20 |     "NA_removal": False,
21 | }
22 | MR_METHODS_NAMES = {
23 |     "IVW": "Inverse-Variance Weighted",
24 |     "IVW-RE": "Inverse Variance Weighted (Random Effects)",
25 |     "IVW-FE": "Inverse Variance Weighted (Fixed Effects)",
26 |     "UWR": "Unweighted Regression",
27 |     "WM": "Weighted Median",
28 |     "WM-pen": "Penalised Weighted Median",
29 |     "Simple-median": "Simple Median",
30 |     "Sign": "Sign concordance test",
31 |     "Egger": ("MR Egger", "Egger Intercept"),
32 |     "Egger-boot": ("MR Egger bootstrap", "Egger Intercept bootstrap"),
33 |     "Simple-mode": "Simple mode",
34 |     "Weighted-mode": "Weighted mode",
35 | }


--------------------------------------------------------------------------------
/docs/build/_sources/api.rst.txt:
--------------------------------------------------------------------------------
  1 | ===
  2 | API
  3 | ===
  4 | 
  5 | genal.GENO class
  6 | -----------------
  7 | 
  8 | .. autoclass:: genal.Geno
  9 |    :members:
 10 |    :undoc-members:
 11 |    :show-inheritance:
 12 | 
 13 | genal.geno\_tools module
 14 | ------------------------
 15 | 
 16 | .. automodule:: genal.geno_tools
 17 |    :members:
 18 |    :undoc-members:
 19 |    :show-inheritance:
 20 | 
 21 | genal.tools module
 22 | ------------------
 23 | 
 24 | .. automodule:: genal.tools
 25 |    :members:
 26 |    :undoc-members:
 27 |    :show-inheritance:
 28 | 
 29 | genal.clump module
 30 | ------------------
 31 | 
 32 | .. automodule:: genal.clump
 33 |    :members:
 34 |    :undoc-members:
 35 |    :show-inheritance:
 36 | 
 37 | genal.proxy module
 38 | ------------------
 39 | 
 40 | .. automodule:: genal.proxy
 41 |    :members:
 42 |    :undoc-members:
 43 |    :show-inheritance:
 44 | 
 45 | genal.extract\_prs module
 46 | -------------------------
 47 | 
 48 | .. automodule:: genal.extract_prs
 49 |    :members:
 50 |    :undoc-members:
 51 |    :show-inheritance:
 52 | 
 53 | genal.association module
 54 | ------------------------
 55 | 
 56 | .. automodule:: genal.association
 57 |    :members:
 58 |    :undoc-members:
 59 |    :show-inheritance:
 60 | 
 61 | genal.MR\_tools module
 62 | ----------------------
 63 | 
 64 | .. automodule:: genal.MR_tools
 65 |    :members:
 66 |    :undoc-members:
 67 |    :show-inheritance:
 68 | 
 69 | genal.MR module
 70 | ---------------
 71 | 
 72 | .. automodule:: genal.MR
 73 |    :members:
 74 |    :undoc-members:
 75 |    :show-inheritance:
 76 | 
 77 | genal.MRpresso module
 78 | ---------------------
 79 | 
 80 | .. automodule:: genal.MRpresso
 81 |    :members:
 82 |    :undoc-members:
 83 |    :show-inheritance:
 84 | 
 85 | genal.lift module
 86 | -----------------
 87 | 
 88 | .. automodule:: genal.lift
 89 |    :members:
 90 |    :undoc-members:
 91 |    :show-inheritance:
 92 | 
 93 | genal.snp_query module
 94 | ----------------------
 95 | 
 96 | .. automodule:: genal.snp_query
 97 |    :members:
 98 |    :undoc-members:
 99 |    :show-inheritance:
100 | 
101 | 


--------------------------------------------------------------------------------
/docs/build/_sources/genal.rst.txt:
--------------------------------------------------------------------------------
  1 | genal package
  2 | =============
  3 | 
  4 | Submodules
  5 | ----------
  6 | 
  7 | genal.GENO module
  8 | -----------------
  9 | 
 10 | .. automodule:: genal.Geno
 11 |    :members:
 12 |    :undoc-members:
 13 |    :show-inheritance:
 14 | 
 15 | genal.MR module
 16 | ---------------
 17 | 
 18 | .. automodule:: genal.MR
 19 |    :members:
 20 |    :undoc-members:
 21 |    :show-inheritance:
 22 | 
 23 | genal.MR\_tools module
 24 | ----------------------
 25 | 
 26 | .. automodule:: genal.MR_tools
 27 |    :members:
 28 |    :undoc-members:
 29 |    :show-inheritance:
 30 | 
 31 | genal.MRpresso module
 32 | ---------------------
 33 | 
 34 | .. automodule:: genal.MRpresso
 35 |    :members:
 36 |    :undoc-members:
 37 |    :show-inheritance:
 38 | 
 39 | genal.association module
 40 | ------------------------
 41 | 
 42 | .. automodule:: genal.association
 43 |    :members:
 44 |    :undoc-members:
 45 |    :show-inheritance:
 46 | 
 47 | genal.clump module
 48 | ------------------
 49 | 
 50 | .. automodule:: genal.clump
 51 |    :members:
 52 |    :undoc-members:
 53 |    :show-inheritance:
 54 | 
 55 | genal.extract\_prs module
 56 | -------------------------
 57 | 
 58 | .. automodule:: genal.extract_prs
 59 |    :members:
 60 |    :undoc-members:
 61 |    :show-inheritance:
 62 | 
 63 | genal.geno\_tools module
 64 | ------------------------
 65 | 
 66 | .. automodule:: genal.geno_tools
 67 |    :members:
 68 |    :undoc-members:
 69 |    :show-inheritance:
 70 | 
 71 | genal.lift module
 72 | -----------------
 73 | 
 74 | .. automodule:: genal.lift
 75 |    :members:
 76 |    :undoc-members:
 77 |    :show-inheritance:
 78 | 
 79 | genal.proxy module
 80 | ------------------
 81 | 
 82 | .. automodule:: genal.proxy
 83 |    :members:
 84 |    :undoc-members:
 85 |    :show-inheritance:
 86 | 
 87 | genal.tools module
 88 | ------------------
 89 | 
 90 | .. automodule:: genal.tools
 91 |    :members:
 92 |    :undoc-members:
 93 |    :show-inheritance:
 94 | 
 95 | Module contents
 96 | ---------------
 97 | 
 98 | .. automodule:: genal
 99 |    :members:
100 |    :undoc-members:
101 |    :show-inheritance:
102 | 


--------------------------------------------------------------------------------
/docs/source/api.rst:
--------------------------------------------------------------------------------
  1 | ===
  2 | API
  3 | ===
  4 | 
  5 | genal.GENO class
  6 | -----------------
  7 | 
  8 | .. autoclass:: genal.Geno
  9 |    :members: __init__, preprocess_data, get_reference_panel, clump, update_snpids, extract_snps, prs, set_phenotype, association_test, query_outcome, MR, MR_plot, MR_forest, MRpresso, filter_by_gene, colocalize, lift, query_gwas_catalog, standardize_betas, update_eaf, sort_group, copy, save
 10 |    :undoc-members:
 11 |    :show-inheritance:
 12 | 
 13 | genal.geno\_tools module
 14 | ------------------------
 15 | 
 16 | .. automodule:: genal.geno_tools
 17 |    :members:
 18 |    :undoc-members:
 19 |    :show-inheritance:
 20 | 
 21 | genal.tools module
 22 | ------------------
 23 | 
 24 | .. automodule:: genal.tools
 25 |    :members:
 26 |    :undoc-members:
 27 |    :show-inheritance:
 28 | 
 29 | genal.clump module
 30 | ------------------
 31 | 
 32 | .. automodule:: genal.clump
 33 |    :members:
 34 |    :undoc-members:
 35 |    :show-inheritance:
 36 | 
 37 | genal.proxy module
 38 | ------------------
 39 | 
 40 | .. automodule:: genal.proxy
 41 |    :members:
 42 |    :undoc-members:
 43 |    :show-inheritance:
 44 | 
 45 | genal.extract\_prs module
 46 | -------------------------
 47 | 
 48 | .. automodule:: genal.extract_prs
 49 |    :members:
 50 |    :undoc-members:
 51 |    :show-inheritance:
 52 | 
 53 | genal.association module
 54 | ------------------------
 55 | 
 56 | .. automodule:: genal.association
 57 |    :members:
 58 |    :undoc-members:
 59 |    :show-inheritance:
 60 | 
 61 | genal.MR\_tools module
 62 | ----------------------
 63 | 
 64 | .. automodule:: genal.MR_tools
 65 |    :members:
 66 |    :undoc-members:
 67 |    :show-inheritance:
 68 | 
 69 | genal.MR module
 70 | ---------------
 71 | 
 72 | .. automodule:: genal.MR
 73 |    :members:
 74 |    :undoc-members:
 75 |    :show-inheritance:
 76 | 
 77 | genal.MRpresso module
 78 | ---------------------
 79 | 
 80 | .. automodule:: genal.MRpresso
 81 |    :members:
 82 |    :undoc-members:
 83 |    :show-inheritance:
 84 | 
 85 | genal.lift module
 86 | -----------------
 87 | 
 88 | .. automodule:: genal.lift
 89 |    :members:
 90 |    :undoc-members:
 91 |    :show-inheritance:
 92 | 
 93 | genal.snp_query module
 94 | ----------------------
 95 | 
 96 | .. automodule:: genal.snp_query
 97 |    :members:
 98 |    :undoc-members:
 99 |    :show-inheritance:
100 | 
101 | 


--------------------------------------------------------------------------------
/docs/source/modules.rst:
--------------------------------------------------------------------------------
 1 | ==============
 2 | The Geno class
 3 | ==============
 4 | 
 5 | The main object of the package is the :class:`~genal.Geno` class that contains the SNP-level data and manipulates it through its methods.
 6 | 
 7 | .. autoclass:: genal.Geno
 8 | 
 9 | ==============
10 | Main functions
11 | ==============
12 | 
13 | Preprocessing
14 | =============
15 | 
16 | The preprocessing of the SNP-level data is performed with the :func:`~genal.Geno.preprocess_data` method:
17 | 
18 | .. automethod:: genal.Geno.preprocess_data
19 | 
20 | 
21 | Clumping
22 | ========
23 | 
24 | Clumping is performed with the :func:`~genal.Geno.clump` method:
25 | 
26 | .. automethod:: genal.Geno.clump
27 | 
28 | Polygenic Risk Scoring
29 | ======================
30 | 
31 | The computation of a polygenic risk score in a target population is performed with the :func:`~genal.Geno.prs` method:
32 | 
33 | .. automethod:: genal.Geno.prs
34 | 
35 | Querying outcome data
36 | =====================
37 | 
38 | Before running Mendelian Randomization, the extraction of the genetic instruments from the :class:`~genal.Geno` object containing the SNP-outcome association data is done with :func:`~genal.Geno.query_outcome` method:
39 | 
40 | .. automethod:: genal.Geno.query_outcome
41 | 
42 | Mendelian Randomization
43 | =======================
44 | 
45 | Various Mendelian Randomization methods are computed with the :func:`~genal.Geno.MR` method:
46 | 
47 | .. automethod:: genal.Geno.MR
48 | 
49 | MR-PRESSO
50 | =========
51 | 
52 | The MR-PRESSO algorithm to detect and correct horizontal pleiotropy is executed with :func:`~genal.Geno.MRpresso` method:
53 | 
54 | .. automethod:: genal.Geno.MRpresso
55 | 
56 | Phenotype assignment
57 | ====================
58 | 
59 | Before running SNP-association tests, assigning a dataframe with phenotypic data to the :class:`~genal.Geno` object is done with :func:`~genal.Geno.set_phenotype` method:
60 | 
61 | .. automethod:: genal.Geno.set_phenotype
62 | 
63 | SNP-association tests
64 | =====================
65 | 
66 | SNP-association testing is conducted with :func:`~genal.Geno.association_test` method:
67 | 
68 | .. automethod:: genal.Geno.association_test
69 | 
70 | Genetic lifting
71 | ===============
72 | 
73 | Lifting the SNP data to another genetic build is done with :func:`~genal.Geno.lift` method:
74 | 
75 | .. automethod:: genal.Geno.lift
76 | 
77 | GWAS Catalog
78 | ============
79 | 
80 | Querying the GWAS Catalog to extract traits associated with the SNPs is done with :func:`~genal.Geno.query_gwas_catalog` method:
81 | 
82 | .. automethod:: genal.Geno.query_gwas_catalog


--------------------------------------------------------------------------------
/docs/build/_sources/modules.rst.txt:
--------------------------------------------------------------------------------
 1 | ==============
 2 | The Geno class
 3 | ==============
 4 | 
 5 | The main object of the package is the :class:`~genal.Geno` class that contains the SNP-level data and manipulates it through its methods.
 6 | 
 7 | .. autoclass:: genal.Geno
 8 | 
 9 | ==============
10 | Main functions
11 | ==============
12 | 
13 | Preprocessing
14 | =============
15 | 
16 | The preprocessing of the SNP-level data is performed with the :func:`~genal.Geno.preprocess_data` method:
17 | 
18 | .. automethod:: genal.Geno.preprocess_data
19 | 
20 | 
21 | Clumping
22 | ========
23 | 
24 | Clumping is performed with the :func:`~genal.Geno.clump` method:
25 | 
26 | .. automethod:: genal.Geno.clump
27 | 
28 | Polygenic Risk Scoring
29 | ======================
30 | 
31 | The computation of a polygenic risk score in a target population is performed with the :func:`~genal.Geno.prs` method:
32 | 
33 | .. automethod:: genal.Geno.prs
34 | 
35 | Querying outcome data
36 | =====================
37 | 
38 | Before running Mendelian Randomization, the extraction of the genetic instruments from the :class:`~genal.Geno` object containing the SNP-outcome association data is done with :func:`~genal.Geno.query_outcome` method:
39 | 
40 | .. automethod:: genal.Geno.query_outcome
41 | 
42 | Mendelian Randomization
43 | =======================
44 | 
45 | Various Mendelian Randomization methods are computed with the :func:`~genal.Geno.MR` method:
46 | 
47 | .. automethod:: genal.Geno.MR
48 | 
49 | MR-PRESSO
50 | =========
51 | 
52 | The MR-PRESSO algorithm to detect and correct horizontal pleiotropy is executed with :func:`~genal.Geno.MRpresso` method:
53 | 
54 | .. automethod:: genal.Geno.MRpresso
55 | 
56 | Phenotype assignment
57 | ====================
58 | 
59 | Before running SNP-association tests, assigning a dataframe with phenotypic data to the :class:`~genal.Geno` object is done with :func:`~genal.Geno.set_phenotype` method:
60 | 
61 | .. automethod:: genal.Geno.set_phenotype
62 | 
63 | SNP-association tests
64 | =====================
65 | 
66 | SNP-association testing is conducted with :func:`~genal.Geno.association_test` method:
67 | 
68 | .. automethod:: genal.Geno.association_test
69 | 
70 | Genetic lifting
71 | ===============
72 | 
73 | Lifting the SNP data to another genetic build is done with :func:`~genal.Geno.lift` method:
74 | 
75 | .. automethod:: genal.Geno.lift
76 | 
77 | GWAS Catalog
78 | ============
79 | 
80 | Querying the GWAS Catalog to extract traits associated with the SNPs is done with :func:`~genal.Geno.query_gwas_catalog` method:
81 | 
82 | .. automethod:: genal.Geno.query_gwas_catalog


--------------------------------------------------------------------------------
/docs/build/_static/js/html5shiv.min.js:
--------------------------------------------------------------------------------
1 | /**
2 | * @preserve HTML5 Shiv 3.7.3 | @afarkas @jdalton @jon_neal @rem | MIT/GPL2 Licensed
3 | */
4 | !function(a,b){function c(a,b){var c=a.createElement("p"),d=a.getElementsByTagName("head")[0]||a.documentElement;return c.innerHTML="x<style>"+b+"</style>",d.insertBefore(c.lastChild,d.firstChild)}function d(){var a=t.elements;return"string"==typeof a?a.split(" "):a}function e(a,b){var c=t.elements;"string"!=typeof c&&(c=c.join(" ")),"string"!=typeof a&&(a=a.join(" ")),t.elements=c+" "+a,j(b)}function f(a){var b=s[a[q]];return b||(b={},r++,a[q]=r,s[r]=b),b}function g(a,c,d){if(c||(c=b),l)return c.createElement(a);d||(d=f(c));var e;return e=d.cache[a]?d.cache[a].cloneNode():p.test(a)?(d.cache[a]=d.createElem(a)).cloneNode():d.createElem(a),!e.canHaveChildren||o.test(a)||e.tagUrn?e:d.frag.appendChild(e)}function h(a,c){if(a||(a=b),l)return a.createDocumentFragment();c=c||f(a);for(var e=c.frag.cloneNode(),g=0,h=d(),i=h.length;i>g;g++)e.createElement(h[g]);return e}function i(a,b){b.cache||(b.cache={},b.createElem=a.createElement,b.createFrag=a.createDocumentFragment,b.frag=b.createFrag()),a.createElement=function(c){return t.shivMethods?g(c,a,b):b.createElem(c)},a.createDocumentFragment=Function("h,f","return function(){var n=f.cloneNode(),c=n.createElement;h.shivMethods&&("+d().join().replace(/[\w\-:]+/g,function(a){return b.createElem(a),b.frag.createElement(a),'c("'+a+'")'})+");return n}")(t,b.frag)}function j(a){a||(a=b);var d=f(a);return!t.shivCSS||k||d.hasCSS||(d.hasCSS=!!c(a,"article,aside,dialog,figcaption,figure,footer,header,hgroup,main,nav,section{display:block}mark{background:#FF0;color:#000}template{display:none}")),l||i(a,d),a}var k,l,m="3.7.3-pre",n=a.html5||{},o=/^<|^(?:button|map|select|textarea|object|iframe|option|optgroup)$/i,p=/^(?:a|b|code|div|fieldset|h1|h2|h3|h4|h5|h6|i|label|li|ol|p|q|span|strong|style|table|tbody|td|th|tr|ul)$/i,q="_html5shiv",r=0,s={};!function(){try{var a=b.createElement("a");a.innerHTML="<xyz></xyz>",k="hidden"in a,l=1==a.childNodes.length||function(){b.createElement("a");var a=b.createDocumentFragment();return"undefined"==typeof a.cloneNode||"undefined"==typeof a.createDocumentFragment||"undefined"==typeof a.createElement}()}catch(c){k=!0,l=!0}}();var t={elements:n.elements||"abbr article aside audio bdi canvas data datalist details dialog figcaption figure footer header hgroup main mark meter nav output picture progress section summary template time video",version:m,shivCSS:n.shivCSS!==!1,supportsUnknownElements:l,shivMethods:n.shivMethods!==!1,type:"default",shivDocument:j,createElement:g,createDocumentFragment:h,addElements:e};a.html5=t,j(b),"object"==typeof module&&module.exports&&(module.exports=t)}("undefined"!=typeof window?window:this,document);


--------------------------------------------------------------------------------
/docs/build/_static/css/badge_only.css:
--------------------------------------------------------------------------------
1 | .clearfix{*zoom:1}.clearfix:after,.clearfix:before{display:table;content:""}.clearfix:after{clear:both}@font-face{font-family:FontAwesome;font-style:normal;font-weight:400;src:url(fonts/fontawesome-webfont.eot?674f50d287a8c48dc19ba404d20fe713?#iefix) format("embedded-opentype"),url(fonts/fontawesome-webfont.woff2?af7ae505a9eed503f8b8e6982036873e) format("woff2"),url(fonts/fontawesome-webfont.woff?fee66e712a8a08eef5805a46892932ad) format("woff"),url(fonts/fontawesome-webfont.ttf?b06871f281fee6b241d60582ae9369b9) format("truetype"),url(fonts/fontawesome-webfont.svg?912ec66d7572ff821749319396470bde#FontAwesome) format("svg")}.fa:before{font-family:FontAwesome;font-style:normal;font-weight:400;line-height:1}.fa:before,a .fa{text-decoration:inherit}.fa:before,a .fa,li .fa{display:inline-block}li .fa-large:before{width:1.875em}ul.fas{list-style-type:none;margin-left:2em;text-indent:-.8em}ul.fas li .fa{width:.8em}ul.fas li .fa-large:before{vertical-align:baseline}.fa-book:before,.icon-book:before{content:"\f02d"}.fa-caret-down:before,.icon-caret-down:before{content:"\f0d7"}.fa-caret-up:before,.icon-caret-up:before{content:"\f0d8"}.fa-caret-left:before,.icon-caret-left:before{content:"\f0d9"}.fa-caret-right:before,.icon-caret-right:before{content:"\f0da"}.rst-versions{position:fixed;bottom:0;left:0;width:300px;color:#fcfcfc;background:#1f1d1d;font-family:Lato,proxima-nova,Helvetica Neue,Arial,sans-serif;z-index:400}.rst-versions a{color:#2980b9;text-decoration:none}.rst-versions .rst-badge-small{display:none}.rst-versions .rst-current-version{padding:12px;background-color:#272525;display:block;text-align:right;font-size:90%;cursor:pointer;color:#27ae60}.rst-versions .rst-current-version:after{clear:both;content:"";display:block}.rst-versions .rst-current-version .fa{color:#fcfcfc}.rst-versions .rst-current-version .fa-book,.rst-versions .rst-current-version .icon-book{float:left}.rst-versions .rst-current-version.rst-out-of-date{background-color:#e74c3c;color:#fff}.rst-versions .rst-current-version.rst-active-old-version{background-color:#f1c40f;color:#000}.rst-versions.shift-up{height:auto;max-height:100%;overflow-y:scroll}.rst-versions.shift-up .rst-other-versions{display:block}.rst-versions .rst-other-versions{font-size:90%;padding:12px;color:grey;display:none}.rst-versions .rst-other-versions hr{display:block;height:1px;border:0;margin:20px 0;padding:0;border-top:1px solid #413d3d}.rst-versions .rst-other-versions dd{display:inline-block;margin:0}.rst-versions .rst-other-versions dd a{display:inline-block;padding:6px;color:#fcfcfc}.rst-versions.rst-badge{width:auto;bottom:20px;right:20px;left:auto;border:none;max-width:300px;max-height:90%}.rst-versions.rst-badge .fa-book,.rst-versions.rst-badge .icon-book{float:none;line-height:30px}.rst-versions.rst-badge.shift-up .rst-current-version{text-align:right}.rst-versions.rst-badge.shift-up .rst-current-version .fa-book,.rst-versions.rst-badge.shift-up .rst-current-version .icon-book{float:left}.rst-versions.rst-badge>.rst-current-version{width:auto;height:30px;line-height:30px;padding:0 6px;display:block;text-align:center}@media screen and (max-width:768px){.rst-versions{width:85%;display:none}.rst-versions.shift{display:block}}


--------------------------------------------------------------------------------
/docs/build/_sources/index.rst.txt:
--------------------------------------------------------------------------------
 1 | .. genal documentation master file, created by
 2 |    sphinx-quickstart on Thu Sep 14 14:04:16 2023.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | genal: A Python Toolkit for Genetic Risk Scoring and Mendelian Randomization
 7 | ============================================================================
 8 | 
 9 | :Author: Cyprien Rivier
10 | :Date: |today|
11 | :Version: "0.8"
12 | 
13 | Genal is a python module designed to make it easy to run genetic risk scores and mendelian randomization analyses. It integrates a collection of tools that facilitate the cleaning of single nucleotide polymorphism data (usually derived from Genome-Wide Association Studies) and enable the execution of key clinical population genetic workflows. The functionalities provided by genal include clumping, lifting, association testing, polygenic risk scoring, and Mendelian randomization analyses, all within a single Python module.
14 | 
15 | The module prioritizes user-friendliness and intuitive operation, aiming to reduce the complexity of data analysis for researchers. Despite its focus on simplicity, Genal does not sacrifice the depth of customization or the precision of analysis. Researchers can expect to maintain analytical rigour while benefiting from the streamlined experience.
16 | 
17 | Genal draws on concepts from well-established R packages such as TwoSampleMR, MR-Presso, MendelianRandomization, and gwasvcf, adapting their proven methodologies to the Python environment. This approach ensures that users have access to tried and tested techniques with the versatility of Python's data science tools. 
18 | 
19 | To install the latest release, type::
20 | 
21 |     pip install genal-python
22 | 
23 | Contents
24 | --------
25 | 
26 | .. toctree::
27 |    :maxdepth: 1
28 |    
29 |    Home <self>
30 |    introduction.rst
31 |    modules.rst
32 |    api.rst
33 | 
34 | 
35 | 
36 | Indices and tables
37 | ==================
38 | 
39 | * :ref:`genindex`
40 | * :ref:`modindex`
41 | * :ref:`search`
42 | 
43 | Citation
44 | --------
45 | 
46 | If you use genal in your work, please cite the following paper:
47 | 
48 | .. [Rivier.2024] *Genal: A Python Toolkit for Genetic Risk Scoring and Mendelian Randomization*
49 |    Cyprien Rivier, Cyprien A. Rivier, Santiago Clocchiatti-Tuozzo, Shufan Huo, Victor Torres-Lopez, Daniela Renedo, Kevin N. Sheth, Guido J. Falcone, Julian N. Acosta. 
50 |    medRxiv. 2024 May `10.1101/2024.05.23.24307776 <https://doi.org/10.1101/2024.05.23.24307776>`_.
51 | 
52 | References
53 | ----------
54 | 
55 | .. [Hemani.2018] *The MR-Base platform supports systematic causal inference across the human phenome.*
56 |    Hemani G, Zheng J, Elsworth B, Wade KH, Baird D, Haberland V, Laurin C, Burgess S, Bowden J, Langdon R, Tan VY, Yarmolinsky J, Shihab HA, Timpson NJ, Evans DM, Relton C, Martin RM, Davey Smith G, Gaunt TR, Haycock PC, The MR-Base Collaboration
57 |    eLife. 2018 May `10.7554/eLife.34408 <https://elifesciences.org/articles/34408>`_.
58 |    PMID: `29846171 <https://pubmed.ncbi.nlm.nih.gov/29846171>`_.
59 | 
60 | .. [Verbanck.2018] *Detection of widespread horizontal pleiotropy in causal relationships inferred from Mendelian randomization between complex traits and diseases.*
61 |    Marie Verbanck, Chia-Yen Chen, Benjamin Neale, Ron Do.
62 |    Nature Genetics 2018 May `10.1038/s41588-018-0099-7 <https://www.nature.com/articles/s41588-018-0099-7>`_.
63 |    PMID: `29686387 <https://pubmed.ncbi.nlm.nih.gov/29686387/>`_.
64 | 
65 | .. [Lyon.2020] *The variant call format provides efficient and robust storage of GWAS summary statistics.*
66 |    Matthew Lyon, Shea J Andrews, Ben Elsworth, Tom R Gaunt, Gibran Hemani, Edoardo Marcora.
67 |    bioRxiv 2020 May 30 `2020.05.29.115824v1 <https://www.biorxiv.org/content/10.1101/2020.05.29.115824v1>`_.
68 |    PMID: `33441155 <https://pubmed.ncbi.nlm.nih.gov/33441155/>`_.
69 |    


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | .. genal documentation master file, created by
 2 |    sphinx-quickstart on Thu Sep 14 14:04:16 2023.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | .. image:: Images/genal_logo.png
 7 |    :alt: genal_logo
 8 |    :width: 400px
 9 | 
10 | genal: A Python Toolkit for Genetic Risk Scoring and Mendelian Randomization
11 | ============================================================================
12 | 
13 | :Author: Cyprien A. Rivier
14 | :Date: |today|
15 | :Version: "1.2"
16 | 
17 | Genal is a python module designed to make it easy to run genetic risk scores and mendelian randomization analyses. It integrates a collection of tools that facilitate the cleaning of single nucleotide polymorphism data (usually derived from Genome-Wide Association Studies) and enable the execution of key clinical population genetic workflows. The functionalities provided by genal include clumping, lifting, association testing, polygenic risk scoring, and Mendelian randomization analyses, all within a single Python module.
18 | 
19 | The module prioritizes user-friendliness and intuitive operation, aiming to reduce the complexity of data analysis for researchers. Despite its focus on simplicity, Genal does not sacrifice the depth of customization or the precision of analysis. Researchers can expect to maintain analytical rigour while benefiting from the streamlined experience.
20 | 
21 | Genal draws on concepts from well-established R packages such as TwoSampleMR, MR-Presso, MendelianRandomization, and gwasvcf, adapting their proven methodologies to the Python environment. This approach ensures that users have access to tried and tested techniques with the versatility of Python's data science tools. 
22 | 
23 | To install the latest release, type::
24 | 
25 |     pip install genal-python
26 | 
27 | Contents
28 | --------
29 | 
30 | .. toctree::
31 |    :maxdepth: 1
32 |    
33 |    Home <self>
34 |    introduction.rst
35 |    modules.rst
36 |    api.rst
37 | 
38 | 
39 | 
40 | Indices and tables
41 | ==================
42 | 
43 | * :ref:`genindex`
44 | * :ref:`modindex`
45 | * :ref:`search`
46 | 
47 | Citation
48 | --------
49 | 
50 | If you use genal in your work, please cite the following paper:
51 | 
52 | .. [Rivier.2024] *Genal: A Python Toolkit for Genetic Risk Scoring and Mendelian Randomization*
53 |    Cyprien A. Rivier, Santiago Clocchiatti-Tuozzo, Shufan Huo, Victor Torres-Lopez, Daniela Renedo, Kevin N. Sheth, Guido J. Falcone, Julian N. Acosta. 
54 |    Bioinformatics Advances. 2024 December; `10.1093/bioadv/vbae207 <https://doi.org/10.1093/bioadv/vbae207>`_.
55 | 
56 | References
57 | ----------
58 | 
59 | .. [Hemani.2018] *The MR-Base platform supports systematic causal inference across the human phenome.*
60 |    Hemani G, Zheng J, Elsworth B, Wade KH, Baird D, Haberland V, Laurin C, Burgess S, Bowden J, Langdon R, Tan VY, Yarmolinsky J, Shihab HA, Timpson NJ, Evans DM, Relton C, Martin RM, Davey Smith G, Gaunt TR, Haycock PC, The MR-Base Collaboration
61 |    eLife. 2018 May `10.7554/eLife.34408 <https://elifesciences.org/articles/34408>`_.
62 |    PMID: `29846171 <https://pubmed.ncbi.nlm.nih.gov/29846171>`_.
63 | 
64 | .. [Verbanck.2018] *Detection of widespread horizontal pleiotropy in causal relationships inferred from Mendelian randomization between complex traits and diseases.*
65 |    Marie Verbanck, Chia-Yen Chen, Benjamin Neale, Ron Do.
66 |    Nature Genetics 2018 May `10.1038/s41588-018-0099-7 <https://www.nature.com/articles/s41588-018-0099-7>`_.
67 |    PMID: `29686387 <https://pubmed.ncbi.nlm.nih.gov/29686387/>`_.
68 | 
69 | .. [Lyon.2020] *The variant call format provides efficient and robust storage of GWAS summary statistics.*
70 |    Matthew Lyon, Shea J Andrews, Ben Elsworth, Tom R Gaunt, Gibran Hemani, Edoardo Marcora.
71 |    bioRxiv 2020 May 30 `2020.05.29.115824v1 <https://www.biorxiv.org/content/10.1101/2020.05.29.115824v1>`_.
72 |    PMID: `33441155 <https://pubmed.ncbi.nlm.nih.gov/33441155/>`_.
73 |    


--------------------------------------------------------------------------------
/genal/clump.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pandas as pd
 3 | import uuid
 4 | import re
 5 | 
 6 | from .tools import get_reference_panel_path, get_plink_path, run_plink_command
 7 | 
 8 | def clump_data_plink2(
 9 |     data,
10 |     reference_panel="EUR_37",
11 |     kb=10000,
12 |     r2=0.01,
13 |     p1=5e-8,
14 |     p2=0.01,
15 |     name=None,
16 |     ram=10000,
17 | ):
18 |     """
19 |     Perform clumping on the given data using plink. Corresponds to the :meth:`Geno.clump` method.
20 | 
21 |     Args:
22 |         data (pd.DataFrame): Input data with at least 'SNP' and 'P' columns.
23 |         reference_panel (str, optional): The reference population to get linkage disequilibrium values and find proxies.
24 |             Acceptable populations are "EUR", "SAS", "AFR", "EAS", "AMR" and available builds are 37 and 38 ("EUR_38" or "AFR_37" etc...)
25 |             Also accepts or a path to a specific bed/bim/fam or pgen/pvar/psam panel.
26 |             Default is "EUR_37".
27 |         kb (int, optional): Clumping window in terms of thousands of SNPs. Default is 10000.
28 |         r2 (float, optional): Linkage disequilibrium threshold, values between 0 and 1. Default is 0.01.
29 |         p1 (float, optional): P-value threshold during clumping. SNPs above this value are not considered. Default is 5e-8.
30 |         p2 (float, optional): P-value threshold post-clumping to further filter the clumped SNPs. If p2 < p1, it won't be considered. Default is 0.01.
31 |         name (str, optional): Name used for the files created in the tmp_GENAL folder.
32 |         ram (int, optional): Amount of RAM in MB to be used by plink.
33 | 
34 |     Returns:
35 |         pd.DataFrame: Data after clumping, if any.
36 |     """
37 | 
38 |     # Create unique ID for the name if none is passed
39 |     if name is None:
40 |         name = str(uuid.uuid4())[:8]
41 | 
42 |     # Save the relevant data columns to a temporary file
43 |     to_clump_filename = os.path.join("tmp_GENAL", f"{name}_to_clump.txt")
44 |     data[["SNP", "P"]].to_csv(to_clump_filename, index=False, sep="\t")
45 | 
46 |     # Get reference panel path and type
47 |     ref_path, filetype = get_reference_panel_path(reference_panel)
48 | 
49 |     # Construct and execute the plink clumping command
50 |     output_path = os.path.join("tmp_GENAL", name)
51 |     
52 |     # Base command differs based on filetype
53 |     base_cmd = f"{get_plink_path()} --memory {ram}"
54 |     if filetype == "bed":
55 |         base_cmd += f" --bfile {ref_path}"
56 |     else:  # pgen
57 |         base_cmd += f" --pfile {ref_path}"
58 |         
59 |     plink_command = f"{base_cmd} --rm-dup force-first --clump {to_clump_filename} --clump-kb {kb} \
60 |                      --clump-r2 {r2} --clump-p1 {p1} --clump-p2 {p2} --out {output_path}"
61 |     
62 |     run_plink_command(plink_command)
63 |     
64 |     # Read log file to get the number of missing top variant IDs
65 |     log_content = open(os.path.join("tmp_GENAL", f"{name}.log")).read()
66 |     match = re.search(r"(\d+)\s+top\s+variant\s+ID", log_content)
67 |     if match:
68 |         missing_variants = int(match.group(1))
69 |         print(f"Warning: {missing_variants} top variant IDs missing")
70 | 
71 |     if "No significant --clump results." in log_content:
72 |         print("No SNPs remaining after clumping.")
73 |         return
74 |     
75 |     match = re.search(r"(\d+)\s+clump[s]?\s+formed\s+from\s+(\d+)\s+index", log_content)
76 |     if match:
77 |         print(f"{match.group(1)} clumps formed from {match.group(2)} top variants.")
78 | 
79 |     # Extract the list of clumped SNPs and get the relevant data subset
80 |     clumped_filename = os.path.join("tmp_GENAL", f"{name}.clumps")
81 |     if not os.path.exists(clumped_filename):
82 |         raise FileNotFoundError(f"'{clumped_filename}' is missing.")
83 |     plink_clumped = pd.read_csv(clumped_filename, sep="\s+", usecols=["ID"])
84 |     clumped_data = data[data["SNP"].isin(plink_clumped["ID"])]
85 |     clumped_data.reset_index(drop=True, inplace=True)
86 |     return clumped_data


--------------------------------------------------------------------------------
/docs/build/_static/js/html5shiv-printshiv.min.js:
--------------------------------------------------------------------------------
1 | /**
2 | * @preserve HTML5 Shiv 3.7.3-pre | @afarkas @jdalton @jon_neal @rem | MIT/GPL2 Licensed
3 | */
4 | !function(a,b){function c(a,b){var c=a.createElement("p"),d=a.getElementsByTagName("head")[0]||a.documentElement;return c.innerHTML="x<style>"+b+"</style>",d.insertBefore(c.lastChild,d.firstChild)}function d(){var a=y.elements;return"string"==typeof a?a.split(" "):a}function e(a,b){var c=y.elements;"string"!=typeof c&&(c=c.join(" ")),"string"!=typeof a&&(a=a.join(" ")),y.elements=c+" "+a,j(b)}function f(a){var b=x[a[v]];return b||(b={},w++,a[v]=w,x[w]=b),b}function g(a,c,d){if(c||(c=b),q)return c.createElement(a);d||(d=f(c));var e;return e=d.cache[a]?d.cache[a].cloneNode():u.test(a)?(d.cache[a]=d.createElem(a)).cloneNode():d.createElem(a),!e.canHaveChildren||t.test(a)||e.tagUrn?e:d.frag.appendChild(e)}function h(a,c){if(a||(a=b),q)return a.createDocumentFragment();c=c||f(a);for(var e=c.frag.cloneNode(),g=0,h=d(),i=h.length;i>g;g++)e.createElement(h[g]);return e}function i(a,b){b.cache||(b.cache={},b.createElem=a.createElement,b.createFrag=a.createDocumentFragment,b.frag=b.createFrag()),a.createElement=function(c){return y.shivMethods?g(c,a,b):b.createElem(c)},a.createDocumentFragment=Function("h,f","return function(){var n=f.cloneNode(),c=n.createElement;h.shivMethods&&("+d().join().replace(/[\w\-:]+/g,function(a){return b.createElem(a),b.frag.createElement(a),'c("'+a+'")'})+");return n}")(y,b.frag)}function j(a){a||(a=b);var d=f(a);return!y.shivCSS||p||d.hasCSS||(d.hasCSS=!!c(a,"article,aside,dialog,figcaption,figure,footer,header,hgroup,main,nav,section{display:block}mark{background:#FF0;color:#000}template{display:none}")),q||i(a,d),a}function k(a){for(var b,c=a.getElementsByTagName("*"),e=c.length,f=RegExp("^(?:"+d().join("|")+")$","i"),g=[];e--;)b=c[e],f.test(b.nodeName)&&g.push(b.applyElement(l(b)));return g}function l(a){for(var b,c=a.attributes,d=c.length,e=a.ownerDocument.createElement(A+":"+a.nodeName);d--;)b=c[d],b.specified&&e.setAttribute(b.nodeName,b.nodeValue);return e.style.cssText=a.style.cssText,e}function m(a){for(var b,c=a.split("{"),e=c.length,f=RegExp("(^|[\\s,>+~])("+d().join("|")+")(?=[[\\s,>+~#.:]|$)","gi"),g="$1"+A+"\\:$2";e--;)b=c[e]=c[e].split("}"),b[b.length-1]=b[b.length-1].replace(f,g),c[e]=b.join("}");return c.join("{")}function n(a){for(var b=a.length;b--;)a[b].removeNode()}function o(a){function b(){clearTimeout(g._removeSheetTimer),d&&d.removeNode(!0),d=null}var d,e,g=f(a),h=a.namespaces,i=a.parentWindow;return!B||a.printShived?a:("undefined"==typeof h[A]&&h.add(A),i.attachEvent("onbeforeprint",function(){b();for(var f,g,h,i=a.styleSheets,j=[],l=i.length,n=Array(l);l--;)n[l]=i[l];for(;h=n.pop();)if(!h.disabled&&z.test(h.media)){try{f=h.imports,g=f.length}catch(o){g=0}for(l=0;g>l;l++)n.push(f[l]);try{j.push(h.cssText)}catch(o){}}j=m(j.reverse().join("")),e=k(a),d=c(a,j)}),i.attachEvent("onafterprint",function(){n(e),clearTimeout(g._removeSheetTimer),g._removeSheetTimer=setTimeout(b,500)}),a.printShived=!0,a)}var p,q,r="3.7.3",s=a.html5||{},t=/^<|^(?:button|map|select|textarea|object|iframe|option|optgroup)$/i,u=/^(?:a|b|code|div|fieldset|h1|h2|h3|h4|h5|h6|i|label|li|ol|p|q|span|strong|style|table|tbody|td|th|tr|ul)$/i,v="_html5shiv",w=0,x={};!function(){try{var a=b.createElement("a");a.innerHTML="<xyz></xyz>",p="hidden"in a,q=1==a.childNodes.length||function(){b.createElement("a");var a=b.createDocumentFragment();return"undefined"==typeof a.cloneNode||"undefined"==typeof a.createDocumentFragment||"undefined"==typeof a.createElement}()}catch(c){p=!0,q=!0}}();var y={elements:s.elements||"abbr article aside audio bdi canvas data datalist details dialog figcaption figure footer header hgroup main mark meter nav output picture progress section summary template time video",version:r,shivCSS:s.shivCSS!==!1,supportsUnknownElements:q,shivMethods:s.shivMethods!==!1,type:"default",shivDocument:j,createElement:g,createDocumentFragment:h,addElements:e};a.html5=y,j(b);var z=/^$|\b(?:all|print)\b/,A="html5shiv",B=!q&&function(){var c=b.documentElement;return!("undefined"==typeof b.namespaces||"undefined"==typeof b.parentWindow||"undefined"==typeof c.applyElement||"undefined"==typeof c.removeNode||"undefined"==typeof a.attachEvent)}();y.type+=" print",y.shivPrint=o,o(b),"object"==typeof module&&module.exports&&(module.exports=y)}("undefined"!=typeof window?window:this,document);


--------------------------------------------------------------------------------
/docs/build/search.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html class="writer-html5" lang="en" >
  3 | <head>
  4 |   <meta charset="utf-8" />
  5 |   <meta name="viewport" content="width=device-width, initial-scale=1.0" />
  6 |   <title>Search &mdash; genal v0.8 documentation</title>
  7 |       <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
  8 |       <link rel="stylesheet" href="_static/css/theme.css" type="text/css" />
  9 |     
 10 |   <!--[if lt IE 9]>
 11 |     <script src="_static/js/html5shiv.min.js"></script>
 12 |   <![endif]-->
 13 |   
 14 |         <script src="_static/documentation_options.js?v=b326c068"></script>
 15 |         <script src="_static/doctools.js?v=9a2dae69"></script>
 16 |         <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
 17 |     <script src="_static/js/theme.js"></script>
 18 |     <script src="_static/searchtools.js"></script>
 19 |     <script src="_static/language_data.js"></script>
 20 |     <link rel="index" title="Index" href="genindex.html" />
 21 |     <link rel="search" title="Search" href="#" /> 
 22 | </head>
 23 | 
 24 | <body class="wy-body-for-nav"> 
 25 |   <div class="wy-grid-for-nav">
 26 |     <nav data-toggle="wy-nav-shift" class="wy-nav-side">
 27 |       <div class="wy-side-scroll">
 28 |         <div class="wy-side-nav-search" >
 29 | 
 30 |           
 31 |           
 32 |           <a href="index.html" class="icon icon-home">
 33 |             genal
 34 |           </a>
 35 | <div role="search">
 36 |   <form id="rtd-search-form" class="wy-form" action="#" method="get">
 37 |     <input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
 38 |     <input type="hidden" name="check_keywords" value="yes" />
 39 |     <input type="hidden" name="area" value="default" />
 40 |   </form>
 41 | </div>
 42 |         </div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
 43 |               <ul>
 44 | <li class="toctree-l1"><a class="reference internal" href="index.html">Home</a></li>
 45 | <li class="toctree-l1"><a class="reference internal" href="introduction.html">Installation</a></li>
 46 | <li class="toctree-l1"><a class="reference internal" href="introduction.html#tutorial">Tutorial</a></li>
 47 | <li class="toctree-l1"><a class="reference internal" href="modules.html">The Geno class</a></li>
 48 | <li class="toctree-l1"><a class="reference internal" href="modules.html#main-functions">Main functions</a></li>
 49 | <li class="toctree-l1"><a class="reference internal" href="api.html">API</a></li>
 50 | </ul>
 51 | 
 52 |         </div>
 53 |       </div>
 54 |     </nav>
 55 | 
 56 |     <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
 57 |           <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
 58 |           <a href="index.html">genal</a>
 59 |       </nav>
 60 | 
 61 |       <div class="wy-nav-content">
 62 |         <div class="rst-content">
 63 |           <div role="navigation" aria-label="Page navigation">
 64 |   <ul class="wy-breadcrumbs">
 65 |       <li><a href="index.html" class="icon icon-home" aria-label="Home"></a></li>
 66 |       <li class="breadcrumb-item active">Search</li>
 67 |       <li class="wy-breadcrumbs-aside">
 68 |       </li>
 69 |   </ul>
 70 |   <hr/>
 71 | </div>
 72 |           <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
 73 |            <div itemprop="articleBody">
 74 |              
 75 |   <noscript>
 76 |   <div id="fallback" class="admonition warning">
 77 |     <p class="last">
 78 |       Please activate JavaScript to enable the search functionality.
 79 |     </p>
 80 |   </div>
 81 |   </noscript>
 82 | 
 83 |   
 84 |   <div id="search-results">
 85 |   
 86 |   </div>
 87 | 
 88 |            </div>
 89 |           </div>
 90 |           <footer>
 91 | 
 92 |   <hr/>
 93 | 
 94 |   <div role="contentinfo">
 95 |     <p>&#169; Copyright 2023, Cyprien A. Rivier.</p>
 96 |   </div>
 97 | 
 98 |   Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
 99 |     <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
100 |     provided by <a href="https://readthedocs.org">Read the Docs</a>.
101 |    
102 | 
103 | </footer>
104 |         </div>
105 |       </div>
106 |     </section>
107 |   </div>
108 |   <script>
109 |       jQuery(function () {
110 |           SphinxRtdTheme.Navigation.enable(true);
111 |       });
112 |   </script>
113 |   <script>
114 |     jQuery(function() { Search.loadIndex("searchindex.js"); });
115 |   </script>
116 |   
117 |   <script id="searchindexloader"></script>
118 |    
119 | 
120 | 
121 | </body>
122 | </html>


--------------------------------------------------------------------------------
/docs/build/_static/js/theme.js:
--------------------------------------------------------------------------------
1 | !function(n){var e={};function t(i){if(e[i])return e[i].exports;var o=e[i]={i:i,l:!1,exports:{}};return n[i].call(o.exports,o,o.exports,t),o.l=!0,o.exports}t.m=n,t.c=e,t.d=function(n,e,i){t.o(n,e)||Object.defineProperty(n,e,{enumerable:!0,get:i})},t.r=function(n){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(n,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(n,"__esModule",{value:!0})},t.t=function(n,e){if(1&e&&(n=t(n)),8&e)return n;if(4&e&&"object"==typeof n&&n&&n.__esModule)return n;var i=Object.create(null);if(t.r(i),Object.defineProperty(i,"default",{enumerable:!0,value:n}),2&e&&"string"!=typeof n)for(var o in n)t.d(i,o,function(e){return n[e]}.bind(null,o));return i},t.n=function(n){var e=n&&n.__esModule?function(){return n.default}:function(){return n};return t.d(e,"a",e),e},t.o=function(n,e){return Object.prototype.hasOwnProperty.call(n,e)},t.p="",t(t.s=0)}([function(n,e,t){t(1),n.exports=t(3)},function(n,e,t){(function(){var e="undefined"!=typeof window?window.jQuery:t(2);n.exports.ThemeNav={navBar:null,win:null,winScroll:!1,winResize:!1,linkScroll:!1,winPosition:0,winHeight:null,docHeight:null,isRunning:!1,enable:function(n){var t=this;void 0===n&&(n=!0),t.isRunning||(t.isRunning=!0,e((function(e){t.init(e),t.reset(),t.win.on("hashchange",t.reset),n&&t.win.on("scroll",(function(){t.linkScroll||t.winScroll||(t.winScroll=!0,requestAnimationFrame((function(){t.onScroll()})))})),t.win.on("resize",(function(){t.winResize||(t.winResize=!0,requestAnimationFrame((function(){t.onResize()})))})),t.onResize()})))},enableSticky:function(){this.enable(!0)},init:function(n){n(document);var e=this;this.navBar=n("div.wy-side-scroll:first"),this.win=n(window),n(document).on("click","[data-toggle='wy-nav-top']",(function(){n("[data-toggle='wy-nav-shift']").toggleClass("shift"),n("[data-toggle='rst-versions']").toggleClass("shift")})).on("click",".wy-menu-vertical .current ul li a",(function(){var t=n(this);n("[data-toggle='wy-nav-shift']").removeClass("shift"),n("[data-toggle='rst-versions']").toggleClass("shift"),e.toggleCurrent(t),e.hashChange()})).on("click","[data-toggle='rst-current-version']",(function(){n("[data-toggle='rst-versions']").toggleClass("shift-up")})),n("table.docutils:not(.field-list,.footnote,.citation)").wrap("<div class='wy-table-responsive'></div>"),n("table.docutils.footnote").wrap("<div class='wy-table-responsive footnote'></div>"),n("table.docutils.citation").wrap("<div class='wy-table-responsive citation'></div>"),n(".wy-menu-vertical ul").not(".simple").siblings("a").each((function(){var t=n(this);expand=n('<button class="toctree-expand" title="Open/close menu"></button>'),expand.on("click",(function(n){return e.toggleCurrent(t),n.stopPropagation(),!1})),t.prepend(expand)}))},reset:function(){var n=encodeURI(window.location.hash)||"#";try{var e=$(".wy-menu-vertical"),t=e.find('[href="'+n+'"]');if(0===t.length){var i=$('.document [id="'+n.substring(1)+'"]').closest("div.section");0===(t=e.find('[href="#'+i.attr("id")+'"]')).length&&(t=e.find('[href="#"]'))}if(t.length>0){$(".wy-menu-vertical .current").removeClass("current").attr("aria-expanded","false"),t.addClass("current").attr("aria-expanded","true"),t.closest("li.toctree-l1").parent().addClass("current").attr("aria-expanded","true");for(let n=1;n<=10;n++)t.closest("li.toctree-l"+n).addClass("current").attr("aria-expanded","true");t[0].scrollIntoView()}}catch(n){console.log("Error expanding nav for anchor",n)}},onScroll:function(){this.winScroll=!1;var n=this.win.scrollTop(),e=n+this.winHeight,t=this.navBar.scrollTop()+(n-this.winPosition);n<0||e>this.docHeight||(this.navBar.scrollTop(t),this.winPosition=n)},onResize:function(){this.winResize=!1,this.winHeight=this.win.height(),this.docHeight=$(document).height()},hashChange:function(){this.linkScroll=!0,this.win.one("hashchange",(function(){this.linkScroll=!1}))},toggleCurrent:function(n){var e=n.closest("li");e.siblings("li.current").removeClass("current").attr("aria-expanded","false"),e.siblings().find("li.current").removeClass("current").attr("aria-expanded","false");var t=e.find("> ul li");t.length&&(t.removeClass("current").attr("aria-expanded","false"),e.toggleClass("current").attr("aria-expanded",(function(n,e){return"true"==e?"false":"true"})))}},"undefined"!=typeof window&&(window.SphinxRtdTheme={Navigation:n.exports.ThemeNav,StickyNav:n.exports.ThemeNav}),function(){for(var n=0,e=["ms","moz","webkit","o"],t=0;t<e.length&&!window.requestAnimationFrame;++t)window.requestAnimationFrame=window[e[t]+"RequestAnimationFrame"],window.cancelAnimationFrame=window[e[t]+"CancelAnimationFrame"]||window[e[t]+"CancelRequestAnimationFrame"];window.requestAnimationFrame||(window.requestAnimationFrame=function(e,t){var i=(new Date).getTime(),o=Math.max(0,16-(i-n)),r=window.setTimeout((function(){e(i+o)}),o);return n=i+o,r}),window.cancelAnimationFrame||(window.cancelAnimationFrame=function(n){clearTimeout(n)})}()}).call(window)},function(n,e){n.exports=jQuery},function(n,e,t){}]);


--------------------------------------------------------------------------------
/docs/build/_modules/index.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html class="writer-html5" lang="en" >
  3 | <head>
  4 |   <meta charset="utf-8" />
  5 |   <meta name="viewport" content="width=device-width, initial-scale=1.0" />
  6 |   <title>Overview: module code &mdash; genal v0.8 documentation</title>
  7 |       <link rel="stylesheet" href="../_static/pygments.css" type="text/css" />
  8 |       <link rel="stylesheet" href="../_static/css/theme.css" type="text/css" />
  9 |   <!--[if lt IE 9]>
 10 |     <script src="../_static/js/html5shiv.min.js"></script>
 11 |   <![endif]-->
 12 |   
 13 |         <script src="../_static/documentation_options.js?v=b326c068"></script>
 14 |         <script src="../_static/doctools.js?v=9a2dae69"></script>
 15 |         <script src="../_static/sphinx_highlight.js?v=dc90522c"></script>
 16 |     <script src="../_static/js/theme.js"></script>
 17 |     <link rel="index" title="Index" href="../genindex.html" />
 18 |     <link rel="search" title="Search" href="../search.html" /> 
 19 | </head>
 20 | 
 21 | <body class="wy-body-for-nav"> 
 22 |   <div class="wy-grid-for-nav">
 23 |     <nav data-toggle="wy-nav-shift" class="wy-nav-side">
 24 |       <div class="wy-side-scroll">
 25 |         <div class="wy-side-nav-search" >
 26 | 
 27 |           
 28 |           
 29 |           <a href="../index.html" class="icon icon-home">
 30 |             genal
 31 |           </a>
 32 | <div role="search">
 33 |   <form id="rtd-search-form" class="wy-form" action="../search.html" method="get">
 34 |     <input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
 35 |     <input type="hidden" name="check_keywords" value="yes" />
 36 |     <input type="hidden" name="area" value="default" />
 37 |   </form>
 38 | </div>
 39 |         </div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
 40 |               <ul>
 41 | <li class="toctree-l1"><a class="reference internal" href="../index.html">Home</a></li>
 42 | <li class="toctree-l1"><a class="reference internal" href="../introduction.html">Installation</a></li>
 43 | <li class="toctree-l1"><a class="reference internal" href="../introduction.html#tutorial">Tutorial</a></li>
 44 | <li class="toctree-l1"><a class="reference internal" href="../modules.html">The Geno class</a></li>
 45 | <li class="toctree-l1"><a class="reference internal" href="../modules.html#main-functions">Main functions</a></li>
 46 | <li class="toctree-l1"><a class="reference internal" href="../api.html">API</a></li>
 47 | </ul>
 48 | 
 49 |         </div>
 50 |       </div>
 51 |     </nav>
 52 | 
 53 |     <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
 54 |           <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
 55 |           <a href="../index.html">genal</a>
 56 |       </nav>
 57 | 
 58 |       <div class="wy-nav-content">
 59 |         <div class="rst-content">
 60 |           <div role="navigation" aria-label="Page navigation">
 61 |   <ul class="wy-breadcrumbs">
 62 |       <li><a href="../index.html" class="icon icon-home" aria-label="Home"></a></li>
 63 |       <li class="breadcrumb-item active">Overview: module code</li>
 64 |       <li class="wy-breadcrumbs-aside">
 65 |       </li>
 66 |   </ul>
 67 |   <hr/>
 68 | </div>
 69 |           <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
 70 |            <div itemprop="articleBody">
 71 |              
 72 |   <h1>All modules for which code is available</h1>
 73 | <ul><li><a href="genal/Geno.html">genal.Geno</a></li>
 74 | <li><a href="genal/MR.html">genal.MR</a></li>
 75 | <li><a href="genal/MR_tools.html">genal.MR_tools</a></li>
 76 | <li><a href="genal/MRpresso.html">genal.MRpresso</a></li>
 77 | <li><a href="genal/association.html">genal.association</a></li>
 78 | <li><a href="genal/clump.html">genal.clump</a></li>
 79 | <li><a href="genal/extract_prs.html">genal.extract_prs</a></li>
 80 | <li><a href="genal/geno_tools.html">genal.geno_tools</a></li>
 81 | <li><a href="genal/lift.html">genal.lift</a></li>
 82 | <li><a href="genal/proxy.html">genal.proxy</a></li>
 83 | <li><a href="genal/snp_query.html">genal.snp_query</a></li>
 84 | <li><a href="genal/tools.html">genal.tools</a></li>
 85 | </ul>
 86 | 
 87 |            </div>
 88 |           </div>
 89 |           <footer>
 90 | 
 91 |   <hr/>
 92 | 
 93 |   <div role="contentinfo">
 94 |     <p>&#169; Copyright 2023, Cyprien A. Rivier.</p>
 95 |   </div>
 96 | 
 97 |   Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
 98 |     <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
 99 |     provided by <a href="https://readthedocs.org">Read the Docs</a>.
100 |    
101 | 
102 | </footer>
103 |         </div>
104 |       </div>
105 |     </section>
106 |   </div>
107 |   <script>
108 |       jQuery(function () {
109 |           SphinxRtdTheme.Navigation.enable(true);
110 |       });
111 |   </script> 
112 | 
113 | </body>
114 | </html>


--------------------------------------------------------------------------------
/docs/build/_static/pygments.css:
--------------------------------------------------------------------------------
 1 | pre { line-height: 125%; }
 2 | td.linenos .normal { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; }
 3 | span.linenos { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; }
 4 | td.linenos .special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; }
 5 | span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; }
 6 | .highlight .hll { background-color: #ffffcc }
 7 | .highlight { background: #f8f8f8; }
 8 | .highlight .c { color: #3D7B7B; font-style: italic } /* Comment */
 9 | .highlight .err { border: 1px solid #FF0000 } /* Error */
10 | .highlight .k { color: #008000; font-weight: bold } /* Keyword */
11 | .highlight .o { color: #666666 } /* Operator */
12 | .highlight .ch { color: #3D7B7B; font-style: italic } /* Comment.Hashbang */
13 | .highlight .cm { color: #3D7B7B; font-style: italic } /* Comment.Multiline */
14 | .highlight .cp { color: #9C6500 } /* Comment.Preproc */
15 | .highlight .cpf { color: #3D7B7B; font-style: italic } /* Comment.PreprocFile */
16 | .highlight .c1 { color: #3D7B7B; font-style: italic } /* Comment.Single */
17 | .highlight .cs { color: #3D7B7B; font-style: italic } /* Comment.Special */
18 | .highlight .gd { color: #A00000 } /* Generic.Deleted */
19 | .highlight .ge { font-style: italic } /* Generic.Emph */
20 | .highlight .ges { font-weight: bold; font-style: italic } /* Generic.EmphStrong */
21 | .highlight .gr { color: #E40000 } /* Generic.Error */
22 | .highlight .gh { color: #000080; font-weight: bold } /* Generic.Heading */
23 | .highlight .gi { color: #008400 } /* Generic.Inserted */
24 | .highlight .go { color: #717171 } /* Generic.Output */
25 | .highlight .gp { color: #000080; font-weight: bold } /* Generic.Prompt */
26 | .highlight .gs { font-weight: bold } /* Generic.Strong */
27 | .highlight .gu { color: #800080; font-weight: bold } /* Generic.Subheading */
28 | .highlight .gt { color: #0044DD } /* Generic.Traceback */
29 | .highlight .kc { color: #008000; font-weight: bold } /* Keyword.Constant */
30 | .highlight .kd { color: #008000; font-weight: bold } /* Keyword.Declaration */
31 | .highlight .kn { color: #008000; font-weight: bold } /* Keyword.Namespace */
32 | .highlight .kp { color: #008000 } /* Keyword.Pseudo */
33 | .highlight .kr { color: #008000; font-weight: bold } /* Keyword.Reserved */
34 | .highlight .kt { color: #B00040 } /* Keyword.Type */
35 | .highlight .m { color: #666666 } /* Literal.Number */
36 | .highlight .s { color: #BA2121 } /* Literal.String */
37 | .highlight .na { color: #687822 } /* Name.Attribute */
38 | .highlight .nb { color: #008000 } /* Name.Builtin */
39 | .highlight .nc { color: #0000FF; font-weight: bold } /* Name.Class */
40 | .highlight .no { color: #880000 } /* Name.Constant */
41 | .highlight .nd { color: #AA22FF } /* Name.Decorator */
42 | .highlight .ni { color: #717171; font-weight: bold } /* Name.Entity */
43 | .highlight .ne { color: #CB3F38; font-weight: bold } /* Name.Exception */
44 | .highlight .nf { color: #0000FF } /* Name.Function */
45 | .highlight .nl { color: #767600 } /* Name.Label */
46 | .highlight .nn { color: #0000FF; font-weight: bold } /* Name.Namespace */
47 | .highlight .nt { color: #008000; font-weight: bold } /* Name.Tag */
48 | .highlight .nv { color: #19177C } /* Name.Variable */
49 | .highlight .ow { color: #AA22FF; font-weight: bold } /* Operator.Word */
50 | .highlight .w { color: #bbbbbb } /* Text.Whitespace */
51 | .highlight .mb { color: #666666 } /* Literal.Number.Bin */
52 | .highlight .mf { color: #666666 } /* Literal.Number.Float */
53 | .highlight .mh { color: #666666 } /* Literal.Number.Hex */
54 | .highlight .mi { color: #666666 } /* Literal.Number.Integer */
55 | .highlight .mo { color: #666666 } /* Literal.Number.Oct */
56 | .highlight .sa { color: #BA2121 } /* Literal.String.Affix */
57 | .highlight .sb { color: #BA2121 } /* Literal.String.Backtick */
58 | .highlight .sc { color: #BA2121 } /* Literal.String.Char */
59 | .highlight .dl { color: #BA2121 } /* Literal.String.Delimiter */
60 | .highlight .sd { color: #BA2121; font-style: italic } /* Literal.String.Doc */
61 | .highlight .s2 { color: #BA2121 } /* Literal.String.Double */
62 | .highlight .se { color: #AA5D1F; font-weight: bold } /* Literal.String.Escape */
63 | .highlight .sh { color: #BA2121 } /* Literal.String.Heredoc */
64 | .highlight .si { color: #A45A77; font-weight: bold } /* Literal.String.Interpol */
65 | .highlight .sx { color: #008000 } /* Literal.String.Other */
66 | .highlight .sr { color: #A45A77 } /* Literal.String.Regex */
67 | .highlight .s1 { color: #BA2121 } /* Literal.String.Single */
68 | .highlight .ss { color: #19177C } /* Literal.String.Symbol */
69 | .highlight .bp { color: #008000 } /* Name.Builtin.Pseudo */
70 | .highlight .fm { color: #0000FF } /* Name.Function.Magic */
71 | .highlight .vc { color: #19177C } /* Name.Variable.Class */
72 | .highlight .vg { color: #19177C } /* Name.Variable.Global */
73 | .highlight .vi { color: #19177C } /* Name.Variable.Instance */
74 | .highlight .vm { color: #19177C } /* Name.Variable.Magic */
75 | .highlight .il { color: #666666 } /* Literal.Number.Integer.Long */


--------------------------------------------------------------------------------
/docs/build/_static/doctools.js:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * doctools.js
  3 |  * ~~~~~~~~~~~
  4 |  *
  5 |  * Base JavaScript utilities for all Sphinx HTML documentation.
  6 |  *
  7 |  * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS.
  8 |  * :license: BSD, see LICENSE for details.
  9 |  *
 10 |  */
 11 | "use strict";
 12 | 
 13 | const BLACKLISTED_KEY_CONTROL_ELEMENTS = new Set([
 14 |   "TEXTAREA",
 15 |   "INPUT",
 16 |   "SELECT",
 17 |   "BUTTON",
 18 | ]);
 19 | 
 20 | const _ready = (callback) => {
 21 |   if (document.readyState !== "loading") {
 22 |     callback();
 23 |   } else {
 24 |     document.addEventListener("DOMContentLoaded", callback);
 25 |   }
 26 | };
 27 | 
 28 | /**
 29 |  * Small JavaScript module for the documentation.
 30 |  */
 31 | const Documentation = {
 32 |   init: () => {
 33 |     Documentation.initDomainIndexTable();
 34 |     Documentation.initOnKeyListeners();
 35 |   },
 36 | 
 37 |   /**
 38 |    * i18n support
 39 |    */
 40 |   TRANSLATIONS: {},
 41 |   PLURAL_EXPR: (n) => (n === 1 ? 0 : 1),
 42 |   LOCALE: "unknown",
 43 | 
 44 |   // gettext and ngettext don't access this so that the functions
 45 |   // can safely bound to a different name (_ = Documentation.gettext)
 46 |   gettext: (string) => {
 47 |     const translated = Documentation.TRANSLATIONS[string];
 48 |     switch (typeof translated) {
 49 |       case "undefined":
 50 |         return string; // no translation
 51 |       case "string":
 52 |         return translated; // translation exists
 53 |       default:
 54 |         return translated[0]; // (singular, plural) translation tuple exists
 55 |     }
 56 |   },
 57 | 
 58 |   ngettext: (singular, plural, n) => {
 59 |     const translated = Documentation.TRANSLATIONS[singular];
 60 |     if (typeof translated !== "undefined")
 61 |       return translated[Documentation.PLURAL_EXPR(n)];
 62 |     return n === 1 ? singular : plural;
 63 |   },
 64 | 
 65 |   addTranslations: (catalog) => {
 66 |     Object.assign(Documentation.TRANSLATIONS, catalog.messages);
 67 |     Documentation.PLURAL_EXPR = new Function(
 68 |       "n",
 69 |       `return (${catalog.plural_expr})`
 70 |     );
 71 |     Documentation.LOCALE = catalog.locale;
 72 |   },
 73 | 
 74 |   /**
 75 |    * helper function to focus on search bar
 76 |    */
 77 |   focusSearchBar: () => {
 78 |     document.querySelectorAll("input[name=q]")[0]?.focus();
 79 |   },
 80 | 
 81 |   /**
 82 |    * Initialise the domain index toggle buttons
 83 |    */
 84 |   initDomainIndexTable: () => {
 85 |     const toggler = (el) => {
 86 |       const idNumber = el.id.substr(7);
 87 |       const toggledRows = document.querySelectorAll(`tr.cg-${idNumber}`);
 88 |       if (el.src.substr(-9) === "minus.png") {
 89 |         el.src = `${el.src.substr(0, el.src.length - 9)}plus.png`;
 90 |         toggledRows.forEach((el) => (el.style.display = "none"));
 91 |       } else {
 92 |         el.src = `${el.src.substr(0, el.src.length - 8)}minus.png`;
 93 |         toggledRows.forEach((el) => (el.style.display = ""));
 94 |       }
 95 |     };
 96 | 
 97 |     const togglerElements = document.querySelectorAll("img.toggler");
 98 |     togglerElements.forEach((el) =>
 99 |       el.addEventListener("click", (event) => toggler(event.currentTarget))
100 |     );
101 |     togglerElements.forEach((el) => (el.style.display = ""));
102 |     if (DOCUMENTATION_OPTIONS.COLLAPSE_INDEX) togglerElements.forEach(toggler);
103 |   },
104 | 
105 |   initOnKeyListeners: () => {
106 |     // only install a listener if it is really needed
107 |     if (
108 |       !DOCUMENTATION_OPTIONS.NAVIGATION_WITH_KEYS &&
109 |       !DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS
110 |     )
111 |       return;
112 | 
113 |     document.addEventListener("keydown", (event) => {
114 |       // bail for input elements
115 |       if (BLACKLISTED_KEY_CONTROL_ELEMENTS.has(document.activeElement.tagName)) return;
116 |       // bail with special keys
117 |       if (event.altKey || event.ctrlKey || event.metaKey) return;
118 | 
119 |       if (!event.shiftKey) {
120 |         switch (event.key) {
121 |           case "ArrowLeft":
122 |             if (!DOCUMENTATION_OPTIONS.NAVIGATION_WITH_KEYS) break;
123 | 
124 |             const prevLink = document.querySelector('link[rel="prev"]');
125 |             if (prevLink && prevLink.href) {
126 |               window.location.href = prevLink.href;
127 |               event.preventDefault();
128 |             }
129 |             break;
130 |           case "ArrowRight":
131 |             if (!DOCUMENTATION_OPTIONS.NAVIGATION_WITH_KEYS) break;
132 | 
133 |             const nextLink = document.querySelector('link[rel="next"]');
134 |             if (nextLink && nextLink.href) {
135 |               window.location.href = nextLink.href;
136 |               event.preventDefault();
137 |             }
138 |             break;
139 |         }
140 |       }
141 | 
142 |       // some keyboard layouts may need Shift to get /
143 |       switch (event.key) {
144 |         case "/":
145 |           if (!DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS) break;
146 |           Documentation.focusSearchBar();
147 |           event.preventDefault();
148 |       }
149 |     });
150 |   },
151 | };
152 | 
153 | // quick alias for translations
154 | const _ = Documentation.gettext;
155 | 
156 | _ready(Documentation.init);
157 | 


--------------------------------------------------------------------------------
/genal/genes.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | import os
  4 | import wget
  5 | 
  6 | from .constants import BUCKET_URL
  7 | from .tools import read_config
  8 | 
  9 | 
 10 | 
 11 | def filter_by_gene_func(data, gene_identifier, id_type="symbol", window_size=1000000, build="37"):
 12 |     """
 13 |     Filtering the data to include only variants that are within a specified distance of a specific gene.
 14 |     Corresponds to the :meth:`Geno.filter_by_gene` method.
 15 |     Args:
 16 |         data (pd.DataFrame): Input data with at least 'CHR' and 'POS' columns.
 17 |         gene_identifier (str): Identifier for the gene/protein to filter variants around.
 18 |         id_type (str, optional): Type of identifier provided. Options are:
 19 |             - "symbol": Gene symbol (e.g., "APOE")
 20 |             - "HGNC": HGNC ID (e.g., "HGNC:613")
 21 |             - "name": Full gene name (e.g., "apolipoprotein E")
 22 |             - "Ensembl": Ensembl gene ID (e.g., "ENSG00000130203")
 23 |             - "NCBI": NCBI gene ID (e.g., "348")
 24 |             - "UCSC": UCSC gene ID (e.g., "uc001hbu.2")
 25 |             - "Vega": Vega gene ID (e.g., "OTTHUMG00000019505")
 26 |             Default is "symbol".
 27 |         window_size (int, optional): Size of the window around the gene in base pairs. Default is 1,000,000 (1Mb).
 28 |         build (str, optional): Genome build of the data. Default is "37".
 29 |         
 30 |     Returns:
 31 |         pd.DataFrame: Filtered DataFrame containing only variants within the specified window 
 32 |             around the gene, with additional column 'Distance'.
 33 | 
 34 |     Notes:
 35 |         - Distance is calculated from the nearest gene boundary (start or end position)
 36 |         - Null distances indicate the variant is within the gene
 37 |     """
 38 |         
 39 |     # Validate id_type
 40 |     valid_id_types = ["symbol", "HGNC_id", "name", "gene_id", "NCBI_id", "UCSC_id", "Vega_id"]
 41 |     if id_type in ["HGNC", "NCBI", "UCSC", "Vega"]:
 42 |         id_type = id_type + "_id"
 43 |     if id_type == "Ensembl":
 44 |         id_type = "gene_id"
 45 |     if id_type not in valid_id_types:
 46 |         raise ValueError(f"Invalid id_type. Must be one of: {', '.join(valid_id_types)}")
 47 |     
 48 |     # Validate build
 49 |     if int(build) not in [37, 38]:
 50 |         raise ValueError(f"Invalid build. Must be one of: 37, 38")
 51 |     
 52 |     # Download the gene info file if not already present in the reference folder
 53 |     config = read_config()
 54 |     ref_path = config["paths"]["ref_path"]
 55 |     gene_info_file = os.path.join(ref_path, "gene_id_mapping_filtered.parquet")
 56 |     if not os.path.exists(gene_info_file):
 57 |         # Download parquet file
 58 |         print(f"Downloading gene info file to {gene_info_file}...")    
 59 |         url = BUCKET_URL + "gene_id_mapping_filtered.parquet"
 60 |         try:
 61 |             wget.download(url, gene_info_file)
 62 |             print("\nDownload complete.")
 63 |         except Exception as e:
 64 |             if os.path.exists(gene_info_file):
 65 |                 os.remove(gene_info_file)
 66 |             raise RuntimeError(f"Failed to download gene info: {e}")
 67 | 
 68 |     df_gene_info = pd.read_parquet(gene_info_file, engine="pyarrow")
 69 |     
 70 |     # Find gene coordinates
 71 |     gene_data = df_gene_info[df_gene_info[id_type] == gene_identifier]
 72 |     
 73 |     if gene_data.empty:
 74 |         raise ValueError(f"Gene with {id_type}='{gene_identifier}' not found in gene info database.")
 75 |     
 76 |     if len(gene_data) > 1:
 77 |         print(f"Warning: Multiple entries found for {id_type}='{gene_identifier}'. Using the first entry.")
 78 |     gene_data = gene_data.iloc[0,:]
 79 | 
 80 |     print(f"Filtering variants within {window_size}bp window (+/- {window_size/2}bp on each side) based on genome build {build} around gene: {', '.join(f'{col}: {gene_data[col]}' for col in valid_id_types)}")
 81 |     
 82 |     # Extract gene location information
 83 |     chrom = gene_data['CHR']
 84 |     # Convert to integer if possible
 85 |     if str(chrom).isdigit():
 86 |         chrom = int(chrom)
 87 |     elif chrom=="X":
 88 |         chrom=23
 89 |     else:
 90 |         raise ValueError(f"Gene {gene_identifier} is located on chromosome {chrom}, which is not supported.")
 91 |     
 92 |     gene_start = int(gene_data[f'gene_start_{build}'])
 93 |     gene_end = int(gene_data[f'gene_end_{build}'])
 94 | 
 95 |     # Define the window boundaries
 96 |     window_start = max(0, gene_start - window_size/2)
 97 |     window_end = gene_end + window_size/2
 98 |     
 99 |     # Filter variants within the window
100 |     filtered = data[
101 |         (data['CHR'] == chrom) & 
102 |         (data['POS'] >= window_start) & 
103 |         (data['POS'] <= window_end)
104 |     ].copy()
105 | 
106 |     if not filtered.empty:
107 |         # Calculate distance from gene: if inside the gene, distance is 0, if before, distance is negative, if after, distance is positive
108 |         filtered.loc[:, 'Distance'] = np.nan
109 |         
110 |         # Create boolean masks
111 |         mask_inside = filtered['POS'].between(gene_start, gene_end)
112 |         mask_before = filtered['POS'] < gene_start
113 |         mask_after  = filtered['POS'] > gene_end
114 | 
115 |         filtered.loc[mask_inside, 'Distance'] = 0
116 |         filtered.loc[mask_before, 'Distance'] = filtered['POS'] - gene_start
117 |         filtered.loc[mask_after, 'Distance']  = filtered['POS'] - gene_end
118 | 
119 |         filtered["Distance"] = filtered["Distance"].astype("Int64")
120 |         
121 |         print(f"Found {len(filtered)} variants.")
122 |     else:
123 |         print(f"No variants found in a {window_size}bp window around {gene_identifier}")
124 |     
125 |     return filtered


--------------------------------------------------------------------------------
/docs/build/_static/language_data.js:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * language_data.js
  3 |  * ~~~~~~~~~~~~~~~~
  4 |  *
  5 |  * This script contains the language-specific data used by searchtools.js,
  6 |  * namely the list of stopwords, stemmer, scorer and splitter.
  7 |  *
  8 |  * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS.
  9 |  * :license: BSD, see LICENSE for details.
 10 |  *
 11 |  */
 12 | 
 13 | var stopwords = ["a", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "near", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with"];
 14 | 
 15 | 
 16 | /* Non-minified version is copied as a separate JS file, if available */
 17 | 
 18 | /**
 19 |  * Porter Stemmer
 20 |  */
 21 | var Stemmer = function() {
 22 | 
 23 |   var step2list = {
 24 |     ational: 'ate',
 25 |     tional: 'tion',
 26 |     enci: 'ence',
 27 |     anci: 'ance',
 28 |     izer: 'ize',
 29 |     bli: 'ble',
 30 |     alli: 'al',
 31 |     entli: 'ent',
 32 |     eli: 'e',
 33 |     ousli: 'ous',
 34 |     ization: 'ize',
 35 |     ation: 'ate',
 36 |     ator: 'ate',
 37 |     alism: 'al',
 38 |     iveness: 'ive',
 39 |     fulness: 'ful',
 40 |     ousness: 'ous',
 41 |     aliti: 'al',
 42 |     iviti: 'ive',
 43 |     biliti: 'ble',
 44 |     logi: 'log'
 45 |   };
 46 | 
 47 |   var step3list = {
 48 |     icate: 'ic',
 49 |     ative: '',
 50 |     alize: 'al',
 51 |     iciti: 'ic',
 52 |     ical: 'ic',
 53 |     ful: '',
 54 |     ness: ''
 55 |   };
 56 | 
 57 |   var c = "[^aeiou]";          // consonant
 58 |   var v = "[aeiouy]";          // vowel
 59 |   var C = c + "[^aeiouy]*";    // consonant sequence
 60 |   var V = v + "[aeiou]*";      // vowel sequence
 61 | 
 62 |   var mgr0 = "^(" + C + ")?" + V + C;                      // [C]VC... is m>0
 63 |   var meq1 = "^(" + C + ")?" + V + C + "(" + V + ")?$";    // [C]VC[V] is m=1
 64 |   var mgr1 = "^(" + C + ")?" + V + C + V + C;              // [C]VCVC... is m>1
 65 |   var s_v   = "^(" + C + ")?" + v;                         // vowel in stem
 66 | 
 67 |   this.stemWord = function (w) {
 68 |     var stem;
 69 |     var suffix;
 70 |     var firstch;
 71 |     var origword = w;
 72 | 
 73 |     if (w.length < 3)
 74 |       return w;
 75 | 
 76 |     var re;
 77 |     var re2;
 78 |     var re3;
 79 |     var re4;
 80 | 
 81 |     firstch = w.substr(0,1);
 82 |     if (firstch == "y")
 83 |       w = firstch.toUpperCase() + w.substr(1);
 84 | 
 85 |     // Step 1a
 86 |     re = /^(.+?)(ss|i)es$/;
 87 |     re2 = /^(.+?)([^s])s$/;
 88 | 
 89 |     if (re.test(w))
 90 |       w = w.replace(re,"$1$2");
 91 |     else if (re2.test(w))
 92 |       w = w.replace(re2,"$1$2");
 93 | 
 94 |     // Step 1b
 95 |     re = /^(.+?)eed$/;
 96 |     re2 = /^(.+?)(ed|ing)$/;
 97 |     if (re.test(w)) {
 98 |       var fp = re.exec(w);
 99 |       re = new RegExp(mgr0);
100 |       if (re.test(fp[1])) {
101 |         re = /.$/;
102 |         w = w.replace(re,"");
103 |       }
104 |     }
105 |     else if (re2.test(w)) {
106 |       var fp = re2.exec(w);
107 |       stem = fp[1];
108 |       re2 = new RegExp(s_v);
109 |       if (re2.test(stem)) {
110 |         w = stem;
111 |         re2 = /(at|bl|iz)$/;
112 |         re3 = new RegExp("([^aeiouylsz])\\1$");
113 |         re4 = new RegExp("^" + C + v + "[^aeiouwxy]$");
114 |         if (re2.test(w))
115 |           w = w + "e";
116 |         else if (re3.test(w)) {
117 |           re = /.$/;
118 |           w = w.replace(re,"");
119 |         }
120 |         else if (re4.test(w))
121 |           w = w + "e";
122 |       }
123 |     }
124 | 
125 |     // Step 1c
126 |     re = /^(.+?)y$/;
127 |     if (re.test(w)) {
128 |       var fp = re.exec(w);
129 |       stem = fp[1];
130 |       re = new RegExp(s_v);
131 |       if (re.test(stem))
132 |         w = stem + "i";
133 |     }
134 | 
135 |     // Step 2
136 |     re = /^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/;
137 |     if (re.test(w)) {
138 |       var fp = re.exec(w);
139 |       stem = fp[1];
140 |       suffix = fp[2];
141 |       re = new RegExp(mgr0);
142 |       if (re.test(stem))
143 |         w = stem + step2list[suffix];
144 |     }
145 | 
146 |     // Step 3
147 |     re = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/;
148 |     if (re.test(w)) {
149 |       var fp = re.exec(w);
150 |       stem = fp[1];
151 |       suffix = fp[2];
152 |       re = new RegExp(mgr0);
153 |       if (re.test(stem))
154 |         w = stem + step3list[suffix];
155 |     }
156 | 
157 |     // Step 4
158 |     re = /^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/;
159 |     re2 = /^(.+?)(s|t)(ion)$/;
160 |     if (re.test(w)) {
161 |       var fp = re.exec(w);
162 |       stem = fp[1];
163 |       re = new RegExp(mgr1);
164 |       if (re.test(stem))
165 |         w = stem;
166 |     }
167 |     else if (re2.test(w)) {
168 |       var fp = re2.exec(w);
169 |       stem = fp[1] + fp[2];
170 |       re2 = new RegExp(mgr1);
171 |       if (re2.test(stem))
172 |         w = stem;
173 |     }
174 | 
175 |     // Step 5
176 |     re = /^(.+?)e$/;
177 |     if (re.test(w)) {
178 |       var fp = re.exec(w);
179 |       stem = fp[1];
180 |       re = new RegExp(mgr1);
181 |       re2 = new RegExp(meq1);
182 |       re3 = new RegExp("^" + C + v + "[^aeiouwxy]$");
183 |       if (re.test(stem) || (re2.test(stem) && !(re3.test(stem))))
184 |         w = stem;
185 |     }
186 |     re = /ll$/;
187 |     re2 = new RegExp(mgr1);
188 |     if (re.test(w) && re2.test(w)) {
189 |       re = /.$/;
190 |       w = w.replace(re,"");
191 |     }
192 | 
193 |     // and turn initial Y back to y
194 |     if (firstch == "y")
195 |       w = firstch.toLowerCase() + w.substr(1);
196 |     return w;
197 |   }
198 | }
199 | 
200 | 


--------------------------------------------------------------------------------
/docs/build/_static/sphinx_highlight.js:
--------------------------------------------------------------------------------
  1 | /* Highlighting utilities for Sphinx HTML documentation. */
  2 | "use strict";
  3 | 
  4 | const SPHINX_HIGHLIGHT_ENABLED = true
  5 | 
  6 | /**
  7 |  * highlight a given string on a node by wrapping it in
  8 |  * span elements with the given class name.
  9 |  */
 10 | const _highlight = (node, addItems, text, className) => {
 11 |   if (node.nodeType === Node.TEXT_NODE) {
 12 |     const val = node.nodeValue;
 13 |     const parent = node.parentNode;
 14 |     const pos = val.toLowerCase().indexOf(text);
 15 |     if (
 16 |       pos >= 0 &&
 17 |       !parent.classList.contains(className) &&
 18 |       !parent.classList.contains("nohighlight")
 19 |     ) {
 20 |       let span;
 21 | 
 22 |       const closestNode = parent.closest("body, svg, foreignObject");
 23 |       const isInSVG = closestNode && closestNode.matches("svg");
 24 |       if (isInSVG) {
 25 |         span = document.createElementNS("http://www.w3.org/2000/svg", "tspan");
 26 |       } else {
 27 |         span = document.createElement("span");
 28 |         span.classList.add(className);
 29 |       }
 30 | 
 31 |       span.appendChild(document.createTextNode(val.substr(pos, text.length)));
 32 |       const rest = document.createTextNode(val.substr(pos + text.length));
 33 |       parent.insertBefore(
 34 |         span,
 35 |         parent.insertBefore(
 36 |           rest,
 37 |           node.nextSibling
 38 |         )
 39 |       );
 40 |       node.nodeValue = val.substr(0, pos);
 41 |       /* There may be more occurrences of search term in this node. So call this
 42 |        * function recursively on the remaining fragment.
 43 |        */
 44 |       _highlight(rest, addItems, text, className);
 45 | 
 46 |       if (isInSVG) {
 47 |         const rect = document.createElementNS(
 48 |           "http://www.w3.org/2000/svg",
 49 |           "rect"
 50 |         );
 51 |         const bbox = parent.getBBox();
 52 |         rect.x.baseVal.value = bbox.x;
 53 |         rect.y.baseVal.value = bbox.y;
 54 |         rect.width.baseVal.value = bbox.width;
 55 |         rect.height.baseVal.value = bbox.height;
 56 |         rect.setAttribute("class", className);
 57 |         addItems.push({ parent: parent, target: rect });
 58 |       }
 59 |     }
 60 |   } else if (node.matches && !node.matches("button, select, textarea")) {
 61 |     node.childNodes.forEach((el) => _highlight(el, addItems, text, className));
 62 |   }
 63 | };
 64 | const _highlightText = (thisNode, text, className) => {
 65 |   let addItems = [];
 66 |   _highlight(thisNode, addItems, text, className);
 67 |   addItems.forEach((obj) =>
 68 |     obj.parent.insertAdjacentElement("beforebegin", obj.target)
 69 |   );
 70 | };
 71 | 
 72 | /**
 73 |  * Small JavaScript module for the documentation.
 74 |  */
 75 | const SphinxHighlight = {
 76 | 
 77 |   /**
 78 |    * highlight the search words provided in localstorage in the text
 79 |    */
 80 |   highlightSearchWords: () => {
 81 |     if (!SPHINX_HIGHLIGHT_ENABLED) return;  // bail if no highlight
 82 | 
 83 |     // get and clear terms from localstorage
 84 |     const url = new URL(window.location);
 85 |     const highlight =
 86 |         localStorage.getItem("sphinx_highlight_terms")
 87 |         || url.searchParams.get("highlight")
 88 |         || "";
 89 |     localStorage.removeItem("sphinx_highlight_terms")
 90 |     url.searchParams.delete("highlight");
 91 |     window.history.replaceState({}, "", url);
 92 | 
 93 |     // get individual terms from highlight string
 94 |     const terms = highlight.toLowerCase().split(/\s+/).filter(x => x);
 95 |     if (terms.length === 0) return; // nothing to do
 96 | 
 97 |     // There should never be more than one element matching "div.body"
 98 |     const divBody = document.querySelectorAll("div.body");
 99 |     const body = divBody.length ? divBody[0] : document.querySelector("body");
100 |     window.setTimeout(() => {
101 |       terms.forEach((term) => _highlightText(body, term, "highlighted"));
102 |     }, 10);
103 | 
104 |     const searchBox = document.getElementById("searchbox");
105 |     if (searchBox === null) return;
106 |     searchBox.appendChild(
107 |       document
108 |         .createRange()
109 |         .createContextualFragment(
110 |           '<p class="highlight-link">' +
111 |             '<a href="javascript:SphinxHighlight.hideSearchWords()">' +
112 |             _("Hide Search Matches") +
113 |             "</a></p>"
114 |         )
115 |     );
116 |   },
117 | 
118 |   /**
119 |    * helper function to hide the search marks again
120 |    */
121 |   hideSearchWords: () => {
122 |     document
123 |       .querySelectorAll("#searchbox .highlight-link")
124 |       .forEach((el) => el.remove());
125 |     document
126 |       .querySelectorAll("span.highlighted")
127 |       .forEach((el) => el.classList.remove("highlighted"));
128 |     localStorage.removeItem("sphinx_highlight_terms")
129 |   },
130 | 
131 |   initEscapeListener: () => {
132 |     // only install a listener if it is really needed
133 |     if (!DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS) return;
134 | 
135 |     document.addEventListener("keydown", (event) => {
136 |       // bail for input elements
137 |       if (BLACKLISTED_KEY_CONTROL_ELEMENTS.has(document.activeElement.tagName)) return;
138 |       // bail with special keys
139 |       if (event.shiftKey || event.altKey || event.ctrlKey || event.metaKey) return;
140 |       if (DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS && (event.key === "Escape")) {
141 |         SphinxHighlight.hideSearchWords();
142 |         event.preventDefault();
143 |       }
144 |     });
145 |   },
146 | };
147 | 
148 | _ready(() => {
149 |   /* Do not call highlightSearchWords() when we are on the search page.
150 |    * It will highlight words from the *previous* search query.
151 |    */
152 |   if (typeof Search === "undefined") SphinxHighlight.highlightSearchWords();
153 |   SphinxHighlight.initEscapeListener();
154 | });
155 | 


--------------------------------------------------------------------------------
/docs/build/py-modindex.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html class="writer-html5" lang="en" >
  3 | <head>
  4 |   <meta charset="utf-8" />
  5 |   <meta name="viewport" content="width=device-width, initial-scale=1.0" />
  6 |   <title>Python Module Index &mdash; genal v0.8 documentation</title>
  7 |       <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
  8 |       <link rel="stylesheet" href="_static/css/theme.css" type="text/css" />
  9 |   <!--[if lt IE 9]>
 10 |     <script src="_static/js/html5shiv.min.js"></script>
 11 |   <![endif]-->
 12 |   
 13 |         <script src="_static/documentation_options.js?v=b326c068"></script>
 14 |         <script src="_static/doctools.js?v=9a2dae69"></script>
 15 |         <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
 16 |     <script src="_static/js/theme.js"></script>
 17 |     <link rel="index" title="Index" href="genindex.html" />
 18 |     <link rel="search" title="Search" href="search.html" />
 19 |  
 20 | 
 21 | 
 22 | </head>
 23 | 
 24 | <body class="wy-body-for-nav"> 
 25 |   <div class="wy-grid-for-nav">
 26 |     <nav data-toggle="wy-nav-shift" class="wy-nav-side">
 27 |       <div class="wy-side-scroll">
 28 |         <div class="wy-side-nav-search" >
 29 | 
 30 |           
 31 |           
 32 |           <a href="index.html" class="icon icon-home">
 33 |             genal
 34 |           </a>
 35 | <div role="search">
 36 |   <form id="rtd-search-form" class="wy-form" action="search.html" method="get">
 37 |     <input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
 38 |     <input type="hidden" name="check_keywords" value="yes" />
 39 |     <input type="hidden" name="area" value="default" />
 40 |   </form>
 41 | </div>
 42 |         </div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
 43 |               <ul>
 44 | <li class="toctree-l1"><a class="reference internal" href="index.html">Home</a></li>
 45 | <li class="toctree-l1"><a class="reference internal" href="introduction.html">Installation</a></li>
 46 | <li class="toctree-l1"><a class="reference internal" href="introduction.html#tutorial">Tutorial</a></li>
 47 | <li class="toctree-l1"><a class="reference internal" href="modules.html">The Geno class</a></li>
 48 | <li class="toctree-l1"><a class="reference internal" href="modules.html#main-functions">Main functions</a></li>
 49 | <li class="toctree-l1"><a class="reference internal" href="api.html">API</a></li>
 50 | </ul>
 51 | 
 52 |         </div>
 53 |       </div>
 54 |     </nav>
 55 | 
 56 |     <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
 57 |           <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
 58 |           <a href="index.html">genal</a>
 59 |       </nav>
 60 | 
 61 |       <div class="wy-nav-content">
 62 |         <div class="rst-content">
 63 |           <div role="navigation" aria-label="Page navigation">
 64 |   <ul class="wy-breadcrumbs">
 65 |       <li><a href="index.html" class="icon icon-home" aria-label="Home"></a></li>
 66 |       <li class="breadcrumb-item active">Python Module Index</li>
 67 |       <li class="wy-breadcrumbs-aside">
 68 |       </li>
 69 |   </ul>
 70 |   <hr/>
 71 | </div>
 72 |           <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
 73 |            <div itemprop="articleBody">
 74 |              
 75 | 
 76 |    <h1>Python Module Index</h1>
 77 | 
 78 |    <div class="modindex-jumpbox">
 79 |    <a href="#cap-g"><strong>g</strong></a>
 80 |    </div>
 81 | 
 82 |    <table class="indextable modindextable">
 83 |      <tr class="pcap"><td></td><td>&#160;</td><td></td></tr>
 84 |      <tr class="cap" id="cap-g"><td></td><td>
 85 |        <strong>g</strong></td><td></td></tr>
 86 |      <tr>
 87 |        <td><img src="_static/minus.png" class="toggler"
 88 |               id="toggle-1" style="display: none" alt="-" /></td>
 89 |        <td>
 90 |        <code class="xref">genal</code></td><td>
 91 |        <em></em></td></tr>
 92 |      <tr class="cg-1">
 93 |        <td></td>
 94 |        <td>&#160;&#160;&#160;
 95 |        <a href="api.html#module-genal.association"><code class="xref">genal.association</code></a></td><td>
 96 |        <em></em></td></tr>
 97 |      <tr class="cg-1">
 98 |        <td></td>
 99 |        <td>&#160;&#160;&#160;
100 |        <a href="api.html#module-genal.clump"><code class="xref">genal.clump</code></a></td><td>
101 |        <em></em></td></tr>
102 |      <tr class="cg-1">
103 |        <td></td>
104 |        <td>&#160;&#160;&#160;
105 |        <a href="api.html#module-genal.extract_prs"><code class="xref">genal.extract_prs</code></a></td><td>
106 |        <em></em></td></tr>
107 |      <tr class="cg-1">
108 |        <td></td>
109 |        <td>&#160;&#160;&#160;
110 |        <a href="api.html#module-genal.geno_tools"><code class="xref">genal.geno_tools</code></a></td><td>
111 |        <em></em></td></tr>
112 |      <tr class="cg-1">
113 |        <td></td>
114 |        <td>&#160;&#160;&#160;
115 |        <a href="api.html#module-genal.lift"><code class="xref">genal.lift</code></a></td><td>
116 |        <em></em></td></tr>
117 |      <tr class="cg-1">
118 |        <td></td>
119 |        <td>&#160;&#160;&#160;
120 |        <a href="api.html#module-genal.MR"><code class="xref">genal.MR</code></a></td><td>
121 |        <em></em></td></tr>
122 |      <tr class="cg-1">
123 |        <td></td>
124 |        <td>&#160;&#160;&#160;
125 |        <a href="api.html#module-genal.MR_tools"><code class="xref">genal.MR_tools</code></a></td><td>
126 |        <em></em></td></tr>
127 |      <tr class="cg-1">
128 |        <td></td>
129 |        <td>&#160;&#160;&#160;
130 |        <a href="api.html#module-genal.MRpresso"><code class="xref">genal.MRpresso</code></a></td><td>
131 |        <em></em></td></tr>
132 |      <tr class="cg-1">
133 |        <td></td>
134 |        <td>&#160;&#160;&#160;
135 |        <a href="api.html#module-genal.proxy"><code class="xref">genal.proxy</code></a></td><td>
136 |        <em></em></td></tr>
137 |      <tr class="cg-1">
138 |        <td></td>
139 |        <td>&#160;&#160;&#160;
140 |        <a href="api.html#module-genal.snp_query"><code class="xref">genal.snp_query</code></a></td><td>
141 |        <em></em></td></tr>
142 |      <tr class="cg-1">
143 |        <td></td>
144 |        <td>&#160;&#160;&#160;
145 |        <a href="api.html#module-genal.tools"><code class="xref">genal.tools</code></a></td><td>
146 |        <em></em></td></tr>
147 |    </table>
148 | 
149 | 
150 |            </div>
151 |           </div>
152 |           <footer>
153 | 
154 |   <hr/>
155 | 
156 |   <div role="contentinfo">
157 |     <p>&#169; Copyright 2023, Cyprien A. Rivier.</p>
158 |   </div>
159 | 
160 |   Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
161 |     <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
162 |     provided by <a href="https://readthedocs.org">Read the Docs</a>.
163 |    
164 | 
165 | </footer>
166 |         </div>
167 |       </div>
168 |     </section>
169 |   </div>
170 |   <script>
171 |       jQuery(function () {
172 |           SphinxRtdTheme.Navigation.enable(true);
173 |       });
174 |   </script> 
175 | 
176 | </body>
177 | </html>


--------------------------------------------------------------------------------
/genal/snp_query.py:
--------------------------------------------------------------------------------
  1 | import aiohttp
  2 | import asyncio
  3 | import numpy as np
  4 | import nest_asyncio
  5 | from tqdm.auto import tqdm  
  6 | 
  7 | # Using nest_asyncio to allow execution in notebooks
  8 | nest_asyncio.apply()
  9 | 
 10 | # Main function to start the event loop and run the asynchronous query
 11 | def async_query_gwas_catalog(snps, p_threshold=5e-8, return_p=False, return_study=False, 
 12 |                              max_associations=None, timeout=100):
 13 |     try:
 14 |         loop = asyncio.get_event_loop()
 15 |     except RuntimeError:
 16 |         loop = asyncio.new_event_loop()
 17 |         asyncio.set_event_loop(loop)
 18 |     results_global, errors, timeouts = loop.run_until_complete(
 19 |         query_gwas_catalog_coroutine(
 20 |             snps, p_threshold, return_p, return_study, max_associations, timeout
 21 |         )
 22 |     )
 23 |     return results_global, errors, timeouts
 24 | 
 25 | 
 26 | # Function to query GWAS Catalog API for SNP associations
 27 | async def query_gwas_catalog_coroutine(snps, p_threshold=5e-8, return_p=False, return_study=False, 
 28 |                                        max_associations=None, timeout=100):
 29 |     """
 30 |     Query the GWAS Catalog API for SNP associations.
 31 |     
 32 |     Parameters:
 33 |         snps (list): List of SNPs to query.
 34 |         p_threshold (float): P-value threshold for filtering associations.
 35 |         return_p (bool): Whether to return the P-value of the association.
 36 |         return_study (bool): Whether to return the study ID of the association.
 37 |         max_associations (int): Maximum number of associations to return for each SNP.
 38 |         timeout (int): Timeout for each query in seconds.
 39 | 
 40 |     Returns:
 41 |         results_global (dict): Dictionary storing the SNP (keys) and results for each SNP: a list of single strings or tuples
 42 |         errors (list): List storing SNP for which the GWAS Catalog could not be queried
 43 |         timeouts (list): List storing SNP for which the timeout was reached
 44 |     """
 45 |     
 46 |     results_global = {}  # Dictionary storing the SNP (keys) and results for each SNP: a list of single strings or tuples
 47 |     errors = []          # List storing SNP for which the GWAS Catalog could not be queried
 48 |     timeouts = []        # List storing SNP for which the timeout was reached
 49 | 
 50 |     async def fetch(session, url, timeout_duration=timeout): 
 51 |         try:
 52 |             # Wrap the entire fetch operation with asyncio.wait_for for timeout
 53 |             response = await asyncio.wait_for(session.get(url), timeout=timeout_duration)
 54 |             async with response:
 55 |                 if response.status == 200:
 56 |                     return await response.json()
 57 |                 return None
 58 |         except asyncio.TimeoutError:
 59 |             return "TIMEOUT"
 60 |         except aiohttp.ClientError:
 61 |             return "ERROR"
 62 | 
 63 |     async def process_snp(session, snp):
 64 |         #print(f"Processing SNP {snp}")
 65 |         
 66 |         results_snp = []  # List storing the results for each association found for this SNP
 67 |         
 68 |         base_url = f"https://www.ebi.ac.uk/gwas/rest/api/singleNucleotidePolymorphisms/{snp}/associations?projection=associationBySnp"
 69 |         base_data = await fetch(session, base_url, timeout_duration=timeout)
 70 |         
 71 |         if base_data == "TIMEOUT":
 72 |             timeouts.append(snp)
 73 |         elif base_data == "ERROR" or base_data is None:
 74 |             errors.append(snp)
 75 |         else:
 76 |             i = 0
 77 |             # Process each association found for this SNP
 78 |             for assoc in base_data.get('_embedded', {}).get('associations', []):
 79 | 
 80 |                 # If there are already max_associations, stop the loop
 81 |                 if max_associations and i >= max_associations:
 82 |                     break
 83 |                 i += 1
 84 | 
 85 |                 pvalue = assoc.get("pvalue", np.nan)
 86 |                 # If the pvalue of the association does not pass the threshold, the association is not processed further nor reported 
 87 |                 if pvalue < p_threshold:
 88 |                     efo_traits = assoc.get("efoTraits", [])
 89 |                     if efo_traits:
 90 |                         trait = efo_traits[0].get("trait", "")
 91 |                     else:
 92 |                         trait = ""
 93 |                     
 94 |                     # If the return_study flag is active: query the page containing the GWAS Catalog study ID
 95 |                     if return_study:
 96 |                         study_url = assoc.get("_links", {}).get("study", {}).get("href", "")
 97 |                         if study_url:
 98 |                             study_data = await fetch(session, study_url, timeout_duration=timeout)
 99 |                             if study_data == "TIMEOUT":
100 |                                 study_id = "TIMEOUT"
101 |                             elif study_data == "ERROR" or study_data is None:
102 |                                 study_id = "Error"
103 |                             else:
104 |                                 study_id = study_data.get("accessionId", "Not found")
105 |                         else:
106 |                             study_id = "Not available"
107 |                     else:
108 |                         study_id = None
109 |                         
110 |                     # Return a tuple or a string depending on the return flags
111 |                     if return_p and return_study:
112 |                         result_assoc = (trait, "{:.4g}".format(pvalue), study_id)
113 |                     elif return_p:
114 |                         result_assoc = (trait, "{:.4g}".format(pvalue))
115 |                     elif return_study:
116 |                         result_assoc = (trait, study_id)
117 |                     else:
118 |                         result_assoc = trait
119 |                     results_snp.append(result_assoc)
120 |                     
121 |                 else:
122 |                     continue
123 |                 
124 |             # Clean the associations depending on the flag
125 |             # If the P-value and Study ID are not returned, display each trait only once
126 |             if not return_p and not return_study:
127 |                 results_snp = list(set(results_snp))
128 |             # If the P-value must be returned, return each trait once with the lowest p-value
129 |             elif return_p and not return_study:
130 |                 min_trait = {}
131 |                 for trait, pvalue in results_snp:
132 |                     if trait not in min_trait or pvalue < min_trait[trait]:
133 |                         min_trait[trait] = pvalue
134 |                 results_snp = [(trait, min_trait[trait]) for trait in min_trait]
135 |                 
136 |             results_global[snp] = results_snp
137 | 
138 |     async with aiohttp.ClientSession() as session:
139 |         tasks = [process_snp(session, snp) for snp in snps]
140 |         # Initialize tqdm progress bar
141 |         with tqdm(total=len(tasks), desc="Processing SNPs") as pbar:
142 |             for coro in asyncio.as_completed(tasks):
143 |                 await coro
144 |                 pbar.update(1)
145 |     
146 |     return results_global, errors, timeouts


--------------------------------------------------------------------------------
/genal/colocalization.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | from numpy import exp, log
  4 | from genal.geno_tools import check_beta_column, check_allele_column, check_snp_column, check_int_column
  5 | 
  6 | # Currently does not support multi-allelic SNPs
  7 | 
  8 | def coloc_abf_func(data1, data2, trait1_type="quant", trait2_type="quant", 
  9 |                    sdY1=None, sdY2=None, n1=None, n2=None,
 10 |                    p1=1e-4, p2=1e-4, p12=1e-5, merge_on_snp=False):
 11 |     """
 12 |     Perform colocalization analysis between two GWAS datasets using approximate Bayes factors.
 13 |     Corresponds to the :meth:`Geno.colocalize` method.
 14 |     
 15 |     Args:
 16 |         data1: DataFrame containing GWAS results for trait 1
 17 |         data2: DataFrame containing GWAS results for trait 2
 18 |         trait1_type: Type of trait 1 ("quant" for quantitative traits or "cc" for case-control traits), default is "quant"
 19 |         trait2_type: Type of trait 2 ("quant" for quantitative traits or "cc" for case-control traits), default is "quant"
 20 |         sdY1: Standard deviation of trait 1 (required for quantitative traits)
 21 |         sdY2: Standard deviation of trait 2 (required for quantitative traits)
 22 |         n1: Sample size for trait 1 (used to estimate sdY if not provided)
 23 |         n2: Sample size for trait 2 (used to estimate sdY if not provided)
 24 |         p1: Prior probability SNP associated with trait 1, default is 1e-4
 25 |         p2: Prior probability SNP associated with trait 2, default is 1e-4
 26 |         p12: Prior probability SNP associated with both traits, default is 1e-5
 27 |         merge_on_snp: If True, merge the datasets on SNP column. If False, first attempt to merge on CHR and POS columns.
 28 | 
 29 |     """
 30 | 
 31 |     # Ensure that the BETA columns are preprocessed
 32 |     check_beta_column(data1, 'BETA', 'Fill')
 33 |     check_beta_column(data2, 'BETA', 'Fill')
 34 | 
 35 |     # Adjust EAF column names before merging in case one of the datasets does not have it
 36 |     if 'EAF' in data1.columns:
 37 |         data1.rename(columns={'EAF': 'EAF_1'}, inplace=True)
 38 |     if 'EAF' in data2.columns:
 39 |         data2.rename(columns={'EAF': 'EAF_2'}, inplace=True)
 40 | 
 41 |     # First determine if we can merge on position, otherwise try SNP
 42 |     if all(col in data1.columns for col in ['CHR', 'POS']) and \
 43 |        all(col in data2.columns for col in ['CHR', 'POS']) and not merge_on_snp:
 44 |         
 45 |         print("Merging datasets using genomic positions (CHR, POS)")
 46 |         
 47 |         # Ensure that the CHR and POS columns are preprocessed
 48 |         check_int_column(data1, "CHR")
 49 |         check_int_column(data2, "CHR")
 50 |         check_int_column(data1, "POS")
 51 |         check_int_column(data2, "POS")
 52 |         
 53 |         # Merge using position
 54 |         merged_data = pd.merge(
 55 |             data1,
 56 |             data2,
 57 |             on=['CHR', 'POS'],
 58 |             how='left',
 59 |             suffixes=('_1', '_2')
 60 |         )
 61 |         
 62 |     elif 'SNP' in data1.columns and 'SNP' in data2.columns:
 63 |         print("Position columns (CHR, POS) not present in both datasets. Merging datasets using SNP IDs.")
 64 |         
 65 |         # Ensure that the SNP column is preprocessed
 66 |         check_snp_column(data1)
 67 |         check_snp_column(data2)
 68 |         
 69 |         # Merge using SNP
 70 |         merged_data = pd.merge(
 71 |             data1,
 72 |             data2,
 73 |             on='SNP',
 74 |             suffixes=('_1', '_2')
 75 |         )
 76 |     
 77 |     else:
 78 |         raise ValueError("At least CHR/POS or SNP columns must be present in both datasets for colocalization analysis")
 79 | 
 80 |     # After merging, check if we can align alleles
 81 |     if all(col in merged_data.columns for col in ['EA_1', 'NEA_1', 'EA_2', 'NEA_2']):
 82 |         print("Aligning effect alleles between datasets")
 83 |         
 84 |         # Ensure allele columns are preprocessed
 85 |         check_allele_column(data1, "EA", keep_indel=False)
 86 |         check_allele_column(data1, "NEA", keep_indel=False)
 87 |         check_allele_column(data2, "EA", keep_indel=False)
 88 |         check_allele_column(data2, "NEA", keep_indel=False)
 89 |         
 90 |         # Adjust BETA from trait 2 to correspond to the same effect allele as trait 1
 91 |         conditions = [
 92 |             merged_data["EA_1"] == merged_data["EA_2"],
 93 |             merged_data["EA_1"] == merged_data["NEA_2"],
 94 |             True,
 95 |         ]
 96 |         choices = [
 97 |             merged_data["BETA_2"],
 98 |             -merged_data["BETA_2"],
 99 |             np.nan,
100 |         ]
101 |         merged_data["BETA_2"] = np.select(conditions, choices)
102 |     else:
103 |         print("Allele columns (EA, NEA) not present in both datasets. "
104 |               "This might lead to incorrect results if the effect estimates (BETA) were not obtained with the same reference allele in both datasets.")
105 | 
106 |     # Clean up columns
107 |     merged_data.drop(columns=["EA_2", "NEA_2", "SNP_2", "CHR_2", "POS_2"], inplace=True, errors='ignore')
108 |     merged_data.rename(columns={"SNP_1": "SNP", "CHR_1": "CHR", "POS_1": "POS"}, inplace=True, errors='ignore')
109 | 
110 |     # Drop any rows with duplicate values
111 |     if "SNP" in merged_data.columns:    
112 |         merged_data.drop_duplicates(subset=['SNP'], keep='first', inplace=True)
113 |     if "CHR" in merged_data.columns and "POS" in merged_data.columns:
114 |         merged_data.drop_duplicates(subset=["CHR", "POS"], keep='first', inplace=True)
115 | 
116 |     # Drop any rows with missing values
117 |     merged_data = merged_data.dropna()
118 |     if merged_data.empty:
119 |         raise ValueError("No overlapping variants found between the datasets")
120 |     
121 |     print(f"Using {len(merged_data)} overlapping variants for colocalization analysis")
122 |     
123 |     # Estimate sdY if not provided for quantitative traits
124 |     if trait1_type == "quant" and sdY1 is None:
125 |         if 'EAF_1' not in merged_data.columns or n1 is None:
126 |             print("Neither sdY1 nor EAF and n1 are provided for trait 1. Assuming sdY1 = 1.")
127 |             sdY1 = 1
128 |         else:
129 |             sdY1 = sdY_est(merged_data['SE_1']**2, merged_data['EAF_1'], n1)
130 |             print(f"Using EAF and n1 to estimate sdY1: {sdY1:.2f}")
131 |         
132 |     if trait2_type == "quant" and sdY2 is None:
133 |         if 'EAF_2' not in merged_data.columns or n2 is None:
134 |             print("Neither sdY2 nor EAF and n2 are provided for trait 2. Assuming sdY2 = 1.")
135 |             sdY2 = 1
136 |         else:
137 |             sdY2 = sdY_est(merged_data['SE_2']**2, merged_data['EAF_2'], n2)
138 |             print(f"Using EAF and n2 to estimate sdY2: {sdY2:.2f}")
139 |     
140 |     # Calculate Bayes factors for each dataset
141 |     lABF_1 = approx_bf_estimates(merged_data['BETA_1'], merged_data['SE_1']**2, 
142 |                                 trait_type=trait1_type, sdY=sdY1)
143 |     lABF_2 = approx_bf_estimates(merged_data['BETA_2'], merged_data['SE_2']**2, 
144 |                                 trait_type=trait2_type, sdY=sdY2)
145 |     
146 |     # Adjust priors based on number of SNPs
147 |     n_snps = len(merged_data)
148 |     if n_snps * p1 >= 1:
149 |         p1 = 1 / (n_snps + 1)
150 |     if n_snps * p2 >= 1:
151 |         p2 = 1 / (n_snps + 1)
152 |     if n_snps * p12 >= 1:
153 |         p12 = 1 / (n_snps + 1)
154 |     
155 |     # Calculate posterior probabilities
156 |     pp = combine_abf(lABF_1, lABF_2, p1, p2, p12)
157 |     
158 |     # Add SNP-specific results
159 |     results_df = merged_data.copy()
160 |     results_df['lABF_1'] = lABF_1
161 |     results_df['lABF_2'] = lABF_2
162 |     results_df['internal.sum.lABF'] = lABF_1 + lABF_2
163 |     
164 |     # Calculate SNP-specific PP for H4
165 |     my_denom_log_abf = logsum(results_df['internal.sum.lABF'])
166 |     results_df['SNP.PP.H4'] = np.exp(results_df['internal.sum.lABF'] - my_denom_log_abf)
167 |     
168 |     return {
169 |             'nsnps': n_snps,
170 |             **pp
171 |         }
172 | 
173 | def approx_bf_estimates(beta, varbeta, trait_type="quant", sdY=1, effect_priors={'quant': 0.15, 'cc': 0.2}):
174 |     """
175 |     Calculate approximate Bayes factors using regression estimates.
176 |     
177 |     Args:
178 |         beta: effect size estimate
179 |         varbeta: variance of the effect size estimate
180 |         trait_type: either "quant" for quantitative trait or "cc" for case-control
181 |         sdY: standard deviation of the trait (for quantitative traits)
182 |         effect_priors: dictionary with prior effect sizes for quantitative and case-control traits
183 |         
184 |     Returns:
185 |         array: log approximate Bayes factors
186 |     """
187 |     z = beta / np.sqrt(varbeta)
188 |     
189 |     # Set prior standard deviation based on trait type
190 |     if trait_type == "quant":
191 |         sd_prior = effect_priors['quant'] * sdY
192 |     else:  # case-control
193 |         sd_prior = effect_priors['cc']
194 |         
195 |     r = sd_prior**2 / (sd_prior**2 + varbeta)
196 |     lABF = 0.5 * (np.log(1 - r) + (r * z**2))
197 |     return lABF
198 | 
199 | def logsum(x):
200 |     """Calculate log of sum of exponentials"""
201 |     my_max = np.max(x)
202 |     return my_max + np.log(np.sum(np.exp(x - my_max)))
203 | 
204 | def logdiff(x, y):
205 |     """Calculate log of difference of exponentials"""
206 |     my_max = max(x, y)
207 |     return my_max + np.log(exp(x - my_max) - np.exp(y - my_max))
208 | 
209 | def combine_abf(l1, l2, p1, p2, p12):
210 |     """Calculate posterior probabilities for different hypotheses"""
211 |     lsum = l1 + l2
212 |     
213 |     lH0_abf = 0
214 |     lH1_abf = np.log(p1) + logsum(l1)
215 |     lH2_abf = np.log(p2) + logsum(l2)
216 |     lH3_abf = np.log(p1) + np.log(p2) + logdiff(logsum(l1) + logsum(l2), logsum(lsum))
217 |     lH4_abf = np.log(p12) + logsum(lsum)
218 |     
219 |     all_abf = np.array([lH0_abf, lH1_abf, lH2_abf, lH3_abf, lH4_abf])
220 |     denom_log_abf = logsum(all_abf)
221 |     pp_abf = np.exp(all_abf - denom_log_abf)
222 |     
223 |     return {
224 |         'PP.H0.abf': pp_abf[0],
225 |         'PP.H1.abf': pp_abf[1],
226 |         'PP.H2.abf': pp_abf[2],
227 |         'PP.H3.abf': pp_abf[3],
228 |         'PP.H4.abf': pp_abf[4]
229 |     }
230 | 
231 | def sdY_est(vbeta, maf, n):
232 |     """
233 |     Estimate trait standard deviation given vectors of variance of coefficients, MAF and sample size.
234 |     
235 |     Args:
236 |         vbeta: vector of variance of coefficients
237 |         maf: vector of MAF (same length as vbeta)
238 |         n: sample size
239 |         
240 |     Returns:
241 |         float: estimated standard deviation of Y
242 |     """
243 |     oneover = 1/vbeta
244 |     nvx = 2 * n * maf * (1-maf)
245 |     # Fit linear regression through origin
246 |     coef = np.sum(nvx * oneover) / np.sum(oneover**2)
247 |     if coef < 0:
248 |         raise ValueError("Estimated sdY is negative - this can happen with small datasets, or those with errors. A reasonable estimate of sdY is required to continue.")
249 |     return np.sqrt(coef)


--------------------------------------------------------------------------------
/docs/build/index.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html class="writer-html5" lang="en" >
  3 | <head>
  4 |   <meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" />
  5 | 
  6 |   <meta name="viewport" content="width=device-width, initial-scale=1.0" />
  7 |   <title>genal: A Python Toolkit for Genetic Risk Scoring and Mendelian Randomization &mdash; genal v0.8 documentation</title>
  8 |       <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
  9 |       <link rel="stylesheet" href="_static/css/theme.css" type="text/css" />
 10 |   <!--[if lt IE 9]>
 11 |     <script src="_static/js/html5shiv.min.js"></script>
 12 |   <![endif]-->
 13 |   
 14 |         <script src="_static/documentation_options.js?v=b326c068"></script>
 15 |         <script src="_static/doctools.js?v=9a2dae69"></script>
 16 |         <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
 17 |     <script src="_static/js/theme.js"></script>
 18 |     <link rel="index" title="Index" href="genindex.html" />
 19 |     <link rel="search" title="Search" href="search.html" />
 20 |     <link rel="next" title="Installation" href="introduction.html" /> 
 21 | </head>
 22 | 
 23 | <body class="wy-body-for-nav"> 
 24 |   <div class="wy-grid-for-nav">
 25 |     <nav data-toggle="wy-nav-shift" class="wy-nav-side">
 26 |       <div class="wy-side-scroll">
 27 |         <div class="wy-side-nav-search" >
 28 | 
 29 |           
 30 |           
 31 |           <a href="#" class="icon icon-home">
 32 |             genal
 33 |           </a>
 34 | <div role="search">
 35 |   <form id="rtd-search-form" class="wy-form" action="search.html" method="get">
 36 |     <input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
 37 |     <input type="hidden" name="check_keywords" value="yes" />
 38 |     <input type="hidden" name="area" value="default" />
 39 |   </form>
 40 | </div>
 41 |         </div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
 42 |               <ul class="current">
 43 | <li class="toctree-l1 current"><a class="current reference internal" href="#">Home</a></li>
 44 | <li class="toctree-l1"><a class="reference internal" href="introduction.html">Installation</a></li>
 45 | <li class="toctree-l1"><a class="reference internal" href="introduction.html#tutorial">Tutorial</a></li>
 46 | <li class="toctree-l1"><a class="reference internal" href="modules.html">The Geno class</a></li>
 47 | <li class="toctree-l1"><a class="reference internal" href="modules.html#main-functions">Main functions</a></li>
 48 | <li class="toctree-l1"><a class="reference internal" href="api.html">API</a></li>
 49 | </ul>
 50 | 
 51 |         </div>
 52 |       </div>
 53 |     </nav>
 54 | 
 55 |     <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
 56 |           <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
 57 |           <a href="#">genal</a>
 58 |       </nav>
 59 | 
 60 |       <div class="wy-nav-content">
 61 |         <div class="rst-content">
 62 |           <div role="navigation" aria-label="Page navigation">
 63 |   <ul class="wy-breadcrumbs">
 64 |       <li><a href="#" class="icon icon-home" aria-label="Home"></a></li>
 65 |       <li class="breadcrumb-item active">genal: A Python Toolkit for Genetic Risk Scoring and Mendelian Randomization</li>
 66 |       <li class="wy-breadcrumbs-aside">
 67 |             <a href="_sources/index.rst.txt" rel="nofollow"> View page source</a>
 68 |       </li>
 69 |   </ul>
 70 |   <hr/>
 71 | </div>
 72 |           <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
 73 |            <div itemprop="articleBody">
 74 |              
 75 |   <section id="genal-a-python-toolkit-for-genetic-risk-scoring-and-mendelian-randomization">
 76 | <h1>genal: A Python Toolkit for Genetic Risk Scoring and Mendelian Randomization<a class="headerlink" href="#genal-a-python-toolkit-for-genetic-risk-scoring-and-mendelian-randomization" title="Link to this heading">¶</a></h1>
 77 | <dl class="field-list simple">
 78 | <dt class="field-odd">Author<span class="colon">:</span></dt>
 79 | <dd class="field-odd"><p>Cyprien Rivier</p>
 80 | </dd>
 81 | <dt class="field-even">Date<span class="colon">:</span></dt>
 82 | <dd class="field-even"><p>Aug 13, 2024</p>
 83 | </dd>
 84 | <dt class="field-odd">Version<span class="colon">:</span></dt>
 85 | <dd class="field-odd"><p>“0.8”</p>
 86 | </dd>
 87 | </dl>
 88 | <p>Genal is a python module designed to make it easy to run genetic risk scores and mendelian randomization analyses. It integrates a collection of tools that facilitate the cleaning of single nucleotide polymorphism data (usually derived from Genome-Wide Association Studies) and enable the execution of key clinical population genetic workflows. The functionalities provided by genal include clumping, lifting, association testing, polygenic risk scoring, and Mendelian randomization analyses, all within a single Python module.</p>
 89 | <p>The module prioritizes user-friendliness and intuitive operation, aiming to reduce the complexity of data analysis for researchers. Despite its focus on simplicity, Genal does not sacrifice the depth of customization or the precision of analysis. Researchers can expect to maintain analytical rigour while benefiting from the streamlined experience.</p>
 90 | <p>Genal draws on concepts from well-established R packages such as TwoSampleMR, MR-Presso, MendelianRandomization, and gwasvcf, adapting their proven methodologies to the Python environment. This approach ensures that users have access to tried and tested techniques with the versatility of Python’s data science tools.</p>
 91 | <p>To install the latest release, type:</p>
 92 | <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">pip</span> <span class="n">install</span> <span class="n">genal</span><span class="o">-</span><span class="n">python</span>
 93 | </pre></div>
 94 | </div>
 95 | <section id="contents">
 96 | <h2>Contents<a class="headerlink" href="#contents" title="Link to this heading">¶</a></h2>
 97 | <div class="toctree-wrapper compound">
 98 | <ul class="current">
 99 | <li class="toctree-l1 current"><a class="current reference internal" href="#">Home</a></li>
100 | <li class="toctree-l1"><a class="reference internal" href="introduction.html">Installation</a></li>
101 | <li class="toctree-l1"><a class="reference internal" href="introduction.html#tutorial">Tutorial</a></li>
102 | <li class="toctree-l1"><a class="reference internal" href="modules.html">The Geno class</a></li>
103 | <li class="toctree-l1"><a class="reference internal" href="modules.html#main-functions">Main functions</a></li>
104 | <li class="toctree-l1"><a class="reference internal" href="api.html">API</a></li>
105 | </ul>
106 | </div>
107 | </section>
108 | </section>
109 | <section id="indices-and-tables">
110 | <h1>Indices and tables<a class="headerlink" href="#indices-and-tables" title="Link to this heading">¶</a></h1>
111 | <ul class="simple">
112 | <li><p><a class="reference internal" href="genindex.html"><span class="std std-ref">Index</span></a></p></li>
113 | <li><p><a class="reference internal" href="py-modindex.html"><span class="std std-ref">Module Index</span></a></p></li>
114 | <li><p><a class="reference internal" href="search.html"><span class="std std-ref">Search Page</span></a></p></li>
115 | </ul>
116 | <section id="citation">
117 | <h2>Citation<a class="headerlink" href="#citation" title="Link to this heading">¶</a></h2>
118 | <p>If you use genal in your work, please cite the following paper:</p>
119 | <div role="list" class="citation-list">
120 | <div class="citation" id="rivier-2024" role="doc-biblioentry">
121 | <span class="label"><span class="fn-bracket">[</span>Rivier.2024<span class="fn-bracket">]</span></span>
122 | <p><em>Genal: A Python Toolkit for Genetic Risk Scoring and Mendelian Randomization</em>
123 | Cyprien Rivier, Cyprien A. Rivier, Santiago Clocchiatti-Tuozzo, Shufan Huo, Victor Torres-Lopez, Daniela Renedo, Kevin N. Sheth, Guido J. Falcone, Julian N. Acosta.
124 | medRxiv. 2024 May <a class="reference external" href="https://doi.org/10.1101/2024.05.23.24307776">10.1101/2024.05.23.24307776</a>.</p>
125 | </div>
126 | </div>
127 | </section>
128 | <section id="references">
129 | <h2>References<a class="headerlink" href="#references" title="Link to this heading">¶</a></h2>
130 | <div role="list" class="citation-list">
131 | <div class="citation" id="hemani-2018" role="doc-biblioentry">
132 | <span class="label"><span class="fn-bracket">[</span>Hemani.2018<span class="fn-bracket">]</span></span>
133 | <p><em>The MR-Base platform supports systematic causal inference across the human phenome.</em>
134 | Hemani G, Zheng J, Elsworth B, Wade KH, Baird D, Haberland V, Laurin C, Burgess S, Bowden J, Langdon R, Tan VY, Yarmolinsky J, Shihab HA, Timpson NJ, Evans DM, Relton C, Martin RM, Davey Smith G, Gaunt TR, Haycock PC, The MR-Base Collaboration
135 | eLife. 2018 May <a class="reference external" href="https://elifesciences.org/articles/34408">10.7554/eLife.34408</a>.
136 | PMID: <a class="reference external" href="https://pubmed.ncbi.nlm.nih.gov/29846171">29846171</a>.</p>
137 | </div>
138 | <div class="citation" id="verbanck-2018" role="doc-biblioentry">
139 | <span class="label"><span class="fn-bracket">[</span>Verbanck.2018<span class="fn-bracket">]</span></span>
140 | <p><em>Detection of widespread horizontal pleiotropy in causal relationships inferred from Mendelian randomization between complex traits and diseases.</em>
141 | Marie Verbanck, Chia-Yen Chen, Benjamin Neale, Ron Do.
142 | Nature Genetics 2018 May <a class="reference external" href="https://www.nature.com/articles/s41588-018-0099-7">10.1038/s41588-018-0099-7</a>.
143 | PMID: <a class="reference external" href="https://pubmed.ncbi.nlm.nih.gov/29686387/">29686387</a>.</p>
144 | </div>
145 | <div class="citation" id="lyon-2020" role="doc-biblioentry">
146 | <span class="label"><span class="fn-bracket">[</span>Lyon.2020<span class="fn-bracket">]</span></span>
147 | <p><em>The variant call format provides efficient and robust storage of GWAS summary statistics.</em>
148 | Matthew Lyon, Shea J Andrews, Ben Elsworth, Tom R Gaunt, Gibran Hemani, Edoardo Marcora.
149 | bioRxiv 2020 May 30 <a class="reference external" href="https://www.biorxiv.org/content/10.1101/2020.05.29.115824v1">2020.05.29.115824v1</a>.
150 | PMID: <a class="reference external" href="https://pubmed.ncbi.nlm.nih.gov/33441155/">33441155</a>.</p>
151 | </div>
152 | </div>
153 | </section>
154 | </section>
155 | 
156 | 
157 |            </div>
158 |           </div>
159 |           <footer><div class="rst-footer-buttons" role="navigation" aria-label="Footer">
160 |         <a href="introduction.html" class="btn btn-neutral float-right" title="Installation" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right" aria-hidden="true"></span></a>
161 |     </div>
162 | 
163 |   <hr/>
164 | 
165 |   <div role="contentinfo">
166 |     <p>&#169; Copyright 2023, Cyprien A. Rivier.</p>
167 |   </div>
168 | 
169 |   Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
170 |     <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
171 |     provided by <a href="https://readthedocs.org">Read the Docs</a>.
172 |    
173 | 
174 | </footer>
175 |         </div>
176 |       </div>
177 |     </section>
178 |   </div>
179 |   <script>
180 |       jQuery(function () {
181 |           SphinxRtdTheme.Navigation.enable(true);
182 |       });
183 |   </script> 
184 | 
185 | </body>
186 | </html>


--------------------------------------------------------------------------------
/genal/lift.py:
--------------------------------------------------------------------------------
  1 | from pyliftover import LiftOver
  2 | import os, subprocess
  3 | import numpy as np
  4 | import wget
  5 | import gzip
  6 | import shutil
  7 | import uuid
  8 | import pandas as pd
  9 | from concurrent.futures import ThreadPoolExecutor, as_completed
 10 | 
 11 | from .tools import read_config, create_tmp
 12 | 
 13 | 
 14 | def lift_data(
 15 |     data,
 16 |     start="hg19",
 17 |     end="hg38",
 18 |     extraction_file=False,
 19 |     chain_file=None,
 20 |     name=None,
 21 |     liftover_path=None,
 22 |     object_id="tmp_id",
 23 | ):
 24 |     """
 25 |     Perform a liftover from one genetic build to another. If the chain file required for the liftover is not present, it will be downloaded. It"s also possible to manually provide the path to the chain file.
 26 |     If the dataset is large, it is suggested to use an alternate method (e.g., `lift_data_liftover`).
 27 | 
 28 |     Args:
 29 |         data (pd.DataFrame): The input data containing at least "CHR" and "POS" columns.
 30 |         start (str, optional): The current build of the data. Defaults to "hg19".
 31 |         end (str, optional): The target build for liftover. Defaults to "hg38".
 32 |         extraction_file (bool, optional): If True, also prints a CHR POS SNP space-delimited file for extraction. Defaults to False.
 33 |         chain_file (str, optional): Path to a local chain file for the lift. Overrides the start and end arguments if provided.
 34 |         name (str, optional): Specify a filename or filepath (without extension) for saving. If not provided, the data is not saved.
 35 |         liftover_path (str, optional): Specify the path to the USCS liftover executable. If not provided, the lift will be done in python (slower for large amount of SNPs).
 36 |         object_id (str, optional): Specify the object id for tmp file writing (internal use only)
 37 | 
 38 |     Raises:
 39 |         ValueError: If required columns are missing or if provided chain file path is incorrect.
 40 | 
 41 |     Returns:
 42 |         pd.DataFrame: Lifted data.
 43 | 
 44 |     Notes:
 45 |         Function for the :meth:`Geno.lift` method.
 46 |     """
 47 | 
 48 |     # Prepare chain file and get its path
 49 |     chain_path = prepare_chain_file(chain_file, start, end)
 50 | 
 51 |     # Prepare the data for lifting: handle missing values in CHR, POS columns
 52 |     nrows = data.shape[0]
 53 |     data.dropna(subset=["CHR", "POS"], inplace=True)
 54 |     data.reset_index(drop=True, inplace=True)
 55 |     n_na = nrows - data.shape[0]
 56 |     if n_na:
 57 |         print(
 58 |             f"Excluded {n_na} SNPs ({n_na/nrows*100:.3f}%) with NaN values in CHR or POS columns."
 59 |         )
 60 | 
 61 |     # Perform liftover with the liftover executable or in python
 62 |     if liftover_path:
 63 |         data = lift_coordinates_liftover(data, object_id, chain_path, liftover_path)
 64 |     else:
 65 |         data = lift_coordinates_python(data, chain_path)
 66 | 
 67 |     # Handle post-liftover operations
 68 |     data = post_lift_operations(data, name, extraction_file)
 69 | 
 70 |     return data
 71 | 
 72 | 
 73 | def prepare_chain_file(chain_file, start, end):
 74 |     """Handle chain file loading, downloading if necessary. Return its path."""
 75 |     if chain_file is not None:  # If a local chain file is provided
 76 |         if not os.path.isfile(chain_file):
 77 |             raise ValueError("The provided path does not lead to a valid file.")
 78 |         print(
 79 |             "You provided a path to a local chain path which will be used for the lift."
 80 |         )
 81 |         chain_path = chain_file
 82 |     else:  # Use the specified start and end builds to identify chain file
 83 |         # Construct chain filename
 84 |         chain_name = f"{start.lower()}To{end.capitalize()}.over.chain"
 85 |         config = read_config()
 86 |         ref_path = config["paths"]["ref_path"]
 87 |         chains_folder_path = os.path.join(ref_path, "chain_files")
 88 | 
 89 |         # Ensure directory for chain files exists
 90 |         if not os.path.exists(chains_folder_path):
 91 |             try:
 92 |                 os.makedirs(chains_folder_path)
 93 |             except OSError:
 94 |                 raise OSError(
 95 |                     "Unable to create the 'tmp_GENAL' directory. Check permissions."
 96 |                 )
 97 | 
 98 |         # Check for the chain file locally or download it if necessary
 99 |         chain_path = os.path.join(chains_folder_path, chain_name)
100 |         if not os.path.isfile(chain_path):
101 |             print(
102 |                 f"The chain file to lift from {start} to {end} was not found. Attempting to download it..."
103 |             )
104 |             # Download the chain file
105 |             url = f"https://hgdownload.soe.ucsc.edu/goldenPath/{start.lower()}/liftOver/{chain_name}.gz"
106 |             try:
107 |                 wget.download(url, out=chains_folder_path)
108 |                 # Decompress the downloaded file
109 |                 print(f"The download was successful. Unzipping...")
110 |                 with gzip.open(f"{chain_path}.gz", "rb") as f_in, open(
111 |                     chain_path, "wb"
112 |                 ) as f_out:
113 |                     shutil.copyfileobj(f_in, f_out)
114 |             except Exception as e:
115 |                 print(f"The download was unsuccessful: {e}")
116 |                 print(
117 |                     "Consider downloading the chain file manually from the UCSC website and providing its path via the chain_file argument."
118 |                 )
119 |                 raise FileNotFoundError("Chain file not found.")
120 | 
121 |     return chain_path
122 | 
123 | 
124 | def lift_coordinates_liftover(data, object_id, chain_path, liftover_path):
125 |     """Lift data using the liftover executable and a chain file."""
126 |     # Add the executable part if not there
127 |     if not os.path.isfile(liftover_path):
128 |         liftover_path = os.path.join(liftover_path, "liftOver")
129 |     # Check that it is indeed the path to liftOver executable
130 |     try:
131 |         process = subprocess.run(
132 |             [liftover_path],
133 |             stdout=subprocess.PIPE,
134 |             stderr=subprocess.PIPE,
135 |             timeout=5,
136 |             text=True,
137 |         )
138 |         if not process.stderr.startswith("liftOver"):
139 |             raise TypeError(
140 |                 "The path provided is an executable, but not the liftOver executable. Check the path."
141 |             )
142 |     except Exception as e:
143 |         raise TypeError(e)
144 |     print("Lifting coordinates using liftOver.")
145 | 
146 |     # Write data in correct format for liftOver
147 |     create_tmp()
148 |     data["CHR_liftover"] = "chr" + data.CHR.astype(str)
149 |     to_lift_filename = os.path.join("tmp_GENAL", f"{object_id}.prelift")
150 |     lifted_filename = os.path.join("tmp_GENAL", f"{object_id}.postlift")
151 |     unmapped_filename = os.path.join("tmp_GENAL", f"{object_id}_unMapped")
152 |     data[["CHR_liftover", "POS", "POS"]].to_csv(
153 |         to_lift_filename, sep=" ", index=False, header=False
154 |     )
155 | 
156 |     # Call the liftOver software
157 |     command = f"{liftover_path} {to_lift_filename} \
158 |     {chain_path} {lifted_filename} {unmapped_filename}"
159 |     try:
160 |         output = subprocess.run(
161 |             command, shell=True, capture_output=True, text=True, check=True
162 |         )
163 |     except Exception as e:
164 |         print(f"Error running liftOver: {e}")
165 |         raise ValueError("Error running liftOver. Check error message for more details.")
166 | 
167 |     ## Read the output, print the number of unlifted SNPs and remove them from the prelift data.
168 |     df_post = pd.read_csv(lifted_filename, sep="\t", header=None)
169 |     unMapped = open(unmapped_filename, "r")
170 |     Lines = unMapped.readlines()
171 |     if len(Lines) > 0:
172 |         print(f"{int(len(Lines)/2)} SNPs could not be lifted.")
173 |     else:
174 |         print(f"All SNPs have been lifted.")
175 |     indices = list()
176 |     for i in range(1, len(Lines), 2):
177 |         c = Lines[i].strip()
178 |         (chrom, pos, pos) = c.split("\t")
179 |         indices.append(str(chrom) + ":" + str(pos))
180 |     drop_indices = data[(data.CHR_liftover.astype(str) + ":" + data.POS.astype(str)).isin(indices)].index
181 |     data.drop(index=drop_indices, inplace=True)
182 |     data.reset_index(drop=True, inplace=True)
183 | 
184 |     # Check the length of files
185 |     if len(data) != len(df_post):
186 |         raise ValueError(
187 |             "There was a problem lifting with liftOver. Try lifting in python (liftover_path = None)."
188 |         )
189 | 
190 |     ## Merge prelift and postlift data. Unknown chr from the output of liftOver are assigned the value 99. SNPs mapped to unknown chr are deleted from the final data and their number printed.
191 |     data["POS"] = df_post[1].astype(int)
192 |     data["CHR"] = (
193 |         df_post[0]
194 |         .str.split("chr", expand=True)[1]
195 |         .str.split("_", expand=True)[0]
196 |         .replace({"X": 99, "Y": 99, "Un": 99})
197 |         .astype(int)
198 |     )
199 |     nrow_before = data.shape[0]
200 |     drop_chr_indices = data[data.CHR == 99].index
201 |     data.drop(index=drop_chr_indices, inplace=True)
202 |     nrow_diff = nrow_before - data.shape[0]
203 |     if nrow_diff > 0:
204 |         print(
205 |             f"{nrow_diff} SNPs were lifted to an unknown chromosome and deleted from the final files."
206 |         )
207 |     data.drop(columns=["CHR_liftover"], inplace=True)
208 |     return data
209 | 
210 | 
211 | def lift_coordinates_python(data, chain_path):
212 |     """Perform liftover on data using the chain passed."""
213 |     lo = LiftOver(chain_path)
214 | 
215 |     # Print message
216 |     print("Lifting coordinates in python...")
217 |     nrows = data.shape[0]
218 |     if nrows > 500000:
219 |         print("Your data is large, this can take a few minutes...")
220 | 
221 |     # Create a list of tuples and lift
222 |     coordinates = list(zip(data["CHR"], data["POS"]))
223 | 
224 |     # Perform the lift
225 |     def convert_coordinate(args):
226 |         return lo.convert_coordinate(f"chr{args[0]}", args[1], "-")
227 | 
228 |     results = list(ThreadPoolExecutor().map(convert_coordinate, coordinates))
229 | 
230 |     data["POS"] = [res[0][1] if res else np.nan for res in results]
231 |     data["CHR"] = [res[0][0].split("chr")[1] if res else np.nan for res in results]
232 |     nrows = data.shape[0]
233 |     data.dropna(subset=["POS", "CHR"], inplace=True)
234 |     data["POS"] = data["POS"].astype("Int32")
235 |     data["CHR"] = data["CHR"].astype("Int32")
236 |     data.reset_index(drop=True, inplace=True)
237 |     n_na = nrows - data.shape[0]
238 |     if n_na:
239 |         print(f"{n_na} SNPs ({n_na/nrows*100:.3f}%) could not be lifted.")
240 |     else:
241 |         print("All SNPs have been lifted.")
242 |     return data
243 | 
244 | 
245 | def post_lift_operations(data, name, extraction_file):
246 |     """Handle post-liftover operations like reporting, and saving results."""
247 |     if name:
248 |         filename = os.path.splitext(name)[0] + ".txt"
249 |         data.to_csv(f"{filename}", sep="\t", header=True, index=False)
250 |         print(f"Lifted list of SNPs saved to {filename}")
251 |     if extraction_file:
252 |         if not ("SNP" in data.columns):
253 |             data["SNP"] = data["CHR"].astype(str) + ":" + data["POS"].astype(str)
254 |         data[["CHR", "POS", "SNP"]].to_csv(
255 |             f"{name + '_lifted'}_extraction.txt", sep=" ", header=False, index=False
256 |         )
257 |         print(f"Extraction file saved to {name+ '_lifted'}_extraction.txt")
258 |     return data
259 | 


--------------------------------------------------------------------------------
/genal/MRpresso.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | import statsmodels.formula.api as smf
  4 | from concurrent.futures import ProcessPoolExecutor
  5 | from sklearn.linear_model import LinearRegression
  6 | from tqdm import tqdm
  7 | from numpy.random import default_rng
  8 | from functools import partial
  9 | 
 10 | ##todo: implement the multivariable option, for the moment we assume only 1 BETA_e column
 11 | 
 12 | 
 13 | # MR-PRESSO main function
 14 | def mr_presso(
 15 |     data,
 16 |     BETA_e_columns=["BETA_e"],
 17 |     n_iterations=1000,
 18 |     outlier_test=True,
 19 |     distortion_test=True,
 20 |     significance_p=0.05,
 21 |     cpus=5,
 22 | ):
 23 |     """
 24 |     Perform the MR-PRESSO algorithm for detection of horizontal pleiotropy.
 25 | 
 26 |     Args:
 27 |         data (pd.DataFrame): DataFrame with at least 4 columns: BETA_o (outcome), SE_o, BETA_e (exposure), SE_e.
 28 |         BETA_e_columns (list): List of exposure beta columns.
 29 |         n_iterations (int): Number of steps performed (random data generation).
 30 |         outlier_test (bool): If True, identifies outlier SNPs responsible for horizontal pleiotropy.
 31 |         distortion_test (bool): If True, tests significant distortion in the causal estimates.
 32 |         significance_p (float): Statistical significance threshold for the detection of horizontal pleiotropy.
 33 |         cpus (int): Number of CPUs to use for parallel processing.
 34 | 
 35 |     Returns:
 36 |         mod_table (pd.DataFrame): DataFrame with the original and outlier-corrected inverse variance-weighted MR results.
 37 |         GlobalTest (dict): Dictionary with p-value of the global MR-PRESSO test.
 38 |         OutlierTest (pd.DataFrame): DataFrame with p-value for each SNP for the outlier test.
 39 |         BiasTest (dict): Dictionary with results of the distortion test.
 40 |     """
 41 |     # Transforming the data
 42 |     data = data[["BETA_o", *BETA_e_columns, "SE_o", "SE_e"]].dropna()
 43 |     data[["BETA_o", *BETA_e_columns]] = data[["BETA_o", *BETA_e_columns]].multiply(
 44 |         np.sign(data[BETA_e_columns[0]]), axis=0
 45 |     )
 46 |     data["Weights"] = 1 / (data["SE_o"] ** 2)
 47 | 
 48 |     if len(data) <= len(BETA_e_columns) + 2:
 49 |         raise Exception("Not enough instrumental variables (variants)")
 50 |     if len(data) >= n_iterations:
 51 |         raise Exception(
 52 |             "Not enough elements to compute empirical P-values, increase n_iterations"
 53 |         )
 54 | 
 55 |     print(f"Running the MR-PRESSO algorithm with N = {n_iterations} iterations.")
 56 |     # 1- Computing the observed residual sum of squares (RSS)
 57 |     print(f"Computing the observed residual sum of squares...")
 58 |     RSSobs = getRSS_LOO(data, BETA_e_columns, outlier_test)
 59 | 
 60 |     # 2- Computing the distribution of expected residual sum of squares (RSS)
 61 |     print("Computing the global MR-PRESSO p-value...")
 62 |     partial_parallel_RSS_LOO = partial(
 63 |         parallel_RSS_LOO, data=data, BETA_e_columns=BETA_e_columns
 64 |     )  # Wrapper function freezing the parallel_RSS_LOO call
 65 |     with ProcessPoolExecutor(max_workers=cpus) as executor:
 66 |         results = list(
 67 |             tqdm(
 68 |                 executor.map(partial_parallel_RSS_LOO, range(n_iterations)),
 69 |                 total=n_iterations,
 70 |                 desc="Generating random data",
 71 |                 ncols=100,
 72 |             )
 73 |         )
 74 | 
 75 |     RSSexp = [res[0] for res in results]
 76 |     Random_data_e = np.vstack([r[1] for r in results])
 77 |     Random_data_o = np.vstack([r[2] for r in results])
 78 | 
 79 |     global_p = np.sum([r > RSSobs[0] for r in RSSexp]) / n_iterations
 80 |     
 81 |     if outlier_test:
 82 |         GlobalTest = {"RSSobs": RSSobs[0], "global_test_p": global_p}
 83 |     else:
 84 |         GlobalTest = {"RSSobs": RSSobs, "global_test_p": global_p}
 85 | 
 86 |     # 3- Computing the single IV outlier test
 87 |     if global_p < significance_p and outlier_test:
 88 |         print("Global p-value is below the significance threshold. Running the Outlier test.")
 89 | 
 90 |         if len(BETA_e_columns) == 1:
 91 |             Dif = data["BETA_o"].values - data["BETA_e"].values * RSSobs[1]
 92 |             Exp = Random_data_o - (Random_data_e * RSSobs[1])
 93 |         else:
 94 |             raise ValueError("Outlier test not done for multi MR.")
 95 | 
 96 |         abs_diffs = np.abs(Exp.T) > np.abs(Dif)[:, np.newaxis]
 97 |         pvals = np.sum(abs_diffs, axis=1) / Exp.shape[0]
 98 | 
 99 |         OutlierTest = pd.DataFrame({"RSSobs": Dif**2, "Pvalue": pvals})
100 | 
101 |         OutlierTest.index = data.index
102 |         OutlierTest["Pvalue"] = np.minimum(
103 |             OutlierTest["Pvalue"] * len(data), 1
104 |         )  # Bonferroni correction
105 |         if data.shape[0] / n_iterations > significance_p:
106 |             print(
107 |                 f"Warning: the Outlier test in unstable. The {significance_p} significance threshold cannot be obtained with {n_iterations} Distributions. Increase n_iterations."
108 |             )
109 | 
110 |     else:
111 |         outlier_test = False
112 |         OutlierTest = pd.DataFrame()
113 | 
114 |     # 4- Computing the test of the distortion of the causal estimate
115 |     formula = f"BETA_o ~ -1 + {' + '.join(BETA_e_columns)}"
116 |     mod_all = smf.wls(formula, data=data, weights=data["Weights"]).fit()
117 | 
118 |     BiasTest = {}
119 |     subset_data = None
120 | 
121 |     if distortion_test and outlier_test:
122 |         ## Is there an error in the MRPRESSO code? The outlier indices are supposed to be excluded from the expected bias computation (as per the paper).
123 |         def get_random_bias(BETA_e_columns, data, ref_outlier):
124 |             indices = np.concatenate(
125 |                 [
126 |                     ref_outlier,
127 |                     np.random.choice(
128 |                         list(set(range(len(data))) - set(ref_outlier)),
129 |                         len(data) - len(ref_outlier),
130 |                     ),
131 |                 ]
132 |             )
133 |             subset_data = data.iloc[indices[: -len(ref_outlier)]]
134 |             mod_random = smf.wls(
135 |                 f"BETA_o ~ -1 + {' + '.join(BETA_e_columns)}",
136 |                 data=subset_data,
137 |                 weights=subset_data["Weights"],
138 |             ).fit()
139 |             return mod_random.params[BETA_e_columns]
140 | 
141 |         ref_outlier = OutlierTest.loc[OutlierTest["Pvalue"] <= significance_p].index
142 | 
143 |         if len(ref_outlier) > 0:
144 |             if len(ref_outlier) < len(data):
145 |                 print(f"{len(ref_outlier)}/{len(data)} ({len(ref_outlier)/len(data)*100:.2f}%) outliers found. Running the Distortion test.")
146 |                 BiasExp = [
147 |                     get_random_bias(BETA_e_columns, data, ref_outlier)
148 |                     for _ in range(n_iterations)
149 |                 ]
150 |                 BiasExp = pd.concat(BiasExp, axis=1).transpose()
151 | 
152 |                 subset_data = data.drop(ref_outlier)
153 |                 mod_no_outliers = smf.wls(
154 |                     f"BETA_o ~ -1 + {' + '.join(BETA_e_columns)}",
155 |                     data=subset_data,
156 |                     weights=subset_data["Weights"],
157 |                 ).fit()
158 | 
159 |                 BiasObs = (
160 |                     mod_all.params[BETA_e_columns]
161 |                     - mod_no_outliers.params[BETA_e_columns]
162 |                 ) / abs(mod_no_outliers.params[BETA_e_columns])
163 |                 BiasExp = (mod_all.params[BETA_e_columns] - BiasExp) / abs(BiasExp)
164 | 
165 |                 p_value = np.sum(np.abs(BiasExp) > np.abs(BiasObs)) / n_iterations
166 | 
167 |                 BiasTest = {
168 |                     "outliers_indices": list(ref_outlier),
169 |                     "distortion_test_coefficient": 100 * BiasObs.values[0],
170 |                     "distortion_test_p": p_value.iloc[0],
171 |                 }
172 |             else:
173 |                 print("All SNPs considered as outliers. Skipping the Distortion test.")
174 |                 BiasTest = {
175 |                     "outliers_indices": "All SNPs considered as outliers",
176 |                     "distortion_test_coefficient": np.nan,
177 |                     "distortion_test_p": np.nan,
178 |                 }
179 |         else:
180 |             print("No significant outliers found. Skipping the Distortion test.")
181 |             BiasTest = {
182 |                 "outliers_indices": "No significant outliers",
183 |                 "distortion_test_coefficient": np.nan,
184 |                 "distortion_test_p": np.nan,
185 |             }
186 | 
187 |     # 5- Format
188 |     row_original = {
189 |         "exposure": BETA_e_columns[0],
190 |         "method": "Raw",
191 |         "nSNP": len(data),
192 |         "b": mod_all.params["BETA_e"],
193 |         "se": mod_all.bse["BETA_e"],
194 |         "pval": mod_all.pvalues["BETA_e"],
195 |     }
196 |     if "mod_no_outliers" in locals():
197 |         row_corrected = {
198 |             "exposure": BETA_e_columns[0],
199 |             "method": "Outlier-corrected",
200 |             "nSNP": len(data) - len(ref_outlier),
201 |             "b": mod_no_outliers.params["BETA_e"],
202 |             "se": mod_no_outliers.bse["BETA_e"],
203 |             "pval": mod_no_outliers.pvalues["BETA_e"],
204 |         }
205 |     else:
206 |         row_corrected = {
207 |             "exposure": BETA_e_columns[0],
208 |             "method": "Outlier-corrected",
209 |             "nSNP": np.nan,
210 |             "b": np.nan,
211 |             "se": np.nan,
212 |             "pval": np.nan,
213 |         }
214 | 
215 |     mod_table = pd.DataFrame([row_original, row_corrected])
216 | 
217 |     return mod_table, GlobalTest, OutlierTest, BiasTest, subset_data
218 | 
219 | 
220 | ## MR-PRESSO helper functions
221 | # Define the matrix power operator
222 | def power_eigen(x, n):
223 |     values, vectors = np.linalg.eig(x)
224 |     return vectors.dot(np.diag(values**n)).dot(vectors.T)
225 | 
226 | 
227 | # Function to compute the residual sum of squares in a LOO framework
228 | def getRSS_LOO(data, BETA_e_columns, returnIV):
229 |     dataW = data[["BETA_o"] + BETA_e_columns].multiply(np.sqrt(data["Weights"]), axis=0)
230 |     X = dataW[BETA_e_columns].values
231 |     Y = dataW["BETA_o"].values
232 | 
233 |     # Matrix operations after LOO
234 |     def loo_calculation(i):
235 |         X_loo = np.delete(X, i, axis=0)
236 |         Y_loo = np.delete(Y, i, axis=0)
237 |         return power_eigen(X_loo.T.dot(X_loo), -1).dot(X_loo.T).dot(Y_loo)
238 | 
239 |     CausalEstimate_LOO = np.array([loo_calculation(i) for i in range(len(dataW))])
240 | 
241 |     if len(BETA_e_columns) == 1:
242 |         CausalEstimate_LOO = CausalEstimate_LOO.reshape(-1)
243 |         RSS = np.nansum((Y - CausalEstimate_LOO * X.reshape(-1)) ** 2)
244 |     else:
245 |         raise ValueError("Needs to do the getRSS_LOO for multi exposure.")
246 |         # RSS = np.nansum((Y - np.sum(CausalEstimate_LOO.T * X, axis=1)) ** 2)
247 | 
248 |     if returnIV:
249 |         return (RSS, CausalEstimate_LOO)
250 |     return RSS
251 | 
252 | 
253 | # Generate random data based on normal distributions
254 | def getRandomData(data, BETA_e_columns=["BETA_e"]):
255 |     rng = default_rng()
256 | 
257 |     models = []
258 |     for i in range(len(data)):
259 |         lm = LinearRegression(fit_intercept=False)
260 |         data_i = data.drop(i)
261 |         lm.fit(
262 |             data_i[BETA_e_columns], data_i["BETA_o"], sample_weight=data_i["Weights"]
263 |         )
264 |         models.append(lm)
265 | 
266 |     random_data_dict = {}
267 |     for col, sd_col in zip(BETA_e_columns, ["SE_e"]):
268 |         random_data_dict[col] = rng.normal(data[col], data[sd_col])
269 | 
270 |     random_data_dict["BETA_o"] = [
271 |         rng.normal(
272 |             model.predict(data.iloc[[i]][BETA_e_columns]), data.iloc[i]["SE_o"]
273 |         ).item()
274 |         for i, model in enumerate(models)
275 |     ]
276 |     random_data_dict["Weights"] = data["Weights"].values
277 | 
278 |     random_data_df = pd.DataFrame(random_data_dict)
279 |     return random_data_df
280 | 
281 | 
282 | # Function for the parallel executor in step 2: generate random data and compute the expected residual sum of squares
283 | def parallel_RSS_LOO(i, data, BETA_e_columns):
284 |     random_data = getRandomData(data, BETA_e_columns)
285 | 
286 |     rss_exp = getRSS_LOO(random_data, BETA_e_columns, False)
287 |     return (rss_exp, random_data["BETA_e"].values, random_data["BETA_o"].values)
288 | 


--------------------------------------------------------------------------------
/genal/proxy.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | import os
  4 | import subprocess
  5 | import re
  6 | import uuid
  7 | 
  8 | from .tools import get_reference_panel_path, get_plink_path, run_plink_command
  9 | 
 10 | ## TO DO: accept lists of CHR/POS instead of SNP names for these functions
 11 | 
 12 | 
 13 | def query_outcome_proxy(df, ld, snps_to_extract, snps_df=[]):
 14 |     """
 15 |     Extract the best proxies from a dataframe, as well as specific SNPs.
 16 | 
 17 |     Given a dataframe `df` (originating from Geno.data) and a dataframe of potential proxies
 18 |     (output from `find_proxies`), this function extracts the best proxies from `df` as well as
 19 |     the SNPs specified in `snps_to_extract`.
 20 |     This is suited for querying outcome data.
 21 | 
 22 |     Args:
 23 |         df (pd.DataFrame): Dataframe of SNP information with the usual Geno columns
 24 |                            (SNP, BETA, SE, EAF, EA, NEA). EAF is not necessary.
 25 |         ld (pd.DataFrame): Dataframe of proxies (output from `find_proxies`).
 26 |         snps_to_extract (list): List of SNPs to extract in addition to the proxies.
 27 |         snps_df (list, optional): List of SNPs to choose the proxy from. Should be the list of
 28 |                                   SNPs in df. Can be provided to avoid recomputing it. Defaults to an empty list.
 29 | 
 30 |     Returns:
 31 |         pd.DataFrame: Dataframe with queried SNPs and their proxies.
 32 |     """
 33 |     # If ld is None
 34 |     if not isinstance(ld, pd.DataFrame):
 35 |         raise ValueError("ld is None (The SNPs to be proxied were not found in the reference panel)")
 36 | 
 37 |     # If snps_df is empty, populate it with SNPs from df
 38 |     if not snps_df:
 39 |         snps_df = df.SNP.values
 40 | 
 41 |     # Filter proxies that are present in df
 42 |     ld = ld[ld.SNP_B.isin(snps_df)]
 43 | 
 44 |     # Remove original SNPs
 45 |     ld = ld[ld["SNP_A"] != ld["SNP_B"]]
 46 | 
 47 |     # Sort by r and select the best proxy for each SNP
 48 |     ld = ld.reindex(ld["R"].abs().sort_values(ascending=False).index)
 49 |     ld = ld.groupby("SNP_A").first().reset_index(drop=False)
 50 | 
 51 |     # Determine SNPs to query
 52 |     snps_to_query = set(snps_to_extract) | set(ld.SNP_B.values)
 53 |     df_queried = df[df.SNP.isin(snps_to_query)]
 54 | 
 55 |     # Merge dataframes and identify proxies
 56 |     output = df_queried.merge(ld, how="left", left_on="SNP", right_on="SNP_B")
 57 |     output["proxy"] = output["SNP_B"].notnull()
 58 | 
 59 |     # In the plink output, the alleles taken as reference for proxying are "MAJ_A" and "MAJ_B" (major alleles in the reference panel)
 60 |     # We want to use as effect allele for the original SNP its minor allele in the reference panel
 61 |     # So, we flip BETA if the proxied SNP's effect allele is the major allele in the reference panel
 62 |     conditions = [
 63 |         output["EA"] == output["MAJ_B"],
 64 |         output["EA"] == output["NONMAJ_B"],
 65 |         ~output["proxy"],
 66 |         True,
 67 |     ]
 68 |     choices = [
 69 |         -output["BETA"],  # if EA == MAJ_B, flip the sign of BETA 
 70 |         output["BETA"],  # if EA == NONMAJ_B, BETA does not change
 71 |         output["BETA"],  # if SNP_B is NaN (The original SNP was not proxied), BETA does not change
 72 |         np.nan,  # if the original SNP was proxied but "EA" is neither "MAJ_A" nor "NONMAJ_A", BETA is NaN
 73 |     ]
 74 |     output["BETA"] = np.select(conditions, choices)
 75 | 
 76 |     # Flip BETA if the sign of R is negative: indicates that the positive correlation corresponds to MAJ_A with NONMAJ_B
 77 |     sign_r = np.sign(output["R"]) # Sign of R
 78 |     output["BETA"] = np.where(sign_r == -1, -output["BETA"], output["BETA"])
 79 | 
 80 |     # Delete SNPs with mismatched alleles
 81 |     nrow = output.shape[0]
 82 |     output = output.dropna(subset=["BETA"])
 83 |     if output.shape[0] < nrow:
 84 |         print(
 85 |             f"Deleted {nrow-output.shape[0]} base SNPs that did not have matching alleles in reference data."
 86 |         )
 87 |     print(f"Found proxies for {output['proxy'].sum()} SNPs.")
 88 | 
 89 |     # Replace the proxied SNPs with the position and alleles of the original SNPs
 90 |     output["SNP"] = np.where(output["proxy"], output["SNP_A"], output["SNP"])
 91 |     output["POS"] = np.where(output["proxy"], output["BP_A"], output["POS"])
 92 |     output["CHR"] = np.where(output["proxy"], output["CHR_A"], output["CHR"])
 93 |     output["EA"] = np.where(output["proxy"], output["NONMAJ_A"], output["EA"])
 94 |     output["NEA"] = np.where(output["proxy"], output["MAJ_A"], output["NEA"])
 95 |     if "EAF" in output.columns:
 96 |         output["EAF"] = np.where(output["proxy"], output["NONMAJ_FREQ_A"], output["EAF"])
 97 | 
 98 |     # Drop columns related to ld
 99 |     output = output.drop(columns=ld.columns)
100 | 
101 |     return output
102 | 
103 | 
104 | def apply_proxies(df, ld, searchspace=None):
105 |     """
106 |     Given a dataframe (coming from GENO.data attribute) and a dataframe of proxies
107 |     (output from find_proxies), replace the SNPs in df with their best proxies, if they exist.
108 |     This function is suited for exposure data (before running a PRS for instance).
109 | 
110 |     Args:
111 |         df (DataFrame): Dataframe of SNP information with the usual GENO columns (SNP, BETA, SE, EAF, EA, NEA). EAF is not necessary.
112 |         ld (DataFrame): Dataframe of proxies (output from find_proxies).
113 |         searchspace (list, optional): List of SNPs to restrict the list of potential proxies. By default, includes all the proxies found. Using a searchspace can be done either at the find_proxies step or at this step, but it is much faster to use it at this step.
114 | 
115 |     Returns:
116 |         DataFrame: A DataFrame with SNPs replaced by their best proxies, if they exist.
117 |     """
118 |     # If ld is None
119 |     if not isinstance(ld, pd.DataFrame):
120 |         raise ValueError("ld is None (The SNPs to be proxied were not found in the reference panel)")
121 |     
122 |     # Check mandatory columns
123 |     mandatory_cols = ["EA", "SNP", "BETA"]
124 |     for col in mandatory_cols:
125 |         if col not in df.columns:
126 |             raise ValueError(f"The column {col} is not found in the data!")
127 |     
128 |     # Filter by searchspace if provided
129 |     if searchspace:
130 |         print("Filtering the potential proxies with the searchspace provided.")
131 |         ld = ld[ld.SNP_B.isin(searchspace)]
132 | 
133 |     # Remove original SNPs and sort by r
134 |     ld = ld[ld["SNP_A"] != ld["SNP_B"]]
135 |     ld = ld.reindex(ld["R"].abs().sort_values(ascending=False).index)
136 | 
137 |     # Select the best proxy for each SNP
138 |     ld = ld.groupby("SNP_A").first().reset_index(drop=False)
139 | 
140 |     # Merge the dataframes
141 |     output = df.merge(ld, how="left", left_on="SNP", right_on="SNP_A")
142 |     output["proxy"] = pd.notnull(output["SNP_B"])
143 | 
144 |     # In the plink output, the alleles taken as reference for proxying are "MAJ_A" and "MAJ_B" (major alleles in the reference panel)
145 |     # We want to use as effect allele for the proxy SNP its minor allele in the reference panel
146 |     # So, we flip BETA if the original SNP's effect allele is the major allele in the reference panel
147 |     conditions = [
148 |         output["EA"] == output["MAJ_A"],
149 |         output["EA"] == output["NONMAJ_A"],
150 |         ~output["proxy"],
151 |         True,
152 |     ]
153 |     choices = [
154 |         -output["BETA"],  # if EA == MAJ_A, flip the sign of BETA 
155 |         output["BETA"],  # if EA == NONMAJ_A, BETA does not change
156 |         output["BETA"],  # if SNP_B is NaN (The original SNP was not proxied), BETA does not change
157 |         np.nan,  # if the original SNP was proxied but "EA" is neither "MAJ_A" nor "NONMAJ_A", BETA is NaN
158 |     ]
159 |     output["BETA"] = np.select(conditions, choices)
160 | 
161 |     # Flip BETA if the sign of R is negative: indicates that the positive correlation corresponds to MAJ_A with NONMAJ_B
162 |     sign_r = np.sign(output["R"]) # Sign of R
163 |     output["BETA"] = np.where(sign_r == -1, -output["BETA"], output["BETA"])
164 | 
165 |     # Delete SNPs with mismatched alleles
166 |     nrow = output.shape[0]
167 |     output = output.dropna(subset=["BETA"])
168 |     if output.shape[0] < nrow:
169 |         print(
170 |             f"Deleted {nrow-output.shape[0]} base SNPs that did not have matching alleles in reference data."
171 |         )
172 |     print(f"Found proxies for {output['proxy'].sum()} SNPs.")
173 | 
174 |     # Replace the original SNPs with their proxy (if proxied)
175 |     # As said above, we use as effect allele the minor allele in the reference panel
176 |     output["SNP"] = np.where(output["proxy"], output["SNP_B"], output["SNP"])
177 |     output["EA"] = np.where(output["proxy"], output["NONMAJ_B"], output["EA"])
178 |     if "POS" in output.columns:
179 |         output["POS"] = np.where(output["proxy"], output["BP_B"], output["POS"])
180 |     if "CHR" in output.columns:
181 |         output["CHR"] = np.where(output["proxy"], output["CHR_B"], output["CHR"])
182 |     if "NEA" in output.columns:
183 |         output["NEA"] = np.where(output["proxy"], output["MAJ_B"], output["NEA"])
184 |     if "EAF" in output.columns:
185 |         output["EAF"] = np.where(output["proxy"], output["NONMAJ_FREQ_B"], output["EAF"])
186 | 
187 |     # Drop ld columns
188 |     output.drop(columns=ld.columns, inplace=True)
189 | 
190 |     return output
191 | 
192 | 
193 | def find_proxies(
194 |     snp_list,
195 |     searchspace=None,
196 |     reference_panel="EUR_37",
197 |     kb=5000,
198 |     r2=0.8,
199 |     window_snps=1000000,
200 |     threads=1,
201 |     name=None
202 | ):
203 |     """
204 |     Given a list of SNPs, return a table of proxies using PLINK 2.0.
205 | 
206 |     Args:
207 |         snp_list (list): List of rsids.
208 |         searchspace (list, optional): List of SNPs to include in the search. By default, includes the whole reference panel.
209 |         reference_panel (str, optional): The reference population to get linkage disequilibrium values and find proxies.
210 |             Acceptable populations are "EUR", "SAS", "AFR", "EAS", "AMR" and available builds are 37 and 38 ("EUR_38" or "AFR_37" etc...)
211 |             Also accepts or a path to a specific bed/bim/fam or pgen/pvar/psam panel.
212 |             Default is "EUR_37".
213 |         kb (int, optional): Width of the genomic window to look for proxies. Defaults to 5000.
214 |         r2 (float, optional): Minimum linkage disequilibrium value with the main SNP for a proxy to be included. Defaults to 0.8.
215 |         window_snps (int, optional): Compute the LD value for SNPs that are not more than x SNPs apart from the main SNP. Defaults to 1000000 (equivalent to infinity).
216 |         threads (int, optional): Number of threads to use. Defaults to 1.
217 | 
218 |     Returns:
219 |         DataFrame: A DataFrame containing the proxies. Only biallelic SNPs are returned.
220 |     """
221 |     # Ensure tmp_GENAL directory exists
222 |     os.makedirs(f"tmp_GENAL/", exist_ok=True)
223 | 
224 |     # Generate a default name if none is provided
225 |     if name is None:
226 |         name = str(uuid.uuid4())[:8]
227 | 
228 |     # Convert snp_list to numpy array
229 |     snp_list = np.array(list(snp_list))
230 | 
231 |     # Check if searchspace is provided
232 |     if searchspace is None:
233 |         extract_arg = ""
234 |     else:
235 |         print("Searching proxies in the provided searchspace.")
236 |         with open(f"tmp_GENAL/{name}_searchspace.txt", "w") as file:
237 |             for s in searchspace + snp_list:
238 |                 file.write(str(s) + "\n")
239 |         extract_arg = "--extract tmp_GENAL/{name}_searchspace.txt"
240 | 
241 |     # Save snp_list to a file
242 |     np.savetxt(f"tmp_GENAL/{name}_snps_to_proxy.txt", snp_list, fmt="%s", delimiter=" ")
243 | 
244 |     # Get reference panel path and type
245 |     ref_path, filetype = get_reference_panel_path(reference_panel)
246 | 
247 |     # Construct base command based on filetype
248 |     base_cmd = f"{get_plink_path()}"
249 |     if filetype == "bed":
250 |         base_cmd += f" --bfile {ref_path}"
251 |     else:  # pgen
252 |         base_cmd += f" --pfile {ref_path}"
253 | 
254 |     # Construct base command based on filetype
255 |     base_cmd = f"{get_plink_path()}"
256 |     if filetype == "bed":
257 |         base_cmd += f" --bfile {ref_path}"
258 |     else:  # pgen
259 |         base_cmd += f" --pfile {ref_path}"
260 | 
261 |     # Construct and execute the plink2 command
262 |     command = (
263 |         f"{base_cmd} {extract_arg} "
264 |         f"--r-unphased 'cols=chrom,pos,id,maj,nonmaj,freq' "
265 |         f"--ld-snp-list tmp_GENAL/{name}_snps_to_proxy.txt "
266 |         f"--ld-window-kb {kb} "
267 |         f"--ld-window-r2 {r2} "
268 |         f"--ld-window {window_snps} "
269 |         f"--threads {threads} "
270 |         f"--out tmp_GENAL/{name}_proxy.targets"
271 |     )
272 |     
273 |     run_plink_command(command)
274 | 
275 |     # Read log file to return amount of SNPs to be proxied present in the ref panel
276 |     log_path = os.path.join("tmp_GENAL", f"{name}_proxy.targets.log")
277 |     log_content = open(log_path).read()
278 |     match = re.search(r'(\d+) variant[s] remaining', log_content)
279 |     if match:
280 |         n_present = int(match.group(1))
281 |         if n_present == 0:
282 |             print("None of the SNPs to be proxied are present in the reference panel.")
283 |             return None
284 |         else:
285 |             print(f"{n_present} SNPs to be proxied are present in the reference panel.")
286 | 
287 |     # Read and process the output
288 |     try:
289 |         ld = pd.read_csv(f"tmp_GENAL/{name}_proxy.targets.vcor", sep="\s+")
290 |     except FileNotFoundError:
291 |         print("No proxies found that meet the specified criteria.")
292 |         return None
293 |       
294 |     # Rename columns to match the expected format
295 |     ld.rename(columns={
296 |         'ID_A': 'SNP_A',
297 |         'ID_B': 'SNP_B',
298 |         '#CHROM_A': 'CHR_A',
299 |         'CHROM_B': 'CHR_B',
300 |         'POS_A': 'BP_A',
301 |         'POS_B': 'BP_B',
302 |         'UNPHASED_R': 'R',
303 |     }, inplace=True)
304 | 
305 |     # Create PHASE column for compatibility
306 |     #ld['PHASE'] = ld['A1'] + ld['B1'] + ld['A2'] + ld['B2']
307 | 
308 |     # Filter out multiallelic SNPs
309 |     #ld = ld[ld["PHASE"].str.len() == 4]
310 |     #ld = ld.reset_index(drop=True)
311 | 
312 |     # Convert integer columns to Int64 type
313 |     for int_col in ["CHR_A", "CHR_B", "BP_A", "BP_B"]:
314 |         ld[int_col] = ld[int_col].astype("Int64")
315 | 
316 |     return ld
317 | 


--------------------------------------------------------------------------------
/genal/extract_prs.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import os, subprocess, re, uuid
  3 | from functools import partial
  4 | from concurrent.futures import ProcessPoolExecutor
  5 | 
  6 | from .tools import check_bfiles, check_pfiles, setup_genetic_path, get_plink_path
  7 | 
  8 | 
  9 | ### ____________________
 10 | ### PRS functions
 11 | ### ____________________
 12 | 
 13 | def prs_func(data, weighted=True, path=None, ram=20000, cpus=4, name=None):
 14 |     """
 15 |     Compute a PRS (Polygenic Risk Score) using provided SNP-level data. Corresponds to the :meth:`Geno.prs` method
 16 |     """
 17 |     # Get path and filetype
 18 |     path, filetype = setup_genetic_path(path)
 19 | 
 20 |     # Generate a default name if none is provided
 21 |     if name is None:
 22 |         name = str(uuid.uuid4())[:8]
 23 | 
 24 |     # Call extract_snps
 25 |     extracted_path = extract_snps_func(data.SNP, name, path, ram=ram, cpus=cpus)
 26 | 
 27 |     if extracted_path == "FAILED":
 28 |         raise ValueError("No SNPs were extracted from the genetic data and the PRS can't be computed.")
 29 | 
 30 |     # Additional check to ensure there are no duplicates in the data (need to think more about this, should be done upstream)
 31 |     data.drop_duplicates(subset=["SNP"], keep="first", inplace=True)
 32 |     if "CHR" in data.columns and "POS" in data.columns:
 33 |         data.drop_duplicates(subset=["CHR", "POS"], keep="first", inplace=True)
 34 |     
 35 |     # Write processed data to file and run plink on it
 36 |     data = data[["SNP", "EA", "BETA"]]
 37 |     data_path = os.path.join("tmp_GENAL", f"{name}_to_prs.txt")
 38 |     output_path = os.path.join("tmp_GENAL", f"{name}_prs")
 39 | 
 40 |     # Set BETA values to 1 if unweighted PRS is required
 41 |     if not weighted:
 42 |         data["BETA"] = 1
 43 |         print(f"Computing an unweighted PRS using {extracted_path} data.")
 44 |     else:
 45 |         print(f"Computing a weighted PRS using {extracted_path} data.")
 46 | 
 47 |     data.to_csv(data_path, sep="\t", index=False, header=True)
 48 |     
 49 |     # We can use --pfile since extract_snps now creates pgen files
 50 |     plink_command = f"{get_plink_path()} --memory {ram} --pfile {extracted_path} --threads {cpus} \
 51 |                      --score {data_path} 1 2 3 header --out {output_path} --allow-no-sex"
 52 | 
 53 |     # Check for empty dataframe
 54 |     n_snps = data.shape[0]
 55 |     if n_snps == 0:
 56 |         raise ValueError(
 57 |             "No SNPs remain for the polygenic risk score (PRS) calculation."
 58 |         )
 59 | 
 60 |     try:
 61 |         output = subprocess.run(
 62 |             plink_command, shell=True, capture_output=True, text=True, check=True
 63 |         )
 64 |     except subprocess.CalledProcessError as e:
 65 |         print(f"Error running PLINK command: {e}")
 66 |         print(f"PLINK stdout: {e.stdout}")
 67 |         print(f"PLINK stderr: {e.stderr}")
 68 |         raise ValueError("PLINK command failed. Check the error messages above for details.")
 69 | 
 70 |     # Read and process PRS results
 71 |     prs_file = output_path + ".sscore"
 72 |     log_file = output_path + ".log"
 73 |     if os.path.isfile(prs_file): #If the profile file exists: PRS was successful
 74 |         #Extracts the number of SNPs used for the PRS computation
 75 |         log_content = open(log_file).read()
 76 |         match = re.search(r'--score: (\d+) variant[s] processed', log_content)
 77 |         if match:
 78 |             n_predictors = int(match.group(1))
 79 |             print(
 80 |             f"The PRS computation was successful and used {n_predictors}/{n_snps} ({n_predictors/n_snps*100:.3f}%) SNPs."
 81 |             )
 82 |         else:
 83 |             print("Could not extract the number of SNPs used for the PRS computation.")
 84 |         #Return scores
 85 |         df_score = pd.read_csv(prs_file, sep="\s+")
 86 |         df_score.rename(columns={"#FID": "FID"}, inplace=True)
 87 |         return df_score
 88 |     else:
 89 |         print(output.stdout)
 90 |         raise ValueError(
 91 |             f"The PRS computation was not successful. Check the {output_path + '.log'} file."
 92 |         )
 93 | 
 94 | 
 95 | ### _____________________
 96 | ### Extract SNPs functions
 97 | ### _____________________   
 98 | 
 99 | # We are currently excluding all multiallelic variants by forcing first on all duplicates. 
100 | # Could be improved by keeping the relevant version of the multiallelic SNPs based on allele matching
101 | def extract_snps_func(snp_list, name=None, path=None, ram=20000, cpus=4):
102 |     """
103 |     Extracts a list of SNPs from the given path. This function corresponds to the following Geno method: :meth:`Geno.extract_snps`.
104 | 
105 |     Args:
106 |         snp_list (pd.Series): Series of SNPs to extract.
107 |         name (str): Name prefix for the output files.
108 |         path (str, optional): Path to the dataset. Defaults to the path from the configuration.
109 | 
110 |     Returns:
111 |         str: path to the genetic files containing the extracted SNPs
112 | 
113 |     Raises:
114 |         TypeError: Raises an error when no valid path is saved or when there's an incorrect format in the provided path.
115 |     """
116 |     # Check if snp_list is empty Series
117 |     if snp_list.empty:
118 |         print("The provided SNP list is empty.")
119 |         return "FAILED"
120 |     
121 |     # Generate a default name if none is provided
122 |     if name is None:
123 |         name = str(uuid.uuid4())[:8]
124 | 
125 |     # Get path and filetype
126 |     path, filetype = setup_genetic_path(path)
127 | 
128 |     # Prepare the SNP list
129 |     snp_list = snp_list.dropna()
130 |     snp_list_name = f"{name}_list.txt"
131 |     snp_list_path = os.path.join("tmp_GENAL", snp_list_name)
132 |     snp_list.to_csv(snp_list_path, sep=" ", index=False, header=None)
133 |     nrow = len(snp_list)
134 | 
135 |     # Check if the data is split by chromosome
136 |     filetype_split = "split" if "$" in path else "combined"
137 | 
138 |     output_path = os.path.join("tmp_GENAL", f"{name}_allchr")
139 |     if filetype_split == "split":
140 |         ram_estimate_per_cpu = nrow/(1.5*10**2)
141 |         n_cpus = max(1, int(ram // ram_estimate_per_cpu))
142 |         workers = min(n_cpus, cpus)
143 |         merge_command, bedlist_path = extract_snps_from_split_data(
144 |             name, path, output_path, snp_list_path, filetype, workers=workers
145 |         )
146 |         handle_multiallelic_variants(name, merge_command, bedlist_path)
147 |     else:
148 |         extract_snps_from_combined_data(name, path, output_path, snp_list_path, filetype)
149 | 
150 |     #Check that at least 1 variant has been extracted. If not, return "FAILED" to warn downstream functions (prs, association_test)
151 |     log_path = output_path + ".log"
152 |     with open(log_path, 'r') as log_file:
153 |         if " 0 variants remaining" in log_file.read():
154 |             print("None of the provided SNPs were found in the genetic data.")
155 |             return "FAILED"
156 |         else:
157 |             if check_pfiles(output_path):
158 |                 print(f"Created pgen/pvar/psam fileset with extracted SNPs: {output_path}")
159 |             else:
160 |                 print(f"Could not extract the SNPs from the provided genetic data: check plink .log file")
161 |             # Report SNPs not found
162 |             report_snps_not_found(nrow, name)
163 | 
164 |     return output_path
165 | 
166 | 
167 | def extract_command_parallel(task_id, name, path, snp_list_path, filetype):
168 |     """
169 |     Helper function to run SNP extraction in parallel for different chromosomes.
170 |     Args:
171 |         task_id (int): Identifier for the task/chromosome.
172 |         name (str): Name prefix for the output files.
173 |         path (str): Path to the data set.
174 |         snp_list_path (str): Path to the list of SNPs to extract.
175 |         filetype (str): Type of genetic files ("bed" or "pgen")
176 |     Returns:
177 |         int: Returns the task_id if no valid files are found.
178 |     """
179 |     input_path = path.replace("$", str(task_id))
180 | 
181 |     # Check if files exist based on filetype
182 |     if filetype == "bed" and not check_bfiles(input_path):
183 |         return task_id
184 |     elif filetype == "pgen" and not check_pfiles(input_path):
185 |         return task_id
186 | 
187 |     output_path = os.path.join("tmp_GENAL", f"{name}_extract_chr{task_id}")
188 |     
189 |     # Build command based on filetype
190 |     base_cmd = f"{get_plink_path()}"
191 |     if filetype == "bed":
192 |         base_cmd += f" --bfile {input_path}"
193 |     else:  # pgen
194 |         base_cmd += f" --pfile {input_path}"
195 |         
196 |     command = f"{base_cmd} --extract {snp_list_path} --rm-dup force-first --make-pgen --out {output_path}"
197 |     
198 |     subprocess.run(
199 |         command, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL
200 |     )
201 | 
202 | 
203 | def create_bedlist(bedlist_path, output_name, not_found):
204 |     """
205 |     Creates a bedlist file for SNP extraction.
206 |     Args:
207 |         bedlist_path (str): Path to save the bedlist file.
208 |         output_name (str): Base name for the output files.
209 |         not_found (List[int]): List of chromosome numbers for which no files were found.
210 |     """
211 |     with open(bedlist_path, "w+") as bedlist_file:
212 |         found = []
213 |         for i in range(1, 23):
214 |             if i in not_found:
215 |                 print(f"bed/bim/fam or pgen/pvar/psam files not found for chr{i}.")
216 |             elif check_pfiles(f"{output_name}_chr{i}"):
217 |                 bedlist_file.write(f"{output_name}_chr{i}\n")
218 |                 found.append(i)
219 |                 print(f"SNPs extracted for chr{i}.")
220 |             else:
221 |                 print(f"No SNPs extracted for chr{i}.")
222 |     return found
223 | 
224 | 
225 | def extract_snps_from_split_data(name, path, output_path, snp_list_path, filetype, workers=4):
226 |     """Extract SNPs from data split by chromosome."""
227 |     print("Extracting SNPs for each chromosome...")
228 |     num_tasks = 22
229 |     partial_extract_command_parallel = partial(
230 |         extract_command_parallel, 
231 |         name=name, 
232 |         path=path, 
233 |         snp_list_path=snp_list_path,
234 |         filetype=filetype
235 |     )  # Wrapper function
236 |     with ProcessPoolExecutor(max_workers=workers) as executor:
237 |         not_found = list(
238 |             executor.map(partial_extract_command_parallel, range(1, num_tasks + 1))
239 |         )
240 | 
241 |     # Merge extracted SNPs from each chromosome
242 |     bedlist_name = f"{name}_bedlist.txt"
243 |     bedlist_path = os.path.join("tmp_GENAL", bedlist_name)
244 |     found = create_bedlist(
245 |         bedlist_path, os.path.join("tmp_GENAL", f"{name}_extract"), not_found
246 |     )
247 |     if len(found) == 0:
248 |         raise Warning("No SNPs were extracted from any chromosome.")
249 |     
250 |     # If only one chromosome was extracted, no need to merge, simply rename the files
251 |     if len(found) == 1:
252 |         chr_path = os.path.join("tmp_GENAL", f"{name}_extract_chr{found[0]}")
253 |         for ext in [".pgen", ".pvar", ".psam", ".log"]:
254 |             os.rename(f"{chr_path}{ext}", f"{output_path}{ext}")
255 |         return None, bedlist_path
256 | 
257 |     print("Merging SNPs extracted from each chromosome...")
258 |     merge_command = f"{get_plink_path()} --pmerge-list {bedlist_path} pfile --out {output_path}"
259 |     try:
260 |         subprocess.run(
261 |             merge_command, shell=True, capture_output=True, text=True, check=True
262 |         )
263 |     except subprocess.CalledProcessError as e:
264 |         print(f"Error running PLINK command: {e}")
265 |         print(f"PLINK stdout: {e.stdout}")
266 |         print(f"PLINK stderr: {e.stderr}")
267 |         raise ValueError("PLINK command failed. Check the error messages above for details.")
268 | 
269 |     return merge_command, bedlist_path
270 | 
271 | 
272 | def extract_snps_from_combined_data(name, path, output_path, snp_list_path, filetype):
273 |     """Extract SNPs from combined data."""
274 |     print("Extracting SNPs...")
275 |     
276 |     # Build command based on filetype
277 |     base_cmd = f"{get_plink_path()}"
278 |     if filetype == "bed":
279 |         base_cmd += f" --bfile {path}"
280 |     else:  # pgen
281 |         base_cmd += f" --pfile {path}"
282 |         
283 |     extract_command = f"{base_cmd} --extract {snp_list_path} --rm-dup force-first --make-pgen --out {output_path}"
284 |     
285 |     subprocess.run(
286 |         extract_command,
287 |         shell=True,
288 |         stdout=subprocess.DEVNULL,
289 |         stderr=subprocess.DEVNULL,
290 |     )
291 | 
292 | 
293 | def report_snps_not_found(nrow, name):
294 |     """Report the number of SNPs not found in the data."""
295 | 
296 |     def count_lines(filepath):
297 |         with open(filepath, "r") as file:
298 |             return sum(1 for line in file)
299 | 
300 |     file_path = os.path.join("tmp_GENAL", f"{name}_allchr.pvar")
301 |     extracted_snps_count = count_lines(file_path)-1 #pvar files include column names
302 |     delta_nrow = nrow - extracted_snps_count
303 |     if delta_nrow > 0:
304 |         print(
305 |             f"{delta_nrow}({delta_nrow/nrow*100:.3f}%) SNPs were not extracted from the genetic data."
306 |         )
307 | 
308 | # TODO: Check if this function is still needed with plink2
309 | def handle_multiallelic_variants(name, merge_command, bedlist_path):
310 |     """Handle multiallelic variants detected during merging."""
311 | 
312 |     if merge_command is None:
313 |         return
314 |     
315 |     def remove_multiallelic():
316 |         missnp_path = os.path.join(
317 |             "tmp_GENAL", f"{name}_allchr.vmiss"
318 |             )
319 |         if not os.path.exists(missnp_path):
320 |             return 0
321 |             
322 |         snps_to_exclude = pd.read_csv(missnp_path, header=None)
323 |         for i in range(1, 23):
324 |             pvar_path = os.path.join("tmp_GENAL", f"{name}_extract_chr{i}.pvar")
325 |             if not os.path.isfile(pvar_path):
326 |                 continue
327 |             pvar = pd.read_csv(pvar_path, sep="\t", header=None)
328 |             # If no SNPs would be left for this chr: remove corresponding bedlist line
329 |             n_to_exclude = len(set(pvar[2]).intersection(set(snps_to_exclude[0])))
330 |             if n_to_exclude == len(set(pvar[2])):
331 |                 print(f"No SNPs remaining for chromosome {i}.")
332 |                 tmp_filename = os.path.join("tmp_GENAL", "tmp_multiallelic")
333 |                 with open(bedlist_path, "r") as file, open(
334 |                     tmp_filename, "w"
335 |                 ) as temp_file:
336 |                     output_name = os.path.join("tmp_GENAL", f"{name}_extract")
337 |                     line_to_exclude = f"{output_name}_chr{i}\n"
338 |                     for line in file:
339 |                         if line != line_to_exclude:
340 |                             temp_file.write(line)
341 |                 # Replace the original file with the temporary file
342 |                 os.replace(tmp_filename, bedlist_path)
343 | 
344 |             # If there is at least one multiallelic SNP for this chr
345 |             elif n_to_exclude > 0:
346 |                 pfile_path = os.path.join("tmp_GENAL", f"{name}_extract_chr{i}")
347 |                 command = f"{get_plink_path()} --pfile {pfile_path} --exclude {missnp_path} --make-pgen --out {pfile_path}"
348 |                 subprocess.run(
349 |                     command,
350 |                     shell=True,
351 |                     stdout=subprocess.DEVNULL,
352 |                     stderr=subprocess.DEVNULL,
353 |                 )
354 |         return len(snps_to_exclude)
355 | 
356 |     log_content = open(os.path.join("tmp_GENAL", f"{name}_allchr.log")).read()
357 |     if "Error: Multiple" in log_content:
358 |         print("Multiallelic variants detected in the genetic files: removing them before merging.")
359 |         n_multiallelic = remove_multiallelic()
360 |         print(f"Reattempting the merge after exclusion of {n_multiallelic} multiallelic variants.")
361 |         subprocess.run(
362 |             merge_command,
363 |             shell=True,
364 |             stdout=subprocess.DEVNULL,
365 |             stderr=subprocess.DEVNULL,
366 |         )
367 | 
368 | 


--------------------------------------------------------------------------------
/docs/build/_modules/genal/clump.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html class="writer-html5" lang="en" >
  3 | <head>
  4 |   <meta charset="utf-8" />
  5 |   <meta name="viewport" content="width=device-width, initial-scale=1.0" />
  6 |   <title>genal.clump &mdash; genal v0.0 documentation</title>
  7 |       <link rel="stylesheet" href="../../_static/pygments.css" type="text/css" />
  8 |       <link rel="stylesheet" href="../../_static/css/theme.css" type="text/css" />
  9 |   <!--[if lt IE 9]>
 10 |     <script src="../../_static/js/html5shiv.min.js"></script>
 11 |   <![endif]-->
 12 |   
 13 |         <script src="../../_static/documentation_options.js?v=90b5f367"></script>
 14 |         <script src="../../_static/doctools.js?v=9a2dae69"></script>
 15 |         <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
 16 |     <script src="../../_static/js/theme.js"></script>
 17 |     <link rel="index" title="Index" href="../../genindex.html" />
 18 |     <link rel="search" title="Search" href="../../search.html" /> 
 19 | </head>
 20 | 
 21 | <body class="wy-body-for-nav"> 
 22 |   <div class="wy-grid-for-nav">
 23 |     <nav data-toggle="wy-nav-shift" class="wy-nav-side">
 24 |       <div class="wy-side-scroll">
 25 |         <div class="wy-side-nav-search" >
 26 | 
 27 |           
 28 |           
 29 |           <a href="../../index.html" class="icon icon-home">
 30 |             genal
 31 |           </a>
 32 | <div role="search">
 33 |   <form id="rtd-search-form" class="wy-form" action="../../search.html" method="get">
 34 |     <input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
 35 |     <input type="hidden" name="check_keywords" value="yes" />
 36 |     <input type="hidden" name="area" value="default" />
 37 |   </form>
 38 | </div>
 39 |         </div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
 40 |               <ul>
 41 | <li class="toctree-l1"><a class="reference internal" href="../../index.html">Home</a></li>
 42 | <li class="toctree-l1"><a class="reference internal" href="../../introduction.html">Introduction</a></li>
 43 | <li class="toctree-l1"><a class="reference internal" href="../../introduction.html#installation">Installation</a></li>
 44 | <li class="toctree-l1"><a class="reference internal" href="../../introduction.html#tutorial">Tutorial</a></li>
 45 | <li class="toctree-l1"><a class="reference internal" href="../../api.html">The Geno class</a></li>
 46 | <li class="toctree-l1"><a class="reference internal" href="../../api.html#clumping-function">Clumping function</a></li>
 47 | <li class="toctree-l1"><a class="reference internal" href="../../api.html#extract-and-prs-functions">Extract and PRS functions</a></li>
 48 | </ul>
 49 | 
 50 |         </div>
 51 |       </div>
 52 |     </nav>
 53 | 
 54 |     <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
 55 |           <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
 56 |           <a href="../../index.html">genal</a>
 57 |       </nav>
 58 | 
 59 |       <div class="wy-nav-content">
 60 |         <div class="rst-content">
 61 |           <div role="navigation" aria-label="Page navigation">
 62 |   <ul class="wy-breadcrumbs">
 63 |       <li><a href="../../index.html" class="icon icon-home" aria-label="Home"></a></li>
 64 |           <li class="breadcrumb-item"><a href="../index.html">Module code</a></li>
 65 |       <li class="breadcrumb-item active">genal.clump</li>
 66 |       <li class="wy-breadcrumbs-aside">
 67 |       </li>
 68 |   </ul>
 69 |   <hr/>
 70 | </div>
 71 |           <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
 72 |            <div itemprop="articleBody">
 73 |              
 74 |   <h1>Source code for genal.clump</h1><div class="highlight"><pre>
 75 | <span></span><span class="kn">import</span> <span class="nn">os</span>
 76 | <span class="kn">import</span> <span class="nn">subprocess</span>
 77 | <span class="kn">import</span> <span class="nn">pandas</span> <span class="k">as</span> <span class="nn">pd</span>
 78 | <span class="kn">import</span> <span class="nn">uuid</span>
 79 | 
 80 | <span class="kn">from</span> <span class="nn">.tools</span> <span class="kn">import</span> <span class="n">read_config</span><span class="p">,</span> <span class="n">get_plink19_path</span><span class="p">,</span> <span class="n">get_reference_panel_path</span><span class="p">,</span> <span class="n">create_tmp</span>
 81 | 
 82 | 
 83 | <div class="viewcode-block" id="clump_data">
 84 | <a class="viewcode-back" href="../../genal.html#genal.clump.clump_data">[docs]</a>
 85 | <span class="k">def</span> <span class="nf">clump_data</span><span class="p">(</span>
 86 |     <span class="n">data</span><span class="p">,</span>
 87 |     <span class="n">reference_panel</span><span class="o">=</span><span class="s2">&quot;eur&quot;</span><span class="p">,</span>
 88 |     <span class="n">kb</span><span class="o">=</span><span class="mi">250</span><span class="p">,</span>
 89 |     <span class="n">r2</span><span class="o">=</span><span class="mf">0.1</span><span class="p">,</span>
 90 |     <span class="n">p1</span><span class="o">=</span><span class="mf">5e-8</span><span class="p">,</span>
 91 |     <span class="n">p2</span><span class="o">=</span><span class="mf">0.01</span><span class="p">,</span>
 92 |     <span class="n">name</span><span class="o">=</span><span class="s2">&quot;&quot;</span><span class="p">,</span>
 93 |     <span class="n">ram</span><span class="o">=</span><span class="mi">10000</span><span class="p">,</span>
 94 | <span class="p">):</span>
 95 | <span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
 96 | <span class="sd">    Perform clumping on the given data using plink. Corresponds to the :meth:`Geno.clump` method.</span>
 97 | 
 98 | <span class="sd">    Args:</span>
 99 | <span class="sd">        data (pd.DataFrame): Input data with at least &#39;SNP&#39; and &#39;P&#39; columns.</span>
100 | <span class="sd">        reference_panel (str): The reference population for linkage disequilibrium values. Accepts values &quot;eur&quot;, &quot;sas&quot;, &quot;afr&quot;, &quot;eas&quot;, &quot;amr&quot;. Alternatively, a path leading to a specific bed/bim/fam reference panel can be provided. Default is &quot;eur&quot;.</span>
101 | <span class="sd">        kb (int, optional): Clumping window in terms of thousands of SNPs. Default is 250.</span>
102 | <span class="sd">        r2 (float, optional): Linkage disequilibrium threshold, values between 0 and 1. Default is 0.1.</span>
103 | <span class="sd">        p1 (float, optional): P-value threshold during clumping. SNPs above this value are not considered. Default is 5e-8.</span>
104 | <span class="sd">        p2 (float, optional): P-value threshold post-clumping to further filter the clumped SNPs. If p2 &lt; p1, it won&#39;t be considered. Default is 0.01.</span>
105 | <span class="sd">        name (str, optional): Name used for the files created in the tmp_GENAL folder.</span>
106 | <span class="sd">        ram (int, optional): Amount of RAM in MB to be used by plink.</span>
107 | 
108 | <span class="sd">    Returns:</span>
109 | <span class="sd">        pd.DataFrame: Data after clumping, if any.</span>
110 | <span class="sd">    &quot;&quot;&quot;</span>
111 |     <span class="n">plink19_path</span> <span class="o">=</span> <span class="n">get_plink19_path</span><span class="p">()</span>
112 | 
113 |     <span class="c1"># Create unique ID for the name if none is passed</span>
114 |     <span class="k">if</span> <span class="ow">not</span> <span class="n">name</span><span class="p">:</span>
115 |         <span class="n">name</span> <span class="o">=</span> <span class="nb">str</span><span class="p">(</span><span class="n">uuid</span><span class="o">.</span><span class="n">uuid4</span><span class="p">())[:</span><span class="mi">8</span><span class="p">]</span>
116 | 
117 |     <span class="c1"># Save the relevant data columns to a temporary file</span>
118 |     <span class="n">to_clump_filename</span> <span class="o">=</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="s2">&quot;tmp_GENAL&quot;</span><span class="p">,</span> <span class="sa">f</span><span class="s2">&quot;</span><span class="si">{</span><span class="n">name</span><span class="si">}</span><span class="s2">_to_clump.txt&quot;</span><span class="p">)</span>
119 |     <span class="n">data</span><span class="p">[[</span><span class="s2">&quot;SNP&quot;</span><span class="p">,</span> <span class="s2">&quot;P&quot;</span><span class="p">]]</span><span class="o">.</span><span class="n">to_csv</span><span class="p">(</span><span class="n">to_clump_filename</span><span class="p">,</span> <span class="n">index</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">sep</span><span class="o">=</span><span class="s2">&quot;</span><span class="se">\t</span><span class="s2">&quot;</span><span class="p">)</span>
120 | 
121 |     <span class="c1"># Construct and execute the plink clumping command</span>
122 |     <span class="n">output_path</span> <span class="o">=</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="s2">&quot;tmp_GENAL&quot;</span><span class="p">,</span> <span class="n">name</span><span class="p">)</span>
123 |     <span class="n">plink_command</span> <span class="o">=</span> <span class="sa">f</span><span class="s2">&quot;</span><span class="si">{</span><span class="n">plink19_path</span><span class="si">}</span><span class="s2"> --memory </span><span class="si">{</span><span class="n">ram</span><span class="si">}</span><span class="s2"> --bfile </span><span class="si">{</span><span class="n">get_reference_panel_path</span><span class="p">(</span><span class="n">reference_panel</span><span class="p">)</span><span class="si">}</span><span class="s2"> </span><span class="se">\</span>
124 | <span class="s2">                     --clump </span><span class="si">{</span><span class="n">to_clump_filename</span><span class="si">}</span><span class="s2"> --clump-kb </span><span class="si">{</span><span class="n">kb</span><span class="si">}</span><span class="s2"> --clump-r2 </span><span class="si">{</span><span class="n">r2</span><span class="si">}</span><span class="s2"> --clump-p1 </span><span class="si">{</span><span class="n">p1</span><span class="si">}</span><span class="s2"> </span><span class="se">\</span>
125 | <span class="s2">                     --clump-p2 </span><span class="si">{</span><span class="n">p2</span><span class="si">}</span><span class="s2"> --out </span><span class="si">{</span><span class="n">output_path</span><span class="si">}</span><span class="s2">&quot;</span>
126 |     <span class="n">output</span> <span class="o">=</span> <span class="n">subprocess</span><span class="o">.</span><span class="n">run</span><span class="p">(</span>
127 |         <span class="n">plink_command</span><span class="p">,</span> <span class="n">shell</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">capture_output</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">text</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">check</span><span class="o">=</span><span class="kc">True</span>
128 |     <span class="p">)</span>
129 | 
130 |     <span class="c1"># Check and print the outputs for relevant information</span>
131 |     <span class="k">if</span> <span class="n">output</span><span class="o">.</span><span class="n">returncode</span> <span class="o">!=</span> <span class="mi">0</span><span class="p">:</span>
132 |         <span class="k">raise</span> <span class="ne">RuntimeError</span><span class="p">(</span>
133 |             <span class="sa">f</span><span class="s2">&quot;PLINK execution failed with the following error: </span><span class="si">{</span><span class="n">output</span><span class="o">.</span><span class="n">stderr</span><span class="si">}</span><span class="s2">&quot;</span>
134 |         <span class="p">)</span>
135 |     <span class="k">if</span> <span class="s2">&quot;more top variant IDs missing&quot;</span> <span class="ow">in</span> <span class="n">output</span><span class="o">.</span><span class="n">stderr</span><span class="p">:</span>
136 |         <span class="n">missing_variants</span> <span class="o">=</span> <span class="n">output</span><span class="o">.</span><span class="n">stderr</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="s2">&quot;more top variant IDs missing&quot;</span><span class="p">)[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">split</span><span class="p">(</span>
137 |             <span class="s2">&quot;</span><span class="se">\n</span><span class="s2">&quot;</span>
138 |         <span class="p">)[</span><span class="o">-</span><span class="mi">1</span><span class="p">]</span>
139 |         <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Warning: </span><span class="si">{</span><span class="n">missing_variants</span><span class="si">}</span><span class="s2"> top variant IDs missing&quot;</span><span class="p">)</span>
140 |     <span class="k">if</span> <span class="s2">&quot;No significant --clump results.&quot;</span> <span class="ow">in</span> <span class="n">output</span><span class="o">.</span><span class="n">stderr</span><span class="p">:</span>
141 |         <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;No SNPs remaining after clumping.&quot;</span><span class="p">)</span>
142 |         <span class="k">return</span>
143 |     <span class="nb">print</span><span class="p">(</span><span class="n">output</span><span class="o">.</span><span class="n">stdout</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="s2">&quot;--clump: &quot;</span><span class="p">)[</span><span class="mi">1</span><span class="p">]</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="s2">&quot;</span><span class="se">\n</span><span class="s2">&quot;</span><span class="p">)[</span><span class="mi">0</span><span class="p">])</span>
144 | 
145 |     <span class="c1"># Extract the list of clumped SNPs and get the relevant data subset</span>
146 |     <span class="n">clumped_filename</span> <span class="o">=</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="s2">&quot;tmp_GENAL&quot;</span><span class="p">,</span> <span class="sa">f</span><span class="s2">&quot;</span><span class="si">{</span><span class="n">name</span><span class="si">}</span><span class="s2">.clumped&quot;</span><span class="p">)</span>
147 |     <span class="k">if</span> <span class="ow">not</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">exists</span><span class="p">(</span><span class="n">clumped_filename</span><span class="p">):</span>
148 |         <span class="k">raise</span> <span class="ne">FileNotFoundError</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;&#39;</span><span class="si">{</span><span class="n">clumped_filename</span><span class="si">}</span><span class="s2">&#39; is missing.&quot;</span><span class="p">)</span>
149 |     <span class="n">plink_clumped</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="n">clumped_filename</span><span class="p">,</span> <span class="n">sep</span><span class="o">=</span><span class="s2">&quot;\s+&quot;</span><span class="p">,</span> <span class="n">usecols</span><span class="o">=</span><span class="p">[</span><span class="s2">&quot;SNP&quot;</span><span class="p">])</span>
150 |     <span class="n">clumped_data</span> <span class="o">=</span> <span class="n">data</span><span class="p">[</span><span class="n">data</span><span class="p">[</span><span class="s2">&quot;SNP&quot;</span><span class="p">]</span><span class="o">.</span><span class="n">isin</span><span class="p">(</span><span class="n">plink_clumped</span><span class="p">[</span><span class="s2">&quot;SNP&quot;</span><span class="p">])]</span>
151 |     <span class="n">clumped_data</span><span class="o">.</span><span class="n">reset_index</span><span class="p">(</span><span class="n">drop</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">inplace</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
152 |     <span class="k">return</span> <span class="n">clumped_data</span></div>
153 | 
154 | </pre></div>
155 | 
156 |            </div>
157 |           </div>
158 |           <footer>
159 | 
160 |   <hr/>
161 | 
162 |   <div role="contentinfo">
163 |     <p>&#169; Copyright 2023, Cyprien A. Rivier.</p>
164 |   </div>
165 | 
166 |   Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
167 |     <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
168 |     provided by <a href="https://readthedocs.org">Read the Docs</a>.
169 |    
170 | 
171 | </footer>
172 |         </div>
173 |       </div>
174 |     </section>
175 |   </div>
176 |   <script>
177 |       jQuery(function () {
178 |           SphinxRtdTheme.Navigation.enable(true);
179 |       });
180 |   </script> 
181 | 
182 | </body>
183 | </html>


--------------------------------------------------------------------------------
/genal/association.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | from pandas.api.types import is_numeric_dtype
  4 | import scipy.stats as st
  5 | import os, subprocess
  6 | 
  7 | from .extract_prs import check_pfiles
  8 | from .tools import get_plink_path, run_plink_command
  9 | 
 10 | 
 11 | def association_test_func_plink2(data, covar_list, standardize, name, data_pheno, pheno_type):
 12 |     """
 13 |     Conduct single-SNP association tests against a phenotype.
 14 | 
 15 |     This function performs a series of operations:
 16 |         1. Checks for necessary preliminary steps.
 17 |         2. Updates the PSAM file with the phenotype data.
 18 |         3. Creates a covariate file if required.
 19 |         4. Runs a PLINK association test.
 20 |         5. Processes the results and returns them.
 21 | 
 22 |     Args:
 23 |         data (pd.DataFrame): Genetic data with the standard Geno columns.
 24 |         covar_list (list): List of column names in the data_pheno DataFrame to use as covariates.
 25 |         standardize (bool): Flag indicating if the phenotype needs standardization.
 26 |         name (str): Prefix for the filenames used during the process.
 27 |         data_pheno (pd.DataFrame): Phenotype data with at least an IID and PHENO columns.
 28 |         pheno_type (str): Type of phenotype ('binary' or 'quant').
 29 | 
 30 |     Returns:
 31 |         pd.DataFrame: Processed results of the association test.
 32 | 
 33 |     This function corresponds to the following Geno method: :meth:`Geno.association_test`.
 34 |     """
 35 | 
 36 |     # Check necessary files are available
 37 |     genetic_path = os.path.join("tmp_GENAL", f"{name}_allchr")
 38 |     print(genetic_path)
 39 |     if not check_pfiles(genetic_path):
 40 |         raise FileNotFoundError(
 41 |             "Run the extract_snps() method before performing association tests."
 42 |         )
 43 |     if data.shape[0] == 0:
 44 |         raise ValueError(
 45 |             "No SNPs for the association tests. Check the .data or .data_clumped dataframes."
 46 |         )
 47 | 
 48 |     # Update phenotype in the PSAM file
 49 |     psam = _prepare_psam_file(genetic_path, data_pheno, pheno_type, standardize)
 50 | 
 51 |     # Prepare covariate file if covariates are provided
 52 |     covar_list, covar_filename = _handle_covariates(covar_list, data_pheno, name)
 53 | 
 54 |     # Execute PLINK association test
 55 |     output = _run_plink2_assoc_test(
 56 |         genetic_path, name, covar_filename, covar_list, pheno_type
 57 |     )
 58 | 
 59 |     # Process and return results
 60 |     return _process_results_plink2(output, data, pheno_type)
 61 | 
 62 | def _run_plink2_assoc_test(
 63 |     genetic_path, name, covar_filename, covar_list, pheno_type
 64 | ):
 65 |     """Helper function to execute the PLINK 2.0 association test."""
 66 |     
 67 |     print(
 68 |         f"Running {'linear' if pheno_type == 'quant' else 'logistic'} association tests on {genetic_path} data "
 69 |         f"{f'with adjustment for: {covar_list}' if len(covar_list) > 0 else 'without covariates. This is not recommended'}."
 70 |     )
 71 |     
 72 |     output = os.path.join("tmp_GENAL", name)
 73 |     
 74 |     # Build PLINK 2.0 command - we can use --pfile since extract_snps now creates pgen files
 75 |     command = [
 76 |         get_plink_path(),
 77 |         "--pfile", genetic_path,
 78 |         "--glm", 
 79 |         *(["allow-no-covars"] if len(covar_list) == 0 else []),
 80 |         "no-x-sex",
 81 |         "--no-input-missing-phenotype",
 82 |         "--pheno-name", "PHENO1"
 83 |     ]
 84 |     
 85 |     if len(covar_list) > 0:
 86 |         command.extend([
 87 |             "--covar", covar_filename,
 88 |             "--covar-name", ",".join(covar_list)
 89 |         ])
 90 | 
 91 |     command.extend(["--out", output])
 92 |     
 93 |     try:
 94 |         subprocess.run(command, capture_output=True, text=True, check=True)
 95 |     except Exception as e:
 96 |         #Handle the case where the association fails because of numerical instability in the covariates
 97 |         if "scales vary too widely" in str(e):
 98 |             print("The association test failed because of numerical instability in the covariates. Rescaling the covariates.")
 99 |             command.extend(["--covar-variance-standardize"])
100 |             run_plink_command(command)
101 |             
102 |         else:
103 |             print(f"Error running PLINK command: {e}")
104 |             print(f"PLINK stdout: {e.stdout}")
105 |             print(f"PLINK stderr: {e.stderr}")
106 |             raise ValueError("PLINK command failed. Check the error messages above for details.")
107 |     
108 |     return output
109 | 
110 | def _process_results_plink2(output, data, pheno_type):
111 |     """Helper function to process results after the PLINK association test."""
112 |     # Path to PLINK results
113 |     method = "logistic.hybrid" if pheno_type == "binary" else "linear"
114 |     results_path = output + f".PHENO1.glm." + method
115 |     assoc = pd.read_csv(results_path, delimiter="\s+")
116 | 
117 |     # Filter to keep only coefficients corresponding to our phenotype
118 |     assoc = assoc[assoc["TEST"] == "ADD"]
119 | 
120 |     # If logistic regression, log-transform the odds ratio
121 |     assoc["BETA"] = np.log(assoc.OR) if pheno_type == "binary" else assoc.BETA
122 |     
123 |     n_na = assoc["BETA"].isna().sum()
124 | 
125 |     # Rename columns
126 |     assoc.rename(columns={"#CHROM": "CHR", "LOG(OR)_SE": "SE"}, errors="ignore", inplace=True)
127 |     
128 |     # Merge results with the clumped data
129 |     data = data.drop(axis=1, columns=["BETA", "SE", "P"], errors="ignore").merge(
130 |         assoc[["CHR","POS", "BETA", "SE", "A1", "P"]], how="inner", on=["CHR", "POS"]
131 |     )
132 | 
133 |     # Adjust beta values based on allele match
134 |     data["BETA"] = np.where(
135 |         data.EA == data.A1, data.BETA, np.where(data.NEA == data.A1, -data.BETA, np.nan)
136 |     )
137 | 
138 |     # Drop unnecessary columns
139 |     data = data.drop(
140 |         axis=1, columns=["A1"], errors="ignore"
141 |     )
142 | 
143 |     # Remove rows with mismatches in allele columns and notify the user
144 |     nrow_previous = data.shape[0]
145 |     data = data.dropna(subset="BETA")
146 |     delta_nrow = nrow_previous - data.shape[0] - n_na
147 |     if (delta_nrow > 0) or (n_na > 0):
148 |         print(
149 |             f"{f'{n_na}({n_na/nrow_previous*100:.3f}%) SNP-trait tests returned NA value and ' if n_na>0 else ''}{delta_nrow}({delta_nrow/nrow_previous*100:.3f}%) SNPs removed due to allele discrepancies between the main data and the genetic data."
150 |         )
151 |     return data
152 | 
153 | 
154 | def _prepare_psam_file(genetic_path, data_pheno, pheno_type, standardize):
155 |     """Helper function to prepare the PSAM file with phenotype data."""
156 |     # Read the PSAM file
157 |     psam = pd.read_csv(genetic_path + ".psam", delimiter="\t")
158 |     
159 |     # Ensure IID column types match before merging to prevent errors
160 |     if '#IID' in psam.columns and 'IID' in data_pheno.columns:
161 |         psam['#IID'] = psam['#IID'].astype(data_pheno['IID'].dtype)
162 |     elif 'IID' in psam.columns and 'IID' in data_pheno.columns:
163 |          psam['IID'] = psam['IID'].astype(data_pheno['IID'].dtype)
164 | 
165 |     # Merge phenotype data with the PSAM dataframe depending on column present
166 |     if "#FID" in psam.columns:
167 |         data_pheno_trait = data_pheno[["FID", "IID", "PHENO"]].rename(columns={"FID": "#FID", "PHENO": "PHENO1"}).copy()
168 |         psam = psam.merge(data_pheno_trait, how="left", on=["#FID", "IID"], indicator=True)
169 |     else:
170 |         data_pheno_trait = data_pheno[["IID", "PHENO"]].rename(columns={"IID": "#IID", "PHENO": "PHENO1"}).copy()
171 |         psam = psam.merge(data_pheno_trait, how="left", on=["#IID"], indicator=True)
172 |     
173 |     # Verify that the merge was successful
174 |     if (psam["_merge"] == "both").sum() == 0:
175 |         raise ValueError(
176 |             "The IDs in the phenotype dataframe are inconsistent with those in the genetic dataset. Call set_phenotype() method again, specifying the correct column names for the genetic IDs (IID and FID)."
177 |         )
178 |     psam.drop(axis=1, columns=["_merge"], inplace=True, errors="ignore")
179 |     
180 |     # Count the number of individuals with a valid phenotype trait
181 |     n_non_na = psam.shape[0] - psam.PHENO1.isna().sum()
182 |     print(
183 |         f"{n_non_na} individuals are present in the genetic data and have a valid phenotype trait."
184 |     )
185 |     
186 |     # Update phenotype values based on its type
187 |     if pheno_type == "binary":
188 |         psam["PHENO1"] = psam["PHENO1"] + 1
189 |         psam["PHENO1"] = psam["PHENO1"].astype("Int64")
190 |         psam["PHENO1"] = psam["PHENO1"].astype(str).replace('<NA>', 'NA')
191 |     if (pheno_type == "quant") & (standardize == True):
192 |         # Standardizing for quantitative phenotypes
193 |         print(
194 |             "Standardizing the phenotype to approximate a normal distribution. Use standardize = False if you do not want to standardize."
195 |         )
196 |         psam["PHENO1"] = (psam["PHENO1"] - psam["PHENO1"].mean(skipna=True)) / psam["PHENO1"].std(skipna=True)
197 |         psam["PHENO1"] = psam["PHENO1"].fillna('NA')
198 | 
199 |     # Make sure the SEX column is not empty without modifying existing values
200 |     psam["SEX"] = psam["SEX"].replace('', 'NA').fillna('NA')
201 | 
202 |     psam.to_csv(genetic_path + ".psam", sep="\t", index=False)
203 |     return psam
204 | 
205 | 
206 | def _handle_covariates(covar_list, data_pheno, name):
207 |     """Helper function to prepare the covariate file."""
208 |     if len(covar_list) > 0:
209 |         # Ensure all covariates are present in phenotype data
210 |         for col in covar_list:
211 |             if col not in data_pheno.columns:
212 |                 raise TypeError(
213 |                     f"The {col} column is not found in the .phenotype dataframe."
214 |                 )
215 |         # Select required columns and rename columns
216 |         data_cov = data_pheno[["FID", "IID"] + covar_list].copy()
217 | 
218 |         # Ensure the covariates are numeric and not trivial (lead to association fail)
219 |         for col in covar_list:
220 |             if data_pheno[col].nunique() == 1:
221 |                 print(
222 |                     f"The {col} covariate contains only one value and is removed from the tests."
223 |                 )
224 |                 data_cov.drop(axis=1, columns=[col], inplace=True)
225 |                 covar_list.remove(col)
226 |             if not pd.api.types.is_numeric_dtype(data_pheno[col]):
227 |                 print(
228 |                     f"The {col} covariate is not numeric and is removed from the tests."
229 |                 )
230 |                 data_cov.drop(axis=1, columns=[col], inplace=True, errors="ignore")
231 |                 covar_list.remove(col)
232 | 
233 |         # Remove rows with NA values and print their number
234 |         nrows = data_cov.shape[0]
235 |         data_cov.dropna(inplace=True)
236 |         removed_rows = nrows - data_cov.shape[0]
237 |         if removed_rows > 0:
238 |             print(
239 |                 f"{removed_rows}({removed_rows/nrows*100:.3f}%) individuals have NA values in the covariates columns and will be excluded from the association tests."
240 |             )
241 | 
242 |         # Define the covariate filename
243 |         covar_filename = os.path.join("tmp_GENAL", f"{name}_covar.cov")
244 |         # Ensure FID and IID are in integer format and write the covariate file
245 |         data_cov["IID"] = data_cov["IID"].astype("Int64")
246 |         data_cov["FID"] = data_cov["FID"].astype("Int64")
247 |         data_cov.to_csv(covar_filename, sep=" ", header=True, index=False)
248 |         covar = True
249 |     else:
250 |         covar = False
251 |         covar_filename = None
252 |     return covar_list, covar_filename
253 | 
254 | 
255 | ### __________________________
256 | ### Set phenotype functions
257 | ### __________________________
258 | 
259 | def set_phenotype_func(data_original, PHENO, PHENO_type, IID, FID=None, alternate_control=False):
260 |     """
261 |     Set a phenotype dataframe containing individual IDs and phenotype columns formatted for single-SNP association testing.
262 | 
263 |     Args:
264 |         data (pd.DataFrame): Contains at least an individual IDs column and one phenotype column.
265 |         IID (str): Name of the individual IDs column in data.
266 |         PHENO (str): Name of the phenotype column in data.
267 |         PHENO_type (str, optional): Type of the phenotype column. Either "quant" for quantitative (continuous) or "binary".
268 |             The function tries to infer the type if not provided.
269 |         FID (str, optional): Name of the family ID column in data. If not provided, FID will be set to IID values.
270 |         alternate_control (bool): Assumes that for a binary trait, the controls are coded with the most frequent value. 
271 |             Use True to reverse the assumption.
272 | 
273 |     Returns:
274 |         pd.DataFrame: The modified data.
275 |         str: The inferred or provided PHENO_type.
276 |     """
277 |     data = data_original.copy()
278 |     _validate_columns_existence(data, PHENO, IID, FID)
279 |     
280 |     data = _standardize_column_names(data, PHENO, IID, FID)
281 |     PHENO_type = _determine_phenotype_type(data, PHENO_type)
282 |     data = _validate_and_process_phenotype(data, PHENO, PHENO_type, alternate_control)
283 |     _report_na_values(data)
284 | 
285 |     print("The phenotype data is stored in the .phenotype attribute.")
286 |     return data, PHENO_type
287 | 
288 | 
289 | def _validate_columns_existence(data, PHENO, IID, FID):
290 |     """Checks if columns exist and raises errors if not."""
291 |     # Check if PHENO is a string
292 |     if not isinstance(PHENO, str):
293 |         raise ValueError("The PHENO argument must be a string containing the name of the phenotype column.")
294 |     # Check if IID is a string
295 |     if not isinstance(IID, str):
296 |         raise ValueError("The IID argument must be a string containing the name of the individual IDs column.")
297 |     
298 |     for column in [PHENO, IID]:
299 |         # Raise an error if the column name is not provided
300 |         if column is None:
301 |             raise ValueError(f"Please provide a name for the {column} variable.")
302 |         # Raise an error if the column does not exist in the data
303 |         if column not in data.columns:
304 |             raise ValueError(
305 |                 f"The column '{column}' is not present in the dataset. This column is required!"
306 |             )
307 |         
308 |     # Handle FID column
309 |     if FID is not None and FID not in data.columns:
310 |         raise ValueError(f"The column '{FID}' is not present in the provided dataset.")
311 | 
312 |     # If IID or FID is numerical, convert to integer
313 |     if is_numeric_dtype(data[IID]):
314 |         data[IID] = data[IID].astype("Int64")
315 |     if FID is not None and is_numeric_dtype(data[FID]):
316 |         data[FID] = data[FID].astype("Int64")
317 |     
318 |     if data.shape[0] == 0:
319 |         raise ValueError("The phenotype dataframe is empty.")
320 | 
321 | 
322 | def _standardize_column_names(data, PHENO, IID, FID):
323 |     """Standardizes the column names to 'IID' and 'PHENO'."""
324 |     # Drop redundant columns if they exist and rename the target columns to standard names
325 |     if PHENO != "PHENO":
326 |         data.drop(axis=1, columns=["PHENO"], errors="ignore", inplace=True)
327 |     if IID != "IID":
328 |         data.drop(axis=1, columns=["IID"], errors="ignore", inplace=True)
329 |     data.rename(columns={IID: "IID", PHENO: "PHENO"}, inplace=True)
330 |     
331 |     if FID is not None:
332 |         if FID != "FID":
333 |             data.drop(axis=1, columns=["FID"], errors="ignore", inplace=True)
334 |         data.rename(columns={FID: "FID"}, inplace=True)
335 |     else:
336 |         data["FID"] = data["IID"]
337 |         print(
338 |             "The FID column was not provided. The FIDs are assumed to be the same as the IIDs."
339 |         )
340 |     
341 |     return data
342 | 
343 | 
344 | def _determine_phenotype_type(data, PHENO_type):
345 |     """Guesses or validates the phenotype type."""
346 |     # If phenotype type is not given, deduce it based on the unique values in the column
347 |     if PHENO_type is None:
348 |         if len(np.unique(data.PHENO.dropna())) == 2:
349 |             print(
350 |                 "Detected a binary phenotype in the 'PHENO' column. Specify 'PHENO_type=\"quant\"' if this is incorrect."
351 |             )
352 |             return "binary"
353 |         else:
354 |             print(
355 |                 "Detected a quantitative phenotype in the 'PHENO' column. Specify 'PHENO_type=\"binary\"' if this is incorrect."
356 |             )
357 |             return "quant"
358 |     return PHENO_type
359 | 
360 | 
361 | def _validate_and_process_phenotype(data, PHENO, PHENO_type, alternate_control):
362 |     """Validates the phenotype and processes it accordingly."""
363 |     # Process the phenotype based on its type
364 |     if PHENO_type == "binary":
365 |         _process_binary_phenotype(data, PHENO, alternate_control)
366 |     elif PHENO_type == "quant":
367 |         _validate_quantitative_phenotype(data, PHENO)
368 |     else:
369 |         raise ValueError("Accepted values for 'PHENO_type' are 'binary' or 'quant'.")
370 |     return data
371 | 
372 | 
373 | def _process_binary_phenotype(data, PHENO, alternate_control):
374 |     """Processes a binary phenotype."""
375 |     # Ensure that the phenotype is binary
376 |     if len(np.unique(data.PHENO.dropna())) != 2:
377 |         raise ValueError(
378 |             f"The '{PHENO}' column is not binary as it contains more than two distinct values."
379 |         )
380 | 
381 |     if alternate_control:
382 |         code_control = data.PHENO.value_counts().index[1]
383 |         code_case = data.PHENO.value_counts().index[0]
384 |     else:
385 |         code_control = data.PHENO.value_counts().index[0]
386 |         code_case = data.PHENO.value_counts().index[1]
387 |     
388 |     print(
389 |         f"Identified {code_control} as the control code in 'PHENO'. {'Set alternate_control=True to inverse this interpretation.' if not alternate_control else ''}"
390 |     )
391 | 
392 |     # Update the control and case codings
393 |     data.replace({"PHENO": {code_control: 0, code_case: 1}}, inplace=True)
394 | 
395 |     # Print number and percentage of cases
396 |     n_case = int(data.PHENO.sum())
397 |     print(
398 |         f"There are {n_case}({n_case/data.shape[0]*100:.3f}%) cases in the 'PHENO' column."
399 |     )
400 | 
401 | 
402 | def _validate_quantitative_phenotype(data, PHENO):
403 |     """Validates a quantitative phenotype."""
404 |     # Ensure that the phenotype is numeric
405 |     if not is_numeric_dtype(data.PHENO):
406 |         raise ValueError(
407 |             f"The '{PHENO}' column must contain numeric values for a quantitative phenotype."
408 |         )
409 | 
410 | 
411 | def _report_na_values(data):
412 |     """Reports the number of NA values in 'IID' and 'PHENO' columns."""
413 |     nrows = data.shape[0]
414 |     n_nan_id = data.IID.isna().sum()
415 |     n_nan_pheno = data.PHENO.isna().sum()
416 | 
417 |     # Report NA values in ID and PHENO columns, if they exist
418 |     if n_nan_id > 0:
419 |         print(
420 |             f"Detected {n_nan_id} NA values in the 'ID' column, accounting for {n_nan_id/nrows*100:.3f}% of entries. These will be omitted during analyses."
421 |         )
422 |     if n_nan_pheno > 0:
423 |         print(
424 |             f"Detected {n_nan_pheno} NA values in the 'PHENO' column, accounting for {n_nan_pheno/nrows*100:.3f}% of entries. These will be omitted during analyses."
425 |         )
426 | 


--------------------------------------------------------------------------------