├── .DS_Store ├── .gitattributes ├── .github └── workflows │ └── install-ci.yml ├── .readthedocs.yml ├── Dockerfile ├── LICENSE ├── README.md ├── TODO.md ├── docs ├── Makefile ├── _build │ ├── doctrees │ │ ├── environment.pickle │ │ ├── imputation.doctree │ │ ├── index.doctree │ │ ├── install │ │ │ ├── linux.doctree │ │ │ └── macosx.doctree │ │ ├── installation.doctree │ │ ├── pca.doctree │ │ ├── pca │ │ │ ├── joint.doctree │ │ │ ├── normal.doctree │ │ │ └── project.doctree │ │ ├── phasing.doctree │ │ ├── preimp_qc.doctree │ │ ├── qb.doctree │ │ └── tutorial.doctree │ └── html │ │ ├── .buildinfo │ │ ├── _images │ │ └── qc_workflow.png │ │ ├── _sources │ │ ├── imputation.rst.txt │ │ ├── index.rst.txt │ │ ├── install │ │ │ ├── linux.rst.txt │ │ │ └── macosx.rst.txt │ │ ├── installation.rst.txt │ │ ├── pca.rst.txt │ │ ├── pca │ │ │ ├── joint.rst.txt │ │ │ ├── normal.rst.txt │ │ │ └── project.rst.txt │ │ ├── phasing.rst.txt │ │ ├── preimp_qc.rst.txt │ │ ├── qb.rst.txt │ │ └── tutorial.rst.txt │ │ ├── _static │ │ ├── _sphinx_javascript_frameworks_compat.js │ │ ├── basic.css │ │ ├── css │ │ │ ├── badge_only.css │ │ │ ├── fonts │ │ │ │ ├── Roboto-Slab-Bold.woff │ │ │ │ ├── Roboto-Slab-Bold.woff2 │ │ │ │ ├── Roboto-Slab-Regular.woff │ │ │ │ ├── Roboto-Slab-Regular.woff2 │ │ │ │ ├── fontawesome-webfont.eot │ │ │ │ ├── fontawesome-webfont.svg │ │ │ │ ├── fontawesome-webfont.ttf │ │ │ │ ├── fontawesome-webfont.woff │ │ │ │ ├── fontawesome-webfont.woff2 │ │ │ │ ├── lato-bold-italic.woff │ │ │ │ ├── lato-bold-italic.woff2 │ │ │ │ ├── lato-bold.woff │ │ │ │ ├── lato-bold.woff2 │ │ │ │ ├── lato-normal-italic.woff │ │ │ │ ├── lato-normal-italic.woff2 │ │ │ │ ├── lato-normal.woff │ │ │ │ └── lato-normal.woff2 │ │ │ └── theme.css │ │ ├── custom.css │ │ ├── doctools.js │ │ ├── documentation_options.js │ │ ├── file.png │ │ ├── fonts │ │ │ ├── Lato │ │ │ │ ├── lato-bold.eot │ │ │ │ ├── lato-bold.ttf │ │ │ │ ├── lato-bold.woff │ │ │ │ ├── lato-bold.woff2 │ │ │ │ ├── lato-bolditalic.eot │ │ │ │ ├── lato-bolditalic.ttf │ │ │ │ ├── lato-bolditalic.woff │ │ │ │ ├── lato-bolditalic.woff2 │ │ │ │ ├── lato-italic.eot │ │ │ │ ├── lato-italic.ttf │ │ │ │ ├── lato-italic.woff │ │ │ │ ├── lato-italic.woff2 │ │ │ │ ├── lato-regular.eot │ │ │ │ ├── lato-regular.ttf │ │ │ │ ├── lato-regular.woff │ │ │ │ └── lato-regular.woff2 │ │ │ └── RobotoSlab │ │ │ │ ├── roboto-slab-v7-bold.eot │ │ │ │ ├── roboto-slab-v7-bold.ttf │ │ │ │ ├── roboto-slab-v7-bold.woff │ │ │ │ ├── roboto-slab-v7-bold.woff2 │ │ │ │ ├── roboto-slab-v7-regular.eot │ │ │ │ ├── roboto-slab-v7-regular.ttf │ │ │ │ ├── roboto-slab-v7-regular.woff │ │ │ │ └── roboto-slab-v7-regular.woff2 │ │ ├── jquery.js │ │ ├── js │ │ │ ├── badge_only.js │ │ │ ├── theme.js │ │ │ └── versions.js │ │ ├── language_data.js │ │ ├── minus.png │ │ ├── plus.png │ │ ├── pygments.css │ │ ├── searchtools.js │ │ └── sphinx_highlight.js │ │ ├── genindex.html │ │ ├── imputation.html │ │ ├── index.html │ │ ├── install │ │ ├── linux.html │ │ └── macosx.html │ │ ├── installation.html │ │ ├── objects.inv │ │ ├── pca.html │ │ ├── pca │ │ ├── joint.html │ │ ├── normal.html │ │ └── project.html │ │ ├── phasing.html │ │ ├── preimp_qc.html │ │ ├── qb.html │ │ ├── search.html │ │ ├── searchindex.js │ │ └── tutorial.html ├── _static │ └── custom.css ├── conf.py ├── images │ └── qc_workflow.png ├── imputation.rst ├── index.rst ├── installation.rst ├── make.bat ├── pca.rst ├── pca │ ├── joint.rst │ ├── normal.rst │ └── project.rst ├── phasing.rst ├── preimp_qc.rst ├── qb.rst ├── requirements.txt └── tutorial.rst ├── env-setup.sh ├── gwaspy ├── .DS_Store ├── __init__.py ├── check_alleles │ ├── __init__.py │ ├── check_alleles.py │ └── flips.py ├── imputation │ ├── __init__.py │ ├── concat_vcfs.py │ ├── glimpse2_impute.py │ ├── imputation.py │ ├── impute.py │ ├── impute5_impute.py │ ├── impute_vcf.py │ └── sex_aut_imp.py ├── pca │ ├── __init__.py │ ├── assign_pop_labels.py │ ├── filter_ref_data.py │ ├── pca.py │ ├── pca_filter_snps.py │ ├── pca_joint.py │ ├── pca_normal.py │ └── pca_project.py ├── phasing │ ├── __init__.py │ ├── concat_vcfs.py │ ├── get_filebase.py │ ├── phase.py │ ├── phase_vcf.py │ ├── phasing.py │ ├── scatter_vcf.py │ └── shapeit5_phase.py ├── preimp_qc │ ├── __init__.py │ ├── aggregators.py │ ├── annotations.py │ ├── plots.py │ ├── preimp_qc.py │ └── report.py └── utils │ ├── __init__.py │ ├── export_file.py │ ├── get_file_size.py │ ├── natural_sort.py │ ├── read_file.py │ ├── reference_liftover.py │ └── sample_annotations.py ├── nf ├── main.nf ├── modules │ ├── imputation.nf │ └── phasing.nf ├── nextflow.config └── params.json ├── requirements.txt ├── setup.py └── split_maps.sh /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atgu/GWASpy/f8b9f5eccbb67fa5ab462007634db3862198aa6e/.DS_Store -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | *.bim filter=lfs diff=lfs merge=lfs -text 2 | *.fam filter=lfs diff=lfs merge=lfs -text 3 | *.bed filter=lfs diff=lfs merge=lfs -text 4 | data/1kg_annotated.mt/** filter=lfs diff=lfs merge=lfs -text 5 | -------------------------------------------------------------------------------- /.github/workflows/install-ci.yml: -------------------------------------------------------------------------------- 1 | # Run installation checks 2 | 3 | name: install 4 | 5 | on: 6 | push: 7 | branches: [ main ] 8 | pull_request: 9 | branches: [ main ] 10 | 11 | jobs: 12 | install: 13 | 14 | runs-on: ubuntu-20.04 15 | strategy: 16 | matrix: 17 | python-version: [3.10, 3.11] 18 | 19 | steps: 20 | - uses: actions/checkout@v2 21 | - name: Set up Python ${{ matrix.python-version }} 22 | uses: actions/setup-python@v2 23 | with: 24 | python-version: ${{ matrix.python-version }} 25 | - name: Install dependencies 26 | run: | 27 | python -m pip install --upgrade pip 28 | python -m pip install flake8 pytest pypandoc 29 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 30 | - name: Install GWASpy 31 | run: | 32 | python setup.py sdist 33 | pip3 install dist/gwaspy* 34 | - name: Check modules 35 | run: | 36 | preimp_qc --help 37 | pca --help 38 | imputation --help 39 | phasing --help 40 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | # https://docs.readthedocs.com/platform/stable/config-file/index.html 2 | 3 | # Read the Docs configuration file for Sphinx projects 4 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 5 | 6 | # Required 7 | version: 2 8 | 9 | # Set the OS, Python version and other tools you might need 10 | build: 11 | os: ubuntu-22.04 12 | tools: 13 | python: "3.12" 14 | # You can also specify other tool versions: 15 | # nodejs: "20" 16 | # rust: "1.70" 17 | # golang: "1.20" 18 | 19 | # Build documentation in the "docs/" directory with Sphinx 20 | sphinx: 21 | configuration: docs/conf.py 22 | # You can configure Sphinx to use a different builder, for instance use the dirhtml builder for simpler URLs 23 | # builder: "dirhtml" 24 | # Fail on all warnings to avoid broken references 25 | # fail_on_warning: true 26 | 27 | # Optionally build your docs in additional formats such as PDF and ePub 28 | # formats: 29 | # - pdf 30 | # - epub 31 | 32 | # Optional but recommended, declare the Python requirements required 33 | # to build your documentation 34 | # See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html 35 | python: 36 | install: 37 | - requirements: docs/requirements.txt -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:20.04 2 | MAINTAINER Lindo Nkambule (lindonkambule116@gmail.com) 3 | 4 | ARG SAMTOOLS_VERSION=1.13 5 | 6 | RUN apt-get update && apt-get install -y software-properties-common && \ 7 | apt-get update && apt-get install -y \ 8 | autoconf \ 9 | automake \ 10 | bzip2 \ 11 | build-essential \ 12 | ca-certificates \ 13 | cmake \ 14 | curl \ 15 | g++ \ 16 | gcc \ 17 | git \ 18 | gzip \ 19 | libboost-all-dev \ 20 | libbz2-dev \ 21 | libcurl4-openssl-dev \ 22 | liblzma-dev \ 23 | libncurses5-dev \ 24 | libssl-dev \ 25 | make \ 26 | python3 \ 27 | python3-pip \ 28 | r-mathlib \ 29 | sudo \ 30 | unzip \ 31 | wget \ 32 | zlib1g-dev \ 33 | && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \ 34 | apt-get clean && \ 35 | apt-get autoremove -y && \ 36 | rm -rf /var/lib/{apt,dpkg,cache,log}/ 37 | 38 | # HTSLIB 39 | RUN cd /opt && \ 40 | wget --no-check-certificate https://github.com/samtools/htslib/releases/download/${SAMTOOLS_VERSION}/htslib-${SAMTOOLS_VERSION}.tar.bz2 && \ 41 | tar xf htslib-${SAMTOOLS_VERSION}.tar.bz2 && rm htslib-${SAMTOOLS_VERSION}.tar.bz2 && cd htslib-${SAMTOOLS_VERSION} && \ 42 | ./configure --enable-libcurl --enable-s3 --enable-gcs && \ 43 | make && make install && make clean 44 | 45 | COPY makefile_shapeit4 /opt 46 | 47 | # SHAPEIT4 48 | RUN git clone https://github.com/odelaneau/shapeit4.git && \ 49 | cd shapeit4 && \ 50 | mv makefile makefile.old && cp /opt/makefile_shapeit4 . && mv makefile_shapeit4 makefile && \ 51 | make && \ 52 | cd /shapeit4/maps && mkdir b37 b38 && gunzip *.gz && \ 53 | tar -xf genetic_maps.b37.tar -C b37/ && \ 54 | tar -xf genetic_maps.b38.tar -C b38/ && \ 55 | rm *.tar 56 | 57 | ENV PATH /shapeit4/bin/:${PATH} 58 | 59 | # BCFTOOLS 60 | RUN cd /opt && \ 61 | wget --no-check-certificate https://github.com/samtools/bcftools/releases/download/${SAMTOOLS_VERSION}/bcftools-${SAMTOOLS_VERSION}.tar.bz2 && \ 62 | tar -xf bcftools-${SAMTOOLS_VERSION}.tar.bz2 && rm bcftools-${SAMTOOLS_VERSION}.tar.bz2 && cd bcftools-${SAMTOOLS_VERSION} && \ 63 | ./configure --with-htslib=/opt/htslib-${SAMTOOLS_VERSION} && make && make install && make clean 64 | 65 | # EAGLE 66 | RUN cd /opt && \ 67 | wget https://data.broadinstitute.org/alkesgroup/Eagle/downloads/Eagle_v2.4.1.tar.gz && \ 68 | gunzip Eagle_v2.4.1.tar.gz && \ 69 | tar xvf Eagle_v2.4.1.tar && \ 70 | cp /opt/Eagle_v2.4.1/tables/genetic_map_hg19_withX.txt.gz /opt && \ 71 | cp /opt/Eagle_v2.4.1/tables/genetic_map_hg38_withX.txt.gz /opt && \ 72 | mv Eagle_v2.4.1/eagle /usr/local/bin/ && \ 73 | rm -rf Eagle_v2.4.1* 74 | 75 | # IMPUTE5 76 | COPY impute5_v1.1.5.zip /opt 77 | RUN cd /opt && \ 78 | unzip impute5_v1.1.5.zip && cd impute5_v1.1.5 && \ 79 | mv *_static /usr/local/bin/ && cd /opt && rm -rf impute5_v1.1.5* 80 | 81 | # makeScaffold for building haplotype scaffolds for phasing 82 | RUN pip3 install Cython --install-option="--no-cython-compile" 83 | 84 | RUN git clone https://github.com/sinanshi/makeScaffold.git && \ 85 | cd makeScaffold && rm makefile && \ 86 | cmake . && \ 87 | make && \ 88 | mv src/scaffold /usr/local/bin/ 89 | 90 | 91 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Broad Institute of MIT and Harvard 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # GWASpy 2 | 3 | 4 | ![install status](https://github.com/atgu/GWASpy/actions/workflows/install-ci.yml/badge.svg) 5 | ![PyPI version](https://badge.fury.io/py/gwaspy.svg) 6 | 7 | Genome-wide association studies pypeline (GWASpy): A Python package for performing GWAS QC, PCA, haplotype phasing, and 8 | genotype imputation. 9 | 10 | ## Installation 11 | GWASpy is available through [PyPI](https://pypi.org/project/gwaspy/). To install, run the command: 12 | ```bash 13 | pip3 install gwaspy 14 | ``` 15 | 16 | ## Usage 17 | For usage, please visit [GWASpy](https://gwaspy.readthedocs.io/) 18 | 19 | ## Copyright and License 20 | GWASpy is generously distributed under the [MIT License](https://github.com/atgu/GWASpy/blob/main/LICENSE) -------------------------------------------------------------------------------- /TODO.md: -------------------------------------------------------------------------------- 1 | Todo list for GWASpy 2 | ==== 3 | 4 | ## Todo 5 | 6 | - [x] Update report and preimp_qc.py sections to cater for different data types e.g. case-/control-only and 7 | case-control data 8 | - [ ] Add filter functions for handling trio dataset (mendel erros for IDs+SNPs and HWE p-value for SNPs) and 9 | update report and preimp_qc.py sections 10 | - [x] Add support for VCF files and include appropriate filter functions (also 11 | check https://blog.hail.is/whole-exome-and-whole-genome-sequencing-recommendations/) -> VCF from arrays differs to that from sequences, so we don't need them here. 12 | - [x] Currently, we're saving intermediate files in /tmp/. Work out a way to store these files temporarily -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/_build/doctrees/environment.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atgu/GWASpy/f8b9f5eccbb67fa5ab462007634db3862198aa6e/docs/_build/doctrees/environment.pickle -------------------------------------------------------------------------------- /docs/_build/doctrees/imputation.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atgu/GWASpy/f8b9f5eccbb67fa5ab462007634db3862198aa6e/docs/_build/doctrees/imputation.doctree -------------------------------------------------------------------------------- /docs/_build/doctrees/index.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atgu/GWASpy/f8b9f5eccbb67fa5ab462007634db3862198aa6e/docs/_build/doctrees/index.doctree -------------------------------------------------------------------------------- /docs/_build/doctrees/install/linux.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atgu/GWASpy/f8b9f5eccbb67fa5ab462007634db3862198aa6e/docs/_build/doctrees/install/linux.doctree -------------------------------------------------------------------------------- /docs/_build/doctrees/install/macosx.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atgu/GWASpy/f8b9f5eccbb67fa5ab462007634db3862198aa6e/docs/_build/doctrees/install/macosx.doctree -------------------------------------------------------------------------------- /docs/_build/doctrees/installation.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atgu/GWASpy/f8b9f5eccbb67fa5ab462007634db3862198aa6e/docs/_build/doctrees/installation.doctree -------------------------------------------------------------------------------- /docs/_build/doctrees/pca.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atgu/GWASpy/f8b9f5eccbb67fa5ab462007634db3862198aa6e/docs/_build/doctrees/pca.doctree -------------------------------------------------------------------------------- /docs/_build/doctrees/pca/joint.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atgu/GWASpy/f8b9f5eccbb67fa5ab462007634db3862198aa6e/docs/_build/doctrees/pca/joint.doctree -------------------------------------------------------------------------------- /docs/_build/doctrees/pca/normal.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atgu/GWASpy/f8b9f5eccbb67fa5ab462007634db3862198aa6e/docs/_build/doctrees/pca/normal.doctree -------------------------------------------------------------------------------- /docs/_build/doctrees/pca/project.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atgu/GWASpy/f8b9f5eccbb67fa5ab462007634db3862198aa6e/docs/_build/doctrees/pca/project.doctree -------------------------------------------------------------------------------- /docs/_build/doctrees/phasing.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atgu/GWASpy/f8b9f5eccbb67fa5ab462007634db3862198aa6e/docs/_build/doctrees/phasing.doctree -------------------------------------------------------------------------------- /docs/_build/doctrees/preimp_qc.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atgu/GWASpy/f8b9f5eccbb67fa5ab462007634db3862198aa6e/docs/_build/doctrees/preimp_qc.doctree -------------------------------------------------------------------------------- /docs/_build/doctrees/qb.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atgu/GWASpy/f8b9f5eccbb67fa5ab462007634db3862198aa6e/docs/_build/doctrees/qb.doctree -------------------------------------------------------------------------------- /docs/_build/doctrees/tutorial.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atgu/GWASpy/f8b9f5eccbb67fa5ab462007634db3862198aa6e/docs/_build/doctrees/tutorial.doctree -------------------------------------------------------------------------------- /docs/_build/html/.buildinfo: -------------------------------------------------------------------------------- 1 | # Sphinx build info version 1 2 | # This file records the configuration used when building these files. When it is not found, a full rebuild will be done. 3 | config: 364152833ab1d15953fd0b10d3e8242a 4 | tags: 645f666f9bcd5a90fca523b33c5a78b7 5 | -------------------------------------------------------------------------------- /docs/_build/html/_images/qc_workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atgu/GWASpy/f8b9f5eccbb67fa5ab462007634db3862198aa6e/docs/_build/html/_images/qc_workflow.png -------------------------------------------------------------------------------- /docs/_build/html/_sources/imputation.rst.txt: -------------------------------------------------------------------------------- 1 | .. _sec-imputation: 2 | 3 | =================== 4 | Genotype Imputation 5 | =================== 6 | 7 | Genotype imputation is a process of estimating missing genotypes from the haplotype or genotype reference panel. It 8 | allows you to accurately evaluate the evidence for association at genetic markers that are not directly genotyped. 9 | GWASpy has a module, :code:`imputation`, for running imputation using IMPUTE5. Because imputation can be a computationally 10 | intensive task, we run it on multiple chunks in parallel, then merge the imputed chunks together at the end. Below are 11 | examples of how to run imputation using either the HGDP+1kGP or your own reference panel. 12 | 13 | Examples 14 | ######## 15 | 16 | **1. HGDP+1kGP reference panel** 17 | 18 | .. code-block:: sh 19 | 20 | imputation --input-file gs://path/to/file.vcf.bgz --vcf-ref hgdp1kgp --output-filename my_outfilename --out-dir gs://path/to/output/dir --n-samples 1989 --n-ref-samples 4091 --billing-project my-billing-project 21 | 22 | **2. Own reference panel** 23 | 24 | .. code-block:: python 25 | 26 | imputation --input-file gs://path/to/file.vcf.bgz --vcf-ref gs://path/to/ref_panel/ALL.chrCNUMBER.vcf --output-filename my_outfilename --out-dir gs://path/to/output/dir --n-samples 1989 --n-ref-samples 4091 --billing-project my-billing-project 27 | 28 | .. warning:: 29 | When using your own reference panel, make sure that you use the CNUMBER placeholder in the filename passed to --vcf-ref 30 | 31 | Arguments and options 32 | ##################### 33 | 34 | .. list-table:: 35 | :widths: 15 50 36 | :header-rows: 1 37 | 38 | * - Argument 39 | - Description 40 | * - :code:`--input-file` 41 | - Path to where the VCF or TSV with target VCF/BAM files is 42 | * - :code:`--vcf-ref` 43 | - Reference panel file to use for imputation 44 | * - :code:`--chromosomes` 45 | - Chromosome(s) to run imputation for. Default is :code:`all` 46 | * - :code:`--local` 47 | - Type of service. Default is Service backend where jobs are executed on a multi-tenant compute cluster in Google Cloud 48 | * - :code:`--billing-project` 49 | - Billing project to be used for the jobs 50 | * - :code:`--n-samples` 51 | - Number of target samples to be imputed. We use this to estimate resources for some of the jobs 52 | * - :code:`--n-ref-samples` 53 | - Number of reference samples. We use this to estimate resources for some of the jobs 54 | * - :code:`--software` 55 | - Software to use for phasing. Options: [:code:`beagle5`, :code:`impute5`]. Default is :code:`impute5` 56 | * - :code:`--output-filename` 57 | - Output filename without file extension 58 | * - :code:`--out-dir` 59 | - Path to where output files will be saved 60 | 61 | Output 62 | ###### 63 | The resulting output is a VCF file per chromosome with imputed genotypes. 64 | -------------------------------------------------------------------------------- /docs/_build/html/_sources/index.rst.txt: -------------------------------------------------------------------------------- 1 | ========== 2 | GWASpy 0.1 3 | ========== 4 | 5 | GWASpy is an open-source Python package for scalable: (1) Pre-imputation QC; (2) Principal Component Analysis; (3) Haplotype phasing; and (4) Genotype 6 | Imputation. See the `installation page `_ to get started 7 | using GWASpy. 8 | 9 | ======== 10 | Contents 11 | ======== 12 | 13 | .. toctree:: 14 | :maxdepth: 2 15 | 16 | Installation 17 | Hail Query and Batch 18 | Pre-Imputation QC 19 | Principal Component Analysis 20 | Haplotype Phasing 21 | Genotype Imputation 22 | Tutorial -------------------------------------------------------------------------------- /docs/_build/html/_sources/install/linux.rst.txt: -------------------------------------------------------------------------------- 1 | =========================== 2 | Install GWASpy on GNU/Linux 3 | =========================== 4 | 5 | - Install Java 8. 6 | - Install Python 3.6+. 7 | - Install a recent version of the C and C++ standard libraries. GCC 5.0, LLVM 8 | version 3.4, or any later versions suffice. 9 | - Install BLAS and LAPACK. 10 | - Install TeX Live 11 | - Install GWASpy using pip. 12 | 13 | On a recent Debian-like system, the following should suffice: 14 | 15 | .. code-block:: sh 16 | 17 | apt-get install -y \ 18 | openjdk-8-jre-headless \ 19 | g++ \ 20 | python3.6 python3-pip \ 21 | libopenblas-base liblapack3 \ 22 | texlive-pictures texlive-science texlive-latex-extra latexmk 23 | python3.6 -m pip install gwaspy 24 | -------------------------------------------------------------------------------- /docs/_build/html/_sources/install/macosx.rst.txt: -------------------------------------------------------------------------------- 1 | ========================== 2 | Install GWASpy on Mac OS X 3 | ========================== 4 | 5 | - Install `Java 8 `__. 6 | - Install Python 3.6+. 7 | - Install MacTeX 8 | - Open Terminal.app and execute ``pip3 install gwaspy``. 9 | -------------------------------------------------------------------------------- /docs/_build/html/_sources/installation.rst.txt: -------------------------------------------------------------------------------- 1 | .. _sec-installation: 2 | 3 | ================= 4 | Installing GWASpy 5 | ================= 6 | 7 | GWASpy leverages Hail to enable efficient processing of data directly from Google Cloud. As such, the first step is to 8 | install Hail as per instructions `here `_. After you have installed Hail, GWASpy can be easily installed using 9 | 10 | .. code-block:: sh 11 | 12 | pip install gwaspy 13 | 14 | It is important to note that the command above will install GWASpy locally (or wherever you ran the command). For the 15 | :code:`phasing` and :code:`imputation` modules using Hail Batch, this is enough. For the :code:`preimp_qc` and 16 | :code:`pca` modules using Hail Query, however, you have to ensure that the dataproc cluster has GWASpy, and there are 17 | examples showing how to do this in the :ref:`preimp_qc` and :ref:`pca` sections. 18 | -------------------------------------------------------------------------------- /docs/_build/html/_sources/pca.rst.txt: -------------------------------------------------------------------------------- 1 | .. _sec-pca: 2 | .. _pca: 3 | 4 | ============================ 5 | Principal Component Analysis 6 | ============================ 7 | 8 | Principal components analysis (PCA) can be used to detect and quantify the genetic structure of populations. 9 | In GWASpy, the :code:`pca` module can be run in 3 different ways: (1) normal PCA without a reference panel; (2) joint PCA; or (3) Projection PCA. 10 | 11 | .. toctree:: 12 | :maxdepth: 1 13 | 14 | Normal PCA 15 | Joint PCA 16 | Projection PCA 17 | 18 | Arguments and options 19 | ##################### 20 | 21 | .. list-table:: 22 | :widths: 15 50 23 | :header-rows: 1 24 | 25 | * - Argument 26 | - Description 27 | * - :code:`--ref-dirname` 28 | - Path to where reference data is 29 | * - :code:`--ref-basename` 30 | - Reference basename 31 | * - :code:`--ref-info` 32 | - Path to reference information. Tab-delimited file with sample IDs and their SuperPop labels 33 | * - :code:`--reference` 34 | - Genome reference build. Default is GRCh38. Options: [:code:`GRCh37`, :code:`GRCh38`] 35 | * - :code:`--pca-type` 36 | - Type of PCA to run. Default is normal. Options: [:code:`normal`, :code:`project`, :code:`joint`] 37 | * - :code:`--data-dirname` 38 | - Path to where the data is 39 | * - :code:`--data-basename` 40 | - Data basename 41 | * - :code:`--input-type` 42 | - Data input type. Options: [:code:`hail`, :code:`plink`, :code:`vcf`] 43 | * - :code:`--maf` 44 | - include only SNPs with MAF >= NUM in PCA. Default is 0.05 45 | * - :code:`--hwe` 46 | - include only SNPs with HWE >= NUM in PCA. Default is 1e-03 47 | * - :code:`--geno` 48 | - include only SNPs with call-rate > NUM. Default is 0.98 49 | * - :code:`--ld-cor` 50 | - Squared correlation threshold (exclusive upper bound). Must be in the range [0.0, 1.0]. Default is 0.2 51 | * - :code:`--ld-window` 52 | - Window size in base pairs (inclusive upper bound). Default is 250000 53 | * - :code:`--npcs` 54 | - Number of PCs to use. Default is 20 55 | * - :code:`--relatedness-method` 56 | - Method to use for the inference of relatedness. Default is pc_relate. Options: [:code:`pc_relate`, :code:`ibd`, :code:`king`] 57 | * - :code:`--relatedness-thresh` 58 | - Threshold value to use in relatedness checks. Default is 0.98 59 | * - :code:`--prob` 60 | - Minimum probability of belonging to a given population for the population to be set. Default is 0.8 61 | * - :code:`--out-dir` 62 | - Path to where output files will be saved 63 | 64 | Output 65 | ###### 66 | A tab-delimited file with the first 20 principal components (PCs) computed and 67 | graphical visualizations of the PCs are generated. 68 | -------------------------------------------------------------------------------- /docs/_build/html/_sources/pca/joint.rst.txt: -------------------------------------------------------------------------------- 1 | ================================ 2 | Joint PCA (with a reference) 3 | ================================ 4 | 5 | The joint PCA method works by first merging (joining), by locus and allele(s), the input dataset with the reference dataset. 6 | This is followed by "normal" PCA on the merged dataset 7 | 8 | Below is a code on how you can run joint PCA via the command-line or inside a Python script Use 9 | 10 | #. Python (inside a Python script) 11 | 12 | .. code-block:: python 13 | 14 | import gwaspy.pca as pca 15 | pca.pca.pca(data_dirname="gs://my-gcs/bucket/test_data/", data_basename="my_data_basename", 16 | out_dir="gs://my-gcs/bucket/test_data/", input_type="my_input_type", reference="GRCh37", 17 | pca_type="joint") 18 | 19 | #. Command line 20 | 21 | .. code-block:: sh 22 | 23 | pca --data-dirname gs://my-gcs/bucket/test_data/ --data-basename my_data_basename --out-dir gs://my-gcs/bucket/test_data/--input-type my_input_type --reference grch37 --pca-type joint -------------------------------------------------------------------------------- /docs/_build/html/_sources/pca/normal.rst.txt: -------------------------------------------------------------------------------- 1 | ================================ 2 | Normal PCA (without a reference) 3 | ================================ 4 | 5 | GWASpy allows you to run normal PCA without any reference panel 6 | 7 | Below is a code on how you can run normal PCA without a reference via the command-line or inside a Python script Use 8 | 9 | #. Python (inside a Python script) 10 | 11 | .. code-block:: python 12 | 13 | import gwaspy.pca as pca 14 | pca.pca.pca(data_dirname="gs://my-gcs/bucket/test_data/", data_basename="my_data_basename", 15 | out_dir="gs://my-gcs/bucket/test_data/", input_type="my_input_type", reference="GRCh37", 16 | pca_type="normal") 17 | 18 | #. Command line 19 | 20 | .. code-block:: sh 21 | 22 | pca --data-dirname gs://my-gcs/bucket/test_data/ --data-basename my_data_basename --out-dir gs://my-gcs/bucket/test_data/--input-type my_input_type --reference grch37 --pca-type normal -------------------------------------------------------------------------------- /docs/_build/html/_sources/pca/project.rst.txt: -------------------------------------------------------------------------------- 1 | ================================ 2 | Project PCA (with a reference) 3 | ================================ 4 | 5 | You can leverage reference panel information to see how samples in your data cluster on a "global" scale. 6 | PCs are computed using 1KG+HGDP dataset as a reference panel, and then samples in the input dataset are projected onto the 1KG+HGDP PC space. 7 | A random forest classifier model, adopted from gnomAD, is then used to assign population ancestries in the input dataset 8 | 9 | Below is a code on how you can run projection PCA via the command-line or inside a Python script Use 10 | 11 | #. Python (inside a Python script) 12 | 13 | .. code-block:: python 14 | 15 | import gwaspy.pca as pca 16 | pca.pca.pca(data_dirname="gs://my-gcs/bucket/test_data/", data_basename="my_data_basename", 17 | out_dir="gs://my-gcs/bucket/test_data/", input_type="my_input_type", reference="GRCh37", 18 | pca_type="project") 19 | 20 | #. Command line 21 | 22 | .. code-block:: sh 23 | 24 | pca --data-dirname gs://my-gcs/bucket/test_data/ --data-basename my_data_basename --out-dir gs://my-gcs/bucket/test_data/--input-type my_input_type --reference grch37 --pca-type project -------------------------------------------------------------------------------- /docs/_build/html/_sources/phasing.rst.txt: -------------------------------------------------------------------------------- 1 | .. _sec-phasing: 2 | 3 | ================= 4 | Haplotype Phasing 5 | ================= 6 | 7 | Knowing the phase of a haplotype can allow us to impute low frequency variants, this makes haplotype phasing an 8 | important step before genotype imputation. GWASpy has a module, :code:`phasing`, for performing phasing. Phasing can 9 | be run with or without a reference panel using SHAPEIT5 10 | 11 | GWASpy can handle both array and WGS data. For array data, the user can pass a VCF/BCF file with all the chromosomes, 12 | then GWASpy will use SHAPEIT5 to phase the chromosomes in parallel. Since WGS has more variants, phasing will be parallelized across 13 | multiple chunks in each chromosome. It's also important to note that phasing of WGS data includes phasing common 14 | variants first, followed by phasing rare variants. 15 | 16 | Another important aspect of phasing is the use of a reference panel. In many cases (small sample size), including a reference panel when 17 | phasing improves accuracy. By default, GWASpy runs phasing without a reference panel, but there is an option to use a 18 | reference panel as shown below. 19 | 20 | Examples 21 | ######## 22 | 23 | **1. Without a reference panel** 24 | 25 | .. code-block:: sh 26 | 27 | phasing --input-vcf gs://path/to/file.vcf.bgz --output-filename outfilename.phased --out-dir gs://path/to/output/dir --genome-build GRCh38 --billing-project my-billing-project 28 | 29 | **2. HGDP+1KG reference panel** 30 | 31 | Set :code:`--vcf-ref` to :code:`hgdp1kgp` 32 | 33 | .. code-block:: sh 34 | 35 | phasing --input-vcf gs://path/to/file.vcf.bgz --output-filename my_outfilename --out-dir gs://path/to/output/dir --genome-build GRCh38 --billing-project my-billing-project --vcf-ref hgdp1kgp 36 | 37 | **3. Own reference panel** 38 | 39 | .. note:: 40 | 1. If you're using your own reference panel, make sure the files are bgzip compressed. 41 | 2. Chromosome X reference file must be named X and not 23 42 | 43 | Say you have your reference panel files for each chromosomes stored in gs://ref_panel/ALL.chr{1..22,X}.vcf, 44 | you would pass the path to :code:`--vcf-ref` as gs://ref_panel/ALL.chr\ **CNUMBER**\ .vcf. 45 | GWASpy uses **CNUMBER** as a placeholder for the chromosomes. Then you can run phasing as: 46 | 47 | .. code-block:: sh 48 | 49 | phasing --input-vcf gs://path/to/file.vcf.bgz --output-filename outfilename.phased --out-dir gs://path/to/output/dir --genome-build GRCh38 --billing-project my-billing-project --vcf-ref gs://ref_panel/ALL.chrCNUMBER.vcf 50 | 51 | .. note:: 52 | For nextflow users, the idea is the same. The only difference is you have to update the params.json file. Examples 53 | are provided in the tutorial section of the documentation 54 | 55 | Arguments and options 56 | ##################### 57 | 58 | .. list-table:: 59 | :widths: 15 50 60 | :header-rows: 1 61 | 62 | * - Argument 63 | - Description 64 | * - :code:`--input-vcf` 65 | - Path to where VCF file to be phased is 66 | * - :code:`--vcf-ref` 67 | - VCF file for reference haplotypes if phasing with a reference panel 68 | * - :code:`--pedigree` 69 | - Pedigree (PLINK FAM) file 70 | * - :code:`--local` 71 | - Type of service. Default is Service backend where jobs are executed on a multi-tenant compute cluster in Google Cloud 72 | * - :code:`--billing-project` 73 | - Billing project to be used for the job(s) 74 | * - :code:`--genome-build` 75 | - Genome reference build. Default is GRCh38. Options: [:code:`GRCh37`, :code:`GRCh38`] 76 | * - :code:`--data-type` 77 | - Array or WGS data. Default is array. Options: [:code:`array`, :code:`wgs`]. 78 | * - :code:`--fill-tags` 79 | - Whether or not to add AC tag required by SHAPEIT5. Including :code:`--fill-tags`, in your command will enable this step 80 | * - :code:`--software` 81 | - Software to use for phasing. Options: [:code:`beagle`, :code:`shapeit`]. Default is :code:`shapeit` 82 | * - :code:`--output-filename` 83 | - Output filename without file extension 84 | * - :code:`--out-dir` 85 | - Path to where output files will be saved 86 | 87 | Output 88 | ###### 89 | The resulting output is a VCF file per chromosome with phased haplotypes. 90 | -------------------------------------------------------------------------------- /docs/_build/html/_sources/preimp_qc.rst.txt: -------------------------------------------------------------------------------- 1 | .. _sec-pre_imputation_qc: 2 | .. _preimp_qc: 3 | 4 | =================================== 5 | Pre-Imputation Quality Control (QC) 6 | =================================== 7 | 8 | Detecting and correcting issues such as genotyping errors, sample handling errors, population stratification etc 9 | is important in GWAS. The :code:`preimp_qc` module addresses these issues and cleans (QC) your data. Below is a flow diagram 10 | of the filters applied when QC'ing input data: 11 | 12 | .. image:: images/qc_workflow.png 13 | :width: 1000px 14 | :height: 1900px 15 | :scale: 50 % 16 | :align: center 17 | 18 | 19 | Arguments and options 20 | ##################### 21 | 22 | .. list-table:: 23 | :widths: 15 50 24 | :header-rows: 1 25 | 26 | * - Argument 27 | - Description 28 | * - :code:`--dirname` 29 | - Path to where the data is 30 | * - :code:`--basename` 31 | - Data basename 32 | * - :code:`--input-type` 33 | - Input type. Options: [:code:`hail`, :code:`plink`, :code:`vcf`] 34 | * - :code:`--export-type` 35 | - Export type. Options: [:code:`hail`, :code:`plink`, :code:`vcf`] 36 | * - :code:`--out-dir` 37 | - Directory path to where output files are going to be saved 38 | * - :code:`--annotations` 39 | - Annotations file to be used for annotating sample with information such as Sex and Phenotype 40 | * - :code:`--reference` 41 | - Reference genome build. Default is GRCh38. Options: [:code:`GRCh37`, :code:`GRCh38`] 42 | * - :code:`--report` 43 | - Generate a QC PDF report or not. Default is True 44 | * - :code:`--liftover` 45 | - Liftover input data to GRCh38 or not, default is False. Running :code:`preimp_qc` with :code:`--liftover` will activate liftover 46 | * - :code:`--pre-geno` 47 | - include only SNPs with missing-rate < NUM (before ID filter), important for post merge of multiple platforms 48 | * - :code:`--mind` 49 | - include only IDs with missing-rate < NUM 50 | * - :code:`--fhet-aut` 51 | - include only IDs within NUM < FHET < NUM 52 | * - :code:`--fstat-y` 53 | - include only female IDs with fhet < NUM 54 | * - :code:`--fstat-x` 55 | - include only male IDs with fhet > NUM 56 | * - :code:`--geno` 57 | - include only SNPs with missing-rate < NUM 58 | * - :code:`--midi` 59 | - include only SNPs with missing-rate-difference (case/control) < NUM 60 | * - :code:`--withpna` 61 | - include monomorphic (invariant) SNPs 62 | * - :code:`--maf` 63 | - include only SNPs with MAF >= NUM 64 | * - :code:`--hwe-th-con` 65 | - HWE_controls < NUM 66 | * - :code:`--hwe-th-cas` 67 | - HWE_cases < NUM 68 | 69 | Output(s) 70 | ########## 71 | * QC'ed file(s) i.e. file with all the variants and/or samples that fail QC filters removed 72 | * A detailed PDF QC report including pre- and post-QC variant/sample counts, figures such as Manhattan and QQ plots etc. 73 | 74 | 75 | Examples 76 | ######## 77 | 78 | All the code below assumes the user already has a Dataproc cluster running as described in the `previous section `_ 79 | 80 | You can run pre-imputation qc using the :code:`preimp_qc` module (1) inside a python script; or (2) via the command line 81 | 82 | 1. Python script - submitting a python script to a cluster from local machine (Highly recommended) 83 | 84 | - First create a python script on your local machine as below 85 | 86 | .. code-block:: python 87 | 88 | import gwaspy.preimp_qc as qc 89 | qc.preimp_qc.preimp_qc(dirname="gs://my-gcs/bucket/test_data/", basename="my_data_basename", 90 | input_type="my_input_type") 91 | 92 | - Then run the following command to submit the script to the Dataproc cluster named `my-cluster-name` 93 | 94 | .. code-block:: sh 95 | 96 | hailctl dataproc submit my-cluster-name qc_script.py 97 | 98 | 2. Command line - requires user to SSH'ed to a cluster 99 | 100 | Users may encounter `this error `_ when trying to run things from the command line 101 | 102 | - This requires the user to be inside (`gcloud compute ssh`) the Dataproc cluster with GWASpy already installed 103 | 104 | .. code-block:: sh 105 | 106 | gcloud compute ssh "my-cluster-name-m" 107 | preimp_qc --dirname gs://my-gcs/bucket/test_data/ --basename my_data_basename --input-type my_input_type 108 | -------------------------------------------------------------------------------- /docs/_build/html/_sources/qb.rst.txt: -------------------------------------------------------------------------------- 1 | .. _sec-qb: 2 | 3 | ==================== 4 | Hail Query and Batch 5 | ==================== 6 | 7 | The four GWASpy modules use two different backends: :code:`preimp_qc` and :code:`pca` use Hail Query, while 8 | :code:`phasing` and :code:`imputation` modules use Batch (Hail Batch for Broad users and nextflow for non-Broad users). 9 | Hail Query is well-suited for manipulating large genomics data in a highly parallelised environments such as Dataproc. 10 | `Batch `_, on the other hand, is good for batch processing (scheduling, 11 | queueing, and executing) workloads on Google Cloud resources. 12 | 13 | All the instructions below assume the user has a Google account and an active (Google) Cloud billing account 14 | 15 | Query 16 | ##### 17 | 18 | For running the :code:`preimp_qc` and :code:`pca` modules, you need to start a Dataproc cluster. Hail has a command-line 19 | tool, `hailctl `_, for doing this and it is installed automatically when 20 | you install Hail. We highly recommend setting a maximum age for the cluster (:code:`--max-age`), this will ensure the cluster is 21 | automatically deleted after the specified time. 22 | 23 | Below is how you can start a cluster with GWASpy pre-installed: 24 | 25 | .. code-block:: sh 26 | 27 | hailctl dataproc start my-cluster-name -region=us-central1 --packages gwaspy --max-age 4h 28 | 29 | To shut down the cluster, you can run: 30 | 31 | .. code-block:: sh 32 | 33 | hailctl dataproc stop my-cluster-name --region=us-central1 34 | 35 | Batch 36 | ##### 37 | 38 | The :code:`phasing` and :code:`imputation` modules use Batch as the backend. For Broad users with a Hail Batch account, 39 | there is no setup needed, you can proceed to running the modules. For non-Broad users, we have a nextflow implementation 40 | of the modules that requires nextflow setup first. Follow the steps here to: `(1) install nextflow `_; and 41 | `(2) setup Google Cloud Batch for nextflow `_ 42 | -------------------------------------------------------------------------------- /docs/_build/html/_static/_sphinx_javascript_frameworks_compat.js: -------------------------------------------------------------------------------- 1 | /* Compatability shim for jQuery and underscores.js. 2 | * 3 | * Copyright Sphinx contributors 4 | * Released under the two clause BSD licence 5 | */ 6 | 7 | /** 8 | * small helper function to urldecode strings 9 | * 10 | * See https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/decodeURIComponent#Decoding_query_parameters_from_a_URL 11 | */ 12 | jQuery.urldecode = function(x) { 13 | if (!x) { 14 | return x 15 | } 16 | return decodeURIComponent(x.replace(/\+/g, ' ')); 17 | }; 18 | 19 | /** 20 | * small helper function to urlencode strings 21 | */ 22 | jQuery.urlencode = encodeURIComponent; 23 | 24 | /** 25 | * This function returns the parsed url parameters of the 26 | * current request. Multiple values per key are supported, 27 | * it will always return arrays of strings for the value parts. 28 | */ 29 | jQuery.getQueryParameters = function(s) { 30 | if (typeof s === 'undefined') 31 | s = document.location.search; 32 | var parts = s.substr(s.indexOf('?') + 1).split('&'); 33 | var result = {}; 34 | for (var i = 0; i < parts.length; i++) { 35 | var tmp = parts[i].split('=', 2); 36 | var key = jQuery.urldecode(tmp[0]); 37 | var value = jQuery.urldecode(tmp[1]); 38 | if (key in result) 39 | result[key].push(value); 40 | else 41 | result[key] = [value]; 42 | } 43 | return result; 44 | }; 45 | 46 | /** 47 | * highlight a given string on a jquery object by wrapping it in 48 | * span elements with the given class name. 49 | */ 50 | jQuery.fn.highlightText = function(text, className) { 51 | function highlight(node, addItems) { 52 | if (node.nodeType === 3) { 53 | var val = node.nodeValue; 54 | var pos = val.toLowerCase().indexOf(text); 55 | if (pos >= 0 && 56 | !jQuery(node.parentNode).hasClass(className) && 57 | !jQuery(node.parentNode).hasClass("nohighlight")) { 58 | var span; 59 | var isInSVG = jQuery(node).closest("body, svg, foreignObject").is("svg"); 60 | if (isInSVG) { 61 | span = document.createElementNS("http://www.w3.org/2000/svg", "tspan"); 62 | } else { 63 | span = document.createElement("span"); 64 | span.className = className; 65 | } 66 | span.appendChild(document.createTextNode(val.substr(pos, text.length))); 67 | node.parentNode.insertBefore(span, node.parentNode.insertBefore( 68 | document.createTextNode(val.substr(pos + text.length)), 69 | node.nextSibling)); 70 | node.nodeValue = val.substr(0, pos); 71 | if (isInSVG) { 72 | var rect = document.createElementNS("http://www.w3.org/2000/svg", "rect"); 73 | var bbox = node.parentElement.getBBox(); 74 | rect.x.baseVal.value = bbox.x; 75 | rect.y.baseVal.value = bbox.y; 76 | rect.width.baseVal.value = bbox.width; 77 | rect.height.baseVal.value = bbox.height; 78 | rect.setAttribute('class', className); 79 | addItems.push({ 80 | "parent": node.parentNode, 81 | "target": rect}); 82 | } 83 | } 84 | } 85 | else if (!jQuery(node).is("button, select, textarea")) { 86 | jQuery.each(node.childNodes, function() { 87 | highlight(this, addItems); 88 | }); 89 | } 90 | } 91 | var addItems = []; 92 | var result = this.each(function() { 93 | highlight(this, addItems); 94 | }); 95 | for (var i = 0; i < addItems.length; ++i) { 96 | jQuery(addItems[i].parent).before(addItems[i].target); 97 | } 98 | return result; 99 | }; 100 | 101 | /* 102 | * backward compatibility for jQuery.browser 103 | * This will be supported until firefox bug is fixed. 104 | */ 105 | if (!jQuery.browser) { 106 | jQuery.uaMatch = function(ua) { 107 | ua = ua.toLowerCase(); 108 | 109 | var match = /(chrome)[ \/]([\w.]+)/.exec(ua) || 110 | /(webkit)[ \/]([\w.]+)/.exec(ua) || 111 | /(opera)(?:.*version|)[ \/]([\w.]+)/.exec(ua) || 112 | /(msie) ([\w.]+)/.exec(ua) || 113 | ua.indexOf("compatible") < 0 && /(mozilla)(?:.*? rv:([\w.]+)|)/.exec(ua) || 114 | []; 115 | 116 | return { 117 | browser: match[ 1 ] || "", 118 | version: match[ 2 ] || "0" 119 | }; 120 | }; 121 | jQuery.browser = {}; 122 | jQuery.browser[jQuery.uaMatch(navigator.userAgent).browser] = true; 123 | } 124 | -------------------------------------------------------------------------------- /docs/_build/html/_static/css/badge_only.css: -------------------------------------------------------------------------------- 1 | .clearfix{*zoom:1}.clearfix:after,.clearfix:before{display:table;content:""}.clearfix:after{clear:both}@font-face{font-family:FontAwesome;font-style:normal;font-weight:400;src:url(fonts/fontawesome-webfont.eot?674f50d287a8c48dc19ba404d20fe713?#iefix) format("embedded-opentype"),url(fonts/fontawesome-webfont.woff2?af7ae505a9eed503f8b8e6982036873e) format("woff2"),url(fonts/fontawesome-webfont.woff?fee66e712a8a08eef5805a46892932ad) format("woff"),url(fonts/fontawesome-webfont.ttf?b06871f281fee6b241d60582ae9369b9) format("truetype"),url(fonts/fontawesome-webfont.svg?912ec66d7572ff821749319396470bde#FontAwesome) format("svg")}.fa:before{font-family:FontAwesome;font-style:normal;font-weight:400;line-height:1}.fa:before,a .fa{text-decoration:inherit}.fa:before,a .fa,li .fa{display:inline-block}li .fa-large:before{width:1.875em}ul.fas{list-style-type:none;margin-left:2em;text-indent:-.8em}ul.fas li .fa{width:.8em}ul.fas li .fa-large:before{vertical-align:baseline}.fa-book:before,.icon-book:before{content:"\f02d"}.fa-caret-down:before,.icon-caret-down:before{content:"\f0d7"}.fa-caret-up:before,.icon-caret-up:before{content:"\f0d8"}.fa-caret-left:before,.icon-caret-left:before{content:"\f0d9"}.fa-caret-right:before,.icon-caret-right:before{content:"\f0da"}.rst-versions{position:fixed;bottom:0;left:0;width:300px;color:#fcfcfc;background:#1f1d1d;font-family:Lato,proxima-nova,Helvetica Neue,Arial,sans-serif;z-index:400}.rst-versions a{color:#2980b9;text-decoration:none}.rst-versions .rst-badge-small{display:none}.rst-versions .rst-current-version{padding:12px;background-color:#272525;display:block;text-align:right;font-size:90%;cursor:pointer;color:#27ae60}.rst-versions .rst-current-version:after{clear:both;content:"";display:block}.rst-versions .rst-current-version .fa{color:#fcfcfc}.rst-versions .rst-current-version .fa-book,.rst-versions .rst-current-version .icon-book{float:left}.rst-versions .rst-current-version.rst-out-of-date{background-color:#e74c3c;color:#fff}.rst-versions .rst-current-version.rst-active-old-version{background-color:#f1c40f;color:#000}.rst-versions.shift-up{height:auto;max-height:100%;overflow-y:scroll}.rst-versions.shift-up .rst-other-versions{display:block}.rst-versions .rst-other-versions{font-size:90%;padding:12px;color:grey;display:none}.rst-versions .rst-other-versions hr{display:block;height:1px;border:0;margin:20px 0;padding:0;border-top:1px solid #413d3d}.rst-versions .rst-other-versions dd{display:inline-block;margin:0}.rst-versions .rst-other-versions dd a{display:inline-block;padding:6px;color:#fcfcfc}.rst-versions .rst-other-versions .rtd-current-item{font-weight:700}.rst-versions.rst-badge{width:auto;bottom:20px;right:20px;left:auto;border:none;max-width:300px;max-height:90%}.rst-versions.rst-badge .fa-book,.rst-versions.rst-badge .icon-book{float:none;line-height:30px}.rst-versions.rst-badge.shift-up .rst-current-version{text-align:right}.rst-versions.rst-badge.shift-up .rst-current-version .fa-book,.rst-versions.rst-badge.shift-up .rst-current-version .icon-book{float:left}.rst-versions.rst-badge>.rst-current-version{width:auto;height:30px;line-height:30px;padding:0 6px;display:block;text-align:center}@media screen and (max-width:768px){.rst-versions{width:85%;display:none}.rst-versions.shift{display:block}}#flyout-search-form{padding:6px} -------------------------------------------------------------------------------- /docs/_build/html/_static/css/fonts/Roboto-Slab-Bold.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atgu/GWASpy/f8b9f5eccbb67fa5ab462007634db3862198aa6e/docs/_build/html/_static/css/fonts/Roboto-Slab-Bold.woff -------------------------------------------------------------------------------- /docs/_build/html/_static/css/fonts/Roboto-Slab-Bold.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atgu/GWASpy/f8b9f5eccbb67fa5ab462007634db3862198aa6e/docs/_build/html/_static/css/fonts/Roboto-Slab-Bold.woff2 -------------------------------------------------------------------------------- /docs/_build/html/_static/css/fonts/Roboto-Slab-Regular.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atgu/GWASpy/f8b9f5eccbb67fa5ab462007634db3862198aa6e/docs/_build/html/_static/css/fonts/Roboto-Slab-Regular.woff -------------------------------------------------------------------------------- /docs/_build/html/_static/css/fonts/Roboto-Slab-Regular.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atgu/GWASpy/f8b9f5eccbb67fa5ab462007634db3862198aa6e/docs/_build/html/_static/css/fonts/Roboto-Slab-Regular.woff2 -------------------------------------------------------------------------------- /docs/_build/html/_static/css/fonts/fontawesome-webfont.eot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atgu/GWASpy/f8b9f5eccbb67fa5ab462007634db3862198aa6e/docs/_build/html/_static/css/fonts/fontawesome-webfont.eot -------------------------------------------------------------------------------- /docs/_build/html/_static/css/fonts/fontawesome-webfont.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atgu/GWASpy/f8b9f5eccbb67fa5ab462007634db3862198aa6e/docs/_build/html/_static/css/fonts/fontawesome-webfont.ttf -------------------------------------------------------------------------------- /docs/_build/html/_static/css/fonts/fontawesome-webfont.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atgu/GWASpy/f8b9f5eccbb67fa5ab462007634db3862198aa6e/docs/_build/html/_static/css/fonts/fontawesome-webfont.woff -------------------------------------------------------------------------------- /docs/_build/html/_static/css/fonts/fontawesome-webfont.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atgu/GWASpy/f8b9f5eccbb67fa5ab462007634db3862198aa6e/docs/_build/html/_static/css/fonts/fontawesome-webfont.woff2 -------------------------------------------------------------------------------- /docs/_build/html/_static/css/fonts/lato-bold-italic.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atgu/GWASpy/f8b9f5eccbb67fa5ab462007634db3862198aa6e/docs/_build/html/_static/css/fonts/lato-bold-italic.woff -------------------------------------------------------------------------------- /docs/_build/html/_static/css/fonts/lato-bold-italic.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atgu/GWASpy/f8b9f5eccbb67fa5ab462007634db3862198aa6e/docs/_build/html/_static/css/fonts/lato-bold-italic.woff2 -------------------------------------------------------------------------------- /docs/_build/html/_static/css/fonts/lato-bold.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atgu/GWASpy/f8b9f5eccbb67fa5ab462007634db3862198aa6e/docs/_build/html/_static/css/fonts/lato-bold.woff -------------------------------------------------------------------------------- /docs/_build/html/_static/css/fonts/lato-bold.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atgu/GWASpy/f8b9f5eccbb67fa5ab462007634db3862198aa6e/docs/_build/html/_static/css/fonts/lato-bold.woff2 -------------------------------------------------------------------------------- /docs/_build/html/_static/css/fonts/lato-normal-italic.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atgu/GWASpy/f8b9f5eccbb67fa5ab462007634db3862198aa6e/docs/_build/html/_static/css/fonts/lato-normal-italic.woff -------------------------------------------------------------------------------- /docs/_build/html/_static/css/fonts/lato-normal-italic.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atgu/GWASpy/f8b9f5eccbb67fa5ab462007634db3862198aa6e/docs/_build/html/_static/css/fonts/lato-normal-italic.woff2 -------------------------------------------------------------------------------- /docs/_build/html/_static/css/fonts/lato-normal.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atgu/GWASpy/f8b9f5eccbb67fa5ab462007634db3862198aa6e/docs/_build/html/_static/css/fonts/lato-normal.woff -------------------------------------------------------------------------------- /docs/_build/html/_static/css/fonts/lato-normal.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atgu/GWASpy/f8b9f5eccbb67fa5ab462007634db3862198aa6e/docs/_build/html/_static/css/fonts/lato-normal.woff2 -------------------------------------------------------------------------------- /docs/_build/html/_static/custom.css: -------------------------------------------------------------------------------- 1 | .wy-nav-content { 2 | max-width: none; 3 | } 4 | -------------------------------------------------------------------------------- /docs/_build/html/_static/doctools.js: -------------------------------------------------------------------------------- 1 | /* 2 | * Base JavaScript utilities for all Sphinx HTML documentation. 3 | */ 4 | "use strict"; 5 | 6 | const BLACKLISTED_KEY_CONTROL_ELEMENTS = new Set([ 7 | "TEXTAREA", 8 | "INPUT", 9 | "SELECT", 10 | "BUTTON", 11 | ]); 12 | 13 | const _ready = (callback) => { 14 | if (document.readyState !== "loading") { 15 | callback(); 16 | } else { 17 | document.addEventListener("DOMContentLoaded", callback); 18 | } 19 | }; 20 | 21 | /** 22 | * Small JavaScript module for the documentation. 23 | */ 24 | const Documentation = { 25 | init: () => { 26 | Documentation.initDomainIndexTable(); 27 | Documentation.initOnKeyListeners(); 28 | }, 29 | 30 | /** 31 | * i18n support 32 | */ 33 | TRANSLATIONS: {}, 34 | PLURAL_EXPR: (n) => (n === 1 ? 0 : 1), 35 | LOCALE: "unknown", 36 | 37 | // gettext and ngettext don't access this so that the functions 38 | // can safely bound to a different name (_ = Documentation.gettext) 39 | gettext: (string) => { 40 | const translated = Documentation.TRANSLATIONS[string]; 41 | switch (typeof translated) { 42 | case "undefined": 43 | return string; // no translation 44 | case "string": 45 | return translated; // translation exists 46 | default: 47 | return translated[0]; // (singular, plural) translation tuple exists 48 | } 49 | }, 50 | 51 | ngettext: (singular, plural, n) => { 52 | const translated = Documentation.TRANSLATIONS[singular]; 53 | if (typeof translated !== "undefined") 54 | return translated[Documentation.PLURAL_EXPR(n)]; 55 | return n === 1 ? singular : plural; 56 | }, 57 | 58 | addTranslations: (catalog) => { 59 | Object.assign(Documentation.TRANSLATIONS, catalog.messages); 60 | Documentation.PLURAL_EXPR = new Function( 61 | "n", 62 | `return (${catalog.plural_expr})` 63 | ); 64 | Documentation.LOCALE = catalog.locale; 65 | }, 66 | 67 | /** 68 | * helper function to focus on search bar 69 | */ 70 | focusSearchBar: () => { 71 | document.querySelectorAll("input[name=q]")[0]?.focus(); 72 | }, 73 | 74 | /** 75 | * Initialise the domain index toggle buttons 76 | */ 77 | initDomainIndexTable: () => { 78 | const toggler = (el) => { 79 | const idNumber = el.id.substr(7); 80 | const toggledRows = document.querySelectorAll(`tr.cg-${idNumber}`); 81 | if (el.src.substr(-9) === "minus.png") { 82 | el.src = `${el.src.substr(0, el.src.length - 9)}plus.png`; 83 | toggledRows.forEach((el) => (el.style.display = "none")); 84 | } else { 85 | el.src = `${el.src.substr(0, el.src.length - 8)}minus.png`; 86 | toggledRows.forEach((el) => (el.style.display = "")); 87 | } 88 | }; 89 | 90 | const togglerElements = document.querySelectorAll("img.toggler"); 91 | togglerElements.forEach((el) => 92 | el.addEventListener("click", (event) => toggler(event.currentTarget)) 93 | ); 94 | togglerElements.forEach((el) => (el.style.display = "")); 95 | if (DOCUMENTATION_OPTIONS.COLLAPSE_INDEX) togglerElements.forEach(toggler); 96 | }, 97 | 98 | initOnKeyListeners: () => { 99 | // only install a listener if it is really needed 100 | if ( 101 | !DOCUMENTATION_OPTIONS.NAVIGATION_WITH_KEYS && 102 | !DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS 103 | ) 104 | return; 105 | 106 | document.addEventListener("keydown", (event) => { 107 | // bail for input elements 108 | if (BLACKLISTED_KEY_CONTROL_ELEMENTS.has(document.activeElement.tagName)) return; 109 | // bail with special keys 110 | if (event.altKey || event.ctrlKey || event.metaKey) return; 111 | 112 | if (!event.shiftKey) { 113 | switch (event.key) { 114 | case "ArrowLeft": 115 | if (!DOCUMENTATION_OPTIONS.NAVIGATION_WITH_KEYS) break; 116 | 117 | const prevLink = document.querySelector('link[rel="prev"]'); 118 | if (prevLink && prevLink.href) { 119 | window.location.href = prevLink.href; 120 | event.preventDefault(); 121 | } 122 | break; 123 | case "ArrowRight": 124 | if (!DOCUMENTATION_OPTIONS.NAVIGATION_WITH_KEYS) break; 125 | 126 | const nextLink = document.querySelector('link[rel="next"]'); 127 | if (nextLink && nextLink.href) { 128 | window.location.href = nextLink.href; 129 | event.preventDefault(); 130 | } 131 | break; 132 | } 133 | } 134 | 135 | // some keyboard layouts may need Shift to get / 136 | switch (event.key) { 137 | case "/": 138 | if (!DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS) break; 139 | Documentation.focusSearchBar(); 140 | event.preventDefault(); 141 | } 142 | }); 143 | }, 144 | }; 145 | 146 | // quick alias for translations 147 | const _ = Documentation.gettext; 148 | 149 | _ready(Documentation.init); 150 | -------------------------------------------------------------------------------- /docs/_build/html/_static/documentation_options.js: -------------------------------------------------------------------------------- 1 | const DOCUMENTATION_OPTIONS = { 2 | VERSION: '0.1.0', 3 | LANGUAGE: 'en', 4 | COLLAPSE_INDEX: false, 5 | BUILDER: 'html', 6 | FILE_SUFFIX: '.html', 7 | LINK_SUFFIX: '.html', 8 | HAS_SOURCE: true, 9 | SOURCELINK_SUFFIX: '.txt', 10 | NAVIGATION_WITH_KEYS: false, 11 | SHOW_SEARCH_SUMMARY: true, 12 | ENABLE_SEARCH_SHORTCUTS: true, 13 | }; -------------------------------------------------------------------------------- /docs/_build/html/_static/file.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atgu/GWASpy/f8b9f5eccbb67fa5ab462007634db3862198aa6e/docs/_build/html/_static/file.png -------------------------------------------------------------------------------- /docs/_build/html/_static/fonts/Lato/lato-bold.eot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atgu/GWASpy/f8b9f5eccbb67fa5ab462007634db3862198aa6e/docs/_build/html/_static/fonts/Lato/lato-bold.eot -------------------------------------------------------------------------------- /docs/_build/html/_static/fonts/Lato/lato-bold.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atgu/GWASpy/f8b9f5eccbb67fa5ab462007634db3862198aa6e/docs/_build/html/_static/fonts/Lato/lato-bold.ttf -------------------------------------------------------------------------------- /docs/_build/html/_static/fonts/Lato/lato-bold.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atgu/GWASpy/f8b9f5eccbb67fa5ab462007634db3862198aa6e/docs/_build/html/_static/fonts/Lato/lato-bold.woff -------------------------------------------------------------------------------- /docs/_build/html/_static/fonts/Lato/lato-bold.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atgu/GWASpy/f8b9f5eccbb67fa5ab462007634db3862198aa6e/docs/_build/html/_static/fonts/Lato/lato-bold.woff2 -------------------------------------------------------------------------------- /docs/_build/html/_static/fonts/Lato/lato-bolditalic.eot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atgu/GWASpy/f8b9f5eccbb67fa5ab462007634db3862198aa6e/docs/_build/html/_static/fonts/Lato/lato-bolditalic.eot -------------------------------------------------------------------------------- /docs/_build/html/_static/fonts/Lato/lato-bolditalic.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atgu/GWASpy/f8b9f5eccbb67fa5ab462007634db3862198aa6e/docs/_build/html/_static/fonts/Lato/lato-bolditalic.ttf -------------------------------------------------------------------------------- /docs/_build/html/_static/fonts/Lato/lato-bolditalic.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atgu/GWASpy/f8b9f5eccbb67fa5ab462007634db3862198aa6e/docs/_build/html/_static/fonts/Lato/lato-bolditalic.woff -------------------------------------------------------------------------------- /docs/_build/html/_static/fonts/Lato/lato-bolditalic.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atgu/GWASpy/f8b9f5eccbb67fa5ab462007634db3862198aa6e/docs/_build/html/_static/fonts/Lato/lato-bolditalic.woff2 -------------------------------------------------------------------------------- /docs/_build/html/_static/fonts/Lato/lato-italic.eot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atgu/GWASpy/f8b9f5eccbb67fa5ab462007634db3862198aa6e/docs/_build/html/_static/fonts/Lato/lato-italic.eot -------------------------------------------------------------------------------- /docs/_build/html/_static/fonts/Lato/lato-italic.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atgu/GWASpy/f8b9f5eccbb67fa5ab462007634db3862198aa6e/docs/_build/html/_static/fonts/Lato/lato-italic.ttf -------------------------------------------------------------------------------- /docs/_build/html/_static/fonts/Lato/lato-italic.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atgu/GWASpy/f8b9f5eccbb67fa5ab462007634db3862198aa6e/docs/_build/html/_static/fonts/Lato/lato-italic.woff -------------------------------------------------------------------------------- /docs/_build/html/_static/fonts/Lato/lato-italic.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atgu/GWASpy/f8b9f5eccbb67fa5ab462007634db3862198aa6e/docs/_build/html/_static/fonts/Lato/lato-italic.woff2 -------------------------------------------------------------------------------- /docs/_build/html/_static/fonts/Lato/lato-regular.eot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atgu/GWASpy/f8b9f5eccbb67fa5ab462007634db3862198aa6e/docs/_build/html/_static/fonts/Lato/lato-regular.eot -------------------------------------------------------------------------------- /docs/_build/html/_static/fonts/Lato/lato-regular.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atgu/GWASpy/f8b9f5eccbb67fa5ab462007634db3862198aa6e/docs/_build/html/_static/fonts/Lato/lato-regular.ttf -------------------------------------------------------------------------------- /docs/_build/html/_static/fonts/Lato/lato-regular.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atgu/GWASpy/f8b9f5eccbb67fa5ab462007634db3862198aa6e/docs/_build/html/_static/fonts/Lato/lato-regular.woff -------------------------------------------------------------------------------- /docs/_build/html/_static/fonts/Lato/lato-regular.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atgu/GWASpy/f8b9f5eccbb67fa5ab462007634db3862198aa6e/docs/_build/html/_static/fonts/Lato/lato-regular.woff2 -------------------------------------------------------------------------------- /docs/_build/html/_static/fonts/RobotoSlab/roboto-slab-v7-bold.eot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atgu/GWASpy/f8b9f5eccbb67fa5ab462007634db3862198aa6e/docs/_build/html/_static/fonts/RobotoSlab/roboto-slab-v7-bold.eot -------------------------------------------------------------------------------- /docs/_build/html/_static/fonts/RobotoSlab/roboto-slab-v7-bold.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atgu/GWASpy/f8b9f5eccbb67fa5ab462007634db3862198aa6e/docs/_build/html/_static/fonts/RobotoSlab/roboto-slab-v7-bold.ttf -------------------------------------------------------------------------------- /docs/_build/html/_static/fonts/RobotoSlab/roboto-slab-v7-bold.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atgu/GWASpy/f8b9f5eccbb67fa5ab462007634db3862198aa6e/docs/_build/html/_static/fonts/RobotoSlab/roboto-slab-v7-bold.woff -------------------------------------------------------------------------------- /docs/_build/html/_static/fonts/RobotoSlab/roboto-slab-v7-bold.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atgu/GWASpy/f8b9f5eccbb67fa5ab462007634db3862198aa6e/docs/_build/html/_static/fonts/RobotoSlab/roboto-slab-v7-bold.woff2 -------------------------------------------------------------------------------- /docs/_build/html/_static/fonts/RobotoSlab/roboto-slab-v7-regular.eot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atgu/GWASpy/f8b9f5eccbb67fa5ab462007634db3862198aa6e/docs/_build/html/_static/fonts/RobotoSlab/roboto-slab-v7-regular.eot -------------------------------------------------------------------------------- /docs/_build/html/_static/fonts/RobotoSlab/roboto-slab-v7-regular.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atgu/GWASpy/f8b9f5eccbb67fa5ab462007634db3862198aa6e/docs/_build/html/_static/fonts/RobotoSlab/roboto-slab-v7-regular.ttf -------------------------------------------------------------------------------- /docs/_build/html/_static/fonts/RobotoSlab/roboto-slab-v7-regular.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atgu/GWASpy/f8b9f5eccbb67fa5ab462007634db3862198aa6e/docs/_build/html/_static/fonts/RobotoSlab/roboto-slab-v7-regular.woff -------------------------------------------------------------------------------- /docs/_build/html/_static/fonts/RobotoSlab/roboto-slab-v7-regular.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atgu/GWASpy/f8b9f5eccbb67fa5ab462007634db3862198aa6e/docs/_build/html/_static/fonts/RobotoSlab/roboto-slab-v7-regular.woff2 -------------------------------------------------------------------------------- /docs/_build/html/_static/js/badge_only.js: -------------------------------------------------------------------------------- 1 | !function(e){var t={};function r(n){if(t[n])return t[n].exports;var o=t[n]={i:n,l:!1,exports:{}};return e[n].call(o.exports,o,o.exports,r),o.l=!0,o.exports}r.m=e,r.c=t,r.d=function(e,t,n){r.o(e,t)||Object.defineProperty(e,t,{enumerable:!0,get:n})},r.r=function(e){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},r.t=function(e,t){if(1&t&&(e=r(e)),8&t)return e;if(4&t&&"object"==typeof e&&e&&e.__esModule)return e;var n=Object.create(null);if(r.r(n),Object.defineProperty(n,"default",{enumerable:!0,value:e}),2&t&&"string"!=typeof e)for(var o in e)r.d(n,o,function(t){return e[t]}.bind(null,o));return n},r.n=function(e){var t=e&&e.__esModule?function(){return e.default}:function(){return e};return r.d(t,"a",t),t},r.o=function(e,t){return Object.prototype.hasOwnProperty.call(e,t)},r.p="",r(r.s=4)}({4:function(e,t,r){}}); -------------------------------------------------------------------------------- /docs/_build/html/_static/js/theme.js: -------------------------------------------------------------------------------- 1 | !function(n){var e={};function t(i){if(e[i])return e[i].exports;var o=e[i]={i:i,l:!1,exports:{}};return n[i].call(o.exports,o,o.exports,t),o.l=!0,o.exports}t.m=n,t.c=e,t.d=function(n,e,i){t.o(n,e)||Object.defineProperty(n,e,{enumerable:!0,get:i})},t.r=function(n){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(n,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(n,"__esModule",{value:!0})},t.t=function(n,e){if(1&e&&(n=t(n)),8&e)return n;if(4&e&&"object"==typeof n&&n&&n.__esModule)return n;var i=Object.create(null);if(t.r(i),Object.defineProperty(i,"default",{enumerable:!0,value:n}),2&e&&"string"!=typeof n)for(var o in n)t.d(i,o,function(e){return n[e]}.bind(null,o));return i},t.n=function(n){var e=n&&n.__esModule?function(){return n.default}:function(){return n};return t.d(e,"a",e),e},t.o=function(n,e){return Object.prototype.hasOwnProperty.call(n,e)},t.p="",t(t.s=0)}([function(n,e,t){t(1),n.exports=t(3)},function(n,e,t){(function(){var e="undefined"!=typeof window?window.jQuery:t(2);n.exports.ThemeNav={navBar:null,win:null,winScroll:!1,winResize:!1,linkScroll:!1,winPosition:0,winHeight:null,docHeight:null,isRunning:!1,enable:function(n){var t=this;void 0===n&&(n=!0),t.isRunning||(t.isRunning=!0,e((function(e){t.init(e),t.reset(),t.win.on("hashchange",t.reset),n&&t.win.on("scroll",(function(){t.linkScroll||t.winScroll||(t.winScroll=!0,requestAnimationFrame((function(){t.onScroll()})))})),t.win.on("resize",(function(){t.winResize||(t.winResize=!0,requestAnimationFrame((function(){t.onResize()})))})),t.onResize()})))},enableSticky:function(){this.enable(!0)},init:function(n){n(document);var e=this;this.navBar=n("div.wy-side-scroll:first"),this.win=n(window),n(document).on("click","[data-toggle='wy-nav-top']",(function(){n("[data-toggle='wy-nav-shift']").toggleClass("shift"),n("[data-toggle='rst-versions']").toggleClass("shift")})).on("click",".wy-menu-vertical .current ul li a",(function(){var t=n(this);n("[data-toggle='wy-nav-shift']").removeClass("shift"),n("[data-toggle='rst-versions']").toggleClass("shift"),e.toggleCurrent(t),e.hashChange()})).on("click","[data-toggle='rst-current-version']",(function(){n("[data-toggle='rst-versions']").toggleClass("shift-up")})),n("table.docutils:not(.field-list,.footnote,.citation)").wrap("
"),n("table.docutils.footnote").wrap("
"),n("table.docutils.citation").wrap("
"),n(".wy-menu-vertical ul").not(".simple").siblings("a").each((function(){var t=n(this);expand=n(''),expand.on("click",(function(n){return e.toggleCurrent(t),n.stopPropagation(),!1})),t.prepend(expand)}))},reset:function(){var n=encodeURI(window.location.hash)||"#";try{var e=$(".wy-menu-vertical"),t=e.find('[href="'+n+'"]');if(0===t.length){var i=$('.document [id="'+n.substring(1)+'"]').closest("div.section");0===(t=e.find('[href="#'+i.attr("id")+'"]')).length&&(t=e.find('[href="#"]'))}if(t.length>0){$(".wy-menu-vertical .current").removeClass("current").attr("aria-expanded","false"),t.addClass("current").attr("aria-expanded","true"),t.closest("li.toctree-l1").parent().addClass("current").attr("aria-expanded","true");for(let n=1;n<=10;n++)t.closest("li.toctree-l"+n).addClass("current").attr("aria-expanded","true");t[0].scrollIntoView()}}catch(n){console.log("Error expanding nav for anchor",n)}},onScroll:function(){this.winScroll=!1;var n=this.win.scrollTop(),e=n+this.winHeight,t=this.navBar.scrollTop()+(n-this.winPosition);n<0||e>this.docHeight||(this.navBar.scrollTop(t),this.winPosition=n)},onResize:function(){this.winResize=!1,this.winHeight=this.win.height(),this.docHeight=$(document).height()},hashChange:function(){this.linkScroll=!0,this.win.one("hashchange",(function(){this.linkScroll=!1}))},toggleCurrent:function(n){var e=n.closest("li");e.siblings("li.current").removeClass("current").attr("aria-expanded","false"),e.siblings().find("li.current").removeClass("current").attr("aria-expanded","false");var t=e.find("> ul li");t.length&&(t.removeClass("current").attr("aria-expanded","false"),e.toggleClass("current").attr("aria-expanded",(function(n,e){return"true"==e?"false":"true"})))}},"undefined"!=typeof window&&(window.SphinxRtdTheme={Navigation:n.exports.ThemeNav,StickyNav:n.exports.ThemeNav}),function(){for(var n=0,e=["ms","moz","webkit","o"],t=0;t0 56 | var meq1 = "^(" + C + ")?" + V + C + "(" + V + ")?$"; // [C]VC[V] is m=1 57 | var mgr1 = "^(" + C + ")?" + V + C + V + C; // [C]VCVC... is m>1 58 | var s_v = "^(" + C + ")?" + v; // vowel in stem 59 | 60 | this.stemWord = function (w) { 61 | var stem; 62 | var suffix; 63 | var firstch; 64 | var origword = w; 65 | 66 | if (w.length < 3) 67 | return w; 68 | 69 | var re; 70 | var re2; 71 | var re3; 72 | var re4; 73 | 74 | firstch = w.substr(0,1); 75 | if (firstch == "y") 76 | w = firstch.toUpperCase() + w.substr(1); 77 | 78 | // Step 1a 79 | re = /^(.+?)(ss|i)es$/; 80 | re2 = /^(.+?)([^s])s$/; 81 | 82 | if (re.test(w)) 83 | w = w.replace(re,"$1$2"); 84 | else if (re2.test(w)) 85 | w = w.replace(re2,"$1$2"); 86 | 87 | // Step 1b 88 | re = /^(.+?)eed$/; 89 | re2 = /^(.+?)(ed|ing)$/; 90 | if (re.test(w)) { 91 | var fp = re.exec(w); 92 | re = new RegExp(mgr0); 93 | if (re.test(fp[1])) { 94 | re = /.$/; 95 | w = w.replace(re,""); 96 | } 97 | } 98 | else if (re2.test(w)) { 99 | var fp = re2.exec(w); 100 | stem = fp[1]; 101 | re2 = new RegExp(s_v); 102 | if (re2.test(stem)) { 103 | w = stem; 104 | re2 = /(at|bl|iz)$/; 105 | re3 = new RegExp("([^aeiouylsz])\\1$"); 106 | re4 = new RegExp("^" + C + v + "[^aeiouwxy]$"); 107 | if (re2.test(w)) 108 | w = w + "e"; 109 | else if (re3.test(w)) { 110 | re = /.$/; 111 | w = w.replace(re,""); 112 | } 113 | else if (re4.test(w)) 114 | w = w + "e"; 115 | } 116 | } 117 | 118 | // Step 1c 119 | re = /^(.+?)y$/; 120 | if (re.test(w)) { 121 | var fp = re.exec(w); 122 | stem = fp[1]; 123 | re = new RegExp(s_v); 124 | if (re.test(stem)) 125 | w = stem + "i"; 126 | } 127 | 128 | // Step 2 129 | re = /^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/; 130 | if (re.test(w)) { 131 | var fp = re.exec(w); 132 | stem = fp[1]; 133 | suffix = fp[2]; 134 | re = new RegExp(mgr0); 135 | if (re.test(stem)) 136 | w = stem + step2list[suffix]; 137 | } 138 | 139 | // Step 3 140 | re = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/; 141 | if (re.test(w)) { 142 | var fp = re.exec(w); 143 | stem = fp[1]; 144 | suffix = fp[2]; 145 | re = new RegExp(mgr0); 146 | if (re.test(stem)) 147 | w = stem + step3list[suffix]; 148 | } 149 | 150 | // Step 4 151 | re = /^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/; 152 | re2 = /^(.+?)(s|t)(ion)$/; 153 | if (re.test(w)) { 154 | var fp = re.exec(w); 155 | stem = fp[1]; 156 | re = new RegExp(mgr1); 157 | if (re.test(stem)) 158 | w = stem; 159 | } 160 | else if (re2.test(w)) { 161 | var fp = re2.exec(w); 162 | stem = fp[1] + fp[2]; 163 | re2 = new RegExp(mgr1); 164 | if (re2.test(stem)) 165 | w = stem; 166 | } 167 | 168 | // Step 5 169 | re = /^(.+?)e$/; 170 | if (re.test(w)) { 171 | var fp = re.exec(w); 172 | stem = fp[1]; 173 | re = new RegExp(mgr1); 174 | re2 = new RegExp(meq1); 175 | re3 = new RegExp("^" + C + v + "[^aeiouwxy]$"); 176 | if (re.test(stem) || (re2.test(stem) && !(re3.test(stem)))) 177 | w = stem; 178 | } 179 | re = /ll$/; 180 | re2 = new RegExp(mgr1); 181 | if (re.test(w) && re2.test(w)) { 182 | re = /.$/; 183 | w = w.replace(re,""); 184 | } 185 | 186 | // and turn initial Y back to y 187 | if (firstch == "y") 188 | w = firstch.toLowerCase() + w.substr(1); 189 | return w; 190 | } 191 | } 192 | 193 | -------------------------------------------------------------------------------- /docs/_build/html/_static/minus.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atgu/GWASpy/f8b9f5eccbb67fa5ab462007634db3862198aa6e/docs/_build/html/_static/minus.png -------------------------------------------------------------------------------- /docs/_build/html/_static/plus.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atgu/GWASpy/f8b9f5eccbb67fa5ab462007634db3862198aa6e/docs/_build/html/_static/plus.png -------------------------------------------------------------------------------- /docs/_build/html/_static/pygments.css: -------------------------------------------------------------------------------- 1 | pre { line-height: 125%; } 2 | td.linenos .normal { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; } 3 | span.linenos { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; } 4 | td.linenos .special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } 5 | span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } 6 | .highlight .hll { background-color: #ffffcc } 7 | .highlight { background: #f8f8f8; } 8 | .highlight .c { color: #3D7B7B; font-style: italic } /* Comment */ 9 | .highlight .err { border: 1px solid #FF0000 } /* Error */ 10 | .highlight .k { color: #008000; font-weight: bold } /* Keyword */ 11 | .highlight .o { color: #666666 } /* Operator */ 12 | .highlight .ch { color: #3D7B7B; font-style: italic } /* Comment.Hashbang */ 13 | .highlight .cm { color: #3D7B7B; font-style: italic } /* Comment.Multiline */ 14 | .highlight .cp { color: #9C6500 } /* Comment.Preproc */ 15 | .highlight .cpf { color: #3D7B7B; font-style: italic } /* Comment.PreprocFile */ 16 | .highlight .c1 { color: #3D7B7B; font-style: italic } /* Comment.Single */ 17 | .highlight .cs { color: #3D7B7B; font-style: italic } /* Comment.Special */ 18 | .highlight .gd { color: #A00000 } /* Generic.Deleted */ 19 | .highlight .ge { font-style: italic } /* Generic.Emph */ 20 | .highlight .ges { font-weight: bold; font-style: italic } /* Generic.EmphStrong */ 21 | .highlight .gr { color: #E40000 } /* Generic.Error */ 22 | .highlight .gh { color: #000080; font-weight: bold } /* Generic.Heading */ 23 | .highlight .gi { color: #008400 } /* Generic.Inserted */ 24 | .highlight .go { color: #717171 } /* Generic.Output */ 25 | .highlight .gp { color: #000080; font-weight: bold } /* Generic.Prompt */ 26 | .highlight .gs { font-weight: bold } /* Generic.Strong */ 27 | .highlight .gu { color: #800080; font-weight: bold } /* Generic.Subheading */ 28 | .highlight .gt { color: #0044DD } /* Generic.Traceback */ 29 | .highlight .kc { color: #008000; font-weight: bold } /* Keyword.Constant */ 30 | .highlight .kd { color: #008000; font-weight: bold } /* Keyword.Declaration */ 31 | .highlight .kn { color: #008000; font-weight: bold } /* Keyword.Namespace */ 32 | .highlight .kp { color: #008000 } /* Keyword.Pseudo */ 33 | .highlight .kr { color: #008000; font-weight: bold } /* Keyword.Reserved */ 34 | .highlight .kt { color: #B00040 } /* Keyword.Type */ 35 | .highlight .m { color: #666666 } /* Literal.Number */ 36 | .highlight .s { color: #BA2121 } /* Literal.String */ 37 | .highlight .na { color: #687822 } /* Name.Attribute */ 38 | .highlight .nb { color: #008000 } /* Name.Builtin */ 39 | .highlight .nc { color: #0000FF; font-weight: bold } /* Name.Class */ 40 | .highlight .no { color: #880000 } /* Name.Constant */ 41 | .highlight .nd { color: #AA22FF } /* Name.Decorator */ 42 | .highlight .ni { color: #717171; font-weight: bold } /* Name.Entity */ 43 | .highlight .ne { color: #CB3F38; font-weight: bold } /* Name.Exception */ 44 | .highlight .nf { color: #0000FF } /* Name.Function */ 45 | .highlight .nl { color: #767600 } /* Name.Label */ 46 | .highlight .nn { color: #0000FF; font-weight: bold } /* Name.Namespace */ 47 | .highlight .nt { color: #008000; font-weight: bold } /* Name.Tag */ 48 | .highlight .nv { color: #19177C } /* Name.Variable */ 49 | .highlight .ow { color: #AA22FF; font-weight: bold } /* Operator.Word */ 50 | .highlight .w { color: #bbbbbb } /* Text.Whitespace */ 51 | .highlight .mb { color: #666666 } /* Literal.Number.Bin */ 52 | .highlight .mf { color: #666666 } /* Literal.Number.Float */ 53 | .highlight .mh { color: #666666 } /* Literal.Number.Hex */ 54 | .highlight .mi { color: #666666 } /* Literal.Number.Integer */ 55 | .highlight .mo { color: #666666 } /* Literal.Number.Oct */ 56 | .highlight .sa { color: #BA2121 } /* Literal.String.Affix */ 57 | .highlight .sb { color: #BA2121 } /* Literal.String.Backtick */ 58 | .highlight .sc { color: #BA2121 } /* Literal.String.Char */ 59 | .highlight .dl { color: #BA2121 } /* Literal.String.Delimiter */ 60 | .highlight .sd { color: #BA2121; font-style: italic } /* Literal.String.Doc */ 61 | .highlight .s2 { color: #BA2121 } /* Literal.String.Double */ 62 | .highlight .se { color: #AA5D1F; font-weight: bold } /* Literal.String.Escape */ 63 | .highlight .sh { color: #BA2121 } /* Literal.String.Heredoc */ 64 | .highlight .si { color: #A45A77; font-weight: bold } /* Literal.String.Interpol */ 65 | .highlight .sx { color: #008000 } /* Literal.String.Other */ 66 | .highlight .sr { color: #A45A77 } /* Literal.String.Regex */ 67 | .highlight .s1 { color: #BA2121 } /* Literal.String.Single */ 68 | .highlight .ss { color: #19177C } /* Literal.String.Symbol */ 69 | .highlight .bp { color: #008000 } /* Name.Builtin.Pseudo */ 70 | .highlight .fm { color: #0000FF } /* Name.Function.Magic */ 71 | .highlight .vc { color: #19177C } /* Name.Variable.Class */ 72 | .highlight .vg { color: #19177C } /* Name.Variable.Global */ 73 | .highlight .vi { color: #19177C } /* Name.Variable.Instance */ 74 | .highlight .vm { color: #19177C } /* Name.Variable.Magic */ 75 | .highlight .il { color: #666666 } /* Literal.Number.Integer.Long */ -------------------------------------------------------------------------------- /docs/_build/html/_static/sphinx_highlight.js: -------------------------------------------------------------------------------- 1 | /* Highlighting utilities for Sphinx HTML documentation. */ 2 | "use strict"; 3 | 4 | const SPHINX_HIGHLIGHT_ENABLED = true 5 | 6 | /** 7 | * highlight a given string on a node by wrapping it in 8 | * span elements with the given class name. 9 | */ 10 | const _highlight = (node, addItems, text, className) => { 11 | if (node.nodeType === Node.TEXT_NODE) { 12 | const val = node.nodeValue; 13 | const parent = node.parentNode; 14 | const pos = val.toLowerCase().indexOf(text); 15 | if ( 16 | pos >= 0 && 17 | !parent.classList.contains(className) && 18 | !parent.classList.contains("nohighlight") 19 | ) { 20 | let span; 21 | 22 | const closestNode = parent.closest("body, svg, foreignObject"); 23 | const isInSVG = closestNode && closestNode.matches("svg"); 24 | if (isInSVG) { 25 | span = document.createElementNS("http://www.w3.org/2000/svg", "tspan"); 26 | } else { 27 | span = document.createElement("span"); 28 | span.classList.add(className); 29 | } 30 | 31 | span.appendChild(document.createTextNode(val.substr(pos, text.length))); 32 | const rest = document.createTextNode(val.substr(pos + text.length)); 33 | parent.insertBefore( 34 | span, 35 | parent.insertBefore( 36 | rest, 37 | node.nextSibling 38 | ) 39 | ); 40 | node.nodeValue = val.substr(0, pos); 41 | /* There may be more occurrences of search term in this node. So call this 42 | * function recursively on the remaining fragment. 43 | */ 44 | _highlight(rest, addItems, text, className); 45 | 46 | if (isInSVG) { 47 | const rect = document.createElementNS( 48 | "http://www.w3.org/2000/svg", 49 | "rect" 50 | ); 51 | const bbox = parent.getBBox(); 52 | rect.x.baseVal.value = bbox.x; 53 | rect.y.baseVal.value = bbox.y; 54 | rect.width.baseVal.value = bbox.width; 55 | rect.height.baseVal.value = bbox.height; 56 | rect.setAttribute("class", className); 57 | addItems.push({ parent: parent, target: rect }); 58 | } 59 | } 60 | } else if (node.matches && !node.matches("button, select, textarea")) { 61 | node.childNodes.forEach((el) => _highlight(el, addItems, text, className)); 62 | } 63 | }; 64 | const _highlightText = (thisNode, text, className) => { 65 | let addItems = []; 66 | _highlight(thisNode, addItems, text, className); 67 | addItems.forEach((obj) => 68 | obj.parent.insertAdjacentElement("beforebegin", obj.target) 69 | ); 70 | }; 71 | 72 | /** 73 | * Small JavaScript module for the documentation. 74 | */ 75 | const SphinxHighlight = { 76 | 77 | /** 78 | * highlight the search words provided in localstorage in the text 79 | */ 80 | highlightSearchWords: () => { 81 | if (!SPHINX_HIGHLIGHT_ENABLED) return; // bail if no highlight 82 | 83 | // get and clear terms from localstorage 84 | const url = new URL(window.location); 85 | const highlight = 86 | localStorage.getItem("sphinx_highlight_terms") 87 | || url.searchParams.get("highlight") 88 | || ""; 89 | localStorage.removeItem("sphinx_highlight_terms") 90 | url.searchParams.delete("highlight"); 91 | window.history.replaceState({}, "", url); 92 | 93 | // get individual terms from highlight string 94 | const terms = highlight.toLowerCase().split(/\s+/).filter(x => x); 95 | if (terms.length === 0) return; // nothing to do 96 | 97 | // There should never be more than one element matching "div.body" 98 | const divBody = document.querySelectorAll("div.body"); 99 | const body = divBody.length ? divBody[0] : document.querySelector("body"); 100 | window.setTimeout(() => { 101 | terms.forEach((term) => _highlightText(body, term, "highlighted")); 102 | }, 10); 103 | 104 | const searchBox = document.getElementById("searchbox"); 105 | if (searchBox === null) return; 106 | searchBox.appendChild( 107 | document 108 | .createRange() 109 | .createContextualFragment( 110 | '" 114 | ) 115 | ); 116 | }, 117 | 118 | /** 119 | * helper function to hide the search marks again 120 | */ 121 | hideSearchWords: () => { 122 | document 123 | .querySelectorAll("#searchbox .highlight-link") 124 | .forEach((el) => el.remove()); 125 | document 126 | .querySelectorAll("span.highlighted") 127 | .forEach((el) => el.classList.remove("highlighted")); 128 | localStorage.removeItem("sphinx_highlight_terms") 129 | }, 130 | 131 | initEscapeListener: () => { 132 | // only install a listener if it is really needed 133 | if (!DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS) return; 134 | 135 | document.addEventListener("keydown", (event) => { 136 | // bail for input elements 137 | if (BLACKLISTED_KEY_CONTROL_ELEMENTS.has(document.activeElement.tagName)) return; 138 | // bail with special keys 139 | if (event.shiftKey || event.altKey || event.ctrlKey || event.metaKey) return; 140 | if (DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS && (event.key === "Escape")) { 141 | SphinxHighlight.hideSearchWords(); 142 | event.preventDefault(); 143 | } 144 | }); 145 | }, 146 | }; 147 | 148 | _ready(() => { 149 | /* Do not call highlightSearchWords() when we are on the search page. 150 | * It will highlight words from the *previous* search query. 151 | */ 152 | if (typeof Search === "undefined") SphinxHighlight.highlightSearchWords(); 153 | SphinxHighlight.initEscapeListener(); 154 | }); 155 | -------------------------------------------------------------------------------- /docs/_build/html/genindex.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | Index — GWASpy 0.1.0 documentation 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 |
26 | 56 | 57 |
61 | 62 |
63 |
64 |
65 |
    66 |
  • 67 | 68 |
  • 69 |
  • 70 |
71 |
72 |
73 |
74 |
75 | 76 | 77 |

Index

78 | 79 |
80 | 81 |
82 | 83 | 84 |
85 |
86 |
87 | 88 |
89 | 90 |
91 |

© Copyright 2024, Martin Lab, Broad Institute.

92 |
93 | 94 | Built with Sphinx using a 95 | theme 96 | provided by Read the Docs. 97 | 98 | 99 |
100 |
101 |
102 |
103 |
104 | 109 | 110 | 111 | -------------------------------------------------------------------------------- /docs/_build/html/install/linux.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Install GWASpy on GNU/Linux — GWASpy 0.1.0 documentation 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 |
27 | 57 | 58 |
62 | 63 |
64 |
65 |
66 | 73 |
74 |
75 |
76 |
77 | 78 |
79 |

Install GWASpy on GNU/Linux

80 |
    81 |
  • Install Java 8.

  • 82 |
  • Install Python 3.6+.

  • 83 |
  • Install a recent version of the C and C++ standard libraries. GCC 5.0, LLVM 84 | version 3.4, or any later versions suffice.

  • 85 |
  • Install BLAS and LAPACK.

  • 86 |
  • Install TeX Live

  • 87 |
  • Install GWASpy using pip.

  • 88 |
89 |

On a recent Debian-like system, the following should suffice:

90 |
apt-get install -y \
 91 |     openjdk-8-jre-headless \
 92 |     g++ \
 93 |     python3.6 python3-pip \
 94 |     libopenblas-base liblapack3 \
 95 |     texlive-pictures texlive-science texlive-latex-extra latexmk
 96 | python3.6 -m pip install gwaspy
 97 | 
98 |
99 |
100 | 101 | 102 |
103 |
104 |
105 | 106 |
107 | 108 |
109 |

© Copyright 2024, Martin Lab, Broad Institute.

110 |
111 | 112 | Built with Sphinx using a 113 | theme 114 | provided by Read the Docs. 115 | 116 | 117 |
118 |
119 |
120 |
121 |
122 | 127 | 128 | 129 | -------------------------------------------------------------------------------- /docs/_build/html/install/macosx.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Install GWASpy on Mac OS X — GWASpy 0.1.0 documentation 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 |
27 | 57 | 58 |
62 | 63 |
64 |
65 |
66 | 73 |
74 |
75 |
76 |
77 | 78 |
79 |

Install GWASpy on Mac OS X

80 |
    81 |
  • Install Java 8.

  • 82 |
  • Install Python 3.6+.

  • 83 |
  • Install MacTeX

  • 84 |
  • Open Terminal.app and execute pip3 install gwaspy.

  • 85 |
86 |
87 | 88 | 89 |
90 |
91 |
92 | 93 |
94 | 95 |
96 |

© Copyright 2024, Martin Lab, Broad Institute.

97 |
98 | 99 | Built with Sphinx using a 100 | theme 101 | provided by Read the Docs. 102 | 103 | 104 |
105 |
106 |
107 |
108 |
109 | 114 | 115 | 116 | -------------------------------------------------------------------------------- /docs/_build/html/installation.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Installing GWASpy — GWASpy 0.1.0 documentation 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 |
29 | 59 | 60 |
64 | 65 |
66 |
67 |
68 | 75 |
76 |
77 |
78 |
79 | 80 |
81 |

Installing GWASpy

82 |

GWASpy leverages Hail to enable efficient processing of data directly from Google Cloud. As such, the first step is to 83 | install Hail as per instructions here. After you have installed Hail, GWASpy can be easily installed using

84 |
pip install gwaspy
 85 | 
86 |
87 |

It is important to note that the command above will install GWASpy locally (or wherever you ran the command). For the 88 | phasing and imputation modules using Hail Batch, this is enough. For the preimp_qc and 89 | pca modules using Hail Query, however, you have to ensure that the dataproc cluster has GWASpy, and there are 90 | examples showing how to do this in the Pre-Imputation Quality Control (QC) and Principal Component Analysis sections.

91 |
92 | 93 | 94 |
95 |
96 |
100 | 101 |
102 | 103 |
104 |

© Copyright 2024, Martin Lab, Broad Institute.

105 |
106 | 107 | Built with Sphinx using a 108 | theme 109 | provided by Read the Docs. 110 | 111 | 112 |
113 |
114 |
115 |
116 |
117 | 122 | 123 | 124 | -------------------------------------------------------------------------------- /docs/_build/html/objects.inv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atgu/GWASpy/f8b9f5eccbb67fa5ab462007634db3862198aa6e/docs/_build/html/objects.inv -------------------------------------------------------------------------------- /docs/_build/html/search.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | Search — GWASpy 0.1.0 documentation 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 |
29 | 59 | 60 |
64 | 65 |
66 |
67 |
68 |
    69 |
  • 70 | 71 |
  • 72 |
  • 73 |
74 |
75 |
76 |
77 |
78 | 79 | 86 | 87 | 88 |
89 | 90 |
91 | 92 |
93 |
94 |
95 | 96 |
97 | 98 |
99 |

© Copyright 2024, Martin Lab, Broad Institute.

100 |
101 | 102 | Built with Sphinx using a 103 | theme 104 | provided by Read the Docs. 105 | 106 | 107 |
108 |
109 |
110 |
111 |
112 | 117 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | -------------------------------------------------------------------------------- /docs/_static/custom.css: -------------------------------------------------------------------------------- 1 | .wy-nav-content { 2 | max-width: none; 3 | } 4 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | # import os 14 | # import sys 15 | # sys.path.insert(0, os.path.abspath('.')) 16 | 17 | 18 | # -- Project information ----------------------------------------------------- 19 | 20 | project = 'GWASpy' 21 | copyright = '2024, Martin Lab, Broad Institute' 22 | author = 'Lindokuhle Nkambule' 23 | 24 | # The full version, including alpha/beta/rc tags 25 | release = '0.1.0' 26 | 27 | 28 | # -- General configuration --------------------------------------------------- 29 | 30 | # Add any Sphinx extension module names here, as strings. They can be 31 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 32 | # ones. 33 | extensions = [ 34 | ] 35 | 36 | # Add any paths that contain templates here, relative to this directory. 37 | templates_path = ['_templates'] 38 | 39 | # List of patterns, relative to source directory, that match files and 40 | # directories to ignore when looking for source files. 41 | # This pattern also affects html_static_path and html_extra_path. 42 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 43 | 44 | 45 | # -- Options for HTML output ------------------------------------------------- 46 | 47 | # The theme to use for HTML and HTML Help pages. See the documentation for 48 | # a list of builtin themes. 49 | # 50 | html_theme = 'sphinx_rtd_theme' 51 | 52 | # Add any paths that contain custom static files (such as style sheets) here, 53 | # relative to this directory. They are copied after the builtin static files, 54 | # so a file named "default.css" will overwrite the builtin "default.css". 55 | html_static_path = ['_static'] 56 | 57 | html_css_files = [ 58 | 'custom.css', 59 | ] 60 | -------------------------------------------------------------------------------- /docs/images/qc_workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atgu/GWASpy/f8b9f5eccbb67fa5ab462007634db3862198aa6e/docs/images/qc_workflow.png -------------------------------------------------------------------------------- /docs/imputation.rst: -------------------------------------------------------------------------------- 1 | .. _sec-imputation: 2 | 3 | =================== 4 | Genotype Imputation 5 | =================== 6 | 7 | Genotype imputation is a process of estimating missing genotypes from the haplotype or genotype reference panel. It 8 | allows you to accurately evaluate the evidence for association at genetic markers that are not directly genotyped. 9 | GWASpy has a module, :code:`imputation`, for running imputation using IMPUTE5. Because imputation can be a computationally 10 | intensive task, we run it on multiple chunks in parallel, then merge the imputed chunks together at the end. Below are 11 | examples of how to run imputation using either the HGDP+1kGP or your own reference panel. 12 | 13 | Examples 14 | ######## 15 | 16 | **1. HGDP+1kGP reference panel** 17 | 18 | .. code-block:: sh 19 | 20 | imputation --input-file gs://path/to/file.vcf.bgz --vcf-ref hgdp1kgp --output-filename my_outfilename --out-dir gs://path/to/output/dir --n-samples 1989 --n-ref-samples 4091 --billing-project my-billing-project 21 | 22 | **2. Own reference panel** 23 | 24 | .. code-block:: python 25 | 26 | imputation --input-file gs://path/to/file.vcf.bgz --vcf-ref gs://path/to/ref_panel/ALL.chrCNUMBER.vcf --output-filename my_outfilename --out-dir gs://path/to/output/dir --n-samples 1989 --n-ref-samples 4091 --billing-project my-billing-project 27 | 28 | .. warning:: 29 | When using your own reference panel, make sure that you use the CNUMBER placeholder in the filename passed to --vcf-ref 30 | 31 | Arguments and options 32 | ##################### 33 | 34 | .. list-table:: 35 | :widths: 15 50 36 | :header-rows: 1 37 | 38 | * - Argument 39 | - Description 40 | * - :code:`--input-file` 41 | - Path to where the VCF or TSV with target VCF/BAM files is 42 | * - :code:`--vcf-ref` 43 | - Reference panel file to use for imputation 44 | * - :code:`--chromosomes` 45 | - Chromosome(s) to run imputation for. Default is :code:`all` 46 | * - :code:`--local` 47 | - Type of service. Default is Service backend where jobs are executed on a multi-tenant compute cluster in Google Cloud 48 | * - :code:`--billing-project` 49 | - Billing project to be used for the jobs 50 | * - :code:`--n-samples` 51 | - Number of target samples to be imputed. We use this to estimate resources for some of the jobs 52 | * - :code:`--n-ref-samples` 53 | - Number of reference samples. We use this to estimate resources for some of the jobs 54 | * - :code:`--software` 55 | - Software to use for phasing. Options: [:code:`beagle5`, :code:`impute5`]. Default is :code:`impute5` 56 | * - :code:`--output-filename` 57 | - Output filename without file extension 58 | * - :code:`--out-dir` 59 | - Path to where output files will be saved 60 | 61 | Output 62 | ###### 63 | The resulting output is a VCF file per chromosome with imputed genotypes. 64 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | ========== 2 | GWASpy 0.1 3 | ========== 4 | 5 | GWASpy is an open-source Python package for scalable: (1) Pre-imputation QC; (2) Principal Component Analysis; (3) Haplotype phasing; and (4) Genotype 6 | Imputation. See the `installation page `_ to get started 7 | using GWASpy. 8 | 9 | ======== 10 | Contents 11 | ======== 12 | 13 | .. toctree:: 14 | :maxdepth: 2 15 | 16 | Installation 17 | Hail Query and Batch 18 | Pre-Imputation QC 19 | Principal Component Analysis 20 | Haplotype Phasing 21 | Genotype Imputation 22 | Tutorial -------------------------------------------------------------------------------- /docs/installation.rst: -------------------------------------------------------------------------------- 1 | .. _sec-installation: 2 | 3 | ================= 4 | Installing GWASpy 5 | ================= 6 | 7 | GWASpy leverages Hail to enable efficient processing of data directly from Google Cloud. As such, the first step is to 8 | install Hail as per instructions `here `_. After you have installed Hail, GWASpy can be easily installed using 9 | 10 | .. code-block:: sh 11 | 12 | pip install gwaspy 13 | 14 | It is important to note that the command above will install GWASpy locally (or wherever you ran the command). For the 15 | :code:`phasing` and :code:`imputation` modules using Hail Batch, this is enough. For the :code:`preimp_qc` and 16 | :code:`pca` modules using Hail Query, however, you have to ensure that the dataproc cluster has GWASpy, and there are 17 | examples showing how to do this in the :ref:`preimp_qc` and :ref:`pca` sections. 18 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/pca.rst: -------------------------------------------------------------------------------- 1 | .. _sec-pca: 2 | .. _pca: 3 | 4 | ============================ 5 | Principal Component Analysis 6 | ============================ 7 | 8 | Principal components analysis (PCA) can be used to detect and quantify the genetic structure of populations. 9 | In GWASpy, the :code:`pca` module can be run in 3 different ways: (1) normal PCA without a reference panel; (2) joint PCA; or (3) Projection PCA. 10 | 11 | .. toctree:: 12 | :maxdepth: 1 13 | 14 | Normal PCA 15 | Joint PCA 16 | Projection PCA 17 | 18 | Arguments and options 19 | ##################### 20 | 21 | .. list-table:: 22 | :widths: 15 50 23 | :header-rows: 1 24 | 25 | * - Argument 26 | - Description 27 | * - :code:`--ref-dirname` 28 | - Path to where reference data is 29 | * - :code:`--ref-basename` 30 | - Reference basename 31 | * - :code:`--ref-info` 32 | - Path to reference information. Tab-delimited file with sample IDs and their SuperPop labels 33 | * - :code:`--reference` 34 | - Genome reference build. Default is GRCh38. Options: [:code:`GRCh37`, :code:`GRCh38`] 35 | * - :code:`--pca-type` 36 | - Type of PCA to run. Default is normal. Options: [:code:`normal`, :code:`project`, :code:`joint`] 37 | * - :code:`--data-dirname` 38 | - Path to where the data is 39 | * - :code:`--data-basename` 40 | - Data basename 41 | * - :code:`--input-type` 42 | - Data input type. Options: [:code:`hail`, :code:`plink`, :code:`vcf`] 43 | * - :code:`--maf` 44 | - include only SNPs with MAF >= NUM in PCA. Default is 0.05 45 | * - :code:`--hwe` 46 | - include only SNPs with HWE >= NUM in PCA. Default is 1e-03 47 | * - :code:`--geno` 48 | - include only SNPs with call-rate > NUM. Default is 0.98 49 | * - :code:`--ld-cor` 50 | - Squared correlation threshold (exclusive upper bound). Must be in the range [0.0, 1.0]. Default is 0.2 51 | * - :code:`--ld-window` 52 | - Window size in base pairs (inclusive upper bound). Default is 250000 53 | * - :code:`--npcs` 54 | - Number of PCs to use. Default is 20 55 | * - :code:`--relatedness-method` 56 | - Method to use for the inference of relatedness. Default is pc_relate. Options: [:code:`pc_relate`, :code:`ibd`, :code:`king`] 57 | * - :code:`--relatedness-thresh` 58 | - Threshold value to use in relatedness checks. Default is 0.98 59 | * - :code:`--prob` 60 | - Minimum probability of belonging to a given population for the population to be set. Default is 0.8 61 | * - :code:`--out-dir` 62 | - Path to where output files will be saved 63 | 64 | Output 65 | ###### 66 | A tab-delimited file with the first 20 principal components (PCs) computed and 67 | graphical visualizations of the PCs are generated. 68 | -------------------------------------------------------------------------------- /docs/pca/joint.rst: -------------------------------------------------------------------------------- 1 | ================================ 2 | Joint PCA (with a reference) 3 | ================================ 4 | 5 | The joint PCA method works by first merging (joining), by locus and allele(s), the input dataset with the reference dataset. 6 | This is followed by "normal" PCA on the merged dataset 7 | 8 | Below is a code on how you can run joint PCA via the command-line or inside a Python script Use 9 | 10 | #. Python (inside a Python script) 11 | 12 | .. code-block:: python 13 | 14 | import gwaspy.pca as pca 15 | pca.pca.pca(data_dirname="gs://my-gcs/bucket/test_data/", data_basename="my_data_basename", 16 | out_dir="gs://my-gcs/bucket/test_data/", input_type="my_input_type", reference="GRCh37", 17 | pca_type="joint") 18 | 19 | #. Command line 20 | 21 | .. code-block:: sh 22 | 23 | pca --data-dirname gs://my-gcs/bucket/test_data/ --data-basename my_data_basename --out-dir gs://my-gcs/bucket/test_data/--input-type my_input_type --reference grch37 --pca-type joint -------------------------------------------------------------------------------- /docs/pca/normal.rst: -------------------------------------------------------------------------------- 1 | ================================ 2 | Normal PCA (without a reference) 3 | ================================ 4 | 5 | GWASpy allows you to run normal PCA without any reference panel 6 | 7 | Below is a code on how you can run normal PCA without a reference via the command-line or inside a Python script Use 8 | 9 | #. Python (inside a Python script) 10 | 11 | .. code-block:: python 12 | 13 | import gwaspy.pca as pca 14 | pca.pca.pca(data_dirname="gs://my-gcs/bucket/test_data/", data_basename="my_data_basename", 15 | out_dir="gs://my-gcs/bucket/test_data/", input_type="my_input_type", reference="GRCh37", 16 | pca_type="normal") 17 | 18 | #. Command line 19 | 20 | .. code-block:: sh 21 | 22 | pca --data-dirname gs://my-gcs/bucket/test_data/ --data-basename my_data_basename --out-dir gs://my-gcs/bucket/test_data/--input-type my_input_type --reference grch37 --pca-type normal -------------------------------------------------------------------------------- /docs/pca/project.rst: -------------------------------------------------------------------------------- 1 | ================================ 2 | Project PCA (with a reference) 3 | ================================ 4 | 5 | You can leverage reference panel information to see how samples in your data cluster on a "global" scale. 6 | PCs are computed using 1KG+HGDP dataset as a reference panel, and then samples in the input dataset are projected onto the 1KG+HGDP PC space. 7 | A random forest classifier model, adopted from gnomAD, is then used to assign population ancestries in the input dataset 8 | 9 | Below is a code on how you can run projection PCA via the command-line or inside a Python script Use 10 | 11 | #. Python (inside a Python script) 12 | 13 | .. code-block:: python 14 | 15 | import gwaspy.pca as pca 16 | pca.pca.pca(data_dirname="gs://my-gcs/bucket/test_data/", data_basename="my_data_basename", 17 | out_dir="gs://my-gcs/bucket/test_data/", input_type="my_input_type", reference="GRCh37", 18 | pca_type="project") 19 | 20 | #. Command line 21 | 22 | .. code-block:: sh 23 | 24 | pca --data-dirname gs://my-gcs/bucket/test_data/ --data-basename my_data_basename --out-dir gs://my-gcs/bucket/test_data/--input-type my_input_type --reference grch37 --pca-type project -------------------------------------------------------------------------------- /docs/phasing.rst: -------------------------------------------------------------------------------- 1 | .. _sec-phasing: 2 | 3 | ================= 4 | Haplotype Phasing 5 | ================= 6 | 7 | Knowing the phase of a haplotype can allow us to impute low frequency variants, this makes haplotype phasing an 8 | important step before genotype imputation. GWASpy has a module, :code:`phasing`, for performing phasing. Phasing can 9 | be run with or without a reference panel using SHAPEIT5 10 | 11 | GWASpy can handle both array and WGS data. For array data, the user can pass a VCF/BCF file with all the chromosomes, 12 | then GWASpy will use SHAPEIT5 to phase the chromosomes in parallel. Since WGS has more variants, phasing will be parallelized across 13 | multiple chunks in each chromosome. It's also important to note that phasing of WGS data includes phasing common 14 | variants first, followed by phasing rare variants. 15 | 16 | Another important aspect of phasing is the use of a reference panel. In many cases (small sample size), including a reference panel when 17 | phasing improves accuracy. By default, GWASpy runs phasing without a reference panel, but there is an option to use a 18 | reference panel as shown below. 19 | 20 | Examples 21 | ######## 22 | 23 | **1. Without a reference panel** 24 | 25 | .. code-block:: sh 26 | 27 | phasing --input-vcf gs://path/to/file.vcf.bgz --output-filename outfilename.phased --out-dir gs://path/to/output/dir --genome-build GRCh38 --billing-project my-billing-project 28 | 29 | **2. HGDP+1KG reference panel** 30 | 31 | Set :code:`--vcf-ref` to :code:`hgdp1kgp` 32 | 33 | .. code-block:: sh 34 | 35 | phasing --input-vcf gs://path/to/file.vcf.bgz --output-filename my_outfilename --out-dir gs://path/to/output/dir --genome-build GRCh38 --billing-project my-billing-project --vcf-ref hgdp1kgp 36 | 37 | **3. Own reference panel** 38 | 39 | .. note:: 40 | 1. If you're using your own reference panel, make sure the files are bgzip compressed. 41 | 2. Chromosome X reference file must be named X and not 23 42 | 43 | Say you have your reference panel files for each chromosomes stored in gs://ref_panel/ALL.chr{1..22,X}.vcf, 44 | you would pass the path to :code:`--vcf-ref` as gs://ref_panel/ALL.chr\ **CNUMBER**\ .vcf. 45 | GWASpy uses **CNUMBER** as a placeholder for the chromosomes. Then you can run phasing as: 46 | 47 | .. code-block:: sh 48 | 49 | phasing --input-vcf gs://path/to/file.vcf.bgz --output-filename outfilename.phased --out-dir gs://path/to/output/dir --genome-build GRCh38 --billing-project my-billing-project --vcf-ref gs://ref_panel/ALL.chrCNUMBER.vcf 50 | 51 | .. note:: 52 | For nextflow users, the idea is the same. The only difference is you have to update the params.json file. Examples 53 | are provided in the tutorial section of the documentation 54 | 55 | Arguments and options 56 | ##################### 57 | 58 | .. list-table:: 59 | :widths: 15 50 60 | :header-rows: 1 61 | 62 | * - Argument 63 | - Description 64 | * - :code:`--input-vcf` 65 | - Path to where VCF file to be phased is 66 | * - :code:`--vcf-ref` 67 | - VCF file for reference haplotypes if phasing with a reference panel 68 | * - :code:`--pedigree` 69 | - Pedigree (PLINK FAM) file 70 | * - :code:`--local` 71 | - Type of service. Default is Service backend where jobs are executed on a multi-tenant compute cluster in Google Cloud 72 | * - :code:`--billing-project` 73 | - Billing project to be used for the job(s) 74 | * - :code:`--genome-build` 75 | - Genome reference build. Default is GRCh38. Options: [:code:`GRCh37`, :code:`GRCh38`] 76 | * - :code:`--data-type` 77 | - Array or WGS data. Default is array. Options: [:code:`array`, :code:`wgs`]. 78 | * - :code:`--fill-tags` 79 | - Whether or not to add AC tag required by SHAPEIT5. Including :code:`--fill-tags`, in your command will enable this step 80 | * - :code:`--software` 81 | - Software to use for phasing. Options: [:code:`beagle`, :code:`shapeit`]. Default is :code:`shapeit` 82 | * - :code:`--output-filename` 83 | - Output filename without file extension 84 | * - :code:`--out-dir` 85 | - Path to where output files will be saved 86 | 87 | Output 88 | ###### 89 | The resulting output is a VCF file per chromosome with phased haplotypes. 90 | -------------------------------------------------------------------------------- /docs/preimp_qc.rst: -------------------------------------------------------------------------------- 1 | .. _sec-pre_imputation_qc: 2 | .. _preimp_qc: 3 | 4 | =================================== 5 | Pre-Imputation Quality Control (QC) 6 | =================================== 7 | 8 | Detecting and correcting issues such as genotyping errors, sample handling errors, population stratification etc 9 | is important in GWAS. The :code:`preimp_qc` module addresses these issues and cleans (QC) your data. Below is a flow diagram 10 | of the filters applied when QC'ing input data: 11 | 12 | .. image:: images/qc_workflow.png 13 | :width: 1000px 14 | :height: 1900px 15 | :scale: 50 % 16 | :align: center 17 | 18 | 19 | Arguments and options 20 | ##################### 21 | 22 | .. list-table:: 23 | :widths: 15 50 24 | :header-rows: 1 25 | 26 | * - Argument 27 | - Description 28 | * - :code:`--dirname` 29 | - Path to where the data is 30 | * - :code:`--basename` 31 | - Data basename 32 | * - :code:`--input-type` 33 | - Input type. Options: [:code:`hail`, :code:`plink`, :code:`vcf`] 34 | * - :code:`--export-type` 35 | - Export type. Options: [:code:`hail`, :code:`plink`, :code:`vcf`] 36 | * - :code:`--out-dir` 37 | - Directory path to where output files are going to be saved 38 | * - :code:`--annotations` 39 | - Annotations file to be used for annotating sample with information such as Sex and Phenotype 40 | * - :code:`--reference` 41 | - Reference genome build. Default is GRCh38. Options: [:code:`GRCh37`, :code:`GRCh38`] 42 | * - :code:`--report` 43 | - Generate a QC PDF report or not. Default is True 44 | * - :code:`--liftover` 45 | - Liftover input data to GRCh38 or not, default is False. Running :code:`preimp_qc` with :code:`--liftover` will activate liftover 46 | * - :code:`--pre-geno` 47 | - include only SNPs with missing-rate < NUM (before ID filter), important for post merge of multiple platforms 48 | * - :code:`--mind` 49 | - include only IDs with missing-rate < NUM 50 | * - :code:`--fhet-aut` 51 | - include only IDs within NUM < FHET < NUM 52 | * - :code:`--fstat-y` 53 | - include only female IDs with fhet < NUM 54 | * - :code:`--fstat-x` 55 | - include only male IDs with fhet > NUM 56 | * - :code:`--geno` 57 | - include only SNPs with missing-rate < NUM 58 | * - :code:`--midi` 59 | - include only SNPs with missing-rate-difference (case/control) < NUM 60 | * - :code:`--withpna` 61 | - include monomorphic (invariant) SNPs 62 | * - :code:`--maf` 63 | - include only SNPs with MAF >= NUM 64 | * - :code:`--hwe-th-con` 65 | - HWE_controls < NUM 66 | * - :code:`--hwe-th-cas` 67 | - HWE_cases < NUM 68 | 69 | Output(s) 70 | ########## 71 | * QC'ed file(s) i.e. file with all the variants and/or samples that fail QC filters removed 72 | * A detailed PDF QC report including pre- and post-QC variant/sample counts, figures such as Manhattan and QQ plots etc. 73 | 74 | 75 | Examples 76 | ######## 77 | 78 | All the code below assumes the user already has a Dataproc cluster running as described in the `previous section `_ 79 | 80 | You can run pre-imputation qc using the :code:`preimp_qc` module (1) inside a python script; or (2) via the command line 81 | 82 | 1. Python script - submitting a python script to a cluster from local machine (Highly recommended) 83 | 84 | - First create a python script on your local machine as below 85 | 86 | .. code-block:: python 87 | 88 | import gwaspy.preimp_qc as qc 89 | qc.preimp_qc.preimp_qc(dirname="gs://my-gcs/bucket/test_data/", basename="my_data_basename", 90 | input_type="my_input_type") 91 | 92 | - Then run the following command to submit the script to the Dataproc cluster named `my-cluster-name` 93 | 94 | .. code-block:: sh 95 | 96 | hailctl dataproc submit my-cluster-name qc_script.py 97 | 98 | 2. Command line - requires user to SSH'ed to a cluster 99 | 100 | Users may encounter `this error `_ when trying to run things from the command line 101 | 102 | - This requires the user to be inside (`gcloud compute ssh`) the Dataproc cluster with GWASpy already installed 103 | 104 | .. code-block:: sh 105 | 106 | gcloud compute ssh "my-cluster-name-m" 107 | preimp_qc --dirname gs://my-gcs/bucket/test_data/ --basename my_data_basename --input-type my_input_type 108 | -------------------------------------------------------------------------------- /docs/qb.rst: -------------------------------------------------------------------------------- 1 | .. _sec-qb: 2 | 3 | ==================== 4 | Hail Query and Batch 5 | ==================== 6 | 7 | The four GWASpy modules use two different backends: :code:`preimp_qc` and :code:`pca` use Hail Query, while 8 | :code:`phasing` and :code:`imputation` modules use Batch (Hail Batch for Broad users and nextflow for non-Broad users). 9 | Hail Query is well-suited for manipulating large genomics data in a highly parallelised environments such as Dataproc. 10 | `Batch `_, on the other hand, is good for batch processing (scheduling, 11 | queueing, and executing) workloads on Google Cloud resources. 12 | 13 | All the instructions below assume the user has a Google account and an active (Google) Cloud billing account 14 | 15 | Query 16 | ##### 17 | 18 | For running the :code:`preimp_qc` and :code:`pca` modules, you need to start a Dataproc cluster. Hail has a command-line 19 | tool, `hailctl `_, for doing this and it is installed automatically when 20 | you install Hail. We highly recommend setting a maximum age for the cluster (:code:`--max-age`), this will ensure the cluster is 21 | automatically deleted after the specified time. 22 | 23 | Below is how you can start a cluster with GWASpy pre-installed: 24 | 25 | .. code-block:: sh 26 | 27 | hailctl dataproc start my-cluster-name -region=us-central1 --packages gwaspy --max-age 4h 28 | 29 | To shut down the cluster, you can run: 30 | 31 | .. code-block:: sh 32 | 33 | hailctl dataproc stop my-cluster-name --region=us-central1 34 | 35 | Batch 36 | ##### 37 | 38 | The :code:`phasing` and :code:`imputation` modules use Batch as the backend. For Broad users with a Hail Batch account, 39 | there is no setup needed, you can proceed to running the modules. For non-Broad users, we have a nextflow implementation 40 | of the modules that requires nextflow setup first. Follow the steps here to: `(1) install nextflow `_; and 41 | `(2) setup Google Cloud Batch for nextflow `_ 42 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx-rtd-theme 2 | -------------------------------------------------------------------------------- /env-setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -ex 4 | 5 | PLATFORM="${OSTYPE}" 6 | 7 | # install pylatex depedencies 8 | case "$PLATFORM" in 9 | darwin*) 10 | install-pylatex-dependencies() { 11 | brew install --cask mactex 12 | eval "$(/usr/libexec/path_helper)" 13 | } 14 | ;; 15 | linux*) 16 | install-pylatex-dependencies() { 17 | yes Y | apt-get install texlive-pictures texlive-science texlive-latex-extra latexmk 18 | } 19 | ;; 20 | *) 21 | echo "unsupported platform $PLATFORM." 22 | ;; 23 | esac 24 | 25 | install-pylatex-dependencies 26 | -------------------------------------------------------------------------------- /gwaspy/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atgu/GWASpy/f8b9f5eccbb67fa5ab462007634db3862198aa6e/gwaspy/.DS_Store -------------------------------------------------------------------------------- /gwaspy/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atgu/GWASpy/f8b9f5eccbb67fa5ab462007634db3862198aa6e/gwaspy/__init__.py -------------------------------------------------------------------------------- /gwaspy/check_alleles/__init__.py: -------------------------------------------------------------------------------- 1 | from gwaspy.check_alleles import flips 2 | __all__ = ['flips'] 3 | -------------------------------------------------------------------------------- /gwaspy/check_alleles/check_alleles.py: -------------------------------------------------------------------------------- 1 | _author__ = 'Lindo Nkambule' 2 | 3 | import hailtop.batch as hb 4 | import hailtop.fs as hfs 5 | 6 | from hailtop.batch.job import Job 7 | 8 | 9 | def size(file: str): 10 | """ 11 | Convert the size from bytes to GiB 12 | :param file: path to file, str 13 | :return: file size in GiB 14 | """ 15 | file_info = hfs.stat(file) # returns a named tuple 16 | size_gigs = file_info.size / (1024 * 1024 * 1024) 17 | 18 | return size_gigs 19 | 20 | 21 | def check_alleles_workflow( 22 | batch: hb.Batch = None, 23 | input_path: str = None, 24 | reference_path: str = None, 25 | output_filename: str = None, 26 | step: str = "check", 27 | fix_mode: str = "top", 28 | output_path: str = None): 29 | 30 | def get_stats( 31 | b: hb.batch.Batch, 32 | job_name: str = None, 33 | vcf: hb.ResourceGroup = None, 34 | ref_fasta: hb.ResourceGroup = None, 35 | output_name: str = None, 36 | out_dir: str = None, 37 | ncpu: int = 8, 38 | memory: str = 'standard', 39 | storage: int = None, 40 | img: str = 'docker.io/lindonkambule/gwaspy_phase_impute:latest', 41 | ) -> Job: 42 | j = b.new_job(name=f'Check alleles: {job_name}') 43 | 44 | j.image(img) 45 | j.memory(memory) 46 | j.cpu(ncpu) 47 | j.storage(f'{storage}Gi') 48 | 49 | j.command( 50 | f""" 51 | bcftools +fixref {vcf['vcf']} -- -f {ref_fasta['ref_fasta']} > stats.txt 52 | mv stats.txt {j.ofile} 53 | """ 54 | ) 55 | 56 | b.write_output(j.ofile, 57 | f'{out_dir}/check_alleles/{output_name}.stats.txt') 58 | 59 | return j 60 | 61 | def fix_alleles( 62 | b: hb.batch.Batch, 63 | job_name: str = None, 64 | vcf: hb.ResourceGroup = None, 65 | ref_fasta: hb.ResourceGroup = None, 66 | allele_mode: str = "top", 67 | output_name: str = None, 68 | out_dir: str = None, 69 | ncpu: int = 8, 70 | memory: str = 'standard', 71 | storage: int = None, 72 | img: str = 'docker.io/lindonkambule/gwaspy_phase_impute:latest', 73 | ) -> Job: 74 | j = b.new_job(name=f'Fix alleles: {job_name}') 75 | 76 | j.image(img) 77 | j.memory(memory) 78 | j.cpu(ncpu) 79 | j.storage(f'{storage}Gi') 80 | 81 | j.declare_resource_group( 82 | fixed_file={ 83 | 'bcf': '{root}.bcf', 84 | 'bcf.csi': '{root}.bcf.csi' 85 | } 86 | ) 87 | 88 | j.command( 89 | f""" 90 | bcftools +fixref {vcf['vcf']} -Ob -o {j.fixed_file['bcf']} -- -f {ref_fasta['ref_fasta']} -m {allele_mode} 91 | bcftools index --force {j.fixed_file['bcf']} --output {j.fixed_file['bcf.csi']} --threads {ncpu} 92 | """ 93 | ) 94 | 95 | b.write_output(j.fixed_file, 96 | f'{out_dir}/check_alleles/{output_name}.alleles.fixed') 97 | 98 | return j 99 | 100 | ref_fasta_in = batch.read_input_group(**{'ref_fasta': reference_path, 101 | 'ref_fasta_index': f'{reference_path}.fai'}) 102 | ref_size = round(size(reference_path)) 103 | 104 | if "CNUMBER" in input_path: # input VCF is already split by chromosome 105 | for i in range(1, 23): 106 | vcf_path = input_path.replace('CNUMBER', str(i)) 107 | input_idx = f'{vcf_path}.tbi' if hfs.exists(f'{vcf_path}.tbi') else f'{vcf_path}.csi' 108 | 109 | if not hfs.exists(input_idx): 110 | raise SystemExit('Input file needs to be indexed (.tbi or .csi). Found none, exiting') 111 | 112 | chrom_vcf = batch.read_input_group(**{'vcf': vcf_path, 113 | 'index': input_idx}) 114 | vcf_size = round(size(vcf_path)) 115 | disk_size = int(round(5.0 + vcf_size + ref_size)) 116 | 117 | if step == "check": 118 | get_stats( 119 | b=batch, 120 | job_name=vcf_path, 121 | vcf=chrom_vcf, 122 | ref_fasta=ref_fasta_in, 123 | output_name=f'{output_filename}_chr{i}', 124 | out_dir=output_path, 125 | storage=disk_size 126 | ) 127 | else: 128 | fix_alleles( 129 | b=batch, 130 | job_name=vcf_path, 131 | vcf=chrom_vcf, 132 | ref_fasta=ref_fasta_in, 133 | allele_mode=fix_mode, 134 | output_name=f'{output_filename}_chr{i}', 135 | out_dir=output_path, 136 | storage=disk_size 137 | ) 138 | 139 | else: # one input file with all the chromosomes 140 | vcf_path = input_path 141 | input_idx = f'{vcf_path}.tbi' if hfs.exists(f'{vcf_path}.tbi') else f'{vcf_path}.csi' 142 | 143 | if not hfs.exists(input_idx): 144 | raise SystemExit('Input file needs to be indexed (.tbi or .csi). Found none, exiting') 145 | 146 | chrom_vcf = batch.read_input_group(**{'vcf': input_path, 147 | 'index': input_idx}) 148 | 149 | vcf_size = round(size(vcf_path)) 150 | disk_size = int(round(5.0 + vcf_size + ref_size)) 151 | 152 | if step == "check": 153 | get_stats( 154 | b=batch, 155 | job_name=vcf_path, 156 | vcf=chrom_vcf, 157 | ref_fasta=ref_fasta_in, 158 | output_name=output_filename, 159 | out_dir=output_path, 160 | storage=disk_size 161 | ) 162 | else: 163 | fix_alleles( 164 | b=batch, 165 | job_name=vcf_path, 166 | vcf=chrom_vcf, 167 | ref_fasta=ref_fasta_in, 168 | allele_mode=fix_mode, 169 | output_name=output_filename, 170 | out_dir=output_path, 171 | storage=disk_size 172 | ) 173 | 174 | batch.run() 175 | -------------------------------------------------------------------------------- /gwaspy/check_alleles/flips.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Lindo Nkambule' 2 | 3 | import argparse 4 | import hailtop.batch as hb 5 | from gwaspy.check_alleles.check_alleles import check_alleles_workflow 6 | from typing import Union 7 | 8 | 9 | def run_checks_fix( 10 | backend: Union[hb.ServiceBackend, hb.LocalBackend] = None, 11 | input_vcf: str = None, 12 | ref_path: str = None, 13 | step: str = "check", 14 | fix_mode: str = "top", 15 | output_filename: str = None, 16 | out_dir: str = None 17 | ): 18 | b = hb.Batch(backend=backend, 19 | name=f'GWASpy-{step.upper()}-Alleles') 20 | 21 | check_alleles_workflow( 22 | batch=b, 23 | input_path=input_vcf, 24 | reference_path=ref_path, 25 | output_filename=output_filename, 26 | step=step, 27 | fix_mode=fix_mode, 28 | output_path=out_dir 29 | ) 30 | 31 | 32 | def main(): 33 | parser = argparse.ArgumentParser() 34 | parser.add_argument('--input-vcf', type=str, required=True) 35 | parser.add_argument('--ref-fasta', type=str, default='gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta') 36 | parser.add_argument('--local', action='store_true') 37 | parser.add_argument('--billing-project', required=True) 38 | parser.add_argument('--step', type=str, default='check', choices=['check', 'fix']) 39 | parser.add_argument('--mode', type=str, default='top', choices=['flip', 'flip-all', 'id', 'ref-alt', 'stats', 'swap', 'top']) 40 | parser.add_argument('--output-filename', type=str, required=True) 41 | parser.add_argument('--out-dir', type=str, required=True) 42 | 43 | args = parser.parse_args() 44 | 45 | if args.local: 46 | backend = hb.LocalBackend() 47 | else: 48 | backend = hb.ServiceBackend(billing_project=args.billing_project, 49 | remote_tmpdir=f'{args.out_dir}/tmp/') 50 | 51 | run_checks_fix( 52 | backend=backend, 53 | input_vcf=args.input_vcf, 54 | ref_path=args.ref_fasta, 55 | step=args.step, 56 | fix_mode=args.mode, 57 | output_filename=args.output_filename, 58 | out_dir=args.out_dir) 59 | 60 | backend.close() 61 | -------------------------------------------------------------------------------- /gwaspy/imputation/__init__.py: -------------------------------------------------------------------------------- 1 | from gwaspy.imputation import impute 2 | __all__ = ['impute'] -------------------------------------------------------------------------------- /gwaspy/imputation/concat_vcfs.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Lindo Nkambule' 2 | 3 | import hailtop.batch as hb 4 | import hail as hl 5 | import pandas as pd 6 | from gwaspy.utils.get_file_size import bytes_to_gb 7 | from gwaspy.phasing.get_filebase import get_vcf_filebase 8 | from typing import List 9 | from typing import Union 10 | 11 | 12 | def concat_vcfs(b: hb.batch.Batch, 13 | vcf_basename: str = None, 14 | vcfs_to_merge: List = None, 15 | output_type: str = 'vcf', 16 | chrom: str = None, 17 | cpu: int = 16, 18 | memory: str = 'standard', 19 | docker_img: str = 'docker.io/lindonkambule/gwaspy:v1', 20 | out_dir: str = None): 21 | 22 | global index_cmd 23 | 24 | out_type = 'b' if output_type == 'bcf' else 'z' 25 | vcfs_sizes_sum = 0 26 | merge_vcf_i = '' 27 | 28 | out_filename = f'{vcf_basename}.{chrom}.merged.bcf' if output_type == 'bcf' else \ 29 | f'{vcf_basename}.{chrom}.merged.vcf.gz' 30 | out_index_name = f'{vcf_basename}.{chrom}.merged.bcf.csi' if output_type == 'bcf' else \ 31 | f'{vcf_basename}.{chrom}.merged.vcf.gz.csi' 32 | 33 | for line in vcfs_to_merge: 34 | vcfs_sizes_sum += 2 + bytes_to_gb(line) 35 | 36 | disk_size = int(round(10 + (2 * vcfs_sizes_sum))) 37 | threads = cpu - 1 38 | 39 | concat = b.new_job(name=f'concat-{vcf_basename}') 40 | concat.memory(memory) 41 | concat.storage(f'{disk_size}Gi') 42 | concat.image(docker_img) 43 | concat.cpu(cpu) 44 | 45 | for line in vcfs_to_merge: 46 | input_vcf = b.read_input_group(vcf=line, 47 | ind=f'{line}.csi') 48 | merge_vcf_i += f'{input_vcf.vcf} ' 49 | 50 | cmd = f''' 51 | bcftools concat \ 52 | --no-version \ 53 | --output-type {out_type} \ 54 | --output {out_filename} \ 55 | --threads {threads} \ 56 | {merge_vcf_i} 57 | ''' 58 | 59 | concat.command(cmd) 60 | # index the merged output 61 | concat.command(f'bcftools index --force {out_filename}') 62 | 63 | concat.command(f'mv {out_filename} {concat.ofile}') 64 | concat.command(f'mv {out_index_name} {concat.idx}') 65 | b.write_output(concat.ofile, f'{out_dir}/GWASpy/{vcf_basename}/Imputation/imputed_merged/{out_filename}') 66 | b.write_output(concat.idx, f'{out_dir}/GWASpy/{vcf_basename}/Imputation/imputed_merged/{out_index_name}') 67 | 68 | 69 | def run_concat(backend: Union[hb.ServiceBackend, hb.LocalBackend] = None, 70 | input_vcf: str = None, 71 | output_type: str = 'vcf', 72 | exclude_chrx: bool = False, 73 | cpu: int = 16, 74 | memory: str = 'standard', 75 | out_dir: str = None): 76 | 77 | print(f'\n2. CONCAT {input_vcf}\n') 78 | vcf_filebase = get_vcf_filebase(input_vcf) 79 | concat_b = hb.Batch(backend=backend, name=f'concat-imputed-chunks-{vcf_filebase}') 80 | 81 | # get the regions so we can map each file to its specific region 82 | regions = pd.read_csv(f'{out_dir}/GWASpy/{vcf_filebase}/Imputation/imputation.regions', sep='\t', names=['reg', 'ind']) 83 | regions_dict = pd.Series(regions.reg.values, index=regions.ind).to_dict() 84 | 85 | imputed_vcfs_chunks = hl.utils.hadoop_ls(f'{out_dir}/GWASpy/{vcf_filebase}/Imputation/imputed_chunks/*.bcf') 86 | 87 | max_chrom = 23 if exclude_chrx else 24 # 1-22 if autosomes only, else 1-23 88 | 89 | for i in range(1, max_chrom): 90 | if i == 23: 91 | chrom = 'chrX' 92 | else: 93 | chrom = f'chr{i}' 94 | 95 | chrom_phased_files_to_concat = [] 96 | 97 | for file in imputed_vcfs_chunks: 98 | f = file['path'] 99 | vcf_basename = get_vcf_filebase(f) 100 | file_index = int(vcf_basename.split('.')[-4]) 101 | file_region = regions_dict[file_index] 102 | map_chrom = file_region.split(':')[0] 103 | if map_chrom == chrom: 104 | chrom_phased_files_to_concat.append(f) 105 | 106 | # naturally sort the list of files to merge 107 | from gwaspy.utils.natural_sort import natural_keys 108 | chrom_phased_files_to_concat.sort(key=natural_keys) 109 | 110 | concat_vcfs(b=concat_b, vcfs_to_merge=chrom_phased_files_to_concat, vcf_basename=vcf_filebase, 111 | output_type=output_type, chrom=chrom, cpu=cpu, memory=memory, out_dir=out_dir) 112 | 113 | concat_b.run() 114 | -------------------------------------------------------------------------------- /gwaspy/imputation/imputation.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Lindo Nkambule' 2 | 3 | import hailtop.batch as hb 4 | import argparse 5 | 6 | 7 | # ONCE YOU ADD FUNCTIONALITY FOR USING DIFFERENT REF PANELS, CHANGE n_panel* parameters and include cmd args 8 | def genotype_imputation(input_vcf: str = None, 9 | females_file: str = None, 10 | n_samples: int = None, 11 | n_panel_samples: int = 4099, 12 | buffer_region: int = 250, 13 | phasing_software: str = None, 14 | local: bool = False, 15 | exclude_chrx: bool = False, 16 | billing_project: str = None, 17 | memory: str = 'highmem', 18 | cpu: int = 16, 19 | stages: str = 'impute,concat', 20 | output_type: str = 'bcf', 21 | out_dir: str = None): 22 | # Error handling 23 | if not out_dir: 24 | raise SystemExit('Output directory not specified. Specify using --out_dir if running from the command line or' 25 | 'out_dir argument if running inside a Python script') 26 | 27 | steps_list = stages.split(',') 28 | steps = [x.lower() for x in steps_list] 29 | unknown_steps = [i for i in steps if i not in ['impute', 'concat']] 30 | 31 | if len(unknown_steps) > 0: 32 | raise SystemExit(f'Incorrect process(es) {unknown_steps} selected. Options are [impute, concat]') 33 | 34 | if output_type.lower() not in ['bcf', 'vcf']: 35 | raise SystemExit(f'Incorrect output type {output_type} selected. Options are [bcf, vcf]') 36 | 37 | if memory.lower() not in ['lowmem', 'standard', 'highmem']: 38 | raise SystemExit(f'Incorrect memory type {memory} selected. Options are [lowmem, standard, highmem]') 39 | 40 | if not n_samples: 41 | raise SystemExit('Number of samples in input data not detected. Specify how many samples (integer), using' 42 | '--n-samples if running from the command line or' 43 | 'n_samples argument if running inside a Python script, are in the input data') 44 | 45 | if local: 46 | backend = hb.LocalBackend() 47 | else: 48 | backend = hb.ServiceBackend(billing_project=billing_project, 49 | remote_tmpdir=f'{out_dir}/tmp/') 50 | 51 | # impute genotypes 52 | if 'impute' in steps: 53 | from gwaspy.imputation.sex_aut_imp import run_impute 54 | run_impute(backend=backend, input_vcf=input_vcf, females_file=females_file, n_samples=n_samples, 55 | n_panel_samples=n_panel_samples, phasing_software=phasing_software, exclude_chrx=exclude_chrx, 56 | memory=memory, buffer_region=buffer_region, out_dir=out_dir) 57 | 58 | # Concatenate imputed chunks 59 | if 'concat' in steps: 60 | from gwaspy.imputation.concat_vcfs import run_concat 61 | run_concat(backend=backend, input_vcf=input_vcf, output_type=output_type, exclude_chrx=exclude_chrx, cpu=cpu, 62 | memory=memory, out_dir=out_dir) 63 | 64 | 65 | def main(): 66 | parser = argparse.ArgumentParser() 67 | parser.add_argument('--input-vcf', type=str, required=True) 68 | parser.add_argument('--samples-file', type=str, required=True) 69 | parser.add_argument('--local', action='store_true') 70 | parser.add_argument('--exclude-chrx', action='store_true') 71 | parser.add_argument('--billing-project', required=True) 72 | parser.add_argument('--phasing-software', type=str, default='shapeit', choices=['eagle', 'shapeit']) 73 | parser.add_argument('--memory', type=str, default='highmem', choices=['lowmem', 'standard', 'highmem']) 74 | parser.add_argument('--cpu-concat', type=int, default=16) 75 | parser.add_argument('--n-samples', type=int, required=True) 76 | parser.add_argument('--buffer-region', type=int, default=250) 77 | parser.add_argument('--stages', type=str, default='impute,concat') 78 | parser.add_argument('--out-type', type=str, default='bcf', choices=['bcf', 'vcf']) 79 | parser.add_argument('--out-dir', required=True) 80 | 81 | args = parser.parse_args() 82 | 83 | genotype_imputation(input_vcf=args.input_vcf, females_file=args.samples_file, n_samples=args.n_samples, 84 | buffer_region=args.buffer_region, phasing_software=args.phasing_software, local=args.local, 85 | exclude_chrx=args.exclude_chrx, billing_project=args.billing_project, memory=args.memory, 86 | cpu=args.cpu_concat, stages=args.stages, output_type=args.out_type, out_dir=args.out_dir) 87 | 88 | 89 | if __name__ == '__main__': 90 | main() 91 | -------------------------------------------------------------------------------- /gwaspy/imputation/impute.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Lindo Nkambule' 2 | 3 | import argparse 4 | import hailtop.batch as hb 5 | from gwaspy.imputation.impute5_impute import impute5_imputation 6 | from gwaspy.imputation.glimpse2_impute import glimpse_phase_impute 7 | from typing import Union 8 | 9 | 10 | def run_impute(backend: Union[hb.ServiceBackend, hb.LocalBackend] = None, 11 | input_file: str = None, 12 | vcf_ref: str = None, 13 | chromosomes: str = "all", 14 | software: str = 'impute5', 15 | output_filename: str = None, 16 | n_samples: int = None, 17 | n_panel_samples: int = 4091, 18 | out_dir: str = None 19 | ): 20 | 21 | if software.lower() not in ['beagle5', 'glimpse2', 'impute5']: 22 | raise SystemExit(f'Incorrect software {software} selected. Options are [beagle5, glimpse2, impute5]') 23 | 24 | b = hb.Batch(backend=backend, 25 | name=f'GWASpy-Imputation-{software.upper()}') 26 | 27 | if vcf_ref == 'hgdp1kgp': 28 | print(f'\nIMPUTING GENOTYPES WITH HGDP+1KGP PANEL\n') 29 | ref_path = 'gs://gcp-public-data--gnomad/resources/hgdp_1kg/phased_haplotypes_v2/hgdp1kgp_chrCNUMBER.filtered.SNV_INDEL.phased.shapeit5.bcf' 30 | else: 31 | print(f'\nIMPUTING GENOTYPES WITH USER-DEFINED REFERENCE PANEL\n') 32 | ref_path = vcf_ref 33 | 34 | if software == 'impute5': 35 | print(f'\nIMPUTING GENOTYPES USING IMPUTE5\n') 36 | impute5_imputation( 37 | batch=b, 38 | input_path=input_file, 39 | reference_path=ref_path, 40 | chromosomes=chromosomes, 41 | output_filename=output_filename, 42 | n_samples=n_samples, 43 | n_panel_samples=n_panel_samples, 44 | output_path=out_dir 45 | ) 46 | elif software == 'glimpse2': 47 | glimpse_phase_impute( 48 | batch=b, 49 | bam_files=input_file, 50 | reference_path=ref_path, 51 | chromosomes = chromosomes, 52 | output_filename=output_filename, 53 | output_path=out_dir 54 | ) 55 | # else: TO add BEAGLE 56 | 57 | 58 | def main(): 59 | parser = argparse.ArgumentParser() 60 | parser.add_argument('--input-file', type=str, required=True) 61 | parser.add_argument('--vcf-ref', type=str, default='hgdp1kgp') 62 | parser.add_argument('--chromosomes', type=str, default='all') 63 | parser.add_argument('--local', action='store_true') 64 | parser.add_argument('--billing-project', required=True) 65 | parser.add_argument('--n-samples', type=int, required=True) 66 | parser.add_argument('--n-ref-samples', type=int, default=4091) 67 | parser.add_argument('--software', type=str, default='impute5', choices=['beagle5', 'glimpse2', 'impute5']) 68 | parser.add_argument('--output-filename', type=str, required=True) 69 | parser.add_argument('--out-dir', type=str, required=True) 70 | 71 | args = parser.parse_args() 72 | 73 | if args.local: 74 | backend = hb.LocalBackend() 75 | else: 76 | backend = hb.ServiceBackend(billing_project=args.billing_project, 77 | remote_tmpdir=f'{args.out_dir}/tmp/') 78 | 79 | run_impute(backend=backend, 80 | input_file=args.input_file, 81 | vcf_ref=args.vcf_ref, 82 | chromosomes=args.chromosomes, 83 | software=args.software, 84 | output_filename=args.output_filename, 85 | n_samples=args.n_samples, 86 | n_panel_samples=args.n_ref_samples, 87 | out_dir=args.out_dir) 88 | 89 | backend.close() 90 | -------------------------------------------------------------------------------- /gwaspy/imputation/impute_vcf.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Lindo Nkambule' 2 | 3 | import hailtop.batch as hb 4 | import hail as hl 5 | import pandas as pd 6 | from typing import Union 7 | from gwaspy.phasing.get_filebase import get_vcf_filebase 8 | from gwaspy.utils.get_file_size import bytes_to_gb 9 | 10 | 11 | def imputation(b: hb.batch.Batch, 12 | vcf: str = None, 13 | vcf_filename_no_ext: str = None, 14 | ref: hb.ResourceGroup = None, 15 | ref_size: Union[int, float] = None, 16 | region: str = None, 17 | chromosome: str = None, 18 | cpu: int = 8, 19 | memory: str = 'highmem', 20 | img: str = 'docker.io/lindonkambule/gwaspy:v1', 21 | threads: int = 7, 22 | out_dir: str = None): 23 | 24 | # in_vcf = b.read_input(vcf) 25 | in_vcf = b.read_input_group(**{'bcf': vcf, 26 | 'bcf.csi': f'{vcf}.csi'}) 27 | vcf_size = bytes_to_gb(vcf) 28 | 29 | output_file_name = vcf_filename_no_ext + '.imputed.bcf' 30 | file_dir = vcf_filename_no_ext.split('.')[0] 31 | 32 | disk_size = ref_size + (vcf_size * 4) 33 | 34 | map_file = f'/shapeit4/maps/b38/{chromosome}.b38.gmap.gz' 35 | 36 | impute = b.new_job(name=output_file_name) 37 | impute.cpu(cpu) 38 | impute.memory(memory) 39 | impute.storage(f'{disk_size}Gi') 40 | impute.image(img) 41 | 42 | cmd = f''' 43 | impute5_1.1.5_static \ 44 | --h {ref.bcf} \ 45 | --m {map_file} \ 46 | --g {in_vcf.bcf} \ 47 | --r {region} \ 48 | --out-gp-field \ 49 | --o {output_file_name} \ 50 | --threads {threads} 51 | ''' 52 | 53 | impute.command(cmd) 54 | # index file to use when merging 55 | impute.command(f'bcftools index {output_file_name}') 56 | 57 | impute.command(f'mv {output_file_name} {impute.ofile}') 58 | impute.command(f'mv {output_file_name}.csi {impute.ind}') 59 | b.write_output(impute.ofile, f'{out_dir}/GWASpy/Imputation/{file_dir}/imputed_chunks/{output_file_name}') 60 | b.write_output(impute.ind, f'{out_dir}/GWASpy/Imputation/{file_dir}/imputed_chunks/{output_file_name}.csi') 61 | 62 | 63 | def run_impute(backend: Union[hb.ServiceBackend, hb.LocalBackend] = None, 64 | input_vcfs: str = None, 65 | phasing_software: str = None, 66 | memory: str = 'highmem', 67 | cpu: int = 8, 68 | threads: int = 7, 69 | out_dir: str = None): 70 | 71 | print(f'RUNNING IMPUTATION ON FILES PHASED WITH {phasing_software.upper()}') 72 | impute_b = hb.Batch(backend=backend, name=f'impute-phased-chunks') 73 | 74 | vcf_paths = pd.read_csv(input_vcfs, sep='\t', header=None) 75 | 76 | # get the regions so we can map each file to its specific region 77 | regions = pd.read_csv(f'{out_dir}/GWASpy/Phasing/regions.lines', sep='\t', names=['reg', 'ind']) 78 | regions_dict = pd.Series(regions.reg.values, index=regions.ind).to_dict() 79 | 80 | for index, row in vcf_paths.iterrows(): 81 | vcf = row[0] 82 | vcf_filebase = get_vcf_filebase(vcf) 83 | 84 | if phasing_software == 'shapeit': 85 | phased_vcfs_chunks = hl.utils.hadoop_ls(f'{out_dir}/GWASpy/Phasing/{vcf_filebase}/phased_scatter/*.shapeit.bcf') 86 | else: 87 | phased_vcfs_chunks = hl.utils.hadoop_ls(f'{out_dir}/GWASpy/Phasing/{vcf_filebase}/phased_scatter/*.eagle.bcf') 88 | 89 | for i in range(1, 24): 90 | if i == 23: 91 | chrom = 'chrX' 92 | else: 93 | chrom = f'chr{i}' 94 | 95 | ref_bcf = f'gs://gcp-public-data--gnomad/resources/hgdp_1kg/phased_haplotypes/hgdp.tgp.gwaspy.merged.{chrom}.merged.bcf' 96 | ref_ind = f'gs://gcp-public-data--gnomad/resources/hgdp_1kg/phased_haplotypes/hgdp.tgp.gwaspy.merged.{chrom}.merged.bcf.csi' 97 | ref_size = bytes_to_gb(ref_bcf) 98 | ref = impute_b.read_input_group(**{'bcf': ref_bcf, 99 | 'bcf.csi': ref_ind}) 100 | 101 | for file in phased_vcfs_chunks: 102 | f = file['path'] 103 | vcf_basename = get_vcf_filebase(f) 104 | file_index = int(vcf_basename.split('.')[-3]) 105 | file_region = regions_dict[file_index] 106 | map_chrom = file_region.split(':')[0] 107 | 108 | imp_out_filename = f'{vcf_basename}.imputed.bcf' 109 | file_dir = vcf_basename.split('.')[0] 110 | output_filepath_name = f'{out_dir}/GWASpy/Imputation/{file_dir}/imputed_chunks/{imp_out_filename}' 111 | 112 | if map_chrom == chrom: 113 | # check if imputed file already exists 114 | if hl.hadoop_exists(output_filepath_name): 115 | continue 116 | else: 117 | imputation(b=impute_b, vcf=f, vcf_filename_no_ext=vcf_basename, ref=ref, ref_size=ref_size, 118 | region=file_region, chromosome=chrom, cpu=cpu, memory=memory, 119 | threads=threads, out_dir=out_dir) 120 | 121 | impute_b.run() 122 | -------------------------------------------------------------------------------- /gwaspy/pca/__init__.py: -------------------------------------------------------------------------------- 1 | from gwaspy.pca import pca 2 | __all__ = ['pca'] -------------------------------------------------------------------------------- /gwaspy/pca/assign_pop_labels.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from sklearn.ensemble import RandomForestClassifier 3 | from typing import Tuple 4 | import random 5 | 6 | 7 | def assign_population_pcs( 8 | pop_pc_pd: pd.DataFrame, 9 | num_pcs: int, 10 | known_col: str = 'SuperPop', 11 | fit: RandomForestClassifier = None, 12 | seed: int = 42, 13 | prop_train: float = 0.8, 14 | n_estimators: int = 100, 15 | min_prob: float = 0.9, 16 | output_col: str = 'pop', 17 | missing_label: str = 'oth' 18 | ) -> Tuple[pd.DataFrame, RandomForestClassifier]: 19 | """ 20 | This function uses a random forest model to assign population labels based on the results of PCA. 21 | Default values for model and assignment parameters are those used in gnomAD. 22 | :param Table pop_pc_pd: Pandas dataframe containing population PCs as well as a column with population labels 23 | :param str known_col: Column storing the known population labels 24 | :param RandomForestClassifier fit: fit from a previously trained random forest model (i.e., the output from a previous RandomForestClassifier() call) 25 | :param int num_pcs: number of population PCs on which to train the model 26 | :param int seed: Random seed 27 | :param float prop_train: Proportion of known data used for training 28 | :param int n_estimators: Number of trees to use in the RF model 29 | :param float min_prob: Minimum probability of belonging to a given population for the population to be set (otherwise set to `None`) 30 | :param str output_col: Output column storing the assigned population 31 | :param str missing_label: Label for samples for which the assignment probability is smaller than `min_prob` 32 | :return: Dataframe containing sample IDs and imputed population labels, trained random forest model 33 | :rtype: DataFrame, RandomForestClassifier 34 | """ 35 | 36 | print(f'{num_pcs} PCs to be used in population assignment') 37 | # Expand PC column 38 | pc_cols = ['PC{}'.format(i + 1) for i in range(num_pcs)] 39 | train_data = pop_pc_pd.loc[~pop_pc_pd[known_col].isnull()] 40 | 41 | N = len(train_data) 42 | 43 | # Split training data into subsamples for fitting and evaluating 44 | if not fit: 45 | random.seed(seed) 46 | train_subsample_ridx = random.sample(list(range(0, N)), int(N * prop_train)) 47 | train_fit = train_data.iloc[train_subsample_ridx] 48 | fit_samples = [x for x in train_fit['s']] 49 | evaluate_fit = train_data.loc[~train_data['s'].isin(fit_samples)] 50 | 51 | # Train RF 52 | training_set_known_labels = train_fit[known_col].values 53 | training_set_pcs = train_fit[pc_cols].values 54 | evaluation_set_pcs = evaluate_fit[pc_cols].values 55 | 56 | pop_clf = RandomForestClassifier(n_estimators=n_estimators, random_state=seed) 57 | pop_clf.fit(training_set_pcs, training_set_known_labels) 58 | print('Random forest feature importances are as follows: {}'.format(pop_clf.feature_importances_)) 59 | 60 | # Evaluate RF 61 | predictions = pop_clf.predict(evaluation_set_pcs) 62 | error_rate = 1 - sum(evaluate_fit[known_col] == predictions) / float(len(predictions)) 63 | print('Estimated error rate for RF model is {}'.format(error_rate)) 64 | else: 65 | pop_clf = fit 66 | 67 | # Classify data 68 | print('Classifying data') 69 | pop_pc_pd[output_col] = pop_clf.predict(pop_pc_pd[pc_cols].values) 70 | probs = pop_clf.predict_proba(pop_pc_pd[pc_cols].values) 71 | probs = pd.DataFrame(probs, columns=[f'prob_{p}' for p in pop_clf.classes_]) 72 | 73 | pop_pc_pd = pd.concat([pop_pc_pd.reset_index(drop=True), probs.reset_index(drop=True)], axis=1) 74 | 75 | probs['max'] = probs.max(axis=1) 76 | pop_pc_pd.loc[probs['max'] < min_prob, output_col] = missing_label 77 | 78 | return pop_pc_pd, pop_clf -------------------------------------------------------------------------------- /gwaspy/pca/filter_ref_data.py: -------------------------------------------------------------------------------- 1 | # this is the script used to filter the reference 1KG+HGDP data used in PCA 2 | 3 | import hail as hl 4 | 5 | hl.init(default_reference='GRCh38') 6 | 7 | ref_mt = hl.read_matrix_table('gs://african-seq-data/hgdp_tgp/hgdp_tgp_postQC.mt') 8 | 9 | print("\nInitial number of SNPs before filtering: {}".format(ref_mt.count_rows())) 10 | filtered_ref = hl.variant_qc(ref_mt) 11 | 12 | print("filtering") 13 | 14 | filtered_ref = filtered_ref.filter_rows((filtered_ref.variant_qc.AF[0] > 0.05) & (filtered_ref.variant_qc.AF[0] < 0.95)) 15 | print("\nNumber of SNPs after MAF filtering: {}".format(filtered_ref.count_rows())) 16 | 17 | filtered_ref = filtered_ref.filter_rows(filtered_ref.variant_qc.call_rate > 0.999) 18 | print("\nNumber of SNPs after Call Rate filtering: {}".format(filtered_ref.count_rows())) 19 | 20 | # print("repartitioning") 21 | # filtered_ref = filtered_ref.repartition(n_partitions=100, shuffle=True) 22 | 23 | print("writing filtererd mt") 24 | filtered_ref.write('gs://african-seq-data/hgdp_tgp/gwaspy_pca_ref/hgdp_1kg_filtered_maf_5_GRCh38.mt', overwrite=True) 25 | print("Done filtering") 26 | 27 | print("getting sample information") 28 | 29 | mt = hl.read_matrix_table('gs://african-seq-data/hgdp_tgp/gwaspy_pca_ref/hgdp_1kg_filtered_maf_5_GRCh38.mt') 30 | 31 | cols_ht = mt.cols() 32 | 33 | pops = cols_ht.select(cols_ht.hgdp_tgp_meta.Study.region) 34 | 35 | df = pops.to_pandas() 36 | 37 | df.columns = ['Sample', 'SuperPop'] 38 | 39 | old_pops_labs = ['Africa', 'America', 'Central_South_Asia', 'East_Asia', 'Europe', 'Middle_East', 'Oceania', 'SAS'] 40 | new_pops_labs = ['AFR', 'AMR', 'CSA', 'EAS', 'EUR', 'MID', 'OCE', 'CSA'] 41 | df['SuperPop'] = df['SuperPop'].replace(old_pops_labs, new_pops_labs) 42 | 43 | print(df['SuperPop'].value_counts()) 44 | 45 | print("exporting sample metadata") 46 | df.to_csv('gs://african-seq-data/hgdp_tgp/gwaspy_pca_ref/hgdp_1kg_sample_info.tsv', sep='\t', index=False) 47 | 48 | -------------------------------------------------------------------------------- /gwaspy/pca/pca.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Lindo Nkambule' 2 | 3 | import argparse 4 | import hail as hl 5 | 6 | 7 | def pca( 8 | ref_dirname: str = 'gs://gcp-public-data--gnomad/release/3.1/secondary_analyses/hgdp_1kg_v2/pca_results/', 9 | ref_basename: str = 'unrelateds_without_outliers', 10 | ref_info: str = 'gs://hgdp-1kg/hgdp_tgp/gwaspy_pca_ref/hgdp_1kg_sample_info.unrelateds.pca_outliers_removed.with_project.tsv', 11 | reference: str = 'GRCh38', pca_type: str = None, 12 | data_dirname: str = None, data_basename: str = None, input_type: str = None, 13 | maf: float = 0.05, hwe: float = 1e-3, call_rate: float = 0.98, 14 | ld_cor: float = 0.2, ld_window: int = 250000, n_pcs: int = 20, run_relatedness_check: bool = True, 15 | include_kinself: bool = False, relatedness_method: str = 'pc_relate', 16 | relatedness_thresh: float = 0.1, prob_threshold: float = 0.8, out_dir: str = None): 17 | 18 | if not out_dir: 19 | raise Exception('\nOutput directory where files will be saved is not specified') 20 | 21 | hl.default_reference(new_default_reference=reference) 22 | 23 | if pca_type == 'project': 24 | print('\nRunning PCA using projection method') 25 | 26 | from gwaspy.pca.pca_project import run_pca_project 27 | run_pca_project(ref_dirname=ref_dirname, ref_basename=ref_basename, ref_info=ref_info, 28 | data_dirname=data_dirname, data_basename=data_basename, out_dir=out_dir, input_type=input_type, 29 | reference=reference, npcs=n_pcs, maf=maf, hwe=hwe, call_rate=call_rate, 30 | relatedness_method=relatedness_method, run_relatedness_check=run_relatedness_check, 31 | ld_cor=ld_cor, ld_window=ld_window, include_kinself=include_kinself, 32 | prob_threshold=prob_threshold) 33 | 34 | elif pca_type == 'joint': 35 | print('\nRunning PCA using joint method') 36 | from gwaspy.pca.pca_joint import run_pca_joint 37 | run_pca_joint(ref_dirname=ref_dirname, ref_basename=ref_basename, ref_info=ref_info, data_dirname=data_dirname, 38 | data_basename=data_basename, out_dir=out_dir, input_type=input_type, reference=reference, 39 | npcs=n_pcs, maf=maf, hwe=hwe, call_rate=call_rate, ld_cor=ld_cor, ld_window=ld_window, 40 | relatedness_method=relatedness_method, relatedness_thresh=relatedness_thresh, 41 | prob_threshold=prob_threshold) 42 | 43 | else: 44 | print('\nRunning PCA without a reference') 45 | from gwaspy.pca.pca_normal import run_pca_normal 46 | run_pca_normal(dirname=data_dirname, basename=data_basename, input_type=input_type, out_dir=out_dir, 47 | reference=reference, maf=maf, hwe=hwe, call_rate=call_rate, ld_cor=ld_cor, ld_window=ld_window, 48 | n_pcs=n_pcs, run_relatedness_check=run_relatedness_check, relatedness_method=relatedness_method, 49 | relatedness_thresh=relatedness_thresh, include_kinself=include_kinself) 50 | 51 | 52 | def main(): 53 | parser = argparse.ArgumentParser() 54 | # reference args 55 | parser.add_argument('--ref-dirname', default='gs://gcp-public-data--gnomad/release/3.1/secondary_analyses/hgdp_1kg_v2/pca_results/') 56 | parser.add_argument('--ref-basename', default='unrelateds_without_outliers') 57 | parser.add_argument('--ref-info', default='gs://hgdp-1kg/hgdp_tgp/gwaspy_pca_ref/hgdp_1kg_sample_info.unrelateds.pca_outliers_removed.with_project.tsv') 58 | parser.add_argument('--reference', type=str, default='GRCh38') 59 | parser.add_argument('--pca-type', type=str, default='normal', choices=['normal', 'project', 'joint']) 60 | 61 | # data args 62 | parser.add_argument('--data-dirname', type=str, required=True) 63 | parser.add_argument('--data-basename', type=str, required=True) 64 | parser.add_argument('--input-type', type=str, required=True, choices=['vcf', 'plink', 'hail']) 65 | 66 | # filter args 67 | parser.add_argument('--maf', type=float, default=0.05, help='include only SNPs with MAF >= NUM in PCA') 68 | parser.add_argument('--hwe', type=float, default=1e-3, help='include only SNPs with HWE >= NUM in PCA') 69 | parser.add_argument('--geno', type=float, default=0.98, help='include only SNPs with call-rate > NUM') 70 | parser.add_argument('--ld-cor', type=float, default=0.2, choices=range(0,1), metavar="[0.0-1.0]", 71 | help='Squared correlation threshold (exclusive upper bound). Must be in the range [0.0, 1.0]') 72 | parser.add_argument('--ld-window', type=int, default=250000, 73 | help='Window size in base pairs (inclusive upper bound)') 74 | parser.add_argument('--npcs', type=int, default=20, help='Number of PCs to use') 75 | parser.add_argument('--no-relatedness', action='store_false') 76 | parser.add_argument('--include-kinself', action='store_true') 77 | parser.add_argument('--relatedness-method', type=str, default='pc_relate', 78 | choices=['pc_relate', 'ibd', 'king'], help='Method to use for the inference of relatedness') 79 | parser.add_argument('--relatedness-thresh', type=float, default=0.1, 80 | help='Threshold value to use in relatedness checks') 81 | parser.add_argument('--prob', type=float, default=0.8, 82 | help='Minimum probability of belonging to a given population for the population to be set') 83 | parser.add_argument('--out-dir', type=str, required=True) 84 | 85 | args = parser.parse_args() 86 | 87 | if not args.prob: 88 | print(f'No prob value specified, {args.prob} will be used') 89 | 90 | pca(ref_dirname=args.ref_dirname, ref_basename=args.ref_basename, ref_info=args.ref_info, reference=args.reference, 91 | pca_type=args.pca_type, input_type=args.input_type, data_dirname=args.data_dirname, 92 | data_basename=args.data_basename, maf=args.maf, hwe=args.hwe, call_rate=args.geno, ld_cor=args.ld_cor, 93 | ld_window=args.ld_window, n_pcs=args.npcs, run_relatedness_check=args.no_relatedness, 94 | include_kinself=args.include_kinself, relatedness_method=args.relatedness_method, 95 | relatedness_thresh=args.relatedness_thresh, prob_threshold=args.prob, out_dir=args.out_dir) 96 | 97 | print('\nDone running PCA') 98 | 99 | 100 | if __name__ == '__main__': 101 | main() 102 | 103 | -------------------------------------------------------------------------------- /gwaspy/pca/pca_filter_snps.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Lindo Nkambule' 2 | 3 | import hail as hl 4 | import pandas as pd 5 | 6 | 7 | def pca_filter_mt( 8 | in_mt: hl.MatrixTable, 9 | maf: float = 0.05, 10 | hwe: float = 1e-3, 11 | call_rate: float = 0.98, 12 | ld_cor: float = 0.2, 13 | ld_window: int = 250000): 14 | 15 | print("\nInitial number of SNPs before filtering: {}".format(in_mt.count_rows())) 16 | mt = hl.variant_qc(in_mt) 17 | print(f'\nFiltering out variants with MAF < {maf}') 18 | mt_filt = mt.annotate_rows(maf=hl.min(mt.variant_qc.AF)) 19 | mt_filt = mt_filt.filter_rows(mt_filt.maf > maf) 20 | 21 | print(f'\nFiltering out variants with HWE < {hwe:1e}') 22 | mt_filt = mt_filt.filter_rows(mt_filt.variant_qc.p_value_hwe > hwe) 23 | 24 | print(f'\nFiltering out variants with Call Rate < {call_rate}') 25 | mt_filt = mt_filt.filter_rows(mt_filt.variant_qc.call_rate >= call_rate) 26 | 27 | # no strand ambiguity 28 | print('\nFiltering out strand ambigous variants') 29 | mt_filt = mt_filt.filter_rows(~hl.is_strand_ambiguous(mt_filt.alleles[0], mt_filt.alleles[1])) 30 | 31 | # MHC chr6:25-35Mb 32 | # chr8.inversion chr8:7-13Mb 33 | print('\nFiltering out variants in MHC [chr6:25M-35M] and chromosome 8 inversions [chr8:7M-13M]') 34 | intervals = ['chr6:25M-35M', 'chr8:7M-13M'] 35 | mt_filt = hl.filter_intervals(mt_filt, [hl.parse_locus_interval(x, reference_genome='GRCh38') for x in intervals], 36 | keep=False) 37 | 38 | # This step is expensive (on local machine) 39 | print(f'\nLD pruning using correlation threshold of {ld_cor} and window size of {ld_window}') 40 | mt_ld_prune = hl.ld_prune(mt_filt.GT, r2=ld_cor, bp_window_size=ld_window) 41 | mt_ld_pruned = mt_filt.filter_rows(hl.is_defined(mt_ld_prune[mt_filt.row_key])) 42 | print("\nNumber of SNPs after filtering: {}".format(mt_ld_pruned.count_rows())) 43 | 44 | return mt_ld_pruned 45 | 46 | 47 | def relatedness_check( 48 | in_mt: hl.MatrixTable = None, 49 | method: str = 'pc_relate', 50 | outdir: str = None, 51 | kin_estimate: float = 0.1, 52 | include_kinself: bool = False): 53 | 54 | if method == 'pc_relate': 55 | print("\nUsing PC-Relate for relatedness checks") 56 | # compute kinship statistic for every sample-pair 57 | if include_kinself: 58 | print("\nkinself will be included in exported tsv file") 59 | relatedness_ht = hl.pc_relate(in_mt.GT, 0.01, k=10, statistics='kin', include_self_kinship=include_kinself) 60 | 61 | print('exporting relatedness statistics to a tsv file') 62 | ht_export = relatedness_ht.key_by() 63 | ht_export = ht_export.select(ht_export.kin, i=ht_export.i.s, j=ht_export.j.s) 64 | ht_export.export(f'{outdir}relatedness_checks_pc_relate.tsv.bgz') 65 | 66 | print('getting related samples to be removed using maximal independent set') 67 | # only run maximal independent set step on sample-pairs with kinship above specified threshold 68 | 69 | # when include_kinself is True, not removing kinself will result in all samples failing relatedness because for 70 | # every kin between a sample with itself, the kin estimate will be ~0.5 in most cases (excluding inbreeding) 71 | if include_kinself: 72 | relatedness_ht = relatedness_ht.filter(relatedness_ht.i == relatedness_ht.j, keep=False) 73 | else: 74 | relatedness_ht = relatedness_ht 75 | pairs = relatedness_ht.filter(relatedness_ht['kin'] > kin_estimate) 76 | samples_to_remove = hl.maximal_independent_set(pairs.i, pairs.j, False) 77 | samples = samples_to_remove.node.s.collect() 78 | 79 | elif method == 'ibd': 80 | print("\nUsing PLINK-style IBD for relatedness checks") 81 | in_mt = hl.variant_qc(in_mt) 82 | in_mt = in_mt.annotate_rows(maf=hl.min(in_mt.variant_qc.AF)) 83 | relatedness_ht = hl.identity_by_descent(in_mt, maf=in_mt['maf']) 84 | 85 | print('exporting relatedness statistics to a tsv file') 86 | relatedness_ht.export(f'{outdir}relatedness_checks_ibd.tsv.bgz') 87 | 88 | print('getting related samples to be removed using maximal independent set') 89 | # only run maximal independent set step on sample-pairs with kinship above specified threshold 90 | pairs = relatedness_ht.filter(relatedness_ht['ibd.PI_HAT'] > kin_estimate) 91 | samples_to_remove = hl.maximal_independent_set(pairs.i, pairs.j, False) 92 | samples = samples_to_remove.node.collect() 93 | 94 | else: 95 | print("\nUsing KING for relatedness checks") 96 | if kin_estimate > 0.5: 97 | raise Exception("\nThe maximum kinship coefficient in KING is 0.5") 98 | relatedness_mt = hl.king(in_mt.GT) 99 | relatedness_ht = relatedness_mt.filter_entries((relatedness_mt.s_1 != relatedness_mt.s) & 100 | (relatedness_mt.phi >= kin_estimate)).entries() 101 | 102 | print('exporting relatedness statistics to a tsv file') 103 | relatedness_ht.export(f'{outdir}relatedness_checks_king.tsv.bgz') 104 | 105 | print('getting related samples to be removed using maximal independent set') 106 | samples_to_remove = hl.maximal_independent_set(relatedness_ht.s_1, relatedness_ht.s, False) 107 | samples = samples_to_remove.node.collect() 108 | 109 | if len(samples) > 0: 110 | # Do not remove samples that fail relatedness check 111 | # in_mt = in_mt.filter_cols(hl.literal(samples).contains(in_mt['s']), keep=False) 112 | print(f"\nNumber of samples that fail relatedness checks: {len(samples)}") 113 | 114 | df = pd.DataFrame(samples, columns=['Sample']) 115 | ht = hl.Table.from_pandas(df) 116 | ht.export(f'{outdir}samples_failing_relatedness_checks.tsv') 117 | 118 | else: 119 | print("\nNo samples failed the relatedness check") 120 | 121 | return in_mt, samples 122 | -------------------------------------------------------------------------------- /gwaspy/phasing/__init__.py: -------------------------------------------------------------------------------- 1 | from gwaspy.phasing import phase 2 | __all__ = ['phase'] 3 | -------------------------------------------------------------------------------- /gwaspy/phasing/concat_vcfs.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Michael Wilson & Lindo Nkambule' 2 | 3 | import hailtop.batch as hb 4 | import hail as hl 5 | import pandas as pd 6 | from gwaspy.utils.get_file_size import bytes_to_gb 7 | from gwaspy.phasing.get_filebase import get_vcf_filebase 8 | from typing import List 9 | from typing import Union 10 | 11 | 12 | def concat_vcfs(b: hb.batch.Batch, 13 | vcf_basename: str = None, 14 | vcfs_to_merge: List = None, 15 | output_type: str = 'bcf', 16 | software: str = None, 17 | chrom: str = None, 18 | docker_img: str = 'docker.io/lindonkambule/gwaspy:v1', 19 | cpu: int = 8, 20 | out_dir: str = None): 21 | 22 | global index_cmd 23 | 24 | out_type = 'b' if output_type == 'bcf' else 'z' 25 | threads = cpu - 1 26 | vcfs_sizes_sum = 0 27 | merge_vcf_i = '' 28 | 29 | out_filename = f'{vcf_basename}.{chrom}.phased.{software}.bcf' if output_type == 'bcf' else \ 30 | f'{vcf_basename}.{chrom}.phased.{software}.vcf.gz' 31 | out_index_name = f'{vcf_basename}.{chrom}.phased.{software}.bcf.csi' if output_type == 'bcf' else \ 32 | f'{vcf_basename}.{chrom}.phased.{software}.vcf.gz.csi' 33 | 34 | for line in vcfs_to_merge: 35 | vcfs_sizes_sum += 1 + bytes_to_gb(line) 36 | 37 | mem = 'highmem' if vcfs_sizes_sum > 2 else 'standard' 38 | disk_size = 10 + vcfs_sizes_sum 39 | 40 | concat = b.new_job(name=f'concat-{vcf_basename}') 41 | concat.memory(mem) 42 | concat.storage(f'{disk_size}Gi') 43 | concat.image(docker_img) 44 | concat.cpu(cpu) 45 | 46 | for line in vcfs_to_merge: 47 | input_vcf = b.read_input_group(vcf=line, 48 | ind=f'{line}.csi') 49 | merge_vcf_i += f'{input_vcf.vcf} ' 50 | 51 | cmd = f''' 52 | bcftools concat \ 53 | --no-version \ 54 | --output-type {out_type} \ 55 | --output {out_filename} \ 56 | --threads {threads} \ 57 | --ligate \ 58 | {merge_vcf_i} 59 | ''' 60 | 61 | concat.command(cmd) 62 | # index the merged output 63 | concat.command(f'bcftools index {out_filename}') 64 | 65 | concat.command(f'mv {out_filename} {concat.ofile}') 66 | concat.command(f'mv {out_index_name} {concat.idx}') 67 | b.write_output(concat.ofile, f'{out_dir}/GWASpy/{vcf_basename}/Phasing/phased_merged/{out_filename}') 68 | b.write_output(concat.idx, f'{out_dir}/GWASpy/{vcf_basename}/Phasing/phased_merged/{out_index_name}') 69 | 70 | 71 | def run_concat(backend: Union[hb.ServiceBackend, hb.LocalBackend] = None, 72 | input_vcf: str = None, 73 | output_type: str = 'bcf', 74 | reference: str = 'GRCh38', 75 | software: str = None, 76 | out_dir: str = None): 77 | 78 | print(f'\n3. CONCAT {input_vcf}\n') 79 | vcf_filebase = get_vcf_filebase(input_vcf) 80 | 81 | # get the regions so we can map each file to its specific region 82 | regions = pd.read_csv(f'{out_dir}/GWASpy/{vcf_filebase}/Phasing/regions.lines', sep='\t', names=['reg', 'ind']) 83 | regions_dict = pd.Series(regions.reg.values, index=regions.ind).to_dict() 84 | 85 | concat_b = hb.Batch(backend=backend, name=f'concat-phased-chunks-{vcf_filebase}') 86 | 87 | if software == 'shapeit': 88 | phased_vcf_chunks = hl.utils.hadoop_ls(f'{out_dir}/GWASpy/{vcf_filebase}/Phasing/phased_scatter/*.shapeit.bcf') 89 | else: 90 | phased_vcf_chunks = hl.utils.hadoop_ls(f'{out_dir}/GWASpy/{vcf_filebase}/Phasing/phased_scatter/*.eagle.bcf') 91 | 92 | for i in range(1, 24): 93 | if reference == 'GRCh38': 94 | if i == 23: 95 | chrom = 'chrX' 96 | else: 97 | chrom = f'chr{i}' 98 | 99 | out_chrom_name = chrom 100 | else: 101 | chrom = str(i) 102 | out_chrom_name = f'chr{chrom}' 103 | 104 | chrom_phased_files_to_concat = [] 105 | 106 | for file in phased_vcf_chunks: 107 | f = file['path'] 108 | vcf_basename = get_vcf_filebase(f) 109 | file_index = int(vcf_basename.split('.')[-3]) 110 | file_region = regions_dict[file_index] 111 | map_chrom = file_region.split(':')[0] 112 | if map_chrom == chrom: 113 | chrom_phased_files_to_concat.append(f) 114 | 115 | # naturally sort the list of files to merge 116 | from gwaspy.utils.natural_sort import natural_keys 117 | chrom_phased_files_to_concat.sort(key=natural_keys) 118 | 119 | # checkpoint to see if file already exists to avoid redoing things 120 | chrom_out_filename = f'{vcf_filebase}.{out_chrom_name}.phased.{software}.bcf' if output_type == 'bcf' else \ 121 | f'{vcf_filebase}.{out_chrom_name}.phased.{software}.vcf.gz' 122 | chrom_out_filname_path = f'{out_dir}/GWASpy/{vcf_filebase}/Phasing/phased_merged/{chrom_out_filename}' 123 | 124 | if hl.hadoop_exists(chrom_out_filname_path): 125 | continue 126 | else: 127 | concat_vcfs(b=concat_b, vcfs_to_merge=chrom_phased_files_to_concat, vcf_basename=vcf_filebase, 128 | output_type=output_type, software=software, chrom=out_chrom_name, out_dir=out_dir) 129 | 130 | concat_b.run() 131 | -------------------------------------------------------------------------------- /gwaspy/phasing/get_filebase.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Lindo Nkambule' 2 | 3 | import ntpath 4 | 5 | 6 | def get_vcf_filebase(file: str = None): 7 | 8 | vcf_name = ntpath.basename(file) 9 | if vcf_name.endswith('.gz'): 10 | file_no_ext = vcf_name[:-7] 11 | elif vcf_name.endswith('.bgz'): 12 | file_no_ext = vcf_name[:-8] 13 | else: 14 | file_no_ext = vcf_name[:-4] 15 | 16 | return file_no_ext 17 | -------------------------------------------------------------------------------- /gwaspy/phasing/phase.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Lindo Nkambule' 2 | 3 | import argparse 4 | import hailtop.batch as hb 5 | from gwaspy.phasing.shapeit5_phase import shapeit_phasing 6 | from typing import Union 7 | 8 | 9 | def run_phase(backend: Union[hb.ServiceBackend, hb.LocalBackend] = None, 10 | input_vcf: str = None, 11 | vcf_ref: str = None, 12 | fam_file: str = None, 13 | data_type: str = 'array', 14 | software: str = 'shapeit', 15 | genome_build: str = 'GRCh38', 16 | fill_tags: bool = False, 17 | output_filename: str = None, 18 | out_dir: str = None): 19 | 20 | if data_type.lower() not in ['array', 'wgs']: 21 | raise SystemExit(f'Incorrect data type {data_type} selected. Options are [array, wgs]') 22 | 23 | if software.lower() not in ['beagle', 'shapeit']: 24 | raise SystemExit(f'Incorrect software {software} selected. Options are [beagle, shapeit]') 25 | 26 | b = hb.Batch(backend=backend, 27 | name=f'GWASpy-Phasing-{software.upper()}') 28 | 29 | if vcf_ref: 30 | if vcf_ref == 'hgdp1kgp': 31 | print(f'\nPHASING {input_vcf} WITH HGDP+1KGP PANEL\n') 32 | ref_path = 'gs://gcp-public-data--gnomad/resources/hgdp_1kg/phased_haplotypes_v2/hgdp1kgp_chrCNUMBER.filtered.SNV_INDEL.phased.shapeit5.bcf' 33 | else: 34 | print(f'\nPHASING {input_vcf} WITH USER-DEFINED REFERENCE PANEL\n') 35 | ref_path = vcf_ref 36 | else: 37 | ref_path = None 38 | print(f'\nPHASING {input_vcf} WITHOUT A REFERENCE PANEL\n') 39 | 40 | pedigree = b.read_input(fam_file) if fam_file else None 41 | 42 | if software == 'shapeit': 43 | shapeit_phasing( 44 | batch=b, 45 | input_path=input_vcf, 46 | reference_path=ref_path, 47 | genome_build=genome_build, 48 | fam_file=pedigree, 49 | data_type=data_type, 50 | fill_tags=fill_tags, 51 | output_filename=output_filename, 52 | output_path=out_dir) 53 | # else: To add BEAGLE 54 | 55 | 56 | def main(): 57 | parser = argparse.ArgumentParser() 58 | parser.add_argument('--input-vcf', type=str, required=True) 59 | parser.add_argument('--vcf-ref', type=str, default=None) 60 | parser.add_argument('--pedigree', type=str, default=None) 61 | parser.add_argument('--local', action='store_true') 62 | parser.add_argument('--billing-project', required=True) 63 | parser.add_argument('--genome-build', type=str, default='GRCh38', choices=['GRCh37', 'GRCh38']) 64 | parser.add_argument('--data-type', type=str, default='array', choices=['array', 'wgs']) 65 | parser.add_argument('--fill-tags', action='store_true') 66 | parser.add_argument('--software', type=str, default='shapeit', choices=['beagle', 'shapeit']) 67 | parser.add_argument('--output-filename', type=str, required=True) 68 | parser.add_argument('--out-dir', type=str, required=True) 69 | 70 | args = parser.parse_args() 71 | 72 | if args.local: 73 | backend = hb.LocalBackend() 74 | else: 75 | backend = hb.ServiceBackend(billing_project=args.billing_project, 76 | remote_tmpdir=f'{args.out_dir}/tmp/') 77 | 78 | run_phase(backend=backend, 79 | input_vcf=args.input_vcf, 80 | vcf_ref=args.vcf_ref, 81 | fam_file=args.pedigree, 82 | data_type=args.data_type, 83 | software=args.software, 84 | genome_build=args.genome_build, 85 | fill_tags=args.fill_tags, 86 | output_filename=args.output_filename, 87 | out_dir=args.out_dir) 88 | -------------------------------------------------------------------------------- /gwaspy/phasing/phasing.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Michael Wilson & Lindo Nkambule' 2 | 3 | import hailtop.batch as hb 4 | import argparse 5 | 6 | 7 | def haplotype_phasing(input_vcf: str = None, 8 | vcf_ref: str = None, 9 | family_pedigree: str = None, 10 | local: bool = False, 11 | billing_project: str = None, 12 | software: str = 'shapeit', 13 | reference: str = 'GRCh38', 14 | max_win_size_cm: float = 10.0, 15 | overlap_size_cm: float = 2.0, 16 | scatter_memory: int = 26, 17 | cpu: int = 8, 18 | threads: int = 7, 19 | stages: str = 'scatter,phase,concat', 20 | output_type: str = 'bcf', 21 | out_dir: str = None): 22 | # Error handling 23 | if not out_dir: 24 | raise SystemExit('Output directory not specified. Specify using --out_dir if running from the command line or' 25 | 'out_dir argument if running inside a Python script') 26 | 27 | steps_list = stages.split(',') 28 | steps = [x.lower() for x in steps_list] 29 | unknown_steps = [i for i in steps if i not in ['scatter', 'phase', 'concat']] 30 | 31 | if len(unknown_steps) > 0: 32 | raise SystemExit(f'Incorrect process(es) {unknown_steps} selected. Options are [scatter, phase, concat]') 33 | 34 | if output_type.lower() not in ['bcf', 'vcf']: 35 | raise SystemExit(f'Incorrect output type {output_type} selected. Options are [bcf, vcf]') 36 | 37 | if local: 38 | backend = hb.LocalBackend() 39 | else: 40 | backend = hb.ServiceBackend(billing_project=billing_project, 41 | remote_tmpdir=f'{out_dir}/tmp/') 42 | 43 | # Scatter VCF/BCF file(s) 44 | if 'scatter' in steps: 45 | from gwaspy.phasing.scatter_vcf import run_scatter 46 | run_scatter(backend=backend, input_vcf=input_vcf, reference=reference, max_win_size_cm=max_win_size_cm, 47 | overlap_size_cm=overlap_size_cm, scatter_memory=scatter_memory, out_dir=out_dir) 48 | 49 | # Phase scatterd chunks 50 | if 'phase' in steps: 51 | from gwaspy.phasing.phase_vcf import run_phase 52 | run_phase(backend=backend, input_vcf=input_vcf, vcf_ref_path=vcf_ref, family_pedigree=family_pedigree, 53 | software=software, reference=reference, cpu=cpu, threads=threads, out_dir=out_dir) 54 | 55 | # Concatenate phased chunks 56 | if 'concat' in steps: 57 | from gwaspy.phasing.concat_vcfs import run_concat 58 | run_concat(backend=backend, input_vcf=input_vcf, output_type=output_type, reference=reference, 59 | software=software, out_dir=out_dir) 60 | 61 | 62 | def main(): 63 | parser = argparse.ArgumentParser() 64 | parser.add_argument('--input-vcf', type=str, required=True) 65 | parser.add_argument('--vcf-ref', type=str, default=None) 66 | parser.add_argument('--family-pedigree', type=str, default=None) 67 | parser.add_argument('--local', action='store_true') 68 | parser.add_argument('--billing-project', required=True) 69 | parser.add_argument('--software', type=str, default='shapeit', choices=['eagle', 'shapeit']) 70 | parser.add_argument('--reference', type=str, default='GRCh38', choices=['GRCh37', 'GRCh38']) 71 | parser.add_argument('--max-win-size-cm', type=float, default=10.0) 72 | parser.add_argument('--overlap-size-cm', type=float, default=2.0) 73 | parser.add_argument('--cpu', type=int, default=8) 74 | parser.add_argument('--scatter-mem', type=int, default=26) 75 | parser.add_argument('--threads', type=int, default=7) 76 | parser.add_argument('--stages', type=str, default='scatter,phase,concat') 77 | parser.add_argument('--out-type', type=str, default='bcf', choices=['bcf', 'vcf']) 78 | parser.add_argument('--out-dir', required=True) 79 | 80 | args = parser.parse_args() 81 | 82 | haplotype_phasing(input_vcf=args.input_vcf, vcf_ref=args.vcf_ref, family_pedigree=args.family_pedigree, 83 | local=args.local, billing_project=args.billing_project, software=args.software, 84 | reference=args.reference, max_win_size_cm=args.max_win_size_cm, 85 | overlap_size_cm=args.overlap_size_cm, scatter_memory=args.scatter_mem, cpu=args.cpu, 86 | threads=args.threads, stages=args.stages, output_type=args.out_type, out_dir=args.out_dir) 87 | 88 | 89 | if __name__ == '__main__': 90 | main() 91 | -------------------------------------------------------------------------------- /gwaspy/preimp_qc/__init__.py: -------------------------------------------------------------------------------- 1 | from gwaspy.preimp_qc import preimp_qc 2 | __all__ = ['preimp_qc'] 3 | -------------------------------------------------------------------------------- /gwaspy/preimp_qc/aggregators.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Dan King' 2 | 3 | import hail as hl 4 | 5 | 6 | def variant_qc_aggregator(mt) -> hl.MatrixTable: 7 | """:func:`.variant_qc` as an aggregator.""" 8 | bound_exprs = {} 9 | gq_dp_exprs = {} 10 | 11 | def has_field_of_type(name, dtype): 12 | return name in mt.entry and mt[name].dtype == dtype 13 | 14 | if has_field_of_type('DP', hl.tint32): 15 | gq_dp_exprs['dp_stats'] = hl.agg.stats(mt.DP).select('mean', 'stdev', 'min', 'max') 16 | if has_field_of_type('GQ', hl.tint32): 17 | gq_dp_exprs['gq_stats'] = hl.agg.stats(mt.GQ).select('mean', 'stdev', 'min', 'max') 18 | if not has_field_of_type('GT', hl.tcall): 19 | raise ValueError("'variant_qc': expect an entry field 'GT' of type 'call'") 20 | bound_exprs['n_called'] = hl.agg.count_where(hl.is_defined(mt['GT'])) 21 | bound_exprs['n_not_called'] = hl.agg.count_where(hl.is_missing(mt['GT'])) 22 | n_cols = hl.agg.count() 23 | bound_exprs['n_filtered'] = hl.int64(n_cols) - hl.agg.count() 24 | bound_exprs['call_stats'] = hl.agg.call_stats(mt.GT, mt.alleles) 25 | return hl.rbind(hl.struct(**bound_exprs), 26 | lambda e1: hl.rbind( 27 | hl.case().when(hl.len(mt.alleles) == 2, 28 | hl.hardy_weinberg_test(e1.call_stats.homozygote_count[0], 29 | e1.call_stats.AC[1] - 2 30 | * e1.call_stats.homozygote_count[1], 31 | e1.call_stats.homozygote_count[1]) 32 | ).or_missing(), 33 | lambda hwe: hl.struct(**{ 34 | **gq_dp_exprs, 35 | **e1.call_stats, 36 | 'call_rate': hl.float(e1.n_called) / (e1.n_called + e1.n_not_called + e1.n_filtered), 37 | 'n_called': e1.n_called, 38 | 'n_not_called': e1.n_not_called, 39 | 'n_filtered': e1.n_filtered, 40 | 'n_het': e1.n_called - hl.sum(e1.call_stats.homozygote_count), 41 | 'n_non_ref': e1.n_called - e1.call_stats.homozygote_count[0], 42 | 'het_freq_hwe': hwe.het_freq_hwe, 43 | 'p_value_hwe': hwe.p_value}))) 44 | 45 | 46 | def agg_call_rate(mt: hl.MatrixTable): 47 | # DOES NOT HANDLE filter_entries CORRECTLY! 48 | n_called = hl.agg.count_where(hl.is_defined(mt['GT'])) 49 | 50 | return hl.agg.filter( 51 | ~(mt.exclude_row | mt.exclude_col), 52 | n_called / hl.agg.count()) 53 | 54 | 55 | def impute_sex_aggregator(call, 56 | aaf, 57 | aaf_threshold=0.0, 58 | include_par=False, 59 | female_threshold=0.4, 60 | male_threshold=0.8) -> hl.Table: 61 | """:func:`.impute_sex` as an aggregator.""" 62 | mt = call._indices.source 63 | rg = mt.locus.dtype.reference_genome 64 | x_contigs = hl.literal( 65 | hl.eval( 66 | hl.map(lambda x_contig: hl.parse_locus_interval(x_contig, rg), rg.x_contigs))) 67 | inbreeding = hl.agg.inbreeding(call, aaf) 68 | is_female = hl.if_else(inbreeding.f_stat < female_threshold, 69 | True, 70 | hl.if_else(inbreeding.f_stat > male_threshold, 71 | False, 72 | hl.is_missing('tbool'))) 73 | expression = hl.struct(is_female=is_female, **inbreeding) 74 | if not include_par: 75 | interval_type = hl.tarray(hl.tinterval(hl.tlocus(rg))) 76 | par_intervals = hl.literal(rg.par, interval_type) 77 | expression = hl.agg.filter( 78 | ~par_intervals.any(lambda par_interval: par_interval.contains(mt.locus)), 79 | expression) 80 | expression = hl.agg.filter((aaf > aaf_threshold) & (aaf < (1 - aaf_threshold)), expression) 81 | expression = hl.agg.filter( 82 | x_contigs.any(lambda contig: contig.contains(mt.locus)), 83 | expression) 84 | 85 | return expression 86 | 87 | 88 | def allele_types(mt): 89 | from hail.expr.functions import _num_allele_type, _allele_types 90 | allele_types = _allele_types[:] 91 | allele_types.extend(['Transition', 'Transversion']) 92 | allele_enum = {i: v for i, v in enumerate(allele_types)} 93 | allele_ints = {v: k for k, v in allele_enum.items()} 94 | 95 | def allele_type(ref, alt): 96 | return hl.bind(lambda at: hl.if_else(at == allele_ints['SNP'], 97 | hl.if_else(hl.is_transition(ref, alt), 98 | allele_ints['Transition'], 99 | allele_ints['Transversion']), 100 | at), 101 | _num_allele_type(ref, alt)) 102 | 103 | return mt.alleles[1:].map(lambda alt: allele_type(mt.alleles[0], alt)) 104 | -------------------------------------------------------------------------------- /gwaspy/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from gwaspy.utils import read_file 2 | __all__ = ['read_file'] -------------------------------------------------------------------------------- /gwaspy/utils/export_file.py: -------------------------------------------------------------------------------- 1 | import hail as hl 2 | 3 | 4 | def export_qced_file(mt: hl.MatrixTable, out_dir: str, basename: str, export_type='hail'): 5 | outname = basename + '_qced' 6 | 7 | if export_type == 'hail': 8 | mt.write('{}GWASpy/Preimp_QC/{}.mt'.format(out_dir, outname), overwrite=True) 9 | 10 | elif export_type == 'plink': 11 | hl.export_plink(dataset=mt, output='{}GWASpy/Preimp_QC/{}'.format(out_dir, outname), fam_id=mt.fam_id, 12 | ind_id=mt.s, pat_id=mt.pat_id, mat_id=mt.mat_id, is_female=mt.is_female, pheno=mt.is_case, 13 | varid=mt.rsid) 14 | 15 | else: 16 | hl.export_vcf(mt, '{}GWASpy/Preimp_QC/{}.vcf.bgz'.format(out_dir, outname), tabix=True) 17 | -------------------------------------------------------------------------------- /gwaspy/utils/get_file_size.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | __author__ = 'Lindo Nkambule' 4 | 5 | import hail as hl 6 | 7 | 8 | def bytes_to_gb(in_file: str): 9 | """ 10 | Convert the size from bytes to GiB 11 | :param in_file: path to file, str 12 | :return: file size in GiB 13 | """ 14 | 15 | file_info = hl.utils.hadoop_stat(in_file) 16 | size_bytes = file_info['size_bytes'] 17 | size_gigs = size_bytes / (1024 * 1024 * 1024) 18 | 19 | return size_gigs 20 | -------------------------------------------------------------------------------- /gwaspy/utils/natural_sort.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | 4 | def atoi(text): 5 | return int(text) if text.isdigit() else text 6 | 7 | 8 | def natural_keys(text): 9 | """ 10 | alist.sort(key=natural_keys) sorts in human order 11 | http://nedbatchelder.com/blog/200712/human_sorting.html 12 | (See Toothy's implementation in the comments) 13 | """ 14 | return [atoi(c) for c in re.split(r'(\d+)', text)] 15 | -------------------------------------------------------------------------------- /gwaspy/utils/read_file.py: -------------------------------------------------------------------------------- 1 | import hail as hl 2 | import hailtop.fs as hfs 3 | from gwaspy.utils.sample_annotations import add_sample_annotations 4 | 5 | 6 | def read_plink(dirname: str, basename: str) -> hl.MatrixTable: 7 | 8 | in_mt: hl.MatrixTable = hl.import_plink(bed=dirname + basename + '.bed', 9 | bim=dirname + basename + '.bim', 10 | fam=dirname + basename + '.fam', 11 | block_size=16) 12 | 13 | return in_mt 14 | 15 | 16 | def read_vcf(dirname: str, basename: str) -> hl.MatrixTable: 17 | hl._set_flags(no_whole_stage_codegen='1') 18 | 19 | if hfs.exists(f"{dirname}{basename}.vcf.bgz"): 20 | vcf_file = f"{dirname}{basename}.vcf.bgz" 21 | elif hfs.exists(f"{dirname}{basename}.vcf.gz"): 22 | vcf_file = f"{dirname}{basename}.vcf.gz" 23 | else: 24 | vcf_file = f"{dirname}{basename}.vcf" 25 | 26 | hl.import_vcf(vcf_file, force_bgz=True, block_size=16).write('{}GWASpy.preimpQC.mt'.format(dirname), overwrite=True) 27 | 28 | # unset flag to avoid locus_windows: 'locus_expr' global position must be in ascending order when LD pruning 29 | # https://hail.zulipchat.com/#narrow/channel/123010-Hail-Query-0.2E2-support/topic/locus_windows.20Error/near/272143278 30 | hl._set_flags(no_whole_stage_codegen=None) 31 | in_mt = hl.read_matrix_table('{}GWASpy.preimpQC.mt'.format(dirname)) 32 | 33 | # Unlike array data, a VCF might have multi-allelic sites 34 | # split multi-allelic sites into bi-allelic 35 | print("Checking for multi-allelic sites") 36 | pre_filt_multi_n = in_mt.count_rows() 37 | bi = in_mt.filter_rows(hl.len(in_mt.alleles) == 2) 38 | bi = bi.annotate_rows(a_index=hl.missing(hl.tint)) # when we update Hail version, use hl.missing instead of hl.null 39 | bi = bi.annotate_rows(was_split=False) 40 | 41 | multi = in_mt.filter_rows(hl.len(in_mt.alleles) > 2) 42 | split = hl.split_multi_hts(multi) 43 | 44 | in_mt = split.union_rows(bi) 45 | pos_filt_multi_n = in_mt.count_rows() 46 | print("Number of multi-allelic SNPs in VCF file: {}".format(pos_filt_multi_n-pre_filt_multi_n)) 47 | 48 | return in_mt 49 | 50 | 51 | def read_mt(dirname: str, basename: str) -> hl.MatrixTable: 52 | print(dirname + basename + ".mt") 53 | in_mt: hl.MatrixTable = hl.read_matrix_table(dirname + basename + ".mt") 54 | 55 | return in_mt 56 | 57 | 58 | def read_infile( 59 | input_type: str = None, 60 | dirname: str = None, basename: str = None, 61 | **kwargs): 62 | 63 | global mt 64 | 65 | # vcf = kwargs.get('vcf') 66 | annotations = kwargs.get('annotations') 67 | 68 | if input_type == 'plink': 69 | mt = read_plink(dirname, basename) 70 | 71 | elif input_type == 'vcf': 72 | mt = read_vcf(dirname, basename) 73 | 74 | else: 75 | mt = read_mt(dirname, basename) 76 | 77 | if annotations: 78 | mt = add_sample_annotations(mt, annotations) 79 | 80 | return mt 81 | -------------------------------------------------------------------------------- /gwaspy/utils/reference_liftover.py: -------------------------------------------------------------------------------- 1 | import hail as hl 2 | from gwaspy.utils.read_file import read_infile 3 | from gwaspy.utils.sample_annotations import add_sample_annotations 4 | 5 | 6 | def liftover_to_grch38( 7 | input_type: str = None, 8 | dirname: str = None, 9 | basename: str = None, 10 | **kwargs): 11 | 12 | lifted_over = f'{dirname}{basename}.liftover.grch38.mt' 13 | print('\nLifting over to GRCh38') 14 | mt = read_infile(input_type=input_type, dirname=dirname, basename=basename) 15 | 16 | annotations = kwargs.get('annotations') 17 | if annotations: 18 | mt = add_sample_annotations(mt, annotations) 19 | 20 | rg37 = hl.get_reference('GRCh37') 21 | rg38 = hl.get_reference('GRCh38') 22 | rg37.add_liftover('gs://hail-common/references/grch37_to_grch38.over.chain.gz', rg38) 23 | 24 | mt = mt.annotate_rows(new_locus=hl.liftover(mt.locus, 'GRCh38', include_strand=True), old_locus=mt.locus) 25 | mt = mt.filter_rows(hl.is_defined(mt.new_locus) & ~mt.new_locus.is_negative_strand) 26 | 27 | mt = mt.key_rows_by(locus=mt.new_locus.result, alleles=mt.alleles) 28 | 29 | print(f'\nWriting out data lifted-over to GRCh38 to: {lifted_over}') 30 | mt.write(lifted_over) 31 | 32 | return hl.read_matrix_table(lifted_over) 33 | -------------------------------------------------------------------------------- /gwaspy/utils/sample_annotations.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Lindo Nkambule' 2 | 3 | import hail as hl 4 | import sys 5 | 6 | 7 | def add_sample_annotations(mt: hl.MatrixTable, annotations: str) -> hl.MatrixTable: 8 | # use annotations file to annotate VCF 9 | ann = hl.import_table(annotations, impute=False, 10 | types={'Sample': hl.tstr, 'Sex': hl.tstr, 'Pheno': hl.tstr}).key_by('Sample') 11 | ann_cols = dict(ann.row) 12 | 13 | mt = mt.annotate_cols(annotations=ann[mt.s]) 14 | 15 | if 'is_female' not in mt.col: 16 | if 'Sex' in ann_cols: 17 | mt = mt.annotate_cols(is_female=hl.if_else(((mt.annotations.Sex == 'F') | 18 | (mt.annotations.Sex == str(2)) | 19 | (mt.annotations.Sex == 'True') | 20 | (mt.annotations.Sex == 'Female')), 21 | True, False)) 22 | else: 23 | print('Sex column is missing from annotations file. Please add it and run GWASpy again') 24 | sys.exit(2) 25 | 26 | if 'is_case' not in mt.col: 27 | if 'Pheno' in ann_cols: 28 | mt = mt.annotate_cols(is_case=hl.if_else(((mt.annotations.Pheno == str(2)) | 29 | (mt.annotations.Pheno == 'True') | 30 | (mt.annotations.Pheno == 'Case')), 31 | True, False)) 32 | 33 | return mt 34 | 35 | -------------------------------------------------------------------------------- /nf/modules/imputation.nf: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env nextflow 2 | 3 | nextflow.enable.dsl=2 4 | 5 | 6 | process IMPUTE5 { 7 | cpus 8 8 | memory { 16.GB * task.attempt } 9 | container 'docker.io/lindonkambule/gwaspy_phase_impute:latest' 10 | tag "impute: ${irg}" 11 | publishDir "${out_directory}", overwrite: true, mode:'copy', pattern: '*.bcf*' 12 | 13 | input: 14 | tuple val(chrom), file(input), file(input_idx), file(ref), file(ref_idx), file(ref_bin), file(ref_fam), file(map_file), val(srg), val(irg), val(chk), val(out_directory) 15 | 16 | output: 17 | tuple val(chrom), val(chk), path("${chk}imputed.chr${chrom}.bcf"), path("${chk}imputed.chr${chrom}.bcf.csi") 18 | 19 | // IMPUTE5 automatically indexes output file 20 | script: 21 | """ 22 | impute5_v1.2.0_static \ 23 | --h ${ref} \ 24 | --g ${input} \ 25 | --m ${map_file} \ 26 | --r ${irg} \ 27 | --buffer-region ${srg} \ 28 | --o ${chk}imputed.chr${chrom}.bcf 29 | """ 30 | } 31 | -------------------------------------------------------------------------------- /nf/nextflow.config: -------------------------------------------------------------------------------- 1 | // work directory where intermediate files will be stored 2 | workDir = 'gs://path/to/my/workdir' 3 | 4 | process { 5 | executor = 'google-batch' 6 | errorStrategy = { task.exitStatus==null ? 'retry' : 'terminate' } 7 | maxRetries = 3 8 | } 9 | 10 | profiles { 11 | gbatch { 12 | google.project = 'my-billing-project' 13 | google.location = 'us-central1' 14 | batch.spot = true 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /nf/params.json: -------------------------------------------------------------------------------- 1 | { 2 | "input_vcf": "gs://my-gcs/bucket/my_input_file.vcf", 3 | "output_filename": "my_output_filename_prefix", 4 | "out_dir": "gs://my-gcs/bucket/nf_phase_impute", 5 | "impute": true, 6 | "fill_tags": false, 7 | "input_split_by_chrom": false, 8 | "vcf_ref": "gs://gcp-public-data--gnomad/resources/hgdp_1kg/phased_haplotypes_v2/hgdp1kgp_chrCNUMBER.filtered.SNV_INDEL.phased.shapeit5", 9 | "ref_format": "vcf", 10 | "data_type": "array", 11 | "maf": 0.001, 12 | "common_chunks": "gs://my-gcs/bucket/chunks/b38/20cM/chunks_chrCNUMBER.txt", 13 | "rare_chunks": "gs://my-gcs/bucket/chunks/b38/4cM/chunks_chrCNUMBER.txt", 14 | "genetic_maps": "gs://my-gcs/bucket/resources/maps/chrCNUMBER.b38.gmap.gz" 15 | } -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | hail 2 | matplotlib>=3.3.3 3 | plotly>=5.7.0 4 | pandas>=0.25.3 5 | pylatex>=1.4.1 6 | numpy>=1.18.4 7 | scikit-learn~=0.21.3 8 | setuptools~=41.6.0 9 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | from setuptools import setup, find_packages 3 | 4 | with open("README.md", "r") as fh: 5 | long_description = fh.read() 6 | 7 | classifiers = [ 8 | 'Development Status :: 4 - Beta', 9 | 'Environment :: Console', 10 | 'Intended Audience :: Science/Research', 11 | 'License :: OSI Approved :: MIT License', 12 | 'Programming Language :: Python :: 3', 13 | 'Programming Language :: Python :: 3.2', 14 | 'Programming Language :: Python :: 3.3', 15 | 'Programming Language :: Python :: 3.4', 16 | 'Programming Language :: Python :: 3.5', 17 | 'Programming Language :: Python :: 3.6', 18 | 'Programming Language :: Python :: 3.7', 19 | 'Programming Language :: Python :: 3.8', 20 | 'Topic :: Scientific/Engineering :: Bio-Informatics', 21 | 'Operating System :: POSIX', 22 | 'Operating System :: Unix', 23 | 'Operating System :: MacOS' 24 | ] 25 | 26 | setup(name='gwaspy', 27 | version='0.1.2', 28 | author='Lindokuhle Nkambule', 29 | author_email='lnkambul@broadinstitute.org', 30 | url='https://gwaspy.readthedocs.io/', 31 | project_urls={"GitHub": "https://github.com/atgu/GWASpy"}, 32 | description='GWASpy: A Python package for performing GWAS QC, PCA, phasing, and genotype imputation.', 33 | long_description=long_description, 34 | long_description_content_type="text/markdown", 35 | license='MIT', 36 | packages=find_packages(), 37 | entry_points={ 38 | 'console_scripts': [ 39 | 'preimp_qc = gwaspy.preimp_qc.preimp_qc:main', 40 | 'pca = gwaspy.pca.pca:main', 41 | 'imputation = gwaspy.imputation.impute:main', 42 | # 'imputation = gwaspy.imputation.imputation:main', 43 | 'phasing = gwaspy.phasing.phase:main', 44 | # 'phasing = gwaspy.phasing.phasing:main' 45 | 'checkalleleflips = gwaspy.check_alleles.flips:main' 46 | ] 47 | }, 48 | classifiers=classifiers, 49 | keywords='', 50 | # install_requires=required, 51 | install_requires=['hail', 'matplotlib', 'numpy', 'pandas', 'pylatex', 'plotly', 'distinctipy'], 52 | zip_safe=False 53 | ) 54 | -------------------------------------------------------------------------------- /split_maps.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | for ref in {17,19,38} 4 | do 5 | # put /opt before eagle 6 | map_file=/opt/Eagle_v2.4.1/tables/genetic_map_hg${ref}_withX.txt.gz 7 | for chrom in {1..23} 8 | do 9 | echo -e 'chr position COMBINED_rate(cM/Mb) Genetic_Map(cM)' > /opt/genetic_maps_eagle/hg${ref}/genetic_map_hg${ref}_chr${chrom}_withX.txt 10 | cat $map_file | gunzip | awk '{if ( ($1==CHROM) ) print $0}' CHROM=${chrom} >> /opt/genetic_maps_eagle/hg${ref}/genetic_map_hg${ref}_chr${chrom}_withX.txt 11 | gzip /opt/genetic_maps_eagle/hg${ref}/genetic_map_hg${ref}_chr${chrom}_withX.txt 12 | 13 | # echo -e 'pos\tchr\tcM' > /opt/genetic_maps_shapeit/hg${ref}/genetic_map_hg${ref}_chr${chrom}_withX.txt 14 | cat /opt/genetic_maps_eagle/hg${ref}/genetic_map_hg${ref}_chr${chrom}_withX.txt.gz | gunzip | awk '{print $2,"\t",$1,"\t",$4}' > /opt/genetic_maps_shapeit/hg${ref}/genetic_map_hg${ref}_chr${chrom}_withX.txt 15 | sed 's/position/~~/g; s/Genetic_Map(cM)/cM/g; s/~~/pos/g' /opt/genetic_maps_shapeit/hg${ref}/genetic_map_hg${ref}_chr${chrom}_withX.txt | gzip > /opt/genetic_maps_shapeit/hg${ref}/genetic_map_hg${ref}_chr${chrom}_withX.txt.gz 16 | rm /opt/genetic_maps_shapeit/hg${ref}/genetic_map_hg${ref}_chr${chrom}_withX.txt 17 | done 18 | done 19 | 20 | --------------------------------------------------------------------------------