├── .Rbuildignore
├── .devcontainer
    └── devcontainer.json
├── .gitattributes
├── .github
    ├── .gitignore
    └── workflows
    │   ├── R-CMD-check.yaml
    │   └── pkgdown.yaml
├── .gitignore
├── DESCRIPTION
├── LICENSE
├── LICENSE.md
├── NAMESPACE
├── NEWS.md
├── R
    ├── binaries.r
    ├── gwasglue.R
    ├── manipulate.r
    ├── proxy.r
    ├── pval_index.r
    ├── query.r
    ├── rsid_index.r
    ├── utils-pipe.R
    └── zzz.r
├── README.md
├── _pkgdown.yml
├── gwasvcf.Rproj
├── inst
    ├── extdata
    │   ├── data.vcf.gz
    │   ├── data.vcf.gz.tbi
    │   ├── eur.bed
    │   ├── eur.bim
    │   └── eur.fam
    └── sandpit
    │   ├── bmi_example.r
    │   ├── bmi_example_cp.r
    │   ├── harmonise_against_ref.r
    │   ├── misc
    │       ├── create_ref.sh
    │       ├── harmonise.r
    │       ├── harmonise_against_ref.r
    │       ├── query_times.html
    │       ├── query_times.rmd
    │       ├── skeleton.sh
    │       ├── vcf.html
    │       └── vcf.rmd
    │   └── test_extract.r
├── man
    ├── VariantAnnotation.Rd
    ├── check_bcftools.Rd
    ├── check_plink.Rd
    ├── create_ldref_sqlite.Rd
    ├── create_pval_index_from_vcf.Rd
    ├── create_rsidx_index_from_vcf.Rd
    ├── create_rsidx_sub_index.Rd
    ├── create_vcf.Rd
    ├── get_ld_proxies.Rd
    ├── gwasvcf_to_summaryset.Rd
    ├── merge_vcf.Rd
    ├── parse_chrompos.Rd
    ├── pipe.Rd
    ├── proxy_match.Rd
    ├── query_chrompos_bcftools.Rd
    ├── query_chrompos_file.Rd
    ├── query_chrompos_vcf.Rd
    ├── query_gwas.Rd
    ├── query_pval_bcftools.Rd
    ├── query_pval_file.Rd
    ├── query_pval_sqlite3.Rd
    ├── query_pval_vcf.Rd
    ├── query_pvali.Rd
    ├── query_rsid_bcftools.Rd
    ├── query_rsid_file.Rd
    ├── query_rsid_rsidx.Rd
    ├── query_rsid_vcf.Rd
    ├── query_rsidx.Rd
    ├── set_bcftools.Rd
    ├── set_plink.Rd
    ├── sqlite_ld_proxies.Rd
    ├── vcf_to_granges.Rd
    ├── vcf_to_tibble.Rd
    └── vcflist_overlaps.Rd
├── tests
    ├── testthat.R
    └── testthat
    │   ├── test_manipulate.r
    │   ├── test_proxy.R
    │   ├── test_pvali.r
    │   ├── test_query.r
    │   └── test_rsidx.r
└── vignettes
    ├── figure
        └── target-effects-plot-1.png
    ├── guide.Rmd
    ├── guide.Rmd.orig
    └── precompile.R


/.Rbuildignore:
--------------------------------------------------------------------------------
 1 | ^LICENSE\.md$
 2 | ^mrbase\.oauth$
 3 | ^ieugwasr_oauth$
 4 | .travis.yml
 5 | ^_pkgdown\.yml$
 6 | ^docs$
 7 | ^pkgdown$
 8 | ^\.github$
 9 | ^.*\.Rproj$
10 | ^\.Rproj\.user$
11 | ^Dockerfile$
12 | ^\.devcontainer$
13 | 
14 | # Files generated by tests
15 | inst/extdata/eur.indels
16 | inst/extdata/eur.log
17 | inst/extdata/eur.nosex
18 | tests/testthat/temp.vcf
19 | 
20 | # Files created by vignettes
21 | ^vignettes/index\.rsidx$
22 | ^vignettes/temp\.vcf$
23 | ^vignettes/index\.pvali$
24 | 
25 | ^vignettes/precompile\.R$
26 | ^vignettes/figure$
27 | 


--------------------------------------------------------------------------------
/.devcontainer/devcontainer.json:
--------------------------------------------------------------------------------
 1 | // For format details, see https://aka.ms/devcontainer.json. For config options, see the README at:
 2 | // https://github.com/microsoft/vscode-dev-containers/tree/v0.245.2/containers/docker-existing-dockerfile
 3 | {
 4 | 	"name": "Existing Dockerfile",
 5 | 
 6 | 	// Sets the run context to one level up instead of the .devcontainer folder.
 7 | 	"context": "..",
 8 | 
 9 | 	// Update the 'dockerFile' property if you aren't using the standard 'Dockerfile' filename.
10 | 	"dockerFile": "../Dockerfile"
11 | 
12 | 	// Use 'forwardPorts' to make a list of ports inside the container available locally.
13 | 	// "forwardPorts": [],
14 | 
15 | 	// Uncomment the next line to run commands after the container is created - for example installing curl.
16 | 	// "postCreateCommand": "apt-get update && apt-get install -y curl",
17 | 
18 | 	// Uncomment when using a ptrace-based debugger like C++, Go, and Rust
19 | 	// "runArgs": [ "--cap-add=SYS_PTRACE", "--security-opt", "seccomp=unconfined" ],
20 | 
21 | 	// Uncomment to use the Docker CLI from inside the container. See https://aka.ms/vscode-remote/samples/docker-from-docker.
22 | 	// "mounts": [ "source=/var/run/docker.sock,target=/var/run/docker.sock,type=bind" ],
23 | 
24 | 	// Uncomment to connect as a non-root user if you've added one. See https://aka.ms/vscode-remote/containers/non-root.
25 | 	// "remoteUser": "vscode"
26 | }
27 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
 1 | # Auto detect text files and perform LF normalization
 2 | * text=auto
 3 | 
 4 | # Settings for Linguist Languages pane
 5 | *.html linguist-vendored
 6 | *.css linguist-vendored
 7 | *.js linguist-vendored
 8 | docs/* linguist-vendored
 9 | *.rdb linguist-vendored
10 | *.rdx linguist-vendored
11 | 


--------------------------------------------------------------------------------
/.github/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | 


--------------------------------------------------------------------------------
/.github/workflows/R-CMD-check.yaml:
--------------------------------------------------------------------------------
 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 3 | #
 4 | # NOTE: This workflow is overkill for most R packages and
 5 | # check-standard.yaml is likely a better choice.
 6 | # usethis::use_github_action("check-standard") will install it.
 7 | on:
 8 |   push:
 9 |     branches: [main, master]
10 |   pull_request:
11 | 
12 | name: R-CMD-check.yaml
13 | 
14 | permissions: read-all
15 | 
16 | jobs:
17 |   R-CMD-check:
18 |     runs-on: ${{ matrix.config.os }}
19 | 
20 |     name: ${{ matrix.config.os }} (${{ matrix.config.r }})
21 | 
22 |     strategy:
23 |       fail-fast: false
24 |       matrix:
25 |         config:
26 |           - {os: macos-latest,   r: 'release'}
27 | 
28 |           - {os: windows-latest, r: 'release'}
29 |           # use 4.0 or 4.1 to check with rtools40's older compiler
30 |           - {os: windows-latest, r: 'oldrel-4'}
31 | 
32 |           - {os: ubuntu-latest,  r: 'devel', http-user-agent: 'release'}
33 |           - {os: ubuntu-latest,  r: 'release'}
34 |           - {os: ubuntu-latest,  r: 'oldrel-1'}
35 |           - {os: ubuntu-latest,  r: 'oldrel-2'}
36 |           - {os: ubuntu-latest,  r: 'oldrel-3'}
37 |           - {os: ubuntu-latest,  r: 'oldrel-4'}
38 |           - {os: ubuntu-latest,  r: 'oldrel-5'}
39 | 
40 |     env:
41 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
42 |       R_KEEP_PKG_SOURCE: yes
43 | 
44 |     steps:
45 |       - uses: actions/checkout@v4
46 | 
47 |       - uses: r-lib/actions/setup-pandoc@v2
48 | 
49 |       - uses: r-lib/actions/setup-r@v2
50 |         with:
51 |           r-version: ${{ matrix.config.r }}
52 |           http-user-agent: ${{ matrix.config.http-user-agent }}
53 |           use-public-rspm: true
54 | 
55 |       - uses: r-lib/actions/setup-r-dependencies@v2
56 |         with:
57 |           extra-packages: any::rcmdcheck
58 |           needs: check
59 | 
60 |       - uses: r-lib/actions/check-r-package@v2
61 |         with:
62 |           upload-snapshots: true
63 |           build_args: 'c("--no-manual","--compact-vignettes=gs+qpdf")'
64 | 


--------------------------------------------------------------------------------
/.github/workflows/pkgdown.yaml:
--------------------------------------------------------------------------------
 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 3 | on:
 4 |   push:
 5 |     branches: [main, master]
 6 |   pull_request:
 7 |   release:
 8 |     types: [published]
 9 |   workflow_dispatch:
10 | 
11 | name: pkgdown.yaml
12 | 
13 | permissions: read-all
14 | 
15 | jobs:
16 |   pkgdown:
17 |     runs-on: ubuntu-latest
18 |     # Only restrict concurrency for non-PR jobs
19 |     concurrency:
20 |       group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }}
21 |     env:
22 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
23 |     permissions:
24 |       contents: write
25 |     steps:
26 |       - uses: actions/checkout@v4
27 | 
28 |       - uses: r-lib/actions/setup-pandoc@v2
29 | 
30 |       - uses: r-lib/actions/setup-r@v2
31 |         with:
32 |           use-public-rspm: true
33 | 
34 |       - uses: r-lib/actions/setup-r-dependencies@v2
35 |         with:
36 |           extra-packages: any::pkgdown, local::.
37 |           needs: website
38 | 
39 |       - name: Build site
40 |         run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE)
41 |         shell: Rscript {0}
42 | 
43 |       - name: Deploy to GitHub pages 🚀
44 |         if: github.event_name != 'pull_request'
45 |         uses: JamesIves/github-pages-deploy-action@v4.5.0
46 |         with:
47 |           clean: false
48 |           branch: gh-pages
49 |           folder: docs
50 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .Rproj.user
 2 | .Rhistory
 3 | .RData
 4 | inst/doc
 5 | doc
 6 | Meta
 7 | 
 8 | # Files generated by tests
 9 | inst/extdata/eur.indels
10 | inst/extdata/eur.log
11 | inst/extdata/eur.nosex
12 | tests/testthat/temp.vcf
13 | 
14 | # Files created by vignettes
15 | vignettes/index.rsidx
16 | vignettes/temp.vcf
17 | vignettes/index.pvali
18 | 
19 | docs/
20 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: gwasvcf
 2 | Title: Tools for Dealing with GWAS Summary Data in VCF Format
 3 | Version: 0.1.4
 4 | Authors@R: c(
 5 |     person("Gibran", "Hemani", , "g.hemani@bristol.ac.uk", role = c("aut", "cre"),
 6 |            comment = c(ORCID = "0000-0003-0920-1055")),
 7 |     person("Tom", "Palmer", , "tom.palmer@bristol.ac.uk", role = "ctb",
 8 |            comment = c(ORCID = "0000-0003-4655-4511")),
 9 |     person("Rita", "Rasteiro", , "rita.rasteiro@bristol.ac.uk", role = "ctb",
10 |            comment = c(ORCID = "0000-0002-4217-3060"))
11 |   )
12 | Description: Tools for dealing with GWAS summary data in VCF format.
13 |     Includes reading, querying, writing, as well as helper functions such
14 |     as LD proxy searches.
15 | License: MIT + file LICENSE
16 | URL: https://github.com/mrcieu/gwasvcf, https://mrcieu.github.io/gwasvcf/
17 | BugReports: https://github.com/mrcieu/gwasvcf/issues
18 | Depends: 
19 |     R (>= 4.0.0)
20 | Imports:
21 |     BiocGenerics,
22 |     Biostrings,
23 |     data.table,
24 |     dplyr,
25 |     genetics.binaRies,
26 |     GenomeInfoDb,
27 |     GenomicRanges,
28 |     gwasglue2,
29 |     IRanges,
30 |     magrittr,
31 |     RCurl,
32 |     rlang,
33 |     Rsamtools,
34 |     RSQLite,
35 |     S4Vectors,
36 |     stringr,
37 |     SummarizedExperiment,
38 |     utils,
39 |     VariantAnnotation
40 | Suggests: 
41 |     knitr,
42 |     rmarkdown,
43 |     testthat
44 | VignetteBuilder: 
45 |     knitr
46 | Remotes:
47 |     github::mrcieu/genetics.binaRies,
48 |     github::mrcieu/gwasglue2
49 | Encoding: UTF-8
50 | Roxygen: list(markdown = TRUE)
51 | RoxygenNote: 7.3.2
52 | SystemRequirements: GNU unzip, sqlite3
53 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | YEAR: 2019
2 | COPYRIGHT HOLDER: Gibran Hemani
3 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | 
 3 | Copyright (c) 2019 Gibran Hemani
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | export("%>%")
 4 | export(check_bcftools)
 5 | export(check_plink)
 6 | export(create_ldref_sqlite)
 7 | export(create_pval_index_from_vcf)
 8 | export(create_rsidx_index_from_vcf)
 9 | export(create_rsidx_sub_index)
10 | export(create_vcf)
11 | export(get_ld_proxies)
12 | export(gwasvcf_to_summaryset)
13 | export(merge_vcf)
14 | export(parse_chrompos)
15 | export(proxy_match)
16 | export(query_chrompos_bcftools)
17 | export(query_chrompos_file)
18 | export(query_chrompos_vcf)
19 | export(query_gwas)
20 | export(query_pval_bcftools)
21 | export(query_pval_file)
22 | export(query_pval_sqlite3)
23 | export(query_pval_vcf)
24 | export(query_pvali)
25 | export(query_rsid_bcftools)
26 | export(query_rsid_file)
27 | export(query_rsid_rsidx)
28 | export(query_rsid_vcf)
29 | export(query_rsidx)
30 | export(set_bcftools)
31 | export(set_plink)
32 | export(sqlite_ld_proxies)
33 | export(vcf_to_granges)
34 | export(vcf_to_tibble)
35 | export(vcflist_overlaps)
36 | import(VariantAnnotation)
37 | importFrom(magrittr,"%>%")
38 | importFrom(rlang,.data)
39 | 


--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
 1 | # gwasvcf 0.1.4
 2 | 
 3 | * Add sqlite3 to DESCRIPTION SystemRequirements for `create_pval_index_from_vcf()`
 4 | * Update some URLs within the package documentation
 5 | 
 6 | # gwasvcf 0.1.3
 7 | 
 8 | * Fix for security message in `get_ld_proxies()` (thanks @mattlee821)
 9 | 
10 | # gwasvcf 0.1.2
11 | 
12 | * New `gwasvcf_to_summaryset()` function to create a [gwasglue2](https://mrcieu.github.io/gwasglue2/) SummarySet object from a vcf file
13 | * Fixed error in `get_ld_proxies()` related with argument `validate`, deprecated in `as_tibble()` (tibble 2.0.0)
14 | 


--------------------------------------------------------------------------------
/R/binaries.r:
--------------------------------------------------------------------------------
  1 |  #' Check if the tools_bcftools option is set
  2 | #'
  3 | #' See set_bcftools() for more information
  4 | #'
  5 | #'
  6 | #' @export
  7 | #' @return TRUE or FALSE
  8 | check_bcftools <- function()
  9 | {
 10 | 	if(is.null(options()[["tools_bcftools"]]))
 11 | 	{
 12 | 		message("'tools_bcftools' option is not set, using native read which may be substantially slower. See 'set_bcftools' for information.")
 13 | 		return(FALSE)
 14 | 	}
 15 | 	filecheck <- file.exists(options()[["tools_bcftools"]])
 16 | 	if(filecheck)
 17 | 	{
 18 | 		return(TRUE)
 19 | 	}
 20 | 	pathcheck <- any(sapply(strsplit(Sys.getenv("PATH"), split=":"), function(x) file.exists(file.path(x, options()[["tools_bcftools"]]))))
 21 | 	if(pathcheck)
 22 | 	{
 23 | 		return(TRUE)
 24 | 	}
 25 | 	message("'tools_bcftools' option does not point to an existing file, using native read which may be substantially slower. See 'set_bcftools' for information.")
 26 | 	return(FALSE)
 27 | }
 28 | 
 29 | 
 30 | #' Check if the tools_plink option is set
 31 | #'
 32 | #' See set_plink() for more information
 33 | #'
 34 | #'
 35 | #' @export
 36 | #' @return TRUE or FALSE
 37 | check_plink <- function()
 38 | {
 39 | 	if(is.null(options()[["tools_plink"]]))
 40 | 	{
 41 | 		message("'tools_plink' option is not set. See 'set_plink' for information.")
 42 | 		return(FALSE)
 43 | 	}
 44 | 	filecheck <- file.exists(options()[["tools_plink"]])
 45 | 	if(filecheck)
 46 | 	{
 47 | 		return(TRUE)
 48 | 	}
 49 | 	pathcheck <- any(sapply(strsplit(Sys.getenv("PATH"), split=":"), function(x) file.exists(file.path(x, options()[["tools_plink"]]))))
 50 | 	if(pathcheck)
 51 | 	{
 52 | 		return(TRUE)
 53 | 	}
 54 | 	message("'tools_plink' option is not set. See 'set_plink' for information.")
 55 | 	return(FALSE)
 56 | }
 57 | 
 58 | #' Set bcftools binary location
 59 | #'
 60 | #'
 61 | #' @param path If "" (default), then will use the MRCIEU/genetics.binaRies to get binaries that are appropriate for the detected operating system. Otherwise, provide the path to the bcftools binary. If NULL then will set the option to NULL.
 62 | #'
 63 | #' @export
 64 | #' @return NULL, sets option 'tools_bcftools'
 65 | set_bcftools <- function(path="")
 66 | {
 67 | 	if(is.null(path))
 68 | 	{
 69 | 		options(tools_bcftools = NULL)
 70 | 	} else if(path == "")
 71 | 	{
 72 | 		a <- requireNamespace("genetics.binaRies")
 73 | 		if(a)
 74 | 		{
 75 | 			message("Path not provided, using binaries in the MRCIEU/genetics.binaRies package")
 76 | 			options(tools_bcftools = genetics.binaRies::get_bcftools_binary())
 77 | 		} else {
 78 | 			stop("Please provide a path to bcftools binary or run devtools::install_github('MRCIEU/genetics.binaRies')")
 79 | 		}
 80 | 	} else {
 81 | 		options(tools_bcftools = path)
 82 | 	}
 83 | }
 84 | 
 85 | #' Set plink binary location
 86 | #'
 87 | #'
 88 | #' @param path If "" (default), then will use the MRCIEU/genetics.binaRies to get binaries that are appropriate for the detected operating system. Otherwise, provide the path to the plink binary. If NULL then will set the option to NULL.
 89 | #'
 90 | #' @export
 91 | #' @return NULL, sets option 'tools_plink'
 92 | set_plink <- function(path="")
 93 | {
 94 | 	if(is.null(path))
 95 | 	{
 96 | 		options(tools_plink = NULL)
 97 | 	} else if(path == "")
 98 | 	{
 99 | 		a <- requireNamespace("genetics.binaRies")
100 | 		if(a)
101 | 		{
102 | 			message("Path not provided, using binaries in the MRCIEU/genetics.binaRies package")
103 | 			options(tools_plink = genetics.binaRies::get_plink_binary())
104 | 		} else {
105 | 			stop("Please provide a path to plink binary or run devtools::install_github('MRCIEU/genetics.binaRies')")
106 | 		}
107 | 	} else {
108 | 		options(tools_plink = path)
109 | 	}
110 | }
111 | 


--------------------------------------------------------------------------------
/R/gwasglue.R:
--------------------------------------------------------------------------------
 1 | #  This file contains the functions to create a gwasglue2 SummarySet object
 2 | 
 3 | 
 4 | 
 5 | #' Create a SummarySet
 6 | #' 
 7 | #' Returns a gwasglue2 SummarySet object
 8 | #' @param vcf Path or URL to GWAS-VCF file or VCF object e.g. output from [VariantAnnotation::readVcf()], [create_vcf()] or [query_gwas()]
 9 | #' @export
10 | gwasvcf_to_summaryset <- function(vcf){
11 | 	# get metadata from vcf and create metadata object
12 | 	md <- gwasglue2::create_metadata(id = vcf@metadata$header@samples, build = unique(VariantAnnotation::meta(header(vcf))$contig$assembly))
13 |     
14 | 	# get summary data and create SummarySet
15 | 
16 |     s <- vcf %>% 
17 | 		vcf_to_tibble() %>% 
18 | 		gwasglue2::create_summaryset_from_gwasvcf(metadata = md)
19 | 
20 |     return(s)
21 | }
22 | 


--------------------------------------------------------------------------------
/R/manipulate.r:
--------------------------------------------------------------------------------
  1 | # All functions require v1.0 of the GWAS-VCF specification available from https://github.com/MRCIEU/gwas-vcf-specification/releases/tag/1.0.0
  2 | 
  3 | #' Create GWAS vcf
  4 | #'
  5 | #' @param chrom chrom vector
  6 | #' @param pos pos vector
  7 | #' @param nea nea vector
  8 | #' @param ea ea vector
  9 | #' @param snp Optional vector
 10 | #' @param ea_af Optional vector
 11 | #' @param effect Optional vector
 12 | #' @param se Optional vector
 13 | #' @param pval Optional vector
 14 | #' @param n Optional vector
 15 | #' @param ncase Optional vector
 16 | #' @param name Optional vector
 17 | #'
 18 | #' @export
 19 | #' @return vcf object
 20 | create_vcf <- function(chrom, pos, nea, ea, snp=NULL, ea_af=NULL, effect=NULL, se=NULL, pval=NULL, n=NULL, ncase=NULL, name=NULL)
 21 | {
 22 | 	stopifnot(length(chrom) == length(pos))
 23 | 	if(is.null(snp))
 24 | 	{
 25 | 		snp <- paste0(chrom, ":", pos)
 26 | 	}
 27 | 	nsnp <- length(chrom)
 28 | 	gen <- list()
 29 | 	if(!is.null(ea_af)) gen[["AF"]] <- matrix(ea_af, nsnp)
 30 | 	if(!is.null(effect)) gen[["ES"]] <- matrix(effect, nsnp)
 31 | 	if(!is.null(se)) gen[["SE"]] <- matrix(se, nsnp)
 32 | 	if(!is.null(pval)) gen[["LP"]] <- matrix(-log10(pval), nsnp)
 33 | 	if(!is.null(n)) gen[["SS"]] <- matrix(n, nsnp)
 34 | 	if(!is.null(ncase)) gen[["NC"]] <- matrix(ncase, nsnp)
 35 | 	if(!is.null(snp)) gen[["ID"]] <- matrix(snp, nsnp)
 36 | 	gen <- S4Vectors::SimpleList(gen)
 37 | 
 38 | 	gr <- GenomicRanges::GRanges(chrom, IRanges::IRanges(start=pos, end=pos + pmax(nchar(nea), nchar(ea)) - 1, names=snp))
 39 | 	coldata <- S4Vectors::DataFrame(Samples = length(name), row.names=name)
 40 | 
 41 | 	hdr <- VariantAnnotation::VCFHeader(
 42 | 		header = IRanges::DataFrameList(
 43 | 			fileformat = S4Vectors::DataFrame(Value="VCFv4.2", row.names="fileformat")
 44 | 		),
 45 | 		sample = name
 46 | 	)
 47 | 	VariantAnnotation::geno(hdr) <- S4Vectors::DataFrame(
 48 | 		Number = c("A", "A", "A", "A", "A", "A", "A"),
 49 | 		Type = c("Float", "Float", "Float", "Float", "Float", "Float", "String"),
 50 | 		Description = c(
 51 | 			"Effect size estimate relative to the alternative allele",
 52 | 			"Standard error of effect size estimate",
 53 | 			"-log10 p-value for effect estimate",
 54 | 			"Alternate allele frequency in the association study",
 55 | 			"Sample size used to estimate genetic effect",
 56 | 			"Number of cases used to estimate genetic effect",
 57 | 			"Study variant identifier"
 58 | 		),
 59 | 		row.names=c("ES", "SE", "LP", "AF", "SS", "NC", "ID")
 60 | 	)
 61 | 	VariantAnnotation::geno(hdr) <- subset(VariantAnnotation::geno(hdr), rownames(VariantAnnotation::geno(hdr)) %in% names(gen))
 62 | 
 63 | 	vcf <- VariantAnnotation::VCF(
 64 | 		rowRanges = gr,
 65 | 		colData = coldata,
 66 | 		exptData = list(
 67 | 			header = hdr
 68 | 		),
 69 | 		geno = gen
 70 | 	)
 71 | 	VariantAnnotation::alt(vcf) <- Biostrings::DNAStringSetList(as.list(ea))
 72 | 	VariantAnnotation::ref(vcf) <- Biostrings::DNAStringSet(nea)
 73 | 	VariantAnnotation::fixed(vcf)$FILTER <- "PASS"
 74 | 	return(sort(vcf))
 75 | }
 76 | 
 77 | #' Merge two GWAS VCF objects
 78 | #'
 79 | #' Returns merged intersection of two VCF objects
 80 | #'
 81 | #' @param a VCF object
 82 | #' @param b VCF object
 83 | #'
 84 | #' @export
 85 | #' @return SimpleList of VCF objects
 86 | #' @importFrom rlang .data
 87 | merge_vcf <- function(a, b)
 88 | {
 89 | 	a <- VariantAnnotation::expand(a)
 90 | 	b <- VariantAnnotation::expand(b)
 91 | 	# o <- SummarizedExperiment::findOverlaps(a, b)
 92 | 	o <- dplyr::tibble(
 93 | 		from = which(names(a) %in% names(b)),
 94 | 		to = match(names(a)[.data$from], names(b))
 95 | 	)
 96 | 	a <- a[o[["from"]],]
 97 | 	b <- b[o[["to"]],]
 98 | 	allele_match <- VariantAnnotation::ref(a) == VariantAnnotation::ref(b) & VariantAnnotation::alt(a) == VariantAnnotation::alt(b)
 99 | 	switch <- VariantAnnotation::ref(a) == VariantAnnotation::alt(b) & VariantAnnotation::ref(b) == VariantAnnotation::alt(a)
100 | 	if(any(switch))
101 | 	{
102 | 		for(i in 1:ncol(VariantAnnotation::geno(b)[["ES"]]))
103 | 		{
104 | 			VariantAnnotation::geno(b)[["ES"]][,i][switch] <- lapply(VariantAnnotation::geno(b)[["ES"]][,i][switch], function(x) x * -1)
105 | 		}
106 | 	}
107 | 	a <- a[allele_match | switch, ]
108 | 	b <- b[allele_match | switch, ]
109 | 
110 | 	ab <- a
111 | 	temp <- lapply(names(VariantAnnotation::geno(ab)), function(x) rbind(VariantAnnotation::geno(ab)[x], VariantAnnotation::geno(b)[x])) %>% S4Vectors::SimpleList
112 | 	names(temp) <- names(VariantAnnotation::geno(ab))
113 | 	VariantAnnotation::geno(ab) <- temp
114 | 
115 | 	h <- VariantAnnotation::header(a)
116 | 	out <- VCFHeader(
117 | 		reference = VariantAnnotation::reference(h),
118 | 		samples = c(VariantAnnotation::samples(h), VariantAnnotation::samples(VariantAnnotation::header(b))),
119 | 		meta = VariantAnnotation::meta(h)
120 | 	)
121 | 
122 | 	return(S4Vectors::SimpleList(out))
123 | }
124 | 
125 | 
126 | 
127 | #' Convert vcf format to granges format
128 | #'
129 | #' @param vcf Output from readVcf
130 | #' @param id Only accepts one ID, so specify here if there are multiple GWAS datasets in the vcf
131 | #'
132 | #' @importFrom magrittr %>%
133 | #' @importFrom rlang .data
134 | #'
135 | #' @export
136 | #' @return GRanges object
137 | vcf_to_granges <- function(vcf, id=NULL)
138 | {
139 | 	stopifnot(inherits(vcf, c("ExpandedVCF", "CollapsedVCF")))
140 | 	if(length(vcf) == 0)
141 | 	{
142 | 		message("VCF has length 0")
143 | 		return(NULL)
144 | 	}
145 | 	if(is.null(id))
146 | 	{
147 | 		id <- VariantAnnotation::samples(VariantAnnotation::header(vcf))
148 | 	}
149 | 	stopifnot(length(id) == 1)
150 | 	vcf <- VariantAnnotation::expand(vcf)
151 | 	a <- SummarizedExperiment::rowRanges(vcf)
152 | 	a$`REF` <- as.character(a$`REF`)
153 | 	a$`ALT` <- as.character(a$`ALT`)
154 | 
155 | 	if(length(VariantAnnotation::geno(vcf)) == 0)
156 | 	{
157 | 		return(a)
158 | 	} else {
159 | 		out <- VariantAnnotation::expand(vcf) %>% 
160 | 			VariantAnnotation::geno() %>%
161 | 			as.list() %>%
162 | 			lapply(function(x) unlist(x[,id,drop=TRUE])) %>%
163 | 			dplyr::bind_cols()
164 | 		S4Vectors::values(a) <- cbind(S4Vectors::values(a), out)
165 | 		S4Vectors::values(a)[["id"]] <- id
166 | 
167 | 		if("TotalCases" %in% names(VariantAnnotation::meta(VariantAnnotation::header(vcf))$SAMPLE))
168 | 		{
169 | 			S4Vectors::values(a)[["NC"]] <- as.numeric(VariantAnnotation::meta(VariantAnnotation::header(vcf))$SAMPLE$TotalCases) %>% rep(length(a))
170 | 			S4Vectors::values(a)[["SS"]] <- as.numeric(VariantAnnotation::meta(VariantAnnotation::header(vcf))$SAMPLE$TotalCases) + as.numeric(VariantAnnotation::meta(VariantAnnotation::header(vcf))$SAMPLE$TotalControls) %>% rep(length(a))
171 | 		} else if("TotalControls" %in% names(VariantAnnotation::meta(VariantAnnotation::header(vcf))$SAMPLE)) {
172 | 			S4Vectors::values(a)[["SS"]] <- as.numeric(VariantAnnotation::meta(VariantAnnotation::header(vcf))$SAMPLE$TotalControls) %>% rep(length(a))
173 | 		}
174 | 		return(a)
175 | 	}
176 | }
177 | 
178 | 
179 | #' Convert vcf format to tibble (data frame)
180 | #'
181 | #' @param vcf Output from readVcf
182 | #' @param id Only accepts one ID, so specify here if there are multiple GWAS datasets in the vcf
183 | #'
184 | #' @export
185 | #' @return GRanges object
186 | vcf_to_tibble <- function(vcf, id=NULL)
187 | {
188 | 	a <- vcf_to_granges(vcf, id)
189 | 	if(is.null(a))
190 | 	{
191 | 		return(dplyr::tibble())
192 | 	}
193 | 	S4Vectors::values(a)[["rsid"]] <- names(a)
194 | 	return(dplyr::as_tibble(a, .name_repair = "minimal"))
195 | }
196 | 
197 | 
198 | #' Reduce list of VCFs to intersecting regions
199 | #'
200 | #' @param vcflist List of VCF objects, or list of VCF filenames, or mix of VCF objects and filenames
201 | #' @param chrompos Either vector of chromosome and position ranges e.g. "1:1000" or "1:1000-2000", or data frame with columns `chrom`, `start`, `end`. 
202 | #'
203 | #' @export
204 | #' @return List of VCFs
205 | vcflist_overlaps <- function(vcflist, chrompos)
206 | {
207 | 	stopifnot(is.list(vcflist))
208 | 	if(!is.null(chrompos))
209 | 	{
210 | 		chrompos <- parse_chrompos(chrompos)
211 | 		vcflist <- lapply(vcflist, function(x)
212 | 		{
213 | 			query_gwas(x, chrompos)
214 | 		})
215 | 	} else {
216 | 		vcflist <- lapply(1:length(vcflist), function(i)
217 | 		{
218 | 			x <- vcflist[[i]]
219 | 			if(inherits(x, c("CollapsedVCF", "ExpandedVCF")))
220 | 			{
221 | 				if(is.null(chrompos))
222 | 				{
223 | 					return(x)
224 | 				} else {
225 | 					return(query_gwas(x, chrompos))
226 | 				}
227 | 			}
228 | 			if(is.character(x))
229 | 			{
230 | 				if(is.null(chrompos))
231 | 				{
232 | 					return(VariantAnnotation::readVcf(x))
233 | 				} else {
234 | 					return(query_gwas(x, chrompos))
235 | 				}
236 | 			}
237 | 			stop("Item ", i, " in vcflist is neither VCF object nor path to VCF file")
238 | 		})
239 | 	}
240 | 
241 | 	# collapse indels for sorting purposes
242 | 	vcflist <- lapply(vcflist, function(x)
243 | 	{
244 | 		SummarizedExperiment::end(x) <- SummarizedExperiment::start(x)
245 | 	
246 | 		# Simple approach to avoid duplicate positions due to snps and indels
247 | 		x <- x[!duplicated(SummarizedExperiment::start(x))]
248 | 		return(x)
249 | 	})
250 | 
251 | 
252 | 
253 | 	o <- Reduce(IRanges::subsetByOverlaps, lapply(vcflist, SummarizedExperiment::rowRanges))
254 | 	vcflist <- lapply(vcflist, function(x) IRanges::subsetByOverlaps(x, o))
255 | 	return(vcflist)
256 | }
257 | 


--------------------------------------------------------------------------------
/R/proxy.r:
--------------------------------------------------------------------------------
  1 | #' Find LD proxies for a set of SNPs
  2 | #'
  3 | #' @param rsid list of rs IDs
  4 | #' @param bfile ld reference panel
  5 | #' @param tag_kb =5000 Proxy parameter
  6 | #' @param tag_nsnp =5000 Proxy parameter
  7 | #' @param tag_r2 =0.6 Proxy parameter
  8 | #' @param searchspace Optional list of rs IDs to use as potential proxies 
  9 | #' @param threads Number of threads to use (=1)
 10 | #' @param out temporary output file
 11 | #'
 12 | #' @importFrom magrittr %>%
 13 | #' @importFrom rlang .data
 14 | #'
 15 | #' @export
 16 | #' @return data frame
 17 | get_ld_proxies <- function(rsid, bfile, searchspace=NULL, tag_kb=5000, tag_nsnp=5000, tag_r2=0.6, threads=1, out=tempfile())
 18 | {
 19 | 	stopifnot(check_plink())
 20 | 	searchspacename <- paste0(out, ".searchspace")
 21 | 	targetsname <- paste0(out, ".targets")
 22 | 	outname <- paste0(out, ".targets.ld.gz")
 23 | 	utils::write.table(rsid, file=targetsname, row.names = FALSE, col.names = FALSE, quote = FALSE)
 24 | 	if(!is.null(searchspace))
 25 | 	{
 26 | 		stopifnot(is.character(searchspace))
 27 | 
 28 | 		utils::write.table(unique(c(rsid, searchspace)), file=searchspacename, row.names = FALSE, col.names = FALSE, quote = FALSE)
 29 | 		extract_param <- paste0(" --extract ", searchspacename)
 30 | 	} else {
 31 | 		extract_param <- " " 
 32 | 	}
 33 | 	cmd <- paste0(options()[["tools_plink"]],
 34 | 		" --bfile ", bfile, 
 35 | 		extract_param,
 36 | 		" --keep-allele-order ",
 37 | 		" --r in-phase with-freqs gz",
 38 | 		" --ld-snp-list ", targetsname,
 39 | 		" --ld-window-kb ", tag_kb,
 40 | 		" --ld-window-r2 ", tag_r2,
 41 | 		" --ld-window ", tag_nsnp,
 42 | 		" --out ", targetsname,
 43 | 		" --threads ", threads,
 44 | 		" 2>&1 > /dev/null"
 45 | 	)
 46 | 	message("Finding proxies...")
 47 | 	system(cmd)
 48 | 
 49 | 	if (Sys.info()["sysname"] == "Windows") {
 50 | 	  stop("Currently, this function only works on macOS and Linux")
 51 | 	}
 52 | 	if (!file.exists(outname)) {
 53 | 		ld <- data.frame(CHR_A = integer(), BP_A = integer(), SNP_A = character(), MAF_A = double(), CHR_B = integer(), BP_B = integer(), 
 54 | 		SNP_B = character(), PHASE = character(), MAF_B = double(), R = double())
 55 | 		message("Index SNP not found in the reference panel")
 56 | 		return(ld)
 57 | 	}
 58 | 	ld <- data.table::fread(cmd = paste0("gunzip -c ", outname), header = TRUE) %>%
 59 | 		dplyr::as_tibble(.name_repair="minimal") %>%
 60 | 		dplyr::filter(.data[["R"]]^2 > tag_r2) %>%
 61 | 		dplyr::filter(.data[["SNP_A"]] != .data[["SNP_B"]]) %>%
 62 | 		dplyr::mutate(PHASE=gsub("/", "", .data[["PHASE"]])) %>%
 63 | 		dplyr::filter(nchar(.data[["PHASE"]]) == 4)
 64 | 	unlink(searchspacename)
 65 | 	unlink(targetsname)
 66 | 	unlink(paste0(targetsname, c(".log", ".nosex")))
 67 | 	unlink(outname)
 68 | 	if(nrow(ld) == 0)
 69 | 	{
 70 | 		message("No proxies found")
 71 | 		return(ld)
 72 | 	}
 73 | 	temp <- do.call(rbind, strsplit(ld[["PHASE"]], "")) %>% dplyr::as_tibble(.name_repair="minimal")
 74 | 	names(temp) <- c("A1", "B1", "A2", "B2")
 75 | 	ld <- cbind(ld, temp) %>% dplyr::as_tibble(.name_repair="minimal")
 76 | 	# ld <- dplyr::arrange(ld, desc(abs(R))) %>%
 77 | 	# 	dplyr::filter(!duplicated(SNP_A))
 78 | 	ld <- dplyr::arrange(ld, dplyr::desc(abs(.data[["R"]])))
 79 | 	message("Found ", nrow(ld), " proxies")
 80 | 	return(ld)
 81 | }
 82 | 
 83 | 
 84 | 
 85 | #' Lookup LD proxies from sqlite database
 86 | #'
 87 | #' @param rsids List of rsids
 88 | #' @param dbfile path to dbfile
 89 | #' @param tag_r2 minimum r2 value
 90 | #'
 91 | #' @importFrom magrittr %>%
 92 | #' @importFrom rlang .data
 93 | #'
 94 | #' @export
 95 | #' @return data frame
 96 | sqlite_ld_proxies <- function(rsids, dbfile, tag_r2)
 97 | {
 98 | 	conn <- RSQLite::dbConnect(RSQLite::SQLite(), dbfile)
 99 | 	numid <- gsub("rs", "", rsids) %>% paste(collapse=",")
100 | 	query <- paste0("SELECT DISTINCT * FROM tags WHERE SNP_A IN (", numid, ")")
101 | 	ld <- RSQLite::dbGetQuery(conn, query) %>% 
102 | 		dplyr::as_tibble(.name_repair="minimal") %>%
103 | 		dplyr::filter(.data[["R"]]^2 > tag_r2) %>%
104 | 		dplyr::filter(.data[["SNP_A"]] != .data[["SNP_B"]]) %>%
105 | 		dplyr::mutate(PHASE=gsub("/", "", .data[["PHASE"]])) %>%
106 | 		dplyr::filter(nchar(.data[["PHASE"]]) == 4) %>%
107 | 		dplyr::mutate(SNP_A = paste0("rs", .data[["SNP_A"]]), SNP_B = paste0("rs", .data[["SNP_B"]]))
108 | 
109 | 	temp <- do.call(rbind, strsplit(ld[["PHASE"]], "")) %>% dplyr::as_tibble(.name_repair="minimal")
110 | 	names(temp) <- c("A1", "B1", "A2", "B2")
111 | 	ld <- cbind(ld, temp) %>% dplyr::as_tibble(.name_repair="minimal")
112 | 	ld <- dplyr::arrange(ld, dplyr::desc(abs(.data[["R"]])))
113 | 	message("Found ", nrow(ld), " proxies")
114 | 	RSQLite::dbDisconnect(conn)
115 | 	return(ld)
116 | }
117 | 
118 | 
119 | #' Extract SNPs from vcf file
120 | #'
121 | #' Finds proxies if necessary
122 | #'
123 | #' @param vcf vcf file name
124 | #' @param rsid list of rs IDs
125 | #' @param bfile ld reference panel (plink)
126 | #' @param proxies ="yes" If SNPs are absent then look for proxies (yes) or not (no). Can also mask all target SNPs and only return proxies (only), for testing purposes
127 | #' @param tag_kb =5000 Proxy parameter
128 | #' @param tag_nsnp =5000 Proxy parameter
129 | #' @param tag_r2 =0.6 Proxy parameter
130 | #' @param threads Number of threads to use (=1)
131 | #' @param rsidx Path to rsidx index 
132 | #' @param dbfile ld tag database (sqlite)
133 | #'
134 | #' @export
135 | #' @return data frame
136 | proxy_match <- function(vcf, rsid, bfile=NULL, proxies="yes", tag_kb=5000, tag_nsnp=5000, tag_r2=0.6, threads=1, rsidx=NULL, dbfile=NULL)
137 | {
138 | 	if(is.null(dbfile) & is.null(bfile))
139 | 	{
140 | 		stop('please provide either bfile or dbfile')
141 | 	}
142 | 	if(!is.null(dbfile) & !is.null(bfile))
143 | 	{
144 | 		warning("bfile and dbfile both provided. Using dbfile.")
145 | 	}
146 | 	os <- Sys.info()[['sysname']]
147 | 	if(proxies=="yes")
148 | 	{
149 | 		message("Initial search...")
150 | 		o <- query_gwas(vcf, rsid=rsid, rsidx=rsidx)
151 | 		missing <- rsid[!rsid %in% names(o)]
152 | 		if(length(missing) != 0)
153 | 		{
154 | 			message("Extracted ", length(rsid) - length(missing), " out of ", length(rsid), " rsids")
155 | 			message("Searching for proxies for ", length(missing), " rsids")
156 | 			searchspacename <- tempfile()
157 | 			if(is.character(vcf))
158 | 			{
159 | 				if(check_bcftools() & is.null(dbfile))
160 | 				{
161 | 					message("Determining searchspace...")
162 | 					cmd <- paste0(options()[["tools_bcftools"]], " query -f'%ID\n' ", vcf, " > ", searchspacename)
163 | 					system(cmd)
164 | 					searchspace <- scan(searchspacename, what="character", quiet=TRUE)
165 | 				} else {
166 | 					searchspace <- NULL
167 | 				}				
168 | 			} else {
169 | 				message("Determining searchspace...")
170 | 				searchspace <- names(SummarizedExperiment::rowRanges(vcf))
171 | 			}
172 | 			message("Proxy lookup...")
173 | 			if(is.null(dbfile))
174 | 			{
175 | 				ld <- get_ld_proxies(missing, bfile, searchspace=searchspace, tag_kb=tag_kb, tag_nsnp=tag_nsnp, tag_r2=tag_r2, threads=threads)
176 | 			} else {
177 | 				ld <- sqlite_ld_proxies(rsids=missing, dbfile=dbfile, tag_r2=tag_r2)
178 | 			}
179 | 			if(nrow(ld) == 0)
180 | 			{
181 | 				return(o)
182 | 			}
183 | 		} else {
184 | 			return(o)
185 | 		}
186 | 	} else if(proxies == "only") {
187 | 		searchspacename <- tempfile()
188 | 		if(is.character(vcf))
189 | 		{
190 | 			if(check_bcftools() & is.null(dbfile))
191 | 			{
192 | 				message("Determining searchspace...")
193 | 				cmd <- paste0(options()[["tools_bcftools"]], " query -f'%ID\n' ", vcf, " > ", searchspacename)
194 | 				system(cmd)
195 | 				searchspace <- scan(searchspacename, what="character")
196 | 			} else {
197 | 				searchspace <- NULL
198 | 			}				
199 | 		} else {
200 | 			message("Determining searchspace...")
201 | 			searchspace <- names(SummarizedExperiment::rowRanges(vcf))
202 | 		}
203 | 		message("Proxy lookup...")
204 | 		if(is.null(dbfile))
205 | 		{
206 | 			ld <- get_ld_proxies(rsid, bfile, searchspace=searchspace, tag_kb=tag_kb, tag_nsnp=tag_nsnp, tag_r2=tag_r2, threads=threads)			
207 | 		} else {
208 | 			ld <- sqlite_ld_proxies(rsids=rsid, dbfile=dbfile, tag_r2=tag_r2)
209 | 		}
210 | 		if(nrow(ld) == 0)
211 | 		{
212 | 			return(VCF())
213 | 		}
214 | 	} else if(proxies == "no") {
215 | 		o <- query_gwas(vcf, rsid=rsid, rsidx=rsidx)
216 | 		return(o)
217 | 	} else {
218 | 		stop('proxies must be "yes", "no" or "only"')
219 | 	}
220 | 	if(!is.null(searchspace))
221 | 	{
222 | 		ld <- ld %>% dplyr::filter(!duplicated(.data[["SNP_A"]]))
223 | 	}
224 | 	message("Extrating proxies...")
225 | 	e <- query_gwas(vcf, rsid=ld[["SNP_B"]], rsidx=rsidx)
226 | 
227 | 	if(is.null(searchspace))
228 | 	{
229 | 		ld <- subset(ld, ld[["SNP_B"]] %in% names(e)) %>%
230 | 			dplyr::filter(!duplicated(.data[["SNP_A"]]))		
231 | 	}
232 | 	e <- e[names(e) %in% ld[["SNP_B"]], ]
233 | 	message("Identified proxies for ", nrow(e), " of ", length(missing), " rsids")
234 | 	message("Aligning...")
235 | 	index <- match(names(e), ld[["SNP_B"]])
236 | 	ld <- ld[index,]
237 | 	if(nrow(ld) == 0)
238 | 	{
239 | 		return(o)
240 | 	}
241 | 	stopifnot(all(ld[["SNP_B"]] == names(e)))
242 | 	sign_index <- GenomicRanges::mcols(SummarizedExperiment::rowRanges(e))[,"REF"] == ld[["B1"]]
243 | 	gr <- GenomicRanges::GRanges(ld[["CHR_A"]], IRanges::IRanges(start=ld[["BP_A"]], end=ld[["BP_A"]], names=ld[["SNP_A"]]))
244 | 	fixeddat <- S4Vectors::DataFrame(
245 | 		REF=Biostrings::DNAStringSet(ld[["A1"]]), 
246 | 		ALT=Biostrings::DNAStringSetList(as.list(ld[["A2"]])), 
247 | 		QUAL=as.numeric(NA), 
248 | 		FILTER="PASS"
249 | 	)
250 | 	prox <- VariantAnnotation::VCF(
251 | 		rowRanges = gr,
252 | 		colData = SummarizedExperiment::colData(e),
253 | 		fixed = fixeddat,
254 | 		info = VariantAnnotation::info(e),
255 | 		exptData = list(
256 | 			header = VariantAnnotation::header(e)
257 | 		),
258 | 		geno = S4Vectors::SimpleList(
259 | 			lapply(VariantAnnotation::geno(e), `dimnames<-`, NULL)
260 | 		)
261 | 	)
262 | 	VariantAnnotation::geno(VariantAnnotation::header(prox)) <- rbind(VariantAnnotation::geno(VariantAnnotation::header(prox)), 
263 | 		S4Vectors::DataFrame(row.names="PR", Number="1", Type="String", Description="Proxy rsid")
264 | 	)
265 | 	VariantAnnotation::geno(prox)[["ES"]][!sign_index] <- {unlist(VariantAnnotation::geno(prox)[["ES"]][!sign_index]) * -1} %>% as.list
266 | 	VariantAnnotation::geno(prox)[["PR"]] <- matrix(ld[["SNP_B"]], length(ld[["SNP_B"]]), 1)
267 | 
268 | 	if(proxies == "only")
269 | 	{
270 | 		return(prox)
271 | 	} else {
272 | 		VariantAnnotation::geno(VariantAnnotation::header(o)) <- rbind(VariantAnnotation::geno(VariantAnnotation::header(o)), S4Vectors::DataFrame(row.names="PR", Number="1", Type="String", Description="Proxy rsid"))
273 | 		VariantAnnotation::geno(o)[["PR"]] <- matrix(rep(NA, length(o)), length(o), 1)
274 | 		return(BiocGenerics::rbind(o, prox))
275 | 	}
276 | }
277 | 


--------------------------------------------------------------------------------
/R/pval_index.r:
--------------------------------------------------------------------------------
 1 | #' Create pval index from GWAS-VCF file
 2 | #'
 3 | #' Create a separate file called `<id>.pvali` which is used to speed up p-value queries.
 4 | #'
 5 | #' @param vcffile VCF filename
 6 | #' @param maximum_pval Maximum p-value to include. Default = 0.05
 7 | #' @param indexname index file name to create. Deletes existing file if exists.
 8 | #'
 9 | #' @export
10 | #' @return NULL
11 | create_pval_index_from_vcf <- function(vcffile, maximum_pval, indexname)
12 | {
13 |     stopifnot(!is.null(options()$tools_bcftools))
14 |     checksqlite3 <- system("which sqlite3")
15 |     if(checksqlite3 != 0) stop("sqlite3 not installed")
16 | 	fn <- tempfile()
17 |     id <- VariantAnnotation::samples(VariantAnnotation::scanVcfHeader(vcffile))
18 |     if(length(id) != 1)
19 |     {
20 |         stop("Not implemented for vcf files that don't have a single study")
21 |     }
22 |     cmd <- paste0(options()$tools_bcftools, " query -s ", id, " -i 'FORMAT/LP > ", -log10(maximum_pval), "' -f '%CHROM,%POS,[%LP]\n' ", vcffile, " | sort -r -k 3 > ", fn)
23 | 	message("Extracting pval info")
24 | 	system(cmd)
25 |     cmd <- c(
26 |         'CREATE TABLE pval_to_coord (chrom TEXT NOT NULL DEFAULT NULL, coord INTEGER NOT NULL DEFAULT NULL, LP REAL NOT NULL DEFAULT 0);',
27 |         '.separator ,',
28 |         paste0('.import ', fn, ' pval_to_coord'),
29 |         'CREATE INDEX idx_LP ON pval_to_coord (LP)'
30 |     )
31 |     print(cmd)
32 |     utils::write.table(cmd, file=paste0(fn, ".sql"), row.names = FALSE, col.names = FALSE, quote = FALSE)
33 | 	message("Generating index")
34 | 	cmd <- paste0("sqlite3 ", indexname, " < ", fn, ".sql")
35 | 	unlink(indexname)
36 | 	system(cmd)
37 | }
38 | 
39 | #' Query pval from file using pvali index
40 | #'
41 | #' See create_pvali_index
42 | #'
43 | #' @param pval pval threshold
44 | #' @param vcffile Path to .vcf.gz GWAS summary data file
45 | #' @param id If multiple GWAS datasets in the vcf file, the name (sample ID) from which to perform the filter
46 | #' @param pvali Path to pval index file
47 | #'
48 | #' @export
49 | #' @return vcf object
50 | query_pval_sqlite3 <- function(pval, vcffile, id=NULL, pvali)
51 | {
52 | 	out <- query_pvali(pval, pvali)
53 | 	return(
54 | 		query_gwas(vcffile, chrompos=data.frame(chrom=out$chrom, start=out$coord, end=out$coord), id=id)
55 | 	)
56 | }
57 | 
58 | #' Query pvali
59 | #'
60 | #' @param pval pval threshold
61 | #' @param pvali Path to pval index file
62 | #'
63 | #' @export
64 | #' @return data frame
65 | query_pvali <- function(pval, pvali)
66 | {
67 | 	conn <- RSQLite::dbConnect(RSQLite::SQLite(), pvali)
68 | 	query <- paste0("SELECT DISTINCT * FROM pval_to_coord WHERE lp >= ", -log10(pval))
69 | 	out <- RSQLite::dbGetQuery(conn, query)
70 | 	RSQLite::dbDisconnect(conn)
71 | 	return(out)
72 | }
73 | 


--------------------------------------------------------------------------------
/R/query.r:
--------------------------------------------------------------------------------
  1 | #' Query data from vcf file
  2 | #'
  3 | #' Read in GWAS summary data with filters on datasets (if multiple datasets per file) and/or chromosome/position, rsids or pvalues. Chooses most optimal choice for the detected operating system. Typically chrompos searches are the fastest. On Windows, rsid or pvalue filters from a file will be slow. 
  4 | #'
  5 | #' @param vcf Path or URL to GWAS-VCF file or VCF object e.g. output from [VariantAnnotation::readVcf()] or [create_vcf()]
  6 | #' @param chrompos Either vector of chromosome and position ranges e.g. "1:1000" or "1:1000-2000", or data frame with columns `chrom`, `start`, `end`.
  7 | #' @param rsid Vector of rsids
  8 | #' @param pval  P-value threshold (NOT -log10)
  9 | #' @param id If multiple GWAS datasets in the vcf file, the name (sample ID) from which to perform the filter
 10 | #' @param rsidx Path to rsidx index file
 11 | #' @param pvali Path to pval index file
 12 | #' @param build ="GRCh37" Build of vcffile
 13 | #' @param os The operating system. Default is as detected. Determines the method used to perform query
 14 | #' @param proxies ="no" If SNPs are absent then look for proxies (yes) or not (no). Can also mask all target SNPs and only return proxies (only), for testing purposes. Currently only possible if querying rsid.
 15 | #' @param bfile =path to plink bed/bim/fam ld reference panel
 16 | #' @param dbfile =path to sqlite tag snp database
 17 | #' @param tag_kb =5000 Proxy parameter
 18 | #' @param tag_nsnp =5000 Proxy parameter
 19 | #' @param tag_r2 =0.6 Proxy parameter
 20 | #' @param threads =1 NUmber of threads
 21 | #'
 22 | #' @export
 23 | #' @return vcf object
 24 | query_gwas <- function(vcf, chrompos=NULL, rsid=NULL, pval=NULL, id=NULL, rsidx=NULL, pvali=NULL, build="GRCh37", os=Sys.info()[['sysname']], proxies="no", bfile=NULL, dbfile=NULL, tag_kb=5000, tag_nsnp=5000, tag_r2=0.6, threads=1)
 25 | {
 26 | 	if(is.character(vcf))
 27 | 	{
 28 | 		if(!file.exists(vcf))
 29 | 		{
 30 | 			if(!RCurl::url.exists(vcf))
 31 | 			{
 32 | 				stop("vcf appears to be a string but doesn't refer to an existing URL or local file")
 33 | 			}
 34 | 		} 
 35 | 		fileflag <- TRUE
 36 | 	} else {
 37 | 		stopifnot(inherits(vcf, c("CollapsedVCF", "ExpandedVCF")))
 38 | 		fileflag <- FALSE
 39 | 	}
 40 | 	if(sum(c(!is.null(chrompos), !is.null(rsid), !is.null(pval))) != 1)
 41 | 	{
 42 | 		stop("Must specify filters only for one of chrompos, rsid or pval")
 43 | 	}
 44 | 
 45 | 	if(proxies != "no")
 46 | 	{
 47 | 		if(is.null(rsid))
 48 | 		{
 49 | 			stop("Proxies can only be searched for if rsid query specified")
 50 | 		}
 51 | 	}
 52 | 
 53 | 	if(!is.null(chrompos))
 54 | 	{
 55 | 		if(!fileflag)
 56 | 		{
 57 | 			return(query_chrompos_vcf(chrompos, vcf))
 58 | 		} else {
 59 | 			if(!check_bcftools())
 60 | 			{
 61 | 				return(query_chrompos_file(chrompos, vcf, id, build))
 62 | 			} else {
 63 | 				return(query_chrompos_bcftools(chrompos, vcf, id))
 64 | 			}
 65 | 		}
 66 | 	}
 67 | 
 68 | 	if(!is.null(rsid))
 69 | 	{
 70 | 		stopifnot(proxies %in% c("yes", "no", "only"))
 71 | 		if(proxies != "no")
 72 | 		{
 73 | 			return(proxy_match(vcf, rsid, bfile=bfile, dbfile=dbfile, proxies=proxies, tag_kb=tag_kb, tag_nsnp=tag_nsnp, tag_r2=tag_r2, threads=threads))
 74 | 		}
 75 | 		if(!fileflag)
 76 | 		{
 77 | 			return(query_rsid_vcf(rsid, vcf))
 78 | 		} else {
 79 | 			if(!is.null(rsidx))
 80 | 			{
 81 | 				return(query_rsid_rsidx(rsid, vcf, id, rsidx))
 82 | 			}
 83 | 			if(!check_bcftools())
 84 | 			{
 85 | 				return(query_rsid_file(rsid, vcf, id, build))
 86 | 			} else {
 87 | 				return(query_rsid_bcftools(rsid, vcf, id))
 88 | 			}
 89 | 		}
 90 | 	}
 91 | 
 92 | 	if(!is.null(pval))
 93 | 	{
 94 | 		if(!fileflag)
 95 | 		{
 96 | 			return(query_pval_vcf(pval, vcf, id))
 97 | 		} else {
 98 | 			if(!is.null(pvali))
 99 | 			{
100 | 				message("Using pval index")
101 | 				return(query_pval_sqlite3(pval, vcf, id, pvali))
102 | 			}
103 | 			if(!check_bcftools())
104 | 			{
105 | 				return(query_pval_file(pval, vcf, id, build))
106 | 			} else {
107 | 				return(query_pval_bcftools(pval, vcf, id))
108 | 			}
109 | 		}		
110 | 	}
111 | }
112 | 
113 | 
114 | 
115 | 
116 | df_to_granges <- function(df)
117 | {
118 | 	GenomicRanges::GRanges(seqnames=df[["chrom"]], ranges=IRanges::IRanges(start=df[["start"]], end=df[["end"]]))
119 | }
120 | 
121 | 
122 | 
123 | 
124 | 
125 | 
126 | #' Parse chromosome:position
127 | #'
128 | #' Takes data frame or vector of chromosome position ranges and parses to granges object
129 | #'
130 | #' @param chrompos Either vector of chromosome and position ranges e.g. "1:1000" or "1:1000-2000", or data frame with columns `chrom`, `start`, `end`.
131 | #' @param radius Add radius to the specified positions. Default = NULL 
132 | #'
133 | #' @export
134 | #' @return GRanges object
135 | parse_chrompos <- function(chrompos, radius=NULL)
136 | {
137 | 
138 | 	if(inherits(chrompos, "GRanges"))
139 | 	{
140 | 		if(!is.null(radius))
141 | 		{
142 | 			chrompos <- GenomicRanges::GRanges(
143 | 				seqnames = GenomeInfoDb::seqnames(chrompos),
144 | 				ranges = IRanges::IRanges(
145 | 					start = pmax(chrompos@start - radius, 0),
146 | 					end = chrompos@end + radius
147 | 				),
148 | 				strand = chrompos@strand
149 | 			)
150 | 		}
151 | 		return(chrompos)
152 | 	} else if(is.data.frame(chrompos)) {
153 | 		stopifnot(is.data.frame(chrompos))
154 | 		stopifnot(all(c("chrom", "start", "end") %in% names(chrompos)))
155 | 		return(df_to_granges(chrompos))
156 | 	} else if(!is.character(chrompos)) {
157 | 		stop("chrompos must be data frame with columns chrom, start, end, or character vector of <chr:pos> or <chr:start-end>")
158 | 	}
159 | 
160 | 	a <- stringr::str_split(chrompos, ":")
161 | 	chrom <- sapply(a, function(x) x[1])
162 | 	pos <- sapply(a, function(x) x[2])
163 | 	i <- grepl("-", pos)
164 | 	temp <- stringr::str_split(pos[i], "-")
165 | 	pos1 <- pos
166 | 	pos2 <- pos
167 | 	pos1[i] <- sapply(temp, function(x) {x[1]})
168 | 	pos2[i] <- sapply(temp, function(x) {x[2]})
169 | 	pos1 <- as.numeric(pos1)
170 | 	pos2 <- as.numeric(pos2)
171 | 	if(!is.null(radius))
172 | 	{
173 | 		pos1 <- pmax(0, pos1 - radius)
174 | 		pos2 <- pos2 + radius
175 | 	}
176 | 	return(df_to_granges(data.frame(chrom, start=pos1, end=pos2, stringsAsFactors=FALSE)))
177 | }
178 | 
179 | 
180 | 
181 | 
182 | #' Query vcf file, extracting by chromosome and position
183 | #'
184 | #'
185 | #' @param chrompos Either vector of chromosome and position ranges e.g. "1:1000" or "1:1000-2000", or data frame with columns `chrom`, `start`, `end`.
186 | #' @param vcffile Path to .vcf.gz GWAS summary data file
187 | #' @param id If multiple GWAS datasets in the vcf file, the name (sample ID) from which to perform the filter
188 | #' @param build Default="GRCh37" Build of vcffile
189 | #'
190 | #' @export
191 | #' @return VCF object
192 | query_chrompos_file <- function(chrompos, vcffile, id=NULL, build="GRCh37")
193 | {
194 | 	chrompos <- parse_chrompos(chrompos)
195 | 	if(!is.null(id))
196 | 	{
197 | 		param <- VariantAnnotation::ScanVcfParam(which=chrompos, samples=id)
198 | 	} else {
199 | 		param <- VariantAnnotation::ScanVcfParam(which=chrompos)
200 | 	}
201 | 	tab <- Rsamtools::TabixFile(vcffile)
202 | 	vcf <- VariantAnnotation::readVcf(tab, build, param=chrompos)
203 | 	return(vcf)
204 | }
205 | 
206 | 
207 | #' Query vcf file, extracting by rsid
208 | #'
209 | #' @param rsid Vector of rsids. Use DBSNP build (???)
210 | #' @param vcffile Path to .vcf.gz GWAS summary data file
211 | #' @param id If multiple GWAS datasets in the vcf file, the name (sample ID) from which to perform the filter
212 | #' @param build Default="GRCh37" Build of vcffile
213 | #'
214 | #' @export
215 | #' @return VCF object
216 | query_rsid_file <- function(rsid, vcffile, id=NULL, build="GRCh37")
217 | {
218 | 	message("Note, this is much slower than searching by chromosome/position (e.g. see query_chrompos_file)")
219 | 	vcf <- Rsamtools::TabixFile(vcffile)
220 | 	fil <- function(x)
221 | 	{
222 | 		grepl(paste(rsid, collapse="|"), x)
223 | 	}
224 | 
225 | 	tempfile <- tempfile()
226 | 	VariantAnnotation::filterVcf(vcf, build, tempfile, prefilters=S4Vectors::FilterRules(list(fil=fil)), verbose=TRUE)
227 | 	if(!is.null(id))
228 | 	{
229 | 		o <- VariantAnnotation::readVcf(tempfile, param=VariantAnnotation::ScanVcfParam(samples=id))
230 | 	} else {
231 | 		o <- VariantAnnotation::readVcf(tempfile)
232 | 	}
233 | 	unlink(tempfile)
234 | 
235 | 	# Grep isn't matching on exact word so do second pass here
236 | 	o <- query_rsid_vcf(rsid, o)
237 | 	return(o)
238 | }
239 | 
240 | 
241 | #' Query pval from vcf file
242 | #'
243 | #' @param pval P-value threshold (NOT -log10)
244 | #' @param vcffile Path to tabix indexed vcf file
245 | #' @param id If multiple GWAS datasets in the vcf file, the name (sample ID) from which to perform the filter
246 | #' @param build Default="GRCh37"
247 | #'
248 | #' @export
249 | #' @return VCF object
250 | query_pval_file <- function(pval, vcffile, id=NULL, build="GRCh37")
251 | {
252 | 	if(is.null(id))
253 | 	{
254 | 		id <- VariantAnnotation::samples(VariantAnnotation::scanVcfHeader(vcffile))
255 | 	}
256 | 	stopifnot(length(id) == 1)
257 | 	message("Filtering p-value based on id ", id)
258 | 	message("Note, this is much slower than searching by chromosome/position (e.g. see query_chrompos_file)")
259 | 	vcf <- Rsamtools::TabixFile(vcffile)
260 | 	fil <- function(x)
261 | 	{
262 | 		VariantAnnotation::geno(x)[["LP"]][,id,drop=TRUE] > -log10(pval)
263 | 	}
264 | 	tempfile <- tempfile()
265 | 	VariantAnnotation::filterVcf(vcf, build, tempfile, filters=S4Vectors::FilterRules(list(fil=fil)), verbose=TRUE)
266 | 	o <- VariantAnnotation::readVcf(tempfile)
267 | 	unlink(tempfile)
268 | 	return(o)
269 | }
270 | 
271 | 
272 | #' Query chrompos from vcf object
273 | #'
274 | #' @param chrompos Either vector of chromosome and position ranges e.g. "1:1000" or "1:1000-2000", or data frame with columns `chrom`, `start`, `end`.
275 | #' @param vcf VCF object (e.g. from readVcf)
276 | #' @param id If multiple GWAS datasets in the vcf file, the name (sample ID) from which to perform the filter
277 | #'
278 | #' @export
279 | #' @return VCF object
280 | query_chrompos_vcf <- function(chrompos, vcf, id=NULL)
281 | {
282 | 	if(is.null(id))
283 | 	{
284 | 		id <- VariantAnnotation::samples(VariantAnnotation::header(vcf))
285 | 	}
286 | 	colid <- which(VariantAnnotation::samples(VariantAnnotation::header(vcf)) == id)
287 | 	chrompos <- parse_chrompos(chrompos)
288 | 	i <- IRanges::findOverlaps(SummarizedExperiment::rowRanges(vcf), chrompos) %>% S4Vectors::queryHits() %>% unique %>% sort
289 | 	vcf[i,colid]
290 | }
291 | 
292 | 
293 | #' Query rsid from vcf object
294 | #'
295 | #' @param rsid Vector of rsids
296 | #' @param vcf VCF object (e.g. from readVcf)
297 | #' @param id If multiple GWAS datasets in the vcf file, the name (sample ID) from which to perform the filter
298 | #'
299 | #' @export
300 | #' @return VCF object
301 | query_rsid_vcf <- function(rsid, vcf, id=NULL)
302 | {
303 | 	if(is.null(id))
304 | 	{
305 | 		id <- VariantAnnotation::samples(VariantAnnotation::header(vcf))
306 | 	}
307 | 	colid <- which(VariantAnnotation::samples(VariantAnnotation::header(vcf)) == id)
308 | 	vcf[rownames(vcf) %in% rsid,colid]
309 | }
310 | 
311 | 
312 | #' Query based on p-value threshold from vcf
313 | #'
314 | #' @param pval P-value threshold (NOT -log10)
315 | #' @param id If multiple GWAS datasets in the vcf file, the name (sample ID) from which to perform the filter
316 | #' @param vcf VCF object (e.g. from readVcf)
317 | #'
318 | #' @export
319 | #' @return VCF object
320 | query_pval_vcf <- function(pval, vcf, id=NULL)
321 | {
322 | 	if(is.null(id))
323 | 	{
324 | 		id <- VariantAnnotation::samples(VariantAnnotation::header(vcf))
325 | 	}
326 | 	stopifnot(length(id) == 1)
327 | 	colid <- which(VariantAnnotation::samples(VariantAnnotation::header(vcf)) == id)
328 | 	vcf[VariantAnnotation::geno(vcf)[["LP"]][,colid,drop=TRUE] > -log10(pval),colid]
329 | }
330 | 
331 | 
332 | #' Query 
333 | #'
334 | #' @param rsid Vector of rsids
335 | #' @param vcffile Path to .vcf.gz GWAS summary data file
336 | #' @param id If multiple GWAS datasets in the vcf file, the name (sample ID) from which to perform the filter
337 | #'
338 | #' @export
339 | #' @return VCF object
340 | query_rsid_bcftools <- function(rsid, vcffile, id=NULL)
341 | {
342 | 	stopifnot(check_bcftools())
343 | 	bcftools <- options()[["tools_bcftools"]]
344 | 	if(is.null(id))
345 | 	{
346 | 		id <- VariantAnnotation::samples(VariantAnnotation::scanVcfHeader(vcffile))
347 | 	}
348 | 	id <- paste(id, collapse=",")
349 | 	tmp <- tempfile()
350 | 	utils::write.table(unique(rsid), file=paste0(tmp, ".snplist"), row.names = FALSE, col.names = FALSE, quote = FALSE)
351 | 	cmd <- sprintf("%s view -s %s -i'ID=@%s.snplist' %s > %s.vcf", bcftools, id, tmp, vcffile, tmp)
352 | 	system(cmd)
353 | 	o <- VariantAnnotation::readVcf(paste0(tmp, ".vcf"))
354 | 	unlink(paste0(tmp, ".vcf"))
355 | 	unlink(paste0(tmp, ".snplist"))
356 | 	return(o)
357 | }
358 | 
359 | #' Query p-value using bcftools
360 | #'
361 | #' @param pval P-value threshold (NOT -log10)
362 | #' @param vcffile Path to .vcf.gz GWAS summary data file
363 | #' @param id If multiple GWAS datasets in the vcf file, the name (sample ID) from which to perform the filter
364 | #'
365 | #' @export
366 | #' @return vcf object
367 | query_pval_bcftools <- function(pval, vcffile, id=NULL)
368 | {
369 | 	stopifnot(check_bcftools())
370 | 	bcftools <- options()[["tools_bcftools"]]
371 | 	if(is.null(id))
372 | 	{
373 | 		id <- VariantAnnotation::samples(VariantAnnotation::scanVcfHeader(vcffile))
374 | 	}
375 | 	id <- paste(id, collapse=",")
376 | 	tmp <- tempfile()
377 | 	cmd <- sprintf("%s view -s %s -i 'FORMAT/LP > %s' %s > %s.vcf", bcftools, id, -log10(pval), vcffile, tmp)
378 | 	system(cmd)
379 | 	o <- VariantAnnotation::readVcf(paste0(tmp, ".vcf"))
380 | 	unlink(paste0(tmp, ".vcf"))
381 | 	return(o)
382 | }
383 | 
384 | #' Query chromosome and position using bcftools
385 | #'
386 | #' @param chrompos Either vector of chromosome and position ranges e.g. "1:1000" or "1:1000-2000", or data frame with columns `chrom`, `start`, `end`.
387 | #' @param vcffile Path to .vcf.gz GWAS summary data file
388 | #' @param id If multiple GWAS datasets in the vcf file, the name (sample ID) from which to perform the filter
389 | #'
390 | #' @export
391 | #' @return vcf object
392 | query_chrompos_bcftools <- function(chrompos, vcffile, id=NULL)
393 | {
394 | 	stopifnot(check_bcftools())
395 | 	bcftools <- options()[["tools_bcftools"]]
396 | 	if(is.null(id))
397 | 	{
398 | 		id <- VariantAnnotation::samples(VariantAnnotation::scanVcfHeader(vcffile))
399 | 	}
400 | 	idclause <- ifelse(length(id) == 0, "", paste0("-s ", paste(id, collapse=",")))
401 | 
402 | 	chrompos <- parse_chrompos(chrompos)
403 | 	chrompos %>% as.data.frame
404 | 	tmp <- tempfile()
405 | 	utils::write.table(as.data.frame(chrompos)[,1:3], file=paste0(tmp, ".snplist"), sep="\t", row.names = FALSE, col.names = FALSE, quote = FALSE)
406 | 
407 | 	cmd <- sprintf(paste0("%s view %s -R %s.snplist %s > %s.vcf"), bcftools, idclause, tmp, vcffile, tmp)
408 | 	system(cmd)
409 | 	o <- VariantAnnotation::readVcf(paste0(tmp, ".vcf"))
410 | 	unlink(paste0(tmp, ".vcf"))
411 | 	unlink(paste0(tmp, ".snplist"))
412 | 	return(o)
413 | }
414 | 
415 | 
416 | #' Query rsid from file using rsidx index
417 | #'
418 | #' See create_rsidx_index
419 | #'
420 | #' @param rsid Vector of rsids
421 | #' @param vcffile Path to .vcf.gz GWAS summary data file
422 | #' @param id If multiple GWAS datasets in the vcf file, the name (sample ID) from which to perform the filter
423 | #' @param rsidx Path to rsidx index file
424 | #'
425 | #' @export
426 | #' @return vcf object
427 | query_rsid_rsidx <- function(rsid, vcffile, id=NULL, rsidx)
428 | {
429 | 	out <- query_rsidx(rsid, rsidx)
430 | 	return(
431 | 		query_gwas(vcffile, chrompos=data.frame(chrom=out$chrom, start=out$coord, end=out$coord), id=id)
432 | 	)
433 | }
434 | 
435 | #' Query rsidx
436 | #'
437 | #' @param rsid Vector of rsids
438 | #' @param rsidx Path to rsidx index file
439 | #'
440 | #' @export
441 | #' @return data frame
442 | query_rsidx <- function(rsid, rsidx)
443 | {
444 | 	conn <- RSQLite::dbConnect(RSQLite::SQLite(), rsidx)
445 | 	numid <- gsub("rs", "", rsid) %>% paste(.data, collapse=",")
446 | 	query <- paste0("SELECT DISTINCT * FROM rsid_to_coord WHERE rsid IN (", numid, ")")
447 | 	out <- RSQLite::dbGetQuery(conn, query)
448 | 	RSQLite::dbDisconnect(conn)
449 | 	return(out)
450 | }
451 | 
452 | 
453 | #' Query pval from file using pvali index
454 | #'
455 | #' See create_pvali_index
456 | #'
457 | #' @param pval pval threshold
458 | #' @param vcffile Path to .vcf.gz GWAS summary data file
459 | #' @param id If multiple GWAS datasets in the vcf file, the name (sample ID) from which to perform the filter
460 | #' @param pvali Path to pval index file
461 | #'
462 | #' @export
463 | #' @return vcf object
464 | query_pval_sqlite3 <- function(pval, vcffile, id=NULL, pvali)
465 | {
466 | 	out <- query_pvali(pval, pvali)
467 | 	message("Identified ", nrow(out), " variants passing threshold. Extracting...")
468 | 	return(
469 | 		query_gwas(vcffile, chrompos=data.frame(chrom=out$chrom, start=out$coord, end=out$coord), id=id)
470 | 	)
471 | }
472 | 
473 | #' Query pvali
474 | #'
475 | #' @param pval pval threshold
476 | #' @param pvali Path to pval index file
477 | #'
478 | #' @export
479 | #' @return data frame
480 | query_pvali <- function(pval, pvali)
481 | {
482 | 	conn <- RSQLite::dbConnect(RSQLite::SQLite(), pvali)
483 | 	query <- paste0("SELECT DISTINCT * FROM pval_to_coord WHERE lp >= ", -log10(pval))
484 | 	out <- RSQLite::dbGetQuery(conn, query)
485 | 	RSQLite::dbDisconnect(conn)
486 | 	return(out)
487 | }
488 | 


--------------------------------------------------------------------------------
/R/rsid_index.r:
--------------------------------------------------------------------------------
  1 | #' Create RSID index from VCF
  2 | #'
  3 | #' @param vcf VCF filename
  4 | #' @param indexname index file name to create. Deletes existing file if exists.
  5 | #'
  6 | #' @export
  7 | #' @return NULL
  8 | create_rsidx_index_from_vcf <- function(vcf, indexname)
  9 | {
 10 | 	fn <- tempfile()
 11 | 	if (Sys.info()["sysname"] == "Windows") {
 12 | 	  stop("Currently, this function only works on macOS and Linux")
 13 | 	}
 14 | 	cmd <- paste0("gunzip -c ", vcf, " | grep -v '#' | awk '{ print substr($3, 3), $1, $2 }' > ", fn, ".txt")
 15 | 	message("Extracting position info")
 16 | 	system(cmd)
 17 | 
 18 | 	cmd <- c(
 19 | 		'CREATE TABLE rsid_to_coord (rsid INTEGER PRIMARY KEY, chrom TEXT NULL DEFAULT NULL, coord INTEGER NOT NULL DEFAULT 0);',
 20 | 		'.separator " "',
 21 | 		paste0('.import ', fn, '.txt rsid_to_coord')
 22 | 	)
 23 | 	utils::write.table(cmd, file=paste0(fn, ".sql"), row.names = FALSE, col.names = FALSE, quote = FALSE)
 24 | 	message("Generating index")
 25 | 	cmd <- paste0("sqlite3 ", indexname, " < ", fn, ".sql")
 26 | 	unlink(indexname)
 27 | 	system(cmd)
 28 | }
 29 | 
 30 | #' Create new index from existing index using a subset of rsids
 31 | #'
 32 | #' Note this requires a modified version of plink that allows ld-window-r2 flag for --r option.
 33 | #' Available here: https://github.com/explodecomputer/plink-ng
 34 | #'
 35 | #' @param rsid Vector of rsids
 36 | #' @param rsidx Existing index
 37 | #' @param newindex New index (Note: will delete existing file if exists)
 38 | #'
 39 | #' @export
 40 | #' @return NULL, creates new index file
 41 | create_rsidx_sub_index <- function(rsid, rsidx, newindex)
 42 | {
 43 | 	out <- query_rsidx(rsid, rsidx)
 44 | 	unlink(newindex)
 45 | 	conn <- RSQLite::dbConnect(RSQLite::SQLite(), newindex)
 46 | 	RSQLite::dbWriteTable(conn, "rsid_to_coord", out)
 47 | 	RSQLite::dbExecute(conn, "CREATE INDEX rsid on rsid_to_coord(rsid);")
 48 | 	RSQLite::dbDisconnect(conn)
 49 | }
 50 | 
 51 | 
 52 | 
 53 | #' Create LD reference sqlite database for tags
 54 | #'
 55 | #' This is used for looking up proxies
 56 | #'
 57 | #' @param bfile path to plink file
 58 | #' @param dbname dbname to produce (overwrites existing if exists)
 59 | #' @param tag_r2 minimum tag r2
 60 | #'
 61 | #' @export
 62 | #' @return NULL
 63 | create_ldref_sqlite <- function(bfile, dbname, tag_r2=0.6)
 64 | {
 65 | 	stopifnot(check_plink())
 66 | 	message("identifying indels to remove")
 67 | 	cmd <- paste0("awk '{ if (length($5) != 1 || length($6) != 1) { print $2 }}' ", bfile, ".bim > ", bfile, ".indels")
 68 | 	system(cmd)
 69 | 
 70 | 	message("calculating ld tags")
 71 | 	cmd <- paste0(options()[["tools_plink"]], " --bfile ", bfile, " --keep-allele-order --exclude ", bfile, ".indels --r in-phase with-freqs gz --out ", bfile, " --ld-window-kb 250 --ld-window 1000 --ld-window-r2 ",  tag_r2)
 72 | 	system(cmd)
 73 | 
 74 | 	message("formatting")
 75 | 	if (Sys.info()["sysname"] == "Windows") {
 76 | 	  stop("Currently, this function only works on macOS and Linux")
 77 | 	}
 78 | 	cmd <- paste0("gunzip -c ", bfile, ".ld.gz | awk 'BEGIN {OFS=\",\"}  { if(NR != 1) { print substr($3, 3), $1, $2, $4, substr($7, 3), $5, $6, $9, $8, $10 }}' > ", bfile, ".ld.tab")
 79 | 	system(cmd)
 80 | 
 81 | 	message("creating sqlite db")
 82 | 	cmd <- c(
 83 | 		'CREATE TABLE tags (',
 84 | 		'	SNP_A INTEGER NOT NULL, ',
 85 | 		'	CHR_A TEXT NULL DEFAULT NULL, ',
 86 | 		'	BP_A INTEGER NOT NULL,',
 87 | 		'	MAF_A REAL NOT NULL,',
 88 | 		'	SNP_B INTEGER NOT NULL, ',
 89 | 		'	CHR_B TEXT NULL DEFAULT NULL, ',
 90 | 		'	BP_B INTEGER NOT NULL,',
 91 | 		'	MAF_B REAL NOT NULL,',
 92 | 		'	PHASE TEXT NOT NULL,',
 93 | 		'	R REAL NOT NULL',
 94 | 		');',
 95 | 		'CREATE INDEX SNP_A_INDEX ON tags(SNP_A);',
 96 | 		'.separator ","',
 97 | 		paste0(".import ", bfile, ".ld.tab tags")
 98 | 	)
 99 | 	unlink(paste0(bfile, ".ld.sqlite"))
100 | 	utils::write.table(cmd, file=paste0(bfile, ".ld.sqlite"), row.names = FALSE, col.names = FALSE, quote = FALSE)
101 | 	unlink(dbname)
102 | 	cmd <- paste0("sqlite3 ", dbname, " < ", bfile, ".ld.sqlite")
103 | 	system(cmd)
104 | 	unlink(paste0(bfile, ".ld.tab"))
105 | 	unlink(paste0(bfile, ".ld.gz"))
106 | 	unlink(paste0(bfile, ".ld.sqlite"))
107 | 	# unlink(paste0(bfile, ".indels"))
108 | }
109 | 


--------------------------------------------------------------------------------
/R/utils-pipe.R:
--------------------------------------------------------------------------------
 1 | #' Pipe operator
 2 | #'
 3 | #' See \code{magrittr::\link[magrittr]{\%>\%}} for details.
 4 | #'
 5 | #' @name %>%
 6 | #' @rdname pipe
 7 | #' @keywords internal
 8 | #' @export
 9 | #' @importFrom magrittr %>%
10 | #' @usage lhs \%>\% rhs
11 | NULL
12 | 
13 | #' VariantAnnotation
14 | #'
15 | #' @name VariantAnnotation
16 | #' @import VariantAnnotation
17 | NULL
18 | 


--------------------------------------------------------------------------------
/R/zzz.r:
--------------------------------------------------------------------------------
1 | options(datatable.fread.input.cmd.message=FALSE)
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Reading, querying and writing GWAS summary data in VCF format
 2 | 
 3 | <!-- badges: start -->
 4 | [![Lifecycle:experimental](https://img.shields.io/badge/lifecycle-experimental-orange.svg)](https://lifecycle.r-lib.org/articles/stages.html)
 5 | [![R-CMD-check](https://github.com/MRCIEU/gwasvcf/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/MRCIEU/gwasvcf/actions/workflows/R-CMD-check.yaml)
 6 | <!-- badges: end -->
 7 | 
 8 | Complete GWAS summary datasets are now abundant. A large repository of curated, harmonised and QC'd datasets is available in the [IEU GWAS database](https://gwas.mrcieu.ac.uk/). They can be queried via the [API](https://api.opengwas.io/api/) directly, or through the [ieugwasr](https://github.com/mrcieu/ieugwasr) R package, or the [ieugwaspy](https://github.com/mrcieu/ieugwaspy) Python package. However, for faster querying that can be used in a HPC environment, accessing the data directly and not through cloud systems is advantageous. 
 9 | 
10 | We developed a format for storing and harmonising GWAS summary data known as [GWAS VCF format](https://github.com/MRCIEU/gwas-vcf-specification/releases/tag/1.0.0) which can be created using [gwas2vcf](https://github.com/mrcieu/gwas2vcf). All the data in the [IEU GWAS database](https://gwas.mrcieu.ac.uk/) is available for download in this format. This R package provides fast and convenient functions for querying and creating GWAS summary data in GWAS VCF format (v1.0). See also [pygwasvcf](https://github.com/mrcieu/pygwasvcf) a Python3 parser for querying GWAS VCF files.
11 | 
12 | This package includes:
13 | 
14 | - a wrapper around the [bioconductor/VariantAnnotation](https://bioconductor.org/packages/release/bioc/html/VariantAnnotation.html) package, providing functions tailored to GWAS VCF for reading, querying, creating and writing GWAS VCF format files
15 | - some LD related functions such as using a reference panel to extract proxies, create LD matrices and perform LD clumping
16 | - functions for harmonising a dataset against the reference genome and creating GWAS VCF files.
17 | 
18 | See also the [gwasglue](https://github.com/MRCIEU/gwasglue) R package for methods to connect the VCF data to Mendelian randomization, colocalisation, fine mapping etc.
19 | 
20 | ## Installation
21 | 
22 | You can install a binary version from our [MRC IEU r-universe](https://mrcieu.r-universe.dev/builds) with
23 | 
24 | ```r
25 | install.packages('gwasvcf', repos = c('https://mrcieu.r-universe.dev', 'https://cloud.r-project.org'))
26 | ```
27 | 
28 | or install from the GitHub repo
29 | 
30 | ```r
31 | remotes::install_github("mrcieu/gwasvcf")
32 | ```
33 | 
34 | ## Usage
35 | 
36 | See vignettes here: [https://mrcieu.github.io/gwasvcf/](https://mrcieu.github.io/gwasvcf/).
37 | 
38 | ## Citation
39 | 
40 | If using GWAS-VCF files please reference the studies that you use and the following paper:
41 | 
42 | **The variant call format provides efficient and robust storage of GWAS summary statistics.** Matthew Lyon, Shea J Andrews, Ben Elsworth, Tom R Gaunt, Gibran Hemani, Edoardo Marcora. bioRxiv 2020.05.29.115824; doi: https://doi.org/10.1101/2020.05.29.115824 
43 | 
44 | 
45 | ## Reference datasets
46 | 
47 | Example GWAS VCF (GIANT 2010 BMI):
48 | 
49 | - [http://fileserve.mrcieu.ac.uk/vcf/IEU-a-2.vcf.gz](http://fileserve.mrcieu.ac.uk/vcf/IEU-a-2.vcf.gz)
50 | - [http://fileserve.mrcieu.ac.uk/vcf/IEU-a-2.vcf.gz.tbi](http://fileserve.mrcieu.ac.uk/vcf/IEU-a-2.vcf.gz.tbi)
51 | 
52 | 1000 genomes reference panels for LD for each superpopulation - used by default in OpenGWAS:
53 | 
54 | - [http://fileserve.mrcieu.ac.uk/ld/1kg.v3.tgz](http://fileserve.mrcieu.ac.uk/ld/1kg.v3.tgz)
55 | 
56 | RSID index for faster querying:
57 | 
58 | - [http://fileserve.mrcieu.ac.uk/vcf/annotations.vcf.gz.rsidx](http://fileserve.mrcieu.ac.uk/vcf/annotations.vcf.gz.rsidx)
59 | 
60 | 1000 genomes annotations in vcf format harmonised against human genome reference:
61 | 
62 | - [http://fileserve.mrcieu.ac.uk/vcf/1kg_v3_nomult.vcf.gz](http://fileserve.mrcieu.ac.uk/vcf/1kg_v3_nomult.vcf.gz)
63 | - [http://fileserve.mrcieu.ac.uk/vcf/1kg_v3_nomult.vcf.gz.tbi](http://fileserve.mrcieu.ac.uk/vcf/1kg_v3_nomult.vcf.gz.tbi)
64 | 
65 | ---
66 | 
67 | ### Notes
68 | 
69 | #### Example data
70 | 
71 | data.vcf.gz and data.vcf.gz.tbi are the first few rows of the Speliotes 2010 BMI GWAS
72 | 
73 | The eur.bed/bim/fam files are the same range as data.vcf.gz, from here http://fileserve.mrcieu.ac.uk/ld/data_maf0.01_rs_ref.tgz
74 | 


--------------------------------------------------------------------------------
/_pkgdown.yml:
--------------------------------------------------------------------------------
1 | template:
2 |   bootstrap: 5
3 |   light-switch: true
4 | url: https://mrcieu.github.io/gwasvcf/
5 | 


--------------------------------------------------------------------------------
/gwasvcf.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 
15 | AutoAppendNewline: Yes
16 | 
17 | BuildType: Package
18 | PackageUseDevtools: Yes
19 | PackageInstallArgs: --no-multiarch --with-keep.source
20 | PackageRoxygenize: rd,collate,namespace
21 | 


--------------------------------------------------------------------------------
/inst/extdata/data.vcf.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MRCIEU/gwasvcf/820267653ac7720926a13cac00b82c0a0ca840b6/inst/extdata/data.vcf.gz


--------------------------------------------------------------------------------
/inst/extdata/data.vcf.gz.tbi:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MRCIEU/gwasvcf/820267653ac7720926a13cac00b82c0a0ca840b6/inst/extdata/data.vcf.gz.tbi


--------------------------------------------------------------------------------
/inst/extdata/eur.bed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MRCIEU/gwasvcf/820267653ac7720926a13cac00b82c0a0ca840b6/inst/extdata/eur.bed


--------------------------------------------------------------------------------
/inst/extdata/eur.fam:
--------------------------------------------------------------------------------
  1 | HG00097 HG00097 0 0 0 -9
  2 | HG00099 HG00099 0 0 0 -9
  3 | HG00100 HG00100 0 0 0 -9
  4 | HG00101 HG00101 0 0 0 -9
  5 | HG00102 HG00102 0 0 0 -9
  6 | HG00103 HG00103 0 0 0 -9
  7 | HG00105 HG00105 0 0 0 -9
  8 | HG00106 HG00106 0 0 0 -9
  9 | HG00107 HG00107 0 0 0 -9
 10 | HG00108 HG00108 0 0 0 -9
 11 | HG00109 HG00109 0 0 0 -9
 12 | HG00110 HG00110 0 0 0 -9
 13 | HG00111 HG00111 0 0 0 -9
 14 | HG00112 HG00112 0 0 0 -9
 15 | HG00113 HG00113 0 0 0 -9
 16 | HG00114 HG00114 0 0 0 -9
 17 | HG00115 HG00115 0 0 0 -9
 18 | HG00116 HG00116 0 0 0 -9
 19 | HG00117 HG00117 0 0 0 -9
 20 | HG00118 HG00118 0 0 0 -9
 21 | HG00119 HG00119 0 0 0 -9
 22 | HG00120 HG00120 0 0 0 -9
 23 | HG00121 HG00121 0 0 0 -9
 24 | HG00122 HG00122 0 0 0 -9
 25 | HG00123 HG00123 0 0 0 -9
 26 | HG00125 HG00125 0 0 0 -9
 27 | HG00126 HG00126 0 0 0 -9
 28 | HG00127 HG00127 0 0 0 -9
 29 | HG00128 HG00128 0 0 0 -9
 30 | HG00129 HG00129 0 0 0 -9
 31 | HG00130 HG00130 0 0 0 -9
 32 | HG00131 HG00131 0 0 0 -9
 33 | HG00132 HG00132 0 0 0 -9
 34 | HG00133 HG00133 0 0 0 -9
 35 | HG00136 HG00136 0 0 0 -9
 36 | HG00137 HG00137 0 0 0 -9
 37 | HG00138 HG00138 0 0 0 -9
 38 | HG00139 HG00139 0 0 0 -9
 39 | HG00140 HG00140 0 0 0 -9
 40 | HG00141 HG00141 0 0 0 -9
 41 | HG00142 HG00142 0 0 0 -9
 42 | HG00143 HG00143 0 0 0 -9
 43 | HG00145 HG00145 0 0 0 -9
 44 | HG00146 HG00146 0 0 0 -9
 45 | HG00148 HG00148 0 0 0 -9
 46 | HG00149 HG00149 0 0 0 -9
 47 | HG00150 HG00150 0 0 0 -9
 48 | HG00151 HG00151 0 0 0 -9
 49 | HG00154 HG00154 0 0 0 -9
 50 | HG00155 HG00155 0 0 0 -9
 51 | HG00157 HG00157 0 0 0 -9
 52 | HG00158 HG00158 0 0 0 -9
 53 | HG00159 HG00159 0 0 0 -9
 54 | HG00160 HG00160 0 0 0 -9
 55 | HG00171 HG00171 0 0 0 -9
 56 | HG00173 HG00173 0 0 0 -9
 57 | HG00174 HG00174 0 0 0 -9
 58 | HG00176 HG00176 0 0 0 -9
 59 | HG00177 HG00177 0 0 0 -9
 60 | HG00178 HG00178 0 0 0 -9
 61 | HG00179 HG00179 0 0 0 -9
 62 | HG00180 HG00180 0 0 0 -9
 63 | HG00181 HG00181 0 0 0 -9
 64 | HG00182 HG00182 0 0 0 -9
 65 | HG00183 HG00183 0 0 0 -9
 66 | HG00185 HG00185 0 0 0 -9
 67 | HG00186 HG00186 0 0 0 -9
 68 | HG00187 HG00187 0 0 0 -9
 69 | HG00188 HG00188 0 0 0 -9
 70 | HG00189 HG00189 0 0 0 -9
 71 | HG00190 HG00190 0 0 0 -9
 72 | HG00231 HG00231 0 0 0 -9
 73 | HG00232 HG00232 0 0 0 -9
 74 | HG00233 HG00233 0 0 0 -9
 75 | HG00234 HG00234 0 0 0 -9
 76 | HG00235 HG00235 0 0 0 -9
 77 | HG00236 HG00236 0 0 0 -9
 78 | HG00237 HG00237 0 0 0 -9
 79 | HG00238 HG00238 0 0 0 -9
 80 | HG00239 HG00239 0 0 0 -9
 81 | HG00240 HG00240 0 0 0 -9
 82 | HG00242 HG00242 0 0 0 -9
 83 | HG00243 HG00243 0 0 0 -9
 84 | HG00244 HG00244 0 0 0 -9
 85 | HG00245 HG00245 0 0 0 -9
 86 | HG00246 HG00246 0 0 0 -9
 87 | HG00250 HG00250 0 0 0 -9
 88 | HG00251 HG00251 0 0 0 -9
 89 | HG00252 HG00252 0 0 0 -9
 90 | HG00253 HG00253 0 0 0 -9
 91 | HG00254 HG00254 0 0 0 -9
 92 | HG00255 HG00255 0 0 0 -9
 93 | HG00256 HG00256 0 0 0 -9
 94 | HG00257 HG00257 0 0 0 -9
 95 | HG00258 HG00258 0 0 0 -9
 96 | HG00259 HG00259 0 0 0 -9
 97 | HG00260 HG00260 0 0 0 -9
 98 | HG00261 HG00261 0 0 0 -9
 99 | HG00262 HG00262 0 0 0 -9
100 | HG00263 HG00263 0 0 0 -9
101 | HG00264 HG00264 0 0 0 -9
102 | HG00265 HG00265 0 0 0 -9
103 | HG00266 HG00266 0 0 0 -9
104 | HG00267 HG00267 0 0 0 -9
105 | HG00268 HG00268 0 0 0 -9
106 | HG00269 HG00269 0 0 0 -9
107 | HG00271 HG00271 0 0 0 -9
108 | HG00272 HG00272 0 0 0 -9
109 | HG00273 HG00273 0 0 0 -9
110 | HG00274 HG00274 0 0 0 -9
111 | HG00275 HG00275 0 0 0 -9
112 | HG00276 HG00276 0 0 0 -9
113 | HG00277 HG00277 0 0 0 -9
114 | HG00278 HG00278 0 0 0 -9
115 | HG00280 HG00280 0 0 0 -9
116 | HG00281 HG00281 0 0 0 -9
117 | HG00282 HG00282 0 0 0 -9
118 | HG00284 HG00284 0 0 0 -9
119 | HG00285 HG00285 0 0 0 -9
120 | HG00288 HG00288 0 0 0 -9
121 | HG00290 HG00290 0 0 0 -9
122 | HG00304 HG00304 0 0 0 -9
123 | HG00306 HG00306 0 0 0 -9
124 | HG00308 HG00308 0 0 0 -9
125 | HG00309 HG00309 0 0 0 -9
126 | HG00310 HG00310 0 0 0 -9
127 | HG00311 HG00311 0 0 0 -9
128 | HG00313 HG00313 0 0 0 -9
129 | HG00315 HG00315 0 0 0 -9
130 | HG00318 HG00318 0 0 0 -9
131 | HG00319 HG00319 0 0 0 -9
132 | HG00320 HG00320 0 0 0 -9
133 | HG00321 HG00321 0 0 0 -9
134 | HG00323 HG00323 0 0 0 -9
135 | HG00324 HG00324 0 0 0 -9
136 | HG00325 HG00325 0 0 0 -9
137 | HG00326 HG00326 0 0 0 -9
138 | HG00327 HG00327 0 0 0 -9
139 | HG00328 HG00328 0 0 0 -9
140 | HG00329 HG00329 0 0 0 -9
141 | HG00330 HG00330 0 0 0 -9
142 | HG00331 HG00331 0 0 0 -9
143 | HG00332 HG00332 0 0 0 -9
144 | HG00334 HG00334 0 0 0 -9
145 | HG00335 HG00335 0 0 0 -9
146 | HG00336 HG00336 0 0 0 -9
147 | HG00337 HG00337 0 0 0 -9
148 | HG00338 HG00338 0 0 0 -9
149 | HG00339 HG00339 0 0 0 -9
150 | HG00341 HG00341 0 0 0 -9
151 | HG00342 HG00342 0 0 0 -9
152 | HG00343 HG00343 0 0 0 -9
153 | HG00344 HG00344 0 0 0 -9
154 | HG00345 HG00345 0 0 0 -9
155 | HG00346 HG00346 0 0 0 -9
156 | HG00349 HG00349 0 0 0 -9
157 | HG00350 HG00350 0 0 0 -9
158 | HG00351 HG00351 0 0 0 -9
159 | HG00353 HG00353 0 0 0 -9
160 | HG00355 HG00355 0 0 0 -9
161 | HG00356 HG00356 0 0 0 -9
162 | HG00357 HG00357 0 0 0 -9
163 | HG00358 HG00358 0 0 0 -9
164 | HG00360 HG00360 0 0 0 -9
165 | HG00361 HG00361 0 0 0 -9
166 | HG00362 HG00362 0 0 0 -9
167 | HG00364 HG00364 0 0 0 -9
168 | HG00365 HG00365 0 0 0 -9
169 | HG00366 HG00366 0 0 0 -9
170 | HG00367 HG00367 0 0 0 -9
171 | HG00368 HG00368 0 0 0 -9
172 | HG00369 HG00369 0 0 0 -9
173 | HG00371 HG00371 0 0 0 -9
174 | HG00372 HG00372 0 0 0 -9
175 | HG00373 HG00373 0 0 0 -9
176 | HG00375 HG00375 0 0 0 -9
177 | HG00376 HG00376 0 0 0 -9
178 | HG00378 HG00378 0 0 0 -9
179 | HG00379 HG00379 0 0 0 -9
180 | HG00380 HG00380 0 0 0 -9
181 | HG00381 HG00381 0 0 0 -9
182 | HG00382 HG00382 0 0 0 -9
183 | HG00383 HG00383 0 0 0 -9
184 | HG00384 HG00384 0 0 0 -9
185 | HG01334 HG01334 0 0 0 -9
186 | HG01500 HG01500 0 0 0 -9
187 | HG01501 HG01501 0 0 0 -9
188 | HG01503 HG01503 0 0 0 -9
189 | HG01504 HG01504 0 0 0 -9
190 | HG01506 HG01506 0 0 0 -9
191 | HG01507 HG01507 0 0 0 -9
192 | HG01509 HG01509 0 0 0 -9
193 | HG01510 HG01510 0 0 0 -9
194 | HG01512 HG01512 0 0 0 -9
195 | HG01513 HG01513 0 0 0 -9
196 | HG01515 HG01515 0 0 0 -9
197 | HG01516 HG01516 0 0 0 -9
198 | HG01518 HG01518 0 0 0 -9
199 | HG01519 HG01519 0 0 0 -9
200 | HG01521 HG01521 0 0 0 -9
201 | HG01522 HG01522 0 0 0 -9
202 | HG01524 HG01524 0 0 0 -9
203 | HG01525 HG01525 0 0 0 -9
204 | HG01527 HG01527 0 0 0 -9
205 | HG01528 HG01528 0 0 0 -9
206 | HG01530 HG01530 0 0 0 -9
207 | HG01531 HG01531 0 0 0 -9
208 | HG01536 HG01536 0 0 0 -9
209 | HG01537 HG01537 0 0 0 -9
210 | HG01602 HG01602 0 0 0 -9
211 | HG01603 HG01603 0 0 0 -9
212 | HG01605 HG01605 0 0 0 -9
213 | HG01606 HG01606 0 0 0 -9
214 | HG01607 HG01607 0 0 0 -9
215 | HG01608 HG01608 0 0 0 -9
216 | HG01610 HG01610 0 0 0 -9
217 | HG01612 HG01612 0 0 0 -9
218 | HG01613 HG01613 0 0 0 -9
219 | HG01615 HG01615 0 0 0 -9
220 | HG01617 HG01617 0 0 0 -9
221 | HG01618 HG01618 0 0 0 -9
222 | HG01619 HG01619 0 0 0 -9
223 | HG01620 HG01620 0 0 0 -9
224 | HG01623 HG01623 0 0 0 -9
225 | HG01624 HG01624 0 0 0 -9
226 | HG01625 HG01625 0 0 0 -9
227 | HG01626 HG01626 0 0 0 -9
228 | HG01628 HG01628 0 0 0 -9
229 | HG01630 HG01630 0 0 0 -9
230 | HG01631 HG01631 0 0 0 -9
231 | HG01632 HG01632 0 0 0 -9
232 | HG01668 HG01668 0 0 0 -9
233 | HG01669 HG01669 0 0 0 -9
234 | HG01670 HG01670 0 0 0 -9
235 | HG01672 HG01672 0 0 0 -9
236 | HG01673 HG01673 0 0 0 -9
237 | HG01675 HG01675 0 0 0 -9
238 | HG01676 HG01676 0 0 0 -9
239 | HG01678 HG01678 0 0 0 -9
240 | HG01679 HG01679 0 0 0 -9
241 | HG01680 HG01680 0 0 0 -9
242 | HG01682 HG01682 0 0 0 -9
243 | HG01684 HG01684 0 0 0 -9
244 | HG01685 HG01685 0 0 0 -9
245 | HG01686 HG01686 0 0 0 -9
246 | HG01694 HG01694 0 0 0 -9
247 | HG01695 HG01695 0 0 0 -9
248 | HG01697 HG01697 0 0 0 -9
249 | HG01699 HG01699 0 0 0 -9
250 | HG01700 HG01700 0 0 0 -9
251 | HG01702 HG01702 0 0 0 -9
252 | HG01704 HG01704 0 0 0 -9
253 | HG01705 HG01705 0 0 0 -9
254 | HG01707 HG01707 0 0 0 -9
255 | HG01708 HG01708 0 0 0 -9
256 | HG01709 HG01709 0 0 0 -9
257 | HG01710 HG01710 0 0 0 -9
258 | HG01746 HG01746 0 0 0 -9
259 | HG01747 HG01747 0 0 0 -9
260 | HG01756 HG01756 0 0 0 -9
261 | HG01757 HG01757 0 0 0 -9
262 | HG01761 HG01761 0 0 0 -9
263 | HG01762 HG01762 0 0 0 -9
264 | HG01765 HG01765 0 0 0 -9
265 | HG01766 HG01766 0 0 0 -9
266 | HG01767 HG01767 0 0 0 -9
267 | HG01768 HG01768 0 0 0 -9
268 | HG01770 HG01770 0 0 0 -9
269 | HG01771 HG01771 0 0 0 -9
270 | HG01773 HG01773 0 0 0 -9
271 | HG01775 HG01775 0 0 0 -9
272 | HG01776 HG01776 0 0 0 -9
273 | HG01777 HG01777 0 0 0 -9
274 | HG01779 HG01779 0 0 0 -9
275 | HG01781 HG01781 0 0 0 -9
276 | HG01783 HG01783 0 0 0 -9
277 | HG01784 HG01784 0 0 0 -9
278 | HG01785 HG01785 0 0 0 -9
279 | HG01786 HG01786 0 0 0 -9
280 | HG01789 HG01789 0 0 0 -9
281 | HG01790 HG01790 0 0 0 -9
282 | HG01791 HG01791 0 0 0 -9
283 | HG02215 HG02215 0 0 0 -9
284 | HG02219 HG02219 0 0 0 -9
285 | HG02220 HG02220 0 0 0 -9
286 | HG02221 HG02221 0 0 0 -9
287 | HG02223 HG02223 0 0 0 -9
288 | HG02224 HG02224 0 0 0 -9
289 | HG02230 HG02230 0 0 0 -9
290 | HG02231 HG02231 0 0 0 -9
291 | HG02232 HG02232 0 0 0 -9
292 | HG02233 HG02233 0 0 0 -9
293 | HG02235 HG02235 0 0 0 -9
294 | HG02236 HG02236 0 0 0 -9
295 | HG02238 HG02238 0 0 0 -9
296 | HG02239 HG02239 0 0 0 -9
297 | NA06984 NA06984 0 0 0 -9
298 | NA06985 NA06985 0 0 0 -9
299 | NA06986 NA06986 0 0 0 -9
300 | NA06989 NA06989 0 0 0 -9
301 | NA06994 NA06994 0 0 0 -9
302 | NA07000 NA07000 0 0 0 -9
303 | NA07037 NA07037 0 0 0 -9
304 | NA07048 NA07048 0 0 0 -9
305 | NA07051 NA07051 0 0 0 -9
306 | NA07056 NA07056 0 0 0 -9
307 | NA07347 NA07347 0 0 0 -9
308 | NA07357 NA07357 0 0 0 -9
309 | NA10847 NA10847 0 0 0 -9
310 | NA10851 NA10851 0 0 0 -9
311 | NA11829 NA11829 0 0 0 -9
312 | NA11830 NA11830 0 0 0 -9
313 | NA11831 NA11831 0 0 0 -9
314 | NA11832 NA11832 0 0 0 -9
315 | NA11840 NA11840 0 0 0 -9
316 | NA11843 NA11843 0 0 0 -9
317 | NA11881 NA11881 0 0 0 -9
318 | NA11892 NA11892 0 0 0 -9
319 | NA11893 NA11893 0 0 0 -9
320 | NA11894 NA11894 0 0 0 -9
321 | NA11918 NA11918 0 0 0 -9
322 | NA11919 NA11919 0 0 0 -9
323 | NA11920 NA11920 0 0 0 -9
324 | NA11930 NA11930 0 0 0 -9
325 | NA11931 NA11931 0 0 0 -9
326 | NA11932 NA11932 0 0 0 -9
327 | NA11933 NA11933 0 0 0 -9
328 | NA11992 NA11992 0 0 0 -9
329 | NA11994 NA11994 0 0 0 -9
330 | NA11995 NA11995 0 0 0 -9
331 | NA12003 NA12003 0 0 0 -9
332 | NA12004 NA12004 0 0 0 -9
333 | NA12005 NA12005 0 0 0 -9
334 | NA12006 NA12006 0 0 0 -9
335 | NA12043 NA12043 0 0 0 -9
336 | NA12044 NA12044 0 0 0 -9
337 | NA12045 NA12045 0 0 0 -9
338 | NA12046 NA12046 0 0 0 -9
339 | NA12058 NA12058 0 0 0 -9
340 | NA12144 NA12144 0 0 0 -9
341 | NA12154 NA12154 0 0 0 -9
342 | NA12155 NA12155 0 0 0 -9
343 | NA12156 NA12156 0 0 0 -9
344 | NA12234 NA12234 0 0 0 -9
345 | NA12249 NA12249 0 0 0 -9
346 | NA12272 NA12272 0 0 0 -9
347 | NA12273 NA12273 0 0 0 -9
348 | NA12275 NA12275 0 0 0 -9
349 | NA12282 NA12282 0 0 0 -9
350 | NA12283 NA12283 0 0 0 -9
351 | NA12286 NA12286 0 0 0 -9
352 | NA12287 NA12287 0 0 0 -9
353 | NA12340 NA12340 0 0 0 -9
354 | NA12341 NA12341 0 0 0 -9
355 | NA12342 NA12342 0 0 0 -9
356 | NA12347 NA12347 0 0 0 -9
357 | NA12348 NA12348 0 0 0 -9
358 | NA12383 NA12383 0 0 0 -9
359 | NA12399 NA12399 0 0 0 -9
360 | NA12400 NA12400 0 0 0 -9
361 | NA12413 NA12413 0 0 0 -9
362 | NA12414 NA12414 0 0 0 -9
363 | NA12489 NA12489 0 0 0 -9
364 | NA12546 NA12546 0 0 0 -9
365 | NA12716 NA12716 0 0 0 -9
366 | NA12717 NA12717 0 0 0 -9
367 | NA12718 NA12718 0 0 0 -9
368 | NA12748 NA12748 0 0 0 -9
369 | NA12749 NA12749 0 0 0 -9
370 | NA12750 NA12750 0 0 0 -9
371 | NA12751 NA12751 0 0 0 -9
372 | NA12760 NA12760 0 0 0 -9
373 | NA12761 NA12761 0 0 0 -9
374 | NA12762 NA12762 0 0 0 -9
375 | NA12763 NA12763 0 0 0 -9
376 | NA12775 NA12775 0 0 0 -9
377 | NA12776 NA12776 0 0 0 -9
378 | NA12777 NA12777 0 0 0 -9
379 | NA12778 NA12778 0 0 0 -9
380 | NA12812 NA12812 0 0 0 -9
381 | NA12813 NA12813 0 0 0 -9
382 | NA12814 NA12814 0 0 0 -9
383 | NA12815 NA12815 0 0 0 -9
384 | NA12827 NA12827 0 0 0 -9
385 | NA12828 NA12828 0 0 0 -9
386 | NA12829 NA12829 0 0 0 -9
387 | NA12830 NA12830 0 0 0 -9
388 | NA12842 NA12842 0 0 0 -9
389 | NA12843 NA12843 0 0 0 -9
390 | NA12872 NA12872 0 0 0 -9
391 | NA12873 NA12873 0 0 0 -9
392 | NA12874 NA12874 0 0 0 -9
393 | NA12878 NA12878 0 0 0 -9
394 | NA12889 NA12889 0 0 0 -9
395 | NA12890 NA12890 0 0 0 -9
396 | NA20502 NA20502 0 0 0 -9
397 | NA20503 NA20503 0 0 0 -9
398 | NA20504 NA20504 0 0 0 -9
399 | NA20505 NA20505 0 0 0 -9
400 | NA20506 NA20506 0 0 0 -9
401 | NA20507 NA20507 0 0 0 -9
402 | NA20508 NA20508 0 0 0 -9
403 | NA20509 NA20509 0 0 0 -9
404 | NA20510 NA20510 0 0 0 -9
405 | NA20511 NA20511 0 0 0 -9
406 | NA20512 NA20512 0 0 0 -9
407 | NA20513 NA20513 0 0 0 -9
408 | NA20514 NA20514 0 0 0 -9
409 | NA20515 NA20515 0 0 0 -9
410 | NA20516 NA20516 0 0 0 -9
411 | NA20517 NA20517 0 0 0 -9
412 | NA20518 NA20518 0 0 0 -9
413 | NA20519 NA20519 0 0 0 -9
414 | NA20520 NA20520 0 0 0 -9
415 | NA20521 NA20521 0 0 0 -9
416 | NA20522 NA20522 0 0 0 -9
417 | NA20524 NA20524 0 0 0 -9
418 | NA20525 NA20525 0 0 0 -9
419 | NA20527 NA20527 0 0 0 -9
420 | NA20528 NA20528 0 0 0 -9
421 | NA20529 NA20529 0 0 0 -9
422 | NA20530 NA20530 0 0 0 -9
423 | NA20531 NA20531 0 0 0 -9
424 | NA20532 NA20532 0 0 0 -9
425 | NA20533 NA20533 0 0 0 -9
426 | NA20534 NA20534 0 0 0 -9
427 | NA20535 NA20535 0 0 0 -9
428 | NA20536 NA20536 0 0 0 -9
429 | NA20538 NA20538 0 0 0 -9
430 | NA20539 NA20539 0 0 0 -9
431 | NA20540 NA20540 0 0 0 -9
432 | NA20541 NA20541 0 0 0 -9
433 | NA20542 NA20542 0 0 0 -9
434 | NA20543 NA20543 0 0 0 -9
435 | NA20544 NA20544 0 0 0 -9
436 | NA20581 NA20581 0 0 0 -9
437 | NA20582 NA20582 0 0 0 -9
438 | NA20585 NA20585 0 0 0 -9
439 | NA20586 NA20586 0 0 0 -9
440 | NA20587 NA20587 0 0 0 -9
441 | NA20588 NA20588 0 0 0 -9
442 | NA20589 NA20589 0 0 0 -9
443 | NA20752 NA20752 0 0 0 -9
444 | NA20753 NA20753 0 0 0 -9
445 | NA20754 NA20754 0 0 0 -9
446 | NA20755 NA20755 0 0 0 -9
447 | NA20756 NA20756 0 0 0 -9
448 | NA20757 NA20757 0 0 0 -9
449 | NA20758 NA20758 0 0 0 -9
450 | NA20759 NA20759 0 0 0 -9
451 | NA20760 NA20760 0 0 0 -9
452 | NA20761 NA20761 0 0 0 -9
453 | NA20762 NA20762 0 0 0 -9
454 | NA20763 NA20763 0 0 0 -9
455 | NA20764 NA20764 0 0 0 -9
456 | NA20765 NA20765 0 0 0 -9
457 | NA20766 NA20766 0 0 0 -9
458 | NA20767 NA20767 0 0 0 -9
459 | NA20768 NA20768 0 0 0 -9
460 | NA20769 NA20769 0 0 0 -9
461 | NA20770 NA20770 0 0 0 -9
462 | NA20771 NA20771 0 0 0 -9
463 | NA20772 NA20772 0 0 0 -9
464 | NA20773 NA20773 0 0 0 -9
465 | NA20774 NA20774 0 0 0 -9
466 | NA20775 NA20775 0 0 0 -9
467 | NA20778 NA20778 0 0 0 -9
468 | NA20783 NA20783 0 0 0 -9
469 | NA20785 NA20785 0 0 0 -9
470 | NA20786 NA20786 0 0 0 -9
471 | NA20787 NA20787 0 0 0 -9
472 | NA20790 NA20790 0 0 0 -9
473 | NA20792 NA20792 0 0 0 -9
474 | NA20795 NA20795 0 0 0 -9
475 | NA20796 NA20796 0 0 0 -9
476 | NA20797 NA20797 0 0 0 -9
477 | NA20798 NA20798 0 0 0 -9
478 | NA20799 NA20799 0 0 0 -9
479 | NA20800 NA20800 0 0 0 -9
480 | NA20801 NA20801 0 0 0 -9
481 | NA20802 NA20802 0 0 0 -9
482 | NA20803 NA20803 0 0 0 -9
483 | NA20804 NA20804 0 0 0 -9
484 | NA20805 NA20805 0 0 0 -9
485 | NA20806 NA20806 0 0 0 -9
486 | NA20807 NA20807 0 0 0 -9
487 | NA20808 NA20808 0 0 0 -9
488 | NA20809 NA20809 0 0 0 -9
489 | NA20810 NA20810 0 0 0 -9
490 | NA20811 NA20811 0 0 0 -9
491 | NA20812 NA20812 0 0 0 -9
492 | NA20813 NA20813 0 0 0 -9
493 | NA20814 NA20814 0 0 0 -9
494 | NA20815 NA20815 0 0 0 -9
495 | NA20818 NA20818 0 0 0 -9
496 | NA20819 NA20819 0 0 0 -9
497 | NA20821 NA20821 0 0 0 -9
498 | NA20822 NA20822 0 0 0 -9
499 | NA20826 NA20826 0 0 0 -9
500 | NA20827 NA20827 0 0 0 -9
501 | NA20828 NA20828 0 0 0 -9
502 | NA20832 NA20832 0 0 0 -9
503 | 


--------------------------------------------------------------------------------
/inst/sandpit/bmi_example.r:
--------------------------------------------------------------------------------
 1 | if(!require(gwasvcftools))
 2 | {
 3 | 	if(!required(devtools)) install.packages("devtools")
 4 | 	devtools::install_github("MRCIEU/gwasvcftools")
 5 | }
 6 | library(gwasvcftools)
 7 | library(argparse)
 8 | 
 9 | # create parser object
10 | parser <- ArgumentParser()
11 | parser$add_argument('--snplist', required=TRUE)
12 | parser$add_argument('--bcf-dir', required=TRUE)
13 | parser$add_argument('--gwas-id', required=TRUE)
14 | parser$add_argument('--out', required=TRUE)
15 | parser$add_argument('--bfile', required=TRUE)
16 | parser$add_argument('--get-proxies', action="store_true", default=FALSE)
17 | parser$add_argument('--vcf-ref', required=FALSE)
18 | parser$add_argument('--tag-r2', type="double", default=0.6)
19 | parser$add_argument('--tag-kb', type="double", default=5000)
20 | parser$add_argument('--tag-nsnp', type="double", default=5000)
21 | parser$add_argument('--palindrome-freq', type="double", default=0.4)
22 | parser$add_argument('--no-clean', action="store_true", default=FALSE)
23 | parser$add_argument('--rdsf-config', required=FALSE, default='')
24 | parser$add_argument('--instrument-list', required=FALSE)
25 | 
26 | 
27 | # args <- parser$parse_args()
28 | setwd("~/mr-eve/gwas-instrument-subsets/scripts")
29 | args <- parser$parse_args(c("--bfile", "../../vcf-reference-datasets/ukb/ukb_ref", "--gwas-id", "2", "--snplist", "temp1.txt", "--no-clean", "--out", "out", "--bcf-dir", "../../gwas-files", "--vcf-ref", "../../vcf-reference-datasets/1000g/1kg_v3_nomult.bcf", "--get-proxies"))
30 | print(args)
31 | tempname <- tempfile(pattern="extract", tmpdir=dirname(args[['out']]))
32 | bcf <- file.path(args[['bcf_dir']], args[['gwas_id']], "harmonised.bcf")
33 | snplist <- scan(args[['snplist']], what=character())
34 | 
35 | 
36 | # Test Different proxy options
37 | 
38 | o1 <- extract(bcf, snplist, tempname, "yes", args[["bfile"]], args[["vcf_ref"]])
39 | dim(o1)
40 | o2 <- extract(bcf, snplist, tempname, "no", args[["bfile"]])
41 | dim(o2)
42 | o3 <- extract(bcf, snplist, tempname, "only", args[["bfile"]], args[["vcf_ref"]])
43 | dim(o3)
44 | 
45 | 
46 | # Check that proxies are correctly oriented
47 | # Expect to see that the proxies (o3) have effect sizes that strongly correlate with the true effect sizes (o2)
48 | 
49 | a <- merge(o3, o2, by="ID")
50 | i <- a$ALT.x == a$ALT.y
51 | table(i)
52 | cor(a$B.x, a$B.y)
53 | plot(a$B.x, a$B.y)
54 | 
55 | 
56 | # Finally, check that the original elastic files are on the same strand as the harmonised data
57 | 
58 | o <- fread("gunzip -c ../../gwas-files/2/elastic.gz", he=FALSE)
59 | temp <- merge(o3, o, by.x="ID", by.y="V1")
60 | dim(temp)
61 | i <- temp$REF != temp$V2
62 | table(i)
63 | cor(temp$B, temp$V5)
64 | temp$B[i] <- temp$B[i] * -1
65 | cor(temp$B, temp$V5)
66 | plot(temp$B, temp$V5)
67 | 
68 | 
69 | 


--------------------------------------------------------------------------------
/inst/sandpit/bmi_example_cp.r:
--------------------------------------------------------------------------------
 1 | if(!require(gwasvcftools))
 2 | {
 3 | 	if(!required(devtools)) install.packages("devtools")
 4 | 	devtools::install_github("MRCIEU/gwasvcftools")
 5 | }
 6 | library(gwasvcftools)
 7 | library(argparse)
 8 | 
 9 | # create parser object
10 | parser <- ArgumentParser()
11 | parser$add_argument('--snplist', required=TRUE)
12 | parser$add_argument('--bcf-dir', required=TRUE)
13 | parser$add_argument('--gwas-id', required=TRUE)
14 | parser$add_argument('--out', required=TRUE)
15 | parser$add_argument('--bfile', required=TRUE)
16 | parser$add_argument('--get-proxies', action="store_true", default=FALSE)
17 | parser$add_argument('--vcf-ref', required=FALSE)
18 | parser$add_argument('--tag-r2', type="double", default=0.6)
19 | parser$add_argument('--tag-kb', type="double", default=5000)
20 | parser$add_argument('--tag-nsnp', type="double", default=5000)
21 | parser$add_argument('--palindrome-freq', type="double", default=0.4)
22 | parser$add_argument('--no-clean', action="store_true", default=FALSE)
23 | parser$add_argument('--rdsf-config', required=FALSE, default='')
24 | parser$add_argument('--instrument-list', required=FALSE)
25 | 
26 | 
27 | # args <- parser$parse_args()
28 | setwd("~/mr-eve/gwas-instrument-subsets/scripts")
29 | args <- parser$parse_args(c("--bfile", "../../vcf-reference-datasets/ukb/ukb_ref", "--gwas-id", "2", "--snplist", "temp2.txt", "--no-clean", "--out", "out", "--bcf-dir", "../../gwas-files", "--vcf-ref", "../../vcf-reference-datasets/1000g/1kg_v3_nomult.bcf", "--get-proxies"))
30 | print(args)
31 | tempname <- tempfile(pattern="extract", tmpdir=dirname(args[['out']]))
32 | bcf <- file.path(args[['bcf_dir']], args[['gwas_id']], "harmonised.bcf")
33 | snplist <- fread(args[['snplist']], header=FALSE, sep="\t")
34 | 
35 | 
36 | # Test Different proxy options
37 | 
38 | o1 <- extract(bcf, snplist, tempname, "yes", args[["bfile"]], args[["vcf_ref"]])
39 | dim(o1)
40 | o2 <- extract(bcf, snplist, tempname, "no", args[["bfile"]])
41 | dim(o2)
42 | o3 <- extract(bcf, snplist, tempname, "only", args[["bfile"]], args[["vcf_ref"]])
43 | dim(o3)
44 | 
45 | 
46 | # Check that proxies are correctly oriented
47 | # Expect to see that the proxies (o3) have effect sizes that strongly correlate with the true effect sizes (o2)
48 | 
49 | a <- merge(o3, o2, by="ID")
50 | i <- a$ALT.x == a$ALT.y
51 | table(i)
52 | cor(a$B.x, a$B.y)
53 | plot(a$B.x, a$B.y)
54 | 
55 | 
56 | # Finally, check that the original elastic files are on the same strand as the harmonised data
57 | 
58 | o <- fread("gunzip -c ../../gwas-files/2/elastic.gz", he=FALSE)
59 | temp <- merge(o3, o, by.x="ID", by.y="V1")
60 | dim(temp)
61 | i <- temp$REF != temp$V2
62 | table(i)
63 | cor(temp$B, temp$V5)
64 | temp$B[i] <- temp$B[i] * -1
65 | cor(temp$B, temp$V5)
66 | plot(temp$B, temp$V5)
67 | 
68 | 
69 | 


--------------------------------------------------------------------------------
/inst/sandpit/harmonise_against_ref.r:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | suppressPackageStartupMessages({
  3 | 	library(argparse)
  4 | 	library(dplyr)
  5 | 	library(TwoSampleMR)
  6 | 	# library(gwasvcftools)
  7 | 	library(unixtools)
  8 | 	library(devtools)
  9 | 	load_all()
 10 | })
 11 | 
 12 | # create parser object
 13 | parser <- ArgumentParser()
 14 | parser$add_argument('--ref_file', required=TRUE)
 15 | parser$add_argument('--ref_build', required=TRUE)
 16 | parser$add_argument('--ref_info', required=TRUE)
 17 | parser$add_argument('--mrbase_id', required=TRUE)
 18 | parser$add_argument('--gwas_file', required=TRUE)
 19 | parser$add_argument('--gzipped', required=TRUE, type="integer", default=1)
 20 | parser$add_argument('--delimiter', default="\t", required=TRUE)
 21 | parser$add_argument('--skip', required=TRUE, type="integer", default=0)
 22 | parser$add_argument('--dbsnp_field', type="integer", required=TRUE)
 23 | parser$add_argument('--ea_field', type="integer", required=TRUE)
 24 | parser$add_argument('--nea_field', type="integer", required=FALSE, default=0)
 25 | parser$add_argument('--ea_af_field', type="integer", required=FALSE, default=0)
 26 | parser$add_argument('--effect_field', type="integer", required=FALSE, default=0)
 27 | parser$add_argument('--se_field', type="integer", required=FALSE, default=0)
 28 | parser$add_argument('--pval_field', type="integer", required=FALSE, default=0)
 29 | parser$add_argument('--n_field', type="integer", required=FALSE, default=0)
 30 | parser$add_argument('--info_field', type="integer", required=FALSE, default=0)
 31 | parser$add_argument('--z_field', type="integer", required=FALSE, default=0)
 32 | parser$add_argument('--out_type', required=TRUE, default="bcf")
 33 | parser$add_argument('--out', required=TRUE)
 34 | args <- parser$parse_args()
 35 | str(args)
 36 | 
 37 | 
 38 | # Read in GWAS data
 39 | set.tempdir("tmp")
 40 | gwas <- read_gwas(
 41 | 	args[["gwas_file"]],
 42 | 	skip=args[["skip"]],
 43 | 	snp=args[["dbsnp_field"]],
 44 | 	gzipped=args[["gzipped"]],
 45 | 	delimiter=args[["delimiter"]],
 46 | 	ea=args[["ea_field"]],
 47 | 	nea=args[["nea_field"]],
 48 | 	ea_af=args[["ea_af_field"]],
 49 | 	effect=args[["effect_field"]],
 50 | 	se=args[["se_field"]],
 51 | 	pval=args[["pval_field"]],
 52 | 	n=args[["n_field"]],
 53 | 	info=args[["info_field"]],
 54 | 	z=args[["z_field"]]
 55 | )
 56 | 
 57 | 
 58 | # Read in ref
 59 | ref <- read_reference(args[["ref_file"]], gwas$SNP, args[["out"]]) 
 60 | 
 61 | # Harmonise
 62 | harmonised <- harmonise_against_ref(gwas, ref)
 63 | save(harmonised, file="temp.rdata")
 64 | q()
 65 | 
 66 | # Gather metadata
 67 | metadata.input <- args
 68 | names(metadata.input) <- paste0("input.", names(metadata.input))
 69 | 
 70 | ao <- TwoSampleMR::available_outcomes(NULL)
 71 | metadata.gwas <- as.list(subset(ao, id == args[["mrbase_id"]]))
 72 | metadata.gwas[['path']] <- NULL
 73 | metadata.gwas[['filename']] <- NULL
 74 | names(metadata.gwas) <- paste0("gwas.", names(metadata.gwas))
 75 | 
 76 | metadata.counts <- as.list(attr(harmonised, "log"))
 77 | metadata.counts[['id.exposure']] <- NULL
 78 | metadata.counts[['id.outcome']] <- NULL
 79 | names(metadata.counts) <- paste0("counts.", names(metadata.counts))
 80 | 
 81 | metadata <- c(metadata.input, metadata.gwas, metadata.counts)
 82 | str(metadata)
 83 | 
 84 | # Create vcf format
 85 | vcf <- gwasvcftools::make_vcf(
 86 | 	ID = harmonised$ID,
 87 | 	ALT = harmonised$ALT,
 88 | 	REF = harmonised$REF,
 89 | 	B = harmonised$BETA,
 90 | 	SE = harmonised$SE,
 91 | 	PVAL = harmonised$PVALUE,
 92 | 	N = harmonised$N,
 93 | 	CHROM = harmonised$CHROM,
 94 | 	POS = harmonised$POS,
 95 | 	AF = harmonised$AF,
 96 | 	QUAL = harmonised$INFO,
 97 | 	FILTER = rep("PASS", nrow(harmonised)),
 98 | 	ZVALUE = harmonised$ZVALUE,
 99 | 	build = args[["ref_build"]],
100 | 	meta_data = metadata
101 | )
102 | 
103 | # Write vcf
104 | gwasvcftools::write_vcf(vcf, paste0(args[["out"]], ".", args[["out_type"]]))
105 | 
106 | 


--------------------------------------------------------------------------------
/inst/sandpit/misc/create_ref.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Download 1000 genomes
 4 | 
 5 | wget ALL.chr1.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz
 6 | wget ALL.chr1.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi
 7 | wget ALL.chr10.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz
 8 | wget ALL.chr10.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi
 9 | wget ALL.chr11.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz
10 | wget ALL.chr11.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi
11 | wget ALL.chr12.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz
12 | wget ALL.chr12.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi
13 | wget ALL.chr13.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz
14 | wget ALL.chr13.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi
15 | wget ALL.chr14.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz
16 | wget ALL.chr14.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi
17 | wget ALL.chr15.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz
18 | wget ALL.chr15.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi
19 | wget ALL.chr16.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz
20 | wget ALL.chr16.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi
21 | wget ALL.chr17.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz
22 | wget ALL.chr17.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi
23 | wget ALL.chr18.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz
24 | wget ALL.chr18.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi
25 | wget ALL.chr19.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz
26 | wget ALL.chr19.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi
27 | wget ALL.chr2.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz
28 | wget ALL.chr2.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi
29 | wget ALL.chr20.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz
30 | wget ALL.chr20.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi
31 | wget ALL.chr21.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz
32 | wget ALL.chr21.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi
33 | wget ALL.chr22.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz
34 | wget ALL.chr22.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi
35 | wget ALL.chr3.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz
36 | wget ALL.chr3.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi
37 | wget ALL.chr4.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz
38 | wget ALL.chr4.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi
39 | wget ALL.chr5.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz
40 | wget ALL.chr5.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi
41 | wget ALL.chr6.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz
42 | wget ALL.chr6.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi
43 | wget ALL.chr7.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz
44 | wget ALL.chr7.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi
45 | wget ALL.chr8.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz
46 | wget ALL.chr8.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi
47 | wget ALL.chr9.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz
48 | wget ALL.chr9.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi
49 | wget ALL.chrMT.phase3_callmom-v0_4.20130502.genotypes.vcf.gz
50 | wget ALL.chrMT.phase3_callmom-v0_4.20130502.genotypes.vcf.gz.tbi
51 | wget ALL.chrX.phase3_shapeit2_mvncall_integrated_v1b.20130502.genotypes.vcf.gz
52 | wget ALL.chrX.phase3_shapeit2_mvncall_integrated_v1b.20130502.genotypes.vcf.gz.tbi
53 | wget ALL.chrY.phase3_integrated_v2a.20130502.genotypes.vcf.gz
54 | wget ALL.chrY.phase3_integrated_v2a.20130502.genotypes.vcf.gz.tbi
55 | 
56 | 
57 | for f in *vcf.gz
58 | do
59 | 	echo $f
60 | 	i=`echo $f | cut -d "." -f 2`
61 | 	echo $i
62 | 	bcftools view -G -Ob $f > 1kg_v3_$i.bcf
63 | 	bcftools index 1kg_v3_$i.bcf
64 | done
65 | 
66 | bcftools concat 1kg_v3_*.bcf -Ob > 1kg_v3.bcf
67 | 
68 | # Multi allelic SNPs are problematic for harmonisation because we would have to update TwoSampleMR functions to allow this.
69 | # Most GWASs drop multi-allelic SNPs so we can probably try to ignore, though remains to be seen how much we lose due to this strategy
70 | # Split and keep first biallelic version of each variant only
71 | # Is there a better way to handle multi-allelic SNPs?
72 | bcftools norm -m- 1kg_v3.bcf | bcftools norm -d all -Ob > 1kg_v3_nomult.bcf
73 | 
74 | # Note we can also use dbSNP which has ALT allele frequencies
75 | # ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/VCF/All_20180418.vcf.gz
76 | 
77 | # Or dbSNP filter of only common variants
78 | # ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/VCF/common_all_20180418.vcf.gz
79 | # Though this is slightly prooblematic because they have a CAF column instead of a standard AF column, 
80 | # so handling multiallelic SNPs won't automatically handle this column
81 | 


--------------------------------------------------------------------------------
/inst/sandpit/misc/harmonise.r:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | library(TwoSampleMR)
 4 | library(data.table)
 5 | library(dplyr)
 6 | library(argparse)
 7 | library(magrittr)
 8 | 
 9 | parser <- ArgumentParser()
10 | parser$add_argument('--ref', required=TRUE)
11 | parser$add_argument('--gwas', required=FALSE)
12 | parser$add_argument('--out', required=TRUE)
13 | 
14 | args <- parser$parse_args()
15 | 
16 | 
17 | 
18 | # 1. Read in the GWAS
19 | # 2. Read in the reference data
20 | # 3. Harmonise GWAS against the reference
21 | # 4. Write to VCF
22 | 
23 | 
24 | # Read the GWAS
25 | # Just assuming the format used for uploading to elastic
26 | gwas <- data.table::fread(paste0("gunzip -c ", args[["gwas"]]))
27 | names(gwas) <- c("snp_col", "ea_col", "oa_col", "eaf_col", "beta_col", "se_col", "pval_col", "ncontrol_col")
28 | # This is a continuous GWAS so no ncase column
29 | gwas$ncase_col <- NA
30 | 
31 | 
32 | # Read the reference
33 | ref <- data.table::fread(paste0("gunzip -c ", args[["ref"]]))
34 | stopifnot(c("CHROM", "ID", "REF", "ALT", "MAF", "POS") %in% names(ref))
35 | 
36 | # For simplicity just keeping SNP Ids that are in common
37 | ref <- subset(ref, ID %in% gwas$snp_col)
38 | 
39 | # Put in some dummy variables for the reference for harmonising
40 | ref$beta <- 1
41 | ref$se <- 0.1
42 | ref$pval <- 0.1
43 | a <- TwoSampleMR::format_data(
44 | 	ref,
45 | 	type="exposure",
46 | 	snp_col="ID",
47 | 	effect_allele_col="ALT",
48 | 	other_allele_col="REF",
49 | 	eaf_col="MAF"
50 | )
51 | 
52 | b <- TwoSampleMR::format_data(gwas, type="outcome", 
53 | 	snp_col="snp_col",
54 | 	beta_col="beta_col",
55 | 	se_col="se_col",
56 | 	effect_allele_col="ea_col",
57 | 	other_allele_col="oa_col",
58 | 	eaf_col="eaf_col",
59 | 	ncase_col="ncase_col",
60 | 	ncontrol_col="ncontrol_col",
61 | 	pval_col="pval_col"
62 | )
63 | 
64 | # Is the gwas on the forward strand?
65 | action <- is_forward_strand(gwas$snp_col, gwas$ea_col, gwas$oa_col, ref$ID, ref$ALT, ref$REF)
66 | 
67 | # Harmonise the gwas according to the reference panel
68 | ab <- TwoSampleMR::harmonise_data(a, b, action=action)
69 | 
70 | gwas_h <- ab %$% 
71 | 	dplyr::data_frame(
72 | 		ID=SNP, 
73 | 		ALT=effect_allele.exposure,
74 | 		REF=other_allele.exposure,
75 | 		BETA=beta.outcome,
76 | 		SE=se.outcome,
77 | 		PVALUE=pval.outcome,
78 | 		AF=eaf.outcome,
79 | 		N=samplesize.outcome,
80 | 		NCASE=ncase.outcome,
81 | 		NCONTROL=ncontrol.outcome) %>% 
82 | 	dplyr::inner_join(subset(ref, select=c(ID,REF,ALT,CHROM,POS,MAF)), by=c("ID", "REF", "ALT"))
83 | 
84 | save(gwas_h, file=args[["out"]])
85 | 
86 | 


--------------------------------------------------------------------------------
/inst/sandpit/misc/harmonise_against_ref.r:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | suppressPackageStartupMessages(library(argparse))
  4 | suppressPackageStartupMessages(library(dplyr))
  5 | suppressPackageStartupMessages(library(TwoSampleMR))
  6 | suppressPackageStartupMessages(library(data.table))
  7 | suppressPackageStartupMessages(library(vcfR))
  8 | library(methods)
  9 | library(utils)
 10 | 
 11 | # create parser object
 12 | parser <- ArgumentParser()
 13 | 
 14 | parser$add_argument('--ref-file', required=TRUE)
 15 | parser$add_argument('--ref-build', required=TRUE, default="b37")
 16 | parser$add_argument('--gwas-file', required=TRUE)
 17 | parser$add_argument('--gwas-header', required=TRUE, type="logical", default=FALSE)
 18 | parser$add_argument('--gwas-snp', type="integer", required=TRUE)
 19 | parser$add_argument('--gwas-ref', type="integer", required=FALSE)
 20 | parser$add_argument('--gwas-alt', type="integer", required=TRUE)
 21 | parser$add_argument('--gwas-af', type="integer", required=FALSE)
 22 | parser$add_argument('--gwas-beta', type="integer", required=FALSE)
 23 | parser$add_argument('--gwas-se', type="integer", required=FALSE)
 24 | parser$add_argument('--gwas-pval', type="integer", required=FALSE)
 25 | parser$add_argument('--gwas-n0', type="integer", required=FALSE)
 26 | parser$add_argument('--gwas-n1', type="integer", required=FALSE)
 27 | parser$add_argument('--out', required=TRUE)
 28 | 
 29 | args <- parser$parse_args()
 30 | 
 31 | print(args)
 32 | 
 33 | read_dat <- function(filename, type, header, snp, ref, alt, af, beta, se, pval, n0, n1)
 34 | {
 35 | 	if(grepl("gz$", filename))
 36 | 	{
 37 | 		dat <- data.table::fread(paste0("gunzip -c ", filename), header=header)
 38 | 	} else {
 39 | 		dat <- data.table::fread(filename, header=header)
 40 | 	}
 41 | 	nc <- ncol(dat)
 42 | 	if(snp == 0)
 43 | 	{
 44 | 		dat[[paste0("V", ncol(dat)+1)]] <- rep(NA, nrow(dat))
 45 | 		snp <- ncol(dat)
 46 | 	}
 47 | 	if(ref == 0)
 48 | 	{
 49 | 		dat[[paste0("V", ncol(dat)+1)]] <- rep(NA, nrow(dat))
 50 | 		ref <- ncol(dat)
 51 | 	}
 52 | 	if(alt == 0)
 53 | 	{
 54 | 		dat[[paste0("V", ncol(dat)+1)]] <- rep(NA, nrow(dat))
 55 | 		alt <- ncol(dat)
 56 | 	}
 57 | 	if(af == 0)
 58 | 	{
 59 | 		dat[[paste0("V", ncol(dat)+1)]] <- rep(NA, nrow(dat))
 60 | 		af <- ncol(dat)
 61 | 	}
 62 | 	if(beta == 0)
 63 | 	{
 64 | 		dat[[paste0("V", ncol(dat)+1)]] <- rep(NA, nrow(dat))
 65 | 		beta <- ncol(dat)
 66 | 	}
 67 | 	if(se == 0)
 68 | 	{
 69 | 		dat[[paste0("V", ncol(dat)+1)]] <- rep(NA, nrow(dat))
 70 | 		se <- ncol(dat)
 71 | 	}
 72 | 	if(pval == 0)
 73 | 	{
 74 | 		dat[[paste0("V", ncol(dat)+1)]] <- rep(NA, nrow(dat))
 75 | 		pval <- ncol(dat)
 76 | 	}
 77 | 	if(n0 == 0)
 78 | 	{
 79 | 		dat[[paste0("V", ncol(dat)+1)]] <- rep(NA, nrow(dat))
 80 | 		n0 <- ncol(dat)
 81 | 	}
 82 | 	if(n1 == 0)
 83 | 	{
 84 | 		dat[[paste0("V", ncol(dat)+1)]] <- rep(NA, nrow(dat))
 85 | 		n1 <- ncol(dat)
 86 | 	}
 87 | 
 88 | 	o <- format_data(
 89 | 		dat, 
 90 | 		type=type, 
 91 | 		phenotype_col=type,
 92 | 		snp_col=names(dat)[snp],
 93 | 		beta_col=names(dat)[beta],
 94 | 		se_col=names(dat)[se],
 95 | 		effect_allele_col=names(dat)[alt],
 96 | 		other_allele_col=names(dat)[ref],
 97 | 		eaf_col=names(dat)[af],
 98 | 		pval_col=names(dat)[pval],
 99 | 		ncase_col=names(dat)[n1],
100 | 		ncontrol_col=names(dat)[n0]
101 | 	)
102 | 	return(o)
103 | }
104 | 
105 | 
106 | 
107 | # Read in gwas data
108 | gwas <- read_dat(
109 | 	args[["gwas_file"]],
110 | 	type="outcome",
111 | 	header=args[["gwas_header"]],
112 | 	snp=args[["gwas_snp"]],
113 | 	ref=args[["gwas_ref"]],
114 | 	alt=args[["gwas_alt"]],
115 | 	af=args[["gwas_af"]],
116 | 	beta=args[["gwas_beta"]],
117 | 	se=args[["gwas_se"]],
118 | 	pval=args[["gwas_pval"]],
119 | 	n0=args[["gwas_n0"]],
120 | 	n1=args[["gwas_n1"]]
121 | )
122 | 
123 | 
124 | # Read in ref
125 | 
126 | ref <- data.table::fread(paste0("gunzip -c ", args[["ref_file"]]))
127 | stopifnot(all(c("CHROM", "ID", "REF", "ALT", "AF", "POS") %in% names(ref)))
128 | 
129 | # For simplicity just keeping SNP Ids that are in common
130 | ref <- subset(ref, ID %in% gwas$SNP)
131 | 
132 | # Put in some dummy variables for the reference for harmonising
133 | ref$beta <- 1
134 | ref$se <- 0.1
135 | ref$pval <- 0.1
136 | a <- TwoSampleMR::format_data(
137 |         ref,
138 |         type="exposure",
139 |         snp_col="ID",
140 |         effect_allele_col="ALT",
141 |         other_allele_col="REF",
142 |         eaf_col="AF"
143 | )
144 | 
145 | # Check strand
146 | action <- TwoSampleMR::is_forward_strand(gwas$SNP, gwas$effect_allele.outcome, gwas$other_allele.outcome, ref$ID, ref$ALT, ref$REF, threshold=0.9)
147 | 
148 | # Harmonise
149 | dat <- TwoSampleMR::harmonise_data(a, gwas, action)
150 | 
151 | 
152 | gwas_h <- dat %$%
153 | 	dplyr::data_frame(
154 | 		ID=SNP,
155 | 		ALT=effect_allele.exposure,
156 | 		REF=other_allele.exposure,
157 | 		BETA=beta.outcome,
158 | 		SE=se.outcome,
159 | 		PVALUE=pval.outcome,
160 | 		AF=eaf.outcome,
161 | 		N=samplesize.outcome,
162 | 		NCASE=ncase.outcome,
163 | 		NCONTROL=ncontrol.outcome) %>%
164 | 	dplyr::inner_join(subset(ref, select=c(ID,REF,ALT,CHROM,POS)), by=c("ID", "REF", "ALT"))
165 | 
166 | 
167 | # Create vcf format
168 | vcf <- TwoSampleMR::make_vcf(
169 |                 ID = gwas_h$ID,
170 |                 ALT = gwas_h$ALT,
171 |                 REF = gwas_h$REF,
172 |                 B = gwas_h$BETA,
173 |                 SE = gwas_h$SE,
174 |                 PVAL = gwas_h$PVALUE,
175 |                 N0 = gwas_h$NCONTROL,
176 |                 N1 = gwas_h$NCASE,
177 |                 CHROM = gwas_h$CHROM,
178 |                 POS = gwas_h$POS,
179 |                 AF = gwas_h$AF,
180 |                 QUAL = rep(NA, nrow(gwas_h)),
181 |                 FILTER = rep("PASS", nrow(gwas_h)),
182 |                 build = args[["ref_build"]]
183 |         )
184 | 
185 | # Write vcf
186 | TwoSampleMR::write_vcf(vcf, args[["out"]])
187 | 
188 | 
189 | 


--------------------------------------------------------------------------------
/inst/sandpit/misc/query_times.rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Timing for different methods to query vcf files
 3 | author: Gibran Hemani
 4 | date: 2/10/2019
 5 | ---
 6 | 
 7 | 
 8 | Setup
 9 | 
10 | ```{r}
11 | library(knitr)
12 | opts_chunk$set(warning=FALSE, message=FALSE, cache=TRUE)
13 | library(devtools)
14 | load_all()
15 | fn <- system.file("data","IEU-a-2.vcf.gz", package="gwasvcftools")
16 | chrompos <- "20:800000-4000000"
17 | pval <- 5e-8
18 | rsid <- c("rs3128126", "rs3121561", "rs3813193")
19 | ```
20 | 
21 | Reading in the vcf file
22 | 
23 | ```{r}
24 | system.time({
25 | 	v <- readVcf(fn)
26 | })
27 | ```
28 | 
29 | Different methods for searching for rsid:
30 | 
31 | ```{r}
32 | system.time({
33 | 	query_rsid_vcf(rsid, v)
34 | })
35 | system.time({
36 | 	query_rsid_file(rsid, fn)
37 | })
38 | system.time({
39 | 	query_rsid_bcftools(rsid, fn)
40 | })
41 | ```
42 | 
43 | Different methods for searching by p-value:
44 | 
45 | ```{r}
46 | system.time({
47 | 	query_pval_vcf(pval, v)
48 | })
49 | system.time({
50 | 	query_pval_file(pval, fn)
51 | })
52 | system.time({
53 | 	query_pval_bcftools(pval, fn)
54 | })
55 | ```
56 | 
57 | Different methods for searching by chrompos:
58 | 
59 | ```{r}
60 | system.time({
61 | 	query_chrompos_vcf(chrompos, v)
62 | })
63 | system.time({
64 | 	query_chrompos_file(chrompos, fn)
65 | })
66 | system.time({
67 | 	query_chrompos_bcftools(chrompos, fn)
68 | })
69 | ```
70 | 


--------------------------------------------------------------------------------
/inst/sandpit/misc/skeleton.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | 
 5 | # BCF file with:
 6 | # - correct REF and ALT alleles
 7 | # - AF field in INFO that is the ALT allele frequency
 8 | ref="../../../reference/1000g/1kg_v3_nomult.bcf"
 9 | 
10 | 
11 | # GWAS file:
12 | gwas="~/mr-eve/gwas-instrument-subsets/studies/2/elastic.gz"
13 | 
14 | 
15 | # 0. Clean GWAS
16 | 
17 | 
18 | 
19 | # STILL TO DO
20 | 
21 | 
22 | 
23 | # 1. check for name merges against sqlite
24 | # This file has rs ID merges
25 | # https://www.ncbi.nlm.nih.gov/projects/SNP/snp_db_table_description.cgi?t=RsMergeArch
26 | # Step 1 is to update any rs IDs in the GWAS based on this file
27 | 
28 | 
29 | 
30 | # STILL TO DO
31 | 
32 | 
33 | 
34 | # 2. Convert any chr:pos SNPs in the GWAS to rs IDs
35 | # Do this by extracting variants that are missing rs IDs in GWAS
36 | # and find the rs ID in the reference
37 | # and update the GWAS file
38 | 
39 | 
40 | 
41 | # STILL TO DO
42 | 
43 | 
44 | 
45 | # 3. Get subset of reference in tab format
46 | 
47 | 
48 | gunzip -c elastic.gz | cut -f 1 > snplist.txt
49 | wc -l snplist.txt
50 | 
51 | time bcftools view -i'ID=@temp' $ref | bcftools query -f'%CHROM\t%POS\t%ID\t%REF\t%ALT\t%AF\n' | sed '1 i\
52 | CHROM\tPOS\tID\tREF\tALT\tAF
53 | ' | gzip -c > ref_extract.txt.gz
54 | 
55 | 
56 | 
57 | # 4. Harmonise the GWAS file against the reference
58 | # This needs to retain indels so use the TwoSampleMR::harmonise_data
59 | # function. It will: 
60 | # - switch effect alleles
61 | # - handle sequence coded indels
62 | # - convert D/I indels to sequence coding (as in the reference)
63 | # - check for forward strand and flip if necessary
64 | # - tries to harmonise with only effect allele if other allele not available
65 | # 4b. Write out to bcf format
66 | # After harmonising can use the TwoSampleMR::write_vcf function
67 | # It will create file based on extension and index.
68 | 
69 | 
70 | 
71 | Rscript harmonise_against_ref.r \
72 | --ref-file ref_extract.txt.gz \
73 | --ref-build b37 \
74 | --gwas-file $gwas \
75 | --gwas-header FALSE \
76 | --gwas-snp 1 \
77 | --gwas-ref 3 \
78 | --gwas-alt 2 \
79 | --gwas-af 4 \
80 | --gwas-beta 5 \
81 | --gwas-se 6 \
82 | --gwas-pval 7 \
83 | --gwas-n0 8 \
84 | --gwas-n1 NA \
85 | --out harmonised.bcf
86 | 
87 | 
88 | 
89 | # 5. Create report and json document of harmonising stats
90 | # This could be included above
91 | 
92 | 


--------------------------------------------------------------------------------
/inst/sandpit/misc/vcf.rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: Using VCF to handle GWAS
  3 | author: Gibran Hemani
  4 | date: 18/11/2018
  5 | ---
  6 | 
  7 | ## Background
  8 | 
  9 | Using VCF as a format for storing GWAS summary data. Possible benifits - 
 10 | 
 11 | * Well known standard format
 12 | * Potentially smaller after compression
 13 | * Very fast tools already exist
 14 | * Standardised tools can run implicit checks
 15 | * Indexing will help with regional lookups
 16 | * Standardised way to represent variant info including indels and multi-allelic variants
 17 | * Easy to update build
 18 | 
 19 | 
 20 | ## Specification
 21 | 
 22 | VCF has detailed specification here [http://samtools.github.io/hts-specs/VCFv4.3.pdf](http://samtools.github.io/hts-specs/VCFv4.3.pdf). We need an agreed way to apply the specification to GWAS summary data. Current implementation:
 23 | 
 24 | 1. Use only the first 8 fixed fields.
 25 | 2. QUAL will be set to missing (.) unless an obvious way to use it can be identified.
 26 | 3. ALT allele is always the effect allele. Ideally this is matched to a reference dataset. REF allele is always the non-effect allele
 27 | 4. For binary traits we want to store the number of cases and number of controls
 28 | 5. For continuous traits we use 0 for number of cases, and number of controls is the total sample size
 29 | 6. The INFO column will have fields describing the genetic association, as follows:
 30 | 	* B, Type = Float, Description = Effect size estimate relative to the alternative allele(s)
 31 | 	* SE, Type = Float, Description = Standard error of effect size estimate
 32 | 	* P, Type = Float, Description = P-value for effect estimate
 33 | 	* AF, Type = Float, Description = Alternate allele frequency
 34 | 	* N1, Type = Integer, Description = Number of cases. 0 if continuous trait
 35 | 	* N0, Type = Integer, Description = Number of controls. Total sample size if continuous trait
 36 | 7. FILTER is always PASS unless the variant does not meet some QC parameter.
 37 | 
 38 | The VCF header encapsulating this info will look like this:
 39 | 
 40 | ```
 41 | ##INFO=<ID=B,Number=A,Type=Float,Description="Effect size estimate relative to the alternative allele(s)">
 42 | ##INFO=<ID=SE,Number=A,Type=Float,Description="Standard error of effect size estimate">
 43 | ##INFO=<ID=P,Number=A,Type=Float,Description="P-value for effect estimate">
 44 | ##INFO=<ID=AF,Number=A,Type=Float,Description="Alternate allele frequency">
 45 | ##INFO=<ID=N1,Number=A,Type=Integer,Description="Number of cases. 0 if continuous trait">
 46 | ##INFO=<ID=N0,Number=A,Type=Integer,Description="Number of controls. Total sample size if continuous trait">
 47 | ```
 48 | 
 49 | Missing values throughout are specified as ".", as standard for VCF.
 50 | 
 51 | Custom annotations can be added i.e. ##gwas=casecontrol which are lowercase by convention
 52 | 
 53 | ## Reference FASTA
 54 | 
 55 | The reference fasta should be downloaded from the GATK bundle:
 56 | [b38/hg38](https://console.cloud.google.com/storage/browser/genomics-public-data/resources/broad/hg38/v0)
 57 | [b37/hg19](ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/b37)
 58 | 
 59 | ## Generating the data
 60 | 
 61 | To run the following, first clone the TwoSampleMR repository
 62 | 
 63 | ```bash
 64 | git clone git@github.com:MRCIEU/TwoSampleMR.git
 65 | ```
 66 | 
 67 | Then navigate to here:
 68 | 
 69 | ```bash
 70 | cd TwoSampleMR/vignettes/vcf
 71 | ```
 72 | 
 73 | 
 74 | ### Download some example datasets
 75 | 
 76 | Start with two datasets, a reference (for example 1000 genomes) and a GWAS summary dataset (e.g. Locke et al 2015 BMI analysis). First we will convert the GWAS dataset to be harmonised against the reference dataset
 77 | 
 78 | Download the example GWAS dataset:
 79 | 
 80 | ```{r engine='bash'}
 81 | wget -q -O bmi.txt.gz https://www.dropbox.com/s/ph7in04w6dki2tv/bmi.txt.gz?dl=0
 82 | gunzip -c bmi.txt.gz | head
 83 | gunzip -c bmi.txt.gz | wc -l
 84 | ```
 85 | 
 86 | Download the reference dataset:
 87 | 
 88 | ```{r engine='bash'}
 89 | wget -q -O ref.txt.gz https://www.dropbox.com/s/8vgg08zip2wkayk/ref.txt.gz?dl=0
 90 | gunzip -c ref.txt.gz | head
 91 | gunzip -c ref.txt.gz | wc -l
 92 | ```
 93 | 
 94 | ### Harmonise the GWAS against the reference
 95 | 
 96 | For simplicity I will just use the `harmonise_data` function in the `R/TwoSampleMR` package. This has limitations in that it throws away indels. The scripts that Denis is writing to harmonise against SNP-Base are going to be more appropriate, but this is just here for illustration.
 97 | 
 98 | ```{r engine='bash'}
 99 | Rscript harmonise.r --gwas bmi.txt.gz --ref ref.txt.gz --out harmonised.rdata
100 | ```
101 | 
102 | 
103 | ### Create VCF files from the harmonised object
104 | 
105 | Now that we have a file that has all the required columns:
106 | * CHROM
107 | * POS
108 | * ID (rs ID)
109 | * REF allele
110 | * ALT allele
111 | * BETA
112 | * SE
113 | * PVAL
114 | * NCASE
115 | * NCONTROL
116 | 
117 | And they are all harmonised to a reference dataset, we can produce a vcf file using a couple of functions in the `TwoSampleMR` package
118 | 
119 | ```{r}
120 | library(TwoSampleMR)
121 | library(dplyr)
122 | library(vcfR)
123 | library(methods)
124 | library(utils)
125 | 
126 | # This loads in the harmonised object that we just created - `gwas_h`
127 | load("harmonised.rdata")
128 | str(gwas_h)
129 | 
130 | vcf <- TwoSampleMR::make_vcf(
131 | 		ID = gwas_h$ID,
132 | 		ALT = gwas_h$ALT, 
133 | 		REF = gwas_h$REF, 
134 | 		B = gwas_h$BETA, 
135 | 		SE = gwas_h$SE, 
136 | 		PVAL = gwas_h$PVALUE, 
137 | 		N0 = gwas_h$NCONTROL, 
138 | 		N1 = gwas_h$NCASE, 
139 | 		CHROM = gwas_h$CHROM, 
140 | 		POS = gwas_h$POS, 
141 | 		AF = gwas_h$MAF, 
142 | 		QUAL = rep(NA, nrow(gwas_h)),
143 | 		FILTER = rep('PASS', nrow(gwas_h)), 
144 | 		build = "b37"
145 | 	)
146 | ```
147 | 
148 | We can see some basic stats about the file we just made using the `R/vcfR` package:
149 | 
150 | 
151 | ```{r}
152 | vcf
153 | ```
154 | 
155 | Finally, we can write the correctly formatted data to file:
156 | 
157 | ```{r}
158 | TwoSampleMR::write_vcf(vcf, "bmi.vcf.gz")
159 | TwoSampleMR::write_vcf(vcf, "bmi.bcf")
160 | ```
161 | 
162 | ## Testing the VCF files
163 | 
164 | All VCF files should undergo validation before use using [gatk](https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_tools_walkers_variantutils_ValidateVariants.php).
165 | 
166 | ```gatk ValidateVariants \
167 | -R <ref.fa> \
168 | -V <gwas.vcf> \
169 | --dbsnp <dbsnp.vcf>
170 | ```
171 | 
172 | We can use [bgzip](https://vcf.iobio.io/help.html), [tabix](https://vcf.iobio.io/help.html) and [bcftools](https://samtools.github.io/bcftools/) to work with VCF.
173 | 
174 | Examples of how to compress and index (though the R functions previously run have done this already by calling these tools).
175 | 
176 | Compress a `.vcf` with bgzip, then index:
177 | 
178 | ```bash
179 | bgzip -c bmi.vcf > bmi.vcf.gz
180 | bcftools index bmi.vcf.gz
181 | ```
182 | 
183 | Convert to `.bcf` which is a binary version of the text file:
184 | 
185 | ```bash
186 | bcftools view bmi.vcf.gz -Ob -o bmi.bcf
187 | bcftools index bmi.bcf
188 | ```
189 | 
190 | ### Compare the sizes
191 | 
192 | ```{r engine='bash'}
193 | # bcf and index
194 | du -sh bmi.bcf bmi.bcf.csi
195 | ```
196 | 
197 | ```{r engine='bash'}
198 | # vcf.gz and index
199 | du -sh bmi.vcf.gz bmi.vcf.gz.csi
200 | ```
201 | 
202 | ```{r engine='bash'}
203 | # original gzip file
204 | du -sh bmi.txt.gz
205 | ```
206 | 
207 | The original gzip file is smallest, but it doesn't contain chromosome and position info. Surprisingly, bcf format is almost double the size of the gzip format, and vcf.gz is somewhere in between. 
208 | 
209 | 
210 | ### Speed to extract by p-value
211 | 
212 | ```{r engine='bash'}
213 | # bcf
214 | time bcftools query -i'PVAL<5e-8' -f'%ID\n' bmi.bcf > extract.txt && wc -l extract.txt
215 | ```
216 | 
217 | ```{r engine='bash'}
218 | # vcf.gz
219 | time bcftools query -i'PVAL<5e-8' -f'%ID\n' bmi.vcf.gz > extract.txt && wc -l extract.txt
220 | ```
221 | 
222 | ```{r engine='bash'}
223 | # for comparison - original gzip file
224 | time gunzip -c bmi.txt.gz | awk -F '\t' '$7 < 5e-8 {print $1}' > extract.txt && wc -l extract.txt
225 | ```
226 | 
227 | Extracting using awk is very slow, bcf format is extremely fast, though how this compares to elastic is not clear.
228 | 
229 | ### Speed to extract by rs ID
230 | 
231 | ```{r engine='bash'}
232 | # bcf
233 | time bcftools view -i'ID=@extract.txt' -Ob bmi.bcf > extract.bcf
234 | ```
235 | 
236 | ```{r engine='bash'}
237 | # vcf.gz
238 | time bcftools view -i'ID=@extract.txt' -Oz bmi.vcf.gz > extract.vcf.gz
239 | ```
240 | 
241 | ```{r engine='bash'}
242 | # For comparison we can just try grepping from the original file.
243 | # time zfgrep -wf extract.txt bmi.txt.gz | gzip -c > test.txt.gz
244 | ## Not running this because it takes several minutes
245 | ```
246 | 
247 | ### Speed to extract by chromosome and position
248 | 
249 | Using chrom and position is even faster than extracting by rs ID
250 | 
251 | ```{r engine='bash'}
252 | # Extract top hits again but save chrom and position
253 | time bcftools query -i'PVAL<5e-8' -f'%CHROM\t%POS\n' bmi.bcf > extract.txt && wc -l extract.txt
254 | ```
255 | 
256 | ```{r engine='bash'}
257 | # extract from bcf
258 | time bcftools filter -R extract.txt bmi.bcf > extract.bcf
259 | ```
260 | 
261 | ```{r engine='bash'}
262 | # extract from vcf.gz
263 | time bcftools filter -R extract.txt bmi.vcf.gz > extract.bcf
264 | ```
265 | 
266 | 
267 | ## Create format used for elastic
268 | 
269 | This is the tab delimited file being uploaded to elastic search db:
270 | 
271 | ```{r engine='bash'}
272 | time bcftools query -f'%ID\t%ALT\t%REF\t%AF\t%B\t%SE\t%PVAL\t%N1\t%N0\n' bmi.bcf | sed 's@\t\.@\t@g' | grep -v '$\.' > elastic.txt
273 | head elastic.txt
274 | ```
275 | 
276 | Ideally would create it like this - 
277 | - no alleles
278 | - no total sample size (N)
279 | - for case/control N1 and N0 are number of cases and number of controls
280 | - for continuous N1 is 0 and N0 is total sample size
281 | 
282 | ```{r engine='bash'}
283 | time bcftools query -f'%ID\t%AF\t%B\t%SE\t%PVAL\t%N1\t%N0\n' bmi.bcf | sed 's@\t\.@\t@g' | grep -v '$\.' > elastic.txt
284 | head elastic.txt
285 | ```
286 | 
287 | 
288 | 
289 | 


--------------------------------------------------------------------------------
/inst/sandpit/test_extract.r:
--------------------------------------------------------------------------------
 1 | load_all()
 2 | s <- fread("~/mr-eve/mr-eve/instruments.txt")
 3 | a <- extract(
 4 | 	bcf="~/mr-eve/gwas-files/7/data.bcf", 
 5 | 	snplist=s, 
 6 | 	tempname="temp", 
 7 | 	proxies="yes", 
 8 | 	bfile="~/mr-eve/vcf-reference-datasets/1000g_filtered/data_maf0.01_rs_snps",
 9 | 	vcf="~/mr-eve/vcf-reference-datasets/1000g/1kg_v3_nomult.bcf"
10 | )
11 | 
12 | a <- extract(
13 | 	bcf="~/mr-eve/gwas-files/7/data.bcf", 
14 | 	snplist=s[[6]], 
15 | 	tempname="temp", 
16 | 	proxies="yes", 
17 | 	bfile="~/mr-eve/vcf-reference-datasets/1000g_filtered/data_maf0.01_rs_snps",
18 | 	vcf="~/mr-eve/vcf-reference-datasets/1000g/1kg_v3_nomult.bcf"
19 | )
20 | 
21 | 
22 | a <- extract(
23 | 	bcf="~/mr-eve/gwas-files/2/data.bcf", 
24 | 	snplist=s[[6]], 
25 | 	tempname="temp", 
26 | 	proxies="yes", 
27 | 	bfile="~/mr-eve/vcf-reference-datasets/1000g_filtered/data_maf0.01_rs_snps",
28 | 	vcf="~/mr-eve/vcf-reference-datasets/1000g/1kg_v3_nomult.bcf"
29 | )
30 | 
31 | a <- extract(
32 | 	bcf="~/mr-eve/gwas-files/2/data.bcf", 
33 | 	snplist=s, 
34 | 	tempname="temp", 
35 | 	proxies="yes", 
36 | 	bfile="~/mr-eve/vcf-reference-datasets/1000g_filtered/data_maf0.01_rs_snps",
37 | 	vcf="~/mr-eve/vcf-reference-datasets/1000g/1kg_v3_nomult.bcf"
38 | )
39 | 
40 | 
41 | 
42 | 
43 | library(devtools)
44 | load_all()
45 | a <- TwoSampleMR::extract_instruments(2)
46 | fn <- system.file("data","IEU-a-2.vcf.gz", package="gwasvcftools")
47 | ldref <- "~/repo/mr-base-api/app/ld_files/data_maf0.01_rs"
48 | 
49 | o <- get_ld_proxies(a$SNP, fn, ldref, tempfile())
50 | 


--------------------------------------------------------------------------------
/man/VariantAnnotation.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/utils-pipe.R
3 | \name{VariantAnnotation}
4 | \alias{VariantAnnotation}
5 | \title{VariantAnnotation}
6 | \description{
7 | VariantAnnotation
8 | }
9 | 


--------------------------------------------------------------------------------
/man/check_bcftools.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/binaries.r
 3 | \name{check_bcftools}
 4 | \alias{check_bcftools}
 5 | \title{Check if the tools_bcftools option is set}
 6 | \usage{
 7 | check_bcftools()
 8 | }
 9 | \value{
10 | TRUE or FALSE
11 | }
12 | \description{
13 | See set_bcftools() for more information
14 | }
15 | 


--------------------------------------------------------------------------------
/man/check_plink.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/binaries.r
 3 | \name{check_plink}
 4 | \alias{check_plink}
 5 | \title{Check if the tools_plink option is set}
 6 | \usage{
 7 | check_plink()
 8 | }
 9 | \value{
10 | TRUE or FALSE
11 | }
12 | \description{
13 | See set_plink() for more information
14 | }
15 | 


--------------------------------------------------------------------------------
/man/create_ldref_sqlite.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/rsid_index.r
 3 | \name{create_ldref_sqlite}
 4 | \alias{create_ldref_sqlite}
 5 | \title{Create LD reference sqlite database for tags}
 6 | \usage{
 7 | create_ldref_sqlite(bfile, dbname, tag_r2 = 0.6)
 8 | }
 9 | \arguments{
10 | \item{bfile}{path to plink file}
11 | 
12 | \item{dbname}{dbname to produce (overwrites existing if exists)}
13 | 
14 | \item{tag_r2}{minimum tag r2}
15 | }
16 | \description{
17 | This is used for looking up proxies
18 | }
19 | 


--------------------------------------------------------------------------------
/man/create_pval_index_from_vcf.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/pval_index.r
 3 | \name{create_pval_index_from_vcf}
 4 | \alias{create_pval_index_from_vcf}
 5 | \title{Create pval index from GWAS-VCF file}
 6 | \usage{
 7 | create_pval_index_from_vcf(vcffile, maximum_pval, indexname)
 8 | }
 9 | \arguments{
10 | \item{vcffile}{VCF filename}
11 | 
12 | \item{maximum_pval}{Maximum p-value to include. Default = 0.05}
13 | 
14 | \item{indexname}{index file name to create. Deletes existing file if exists.}
15 | }
16 | \description{
17 | Create a separate file called \verb{<id>.pvali} which is used to speed up p-value queries.
18 | }
19 | 


--------------------------------------------------------------------------------
/man/create_rsidx_index_from_vcf.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/rsid_index.r
 3 | \name{create_rsidx_index_from_vcf}
 4 | \alias{create_rsidx_index_from_vcf}
 5 | \title{Create RSID index from VCF}
 6 | \usage{
 7 | create_rsidx_index_from_vcf(vcf, indexname)
 8 | }
 9 | \arguments{
10 | \item{vcf}{VCF filename}
11 | 
12 | \item{indexname}{index file name to create. Deletes existing file if exists.}
13 | }
14 | \description{
15 | Create RSID index from VCF
16 | }
17 | 


--------------------------------------------------------------------------------
/man/create_rsidx_sub_index.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/rsid_index.r
 3 | \name{create_rsidx_sub_index}
 4 | \alias{create_rsidx_sub_index}
 5 | \title{Create new index from existing index using a subset of rsids}
 6 | \usage{
 7 | create_rsidx_sub_index(rsid, rsidx, newindex)
 8 | }
 9 | \arguments{
10 | \item{rsid}{Vector of rsids}
11 | 
12 | \item{rsidx}{Existing index}
13 | 
14 | \item{newindex}{New index (Note: will delete existing file if exists)}
15 | }
16 | \value{
17 | NULL, creates new index file
18 | }
19 | \description{
20 | Note this requires a modified version of plink that allows ld-window-r2 flag for --r option.
21 | Available here: https://github.com/explodecomputer/plink-ng
22 | }
23 | 


--------------------------------------------------------------------------------
/man/create_vcf.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/manipulate.r
 3 | \name{create_vcf}
 4 | \alias{create_vcf}
 5 | \title{Create GWAS vcf}
 6 | \usage{
 7 | create_vcf(
 8 |   chrom,
 9 |   pos,
10 |   nea,
11 |   ea,
12 |   snp = NULL,
13 |   ea_af = NULL,
14 |   effect = NULL,
15 |   se = NULL,
16 |   pval = NULL,
17 |   n = NULL,
18 |   ncase = NULL,
19 |   name = NULL
20 | )
21 | }
22 | \arguments{
23 | \item{chrom}{chrom vector}
24 | 
25 | \item{pos}{pos vector}
26 | 
27 | \item{nea}{nea vector}
28 | 
29 | \item{ea}{ea vector}
30 | 
31 | \item{snp}{Optional vector}
32 | 
33 | \item{ea_af}{Optional vector}
34 | 
35 | \item{effect}{Optional vector}
36 | 
37 | \item{se}{Optional vector}
38 | 
39 | \item{pval}{Optional vector}
40 | 
41 | \item{n}{Optional vector}
42 | 
43 | \item{ncase}{Optional vector}
44 | 
45 | \item{name}{Optional vector}
46 | }
47 | \value{
48 | vcf object
49 | }
50 | \description{
51 | Create GWAS vcf
52 | }
53 | 


--------------------------------------------------------------------------------
/man/get_ld_proxies.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/proxy.r
 3 | \name{get_ld_proxies}
 4 | \alias{get_ld_proxies}
 5 | \title{Find LD proxies for a set of SNPs}
 6 | \usage{
 7 | get_ld_proxies(
 8 |   rsid,
 9 |   bfile,
10 |   searchspace = NULL,
11 |   tag_kb = 5000,
12 |   tag_nsnp = 5000,
13 |   tag_r2 = 0.6,
14 |   threads = 1,
15 |   out = tempfile()
16 | )
17 | }
18 | \arguments{
19 | \item{rsid}{list of rs IDs}
20 | 
21 | \item{bfile}{ld reference panel}
22 | 
23 | \item{searchspace}{Optional list of rs IDs to use as potential proxies}
24 | 
25 | \item{tag_kb}{=5000 Proxy parameter}
26 | 
27 | \item{tag_nsnp}{=5000 Proxy parameter}
28 | 
29 | \item{tag_r2}{=0.6 Proxy parameter}
30 | 
31 | \item{threads}{Number of threads to use (=1)}
32 | 
33 | \item{out}{temporary output file}
34 | }
35 | \value{
36 | data frame
37 | }
38 | \description{
39 | Find LD proxies for a set of SNPs
40 | }
41 | 


--------------------------------------------------------------------------------
/man/gwasvcf_to_summaryset.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/gwasglue.R
 3 | \name{gwasvcf_to_summaryset}
 4 | \alias{gwasvcf_to_summaryset}
 5 | \title{Create a SummarySet}
 6 | \usage{
 7 | gwasvcf_to_summaryset(vcf)
 8 | }
 9 | \arguments{
10 | \item{vcf}{Path or URL to GWAS-VCF file or VCF object e.g. output from \code{\link[VariantAnnotation:readVcf-methods]{VariantAnnotation::readVcf()}}, \code{\link[=create_vcf]{create_vcf()}} or \code{\link[=query_gwas]{query_gwas()}}}
11 | }
12 | \description{
13 | Returns a gwasglue2 SummarySet object
14 | }
15 | 


--------------------------------------------------------------------------------
/man/merge_vcf.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/manipulate.r
 3 | \name{merge_vcf}
 4 | \alias{merge_vcf}
 5 | \title{Merge two GWAS VCF objects}
 6 | \usage{
 7 | merge_vcf(a, b)
 8 | }
 9 | \arguments{
10 | \item{a}{VCF object}
11 | 
12 | \item{b}{VCF object}
13 | }
14 | \value{
15 | SimpleList of VCF objects
16 | }
17 | \description{
18 | Returns merged intersection of two VCF objects
19 | }
20 | 


--------------------------------------------------------------------------------
/man/parse_chrompos.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/query.r
 3 | \name{parse_chrompos}
 4 | \alias{parse_chrompos}
 5 | \title{Parse chromosome:position}
 6 | \usage{
 7 | parse_chrompos(chrompos, radius = NULL)
 8 | }
 9 | \arguments{
10 | \item{chrompos}{Either vector of chromosome and position ranges e.g. "1:1000" or "1:1000-2000", or data frame with columns \code{chrom}, \code{start}, \code{end}.}
11 | 
12 | \item{radius}{Add radius to the specified positions. Default = NULL}
13 | }
14 | \value{
15 | GRanges object
16 | }
17 | \description{
18 | Takes data frame or vector of chromosome position ranges and parses to granges object
19 | }
20 | 


--------------------------------------------------------------------------------
/man/pipe.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils-pipe.R
 3 | \name{\%>\%}
 4 | \alias{\%>\%}
 5 | \title{Pipe operator}
 6 | \usage{
 7 | lhs \%>\% rhs
 8 | }
 9 | \description{
10 | See \code{magrittr::\link[magrittr]{\%>\%}} for details.
11 | }
12 | \keyword{internal}
13 | 


--------------------------------------------------------------------------------
/man/proxy_match.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/proxy.r
 3 | \name{proxy_match}
 4 | \alias{proxy_match}
 5 | \title{Extract SNPs from vcf file}
 6 | \usage{
 7 | proxy_match(
 8 |   vcf,
 9 |   rsid,
10 |   bfile = NULL,
11 |   proxies = "yes",
12 |   tag_kb = 5000,
13 |   tag_nsnp = 5000,
14 |   tag_r2 = 0.6,
15 |   threads = 1,
16 |   rsidx = NULL,
17 |   dbfile = NULL
18 | )
19 | }
20 | \arguments{
21 | \item{vcf}{vcf file name}
22 | 
23 | \item{rsid}{list of rs IDs}
24 | 
25 | \item{bfile}{ld reference panel (plink)}
26 | 
27 | \item{proxies}{="yes" If SNPs are absent then look for proxies (yes) or not (no). Can also mask all target SNPs and only return proxies (only), for testing purposes}
28 | 
29 | \item{tag_kb}{=5000 Proxy parameter}
30 | 
31 | \item{tag_nsnp}{=5000 Proxy parameter}
32 | 
33 | \item{tag_r2}{=0.6 Proxy parameter}
34 | 
35 | \item{threads}{Number of threads to use (=1)}
36 | 
37 | \item{rsidx}{Path to rsidx index}
38 | 
39 | \item{dbfile}{ld tag database (sqlite)}
40 | }
41 | \value{
42 | data frame
43 | }
44 | \description{
45 | Finds proxies if necessary
46 | }
47 | 


--------------------------------------------------------------------------------
/man/query_chrompos_bcftools.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/query.r
 3 | \name{query_chrompos_bcftools}
 4 | \alias{query_chrompos_bcftools}
 5 | \title{Query chromosome and position using bcftools}
 6 | \usage{
 7 | query_chrompos_bcftools(chrompos, vcffile, id = NULL)
 8 | }
 9 | \arguments{
10 | \item{chrompos}{Either vector of chromosome and position ranges e.g. "1:1000" or "1:1000-2000", or data frame with columns \code{chrom}, \code{start}, \code{end}.}
11 | 
12 | \item{vcffile}{Path to .vcf.gz GWAS summary data file}
13 | 
14 | \item{id}{If multiple GWAS datasets in the vcf file, the name (sample ID) from which to perform the filter}
15 | }
16 | \value{
17 | vcf object
18 | }
19 | \description{
20 | Query chromosome and position using bcftools
21 | }
22 | 


--------------------------------------------------------------------------------
/man/query_chrompos_file.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/query.r
 3 | \name{query_chrompos_file}
 4 | \alias{query_chrompos_file}
 5 | \title{Query vcf file, extracting by chromosome and position}
 6 | \usage{
 7 | query_chrompos_file(chrompos, vcffile, id = NULL, build = "GRCh37")
 8 | }
 9 | \arguments{
10 | \item{chrompos}{Either vector of chromosome and position ranges e.g. "1:1000" or "1:1000-2000", or data frame with columns \code{chrom}, \code{start}, \code{end}.}
11 | 
12 | \item{vcffile}{Path to .vcf.gz GWAS summary data file}
13 | 
14 | \item{id}{If multiple GWAS datasets in the vcf file, the name (sample ID) from which to perform the filter}
15 | 
16 | \item{build}{Default="GRCh37" Build of vcffile}
17 | }
18 | \value{
19 | VCF object
20 | }
21 | \description{
22 | Query vcf file, extracting by chromosome and position
23 | }
24 | 


--------------------------------------------------------------------------------
/man/query_chrompos_vcf.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/query.r
 3 | \name{query_chrompos_vcf}
 4 | \alias{query_chrompos_vcf}
 5 | \title{Query chrompos from vcf object}
 6 | \usage{
 7 | query_chrompos_vcf(chrompos, vcf, id = NULL)
 8 | }
 9 | \arguments{
10 | \item{chrompos}{Either vector of chromosome and position ranges e.g. "1:1000" or "1:1000-2000", or data frame with columns \code{chrom}, \code{start}, \code{end}.}
11 | 
12 | \item{vcf}{VCF object (e.g. from readVcf)}
13 | 
14 | \item{id}{If multiple GWAS datasets in the vcf file, the name (sample ID) from which to perform the filter}
15 | }
16 | \value{
17 | VCF object
18 | }
19 | \description{
20 | Query chrompos from vcf object
21 | }
22 | 


--------------------------------------------------------------------------------
/man/query_gwas.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/query.r
 3 | \name{query_gwas}
 4 | \alias{query_gwas}
 5 | \title{Query data from vcf file}
 6 | \usage{
 7 | query_gwas(
 8 |   vcf,
 9 |   chrompos = NULL,
10 |   rsid = NULL,
11 |   pval = NULL,
12 |   id = NULL,
13 |   rsidx = NULL,
14 |   pvali = NULL,
15 |   build = "GRCh37",
16 |   os = Sys.info()[["sysname"]],
17 |   proxies = "no",
18 |   bfile = NULL,
19 |   dbfile = NULL,
20 |   tag_kb = 5000,
21 |   tag_nsnp = 5000,
22 |   tag_r2 = 0.6,
23 |   threads = 1
24 | )
25 | }
26 | \arguments{
27 | \item{vcf}{Path or URL to GWAS-VCF file or VCF object e.g. output from \code{\link[VariantAnnotation:readVcf-methods]{VariantAnnotation::readVcf()}} or \code{\link[=create_vcf]{create_vcf()}}}
28 | 
29 | \item{chrompos}{Either vector of chromosome and position ranges e.g. "1:1000" or "1:1000-2000", or data frame with columns \code{chrom}, \code{start}, \code{end}.}
30 | 
31 | \item{rsid}{Vector of rsids}
32 | 
33 | \item{pval}{P-value threshold (NOT -log10)}
34 | 
35 | \item{id}{If multiple GWAS datasets in the vcf file, the name (sample ID) from which to perform the filter}
36 | 
37 | \item{rsidx}{Path to rsidx index file}
38 | 
39 | \item{pvali}{Path to pval index file}
40 | 
41 | \item{build}{="GRCh37" Build of vcffile}
42 | 
43 | \item{os}{The operating system. Default is as detected. Determines the method used to perform query}
44 | 
45 | \item{proxies}{="no" If SNPs are absent then look for proxies (yes) or not (no). Can also mask all target SNPs and only return proxies (only), for testing purposes. Currently only possible if querying rsid.}
46 | 
47 | \item{bfile}{=path to plink bed/bim/fam ld reference panel}
48 | 
49 | \item{dbfile}{=path to sqlite tag snp database}
50 | 
51 | \item{tag_kb}{=5000 Proxy parameter}
52 | 
53 | \item{tag_nsnp}{=5000 Proxy parameter}
54 | 
55 | \item{tag_r2}{=0.6 Proxy parameter}
56 | 
57 | \item{threads}{=1 NUmber of threads}
58 | }
59 | \value{
60 | vcf object
61 | }
62 | \description{
63 | Read in GWAS summary data with filters on datasets (if multiple datasets per file) and/or chromosome/position, rsids or pvalues. Chooses most optimal choice for the detected operating system. Typically chrompos searches are the fastest. On Windows, rsid or pvalue filters from a file will be slow.
64 | }
65 | 


--------------------------------------------------------------------------------
/man/query_pval_bcftools.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/query.r
 3 | \name{query_pval_bcftools}
 4 | \alias{query_pval_bcftools}
 5 | \title{Query p-value using bcftools}
 6 | \usage{
 7 | query_pval_bcftools(pval, vcffile, id = NULL)
 8 | }
 9 | \arguments{
10 | \item{pval}{P-value threshold (NOT -log10)}
11 | 
12 | \item{vcffile}{Path to .vcf.gz GWAS summary data file}
13 | 
14 | \item{id}{If multiple GWAS datasets in the vcf file, the name (sample ID) from which to perform the filter}
15 | }
16 | \value{
17 | vcf object
18 | }
19 | \description{
20 | Query p-value using bcftools
21 | }
22 | 


--------------------------------------------------------------------------------
/man/query_pval_file.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/query.r
 3 | \name{query_pval_file}
 4 | \alias{query_pval_file}
 5 | \title{Query pval from vcf file}
 6 | \usage{
 7 | query_pval_file(pval, vcffile, id = NULL, build = "GRCh37")
 8 | }
 9 | \arguments{
10 | \item{pval}{P-value threshold (NOT -log10)}
11 | 
12 | \item{vcffile}{Path to tabix indexed vcf file}
13 | 
14 | \item{id}{If multiple GWAS datasets in the vcf file, the name (sample ID) from which to perform the filter}
15 | 
16 | \item{build}{Default="GRCh37"}
17 | }
18 | \value{
19 | VCF object
20 | }
21 | \description{
22 | Query pval from vcf file
23 | }
24 | 


--------------------------------------------------------------------------------
/man/query_pval_sqlite3.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/pval_index.r, R/query.r
 3 | \name{query_pval_sqlite3}
 4 | \alias{query_pval_sqlite3}
 5 | \title{Query pval from file using pvali index}
 6 | \usage{
 7 | query_pval_sqlite3(pval, vcffile, id = NULL, pvali)
 8 | 
 9 | query_pval_sqlite3(pval, vcffile, id = NULL, pvali)
10 | }
11 | \arguments{
12 | \item{pval}{pval threshold}
13 | 
14 | \item{vcffile}{Path to .vcf.gz GWAS summary data file}
15 | 
16 | \item{id}{If multiple GWAS datasets in the vcf file, the name (sample ID) from which to perform the filter}
17 | 
18 | \item{pvali}{Path to pval index file}
19 | }
20 | \value{
21 | vcf object
22 | 
23 | vcf object
24 | }
25 | \description{
26 | See create_pvali_index
27 | 
28 | See create_pvali_index
29 | }
30 | 


--------------------------------------------------------------------------------
/man/query_pval_vcf.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/query.r
 3 | \name{query_pval_vcf}
 4 | \alias{query_pval_vcf}
 5 | \title{Query based on p-value threshold from vcf}
 6 | \usage{
 7 | query_pval_vcf(pval, vcf, id = NULL)
 8 | }
 9 | \arguments{
10 | \item{pval}{P-value threshold (NOT -log10)}
11 | 
12 | \item{vcf}{VCF object (e.g. from readVcf)}
13 | 
14 | \item{id}{If multiple GWAS datasets in the vcf file, the name (sample ID) from which to perform the filter}
15 | }
16 | \value{
17 | VCF object
18 | }
19 | \description{
20 | Query based on p-value threshold from vcf
21 | }
22 | 


--------------------------------------------------------------------------------
/man/query_pvali.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/pval_index.r, R/query.r
 3 | \name{query_pvali}
 4 | \alias{query_pvali}
 5 | \title{Query pvali}
 6 | \usage{
 7 | query_pvali(pval, pvali)
 8 | 
 9 | query_pvali(pval, pvali)
10 | }
11 | \arguments{
12 | \item{pval}{pval threshold}
13 | 
14 | \item{pvali}{Path to pval index file}
15 | }
16 | \value{
17 | data frame
18 | 
19 | data frame
20 | }
21 | \description{
22 | Query pvali
23 | 
24 | Query pvali
25 | }
26 | 


--------------------------------------------------------------------------------
/man/query_rsid_bcftools.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/query.r
 3 | \name{query_rsid_bcftools}
 4 | \alias{query_rsid_bcftools}
 5 | \title{Query}
 6 | \usage{
 7 | query_rsid_bcftools(rsid, vcffile, id = NULL)
 8 | }
 9 | \arguments{
10 | \item{rsid}{Vector of rsids}
11 | 
12 | \item{vcffile}{Path to .vcf.gz GWAS summary data file}
13 | 
14 | \item{id}{If multiple GWAS datasets in the vcf file, the name (sample ID) from which to perform the filter}
15 | }
16 | \value{
17 | VCF object
18 | }
19 | \description{
20 | Query
21 | }
22 | 


--------------------------------------------------------------------------------
/man/query_rsid_file.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/query.r
 3 | \name{query_rsid_file}
 4 | \alias{query_rsid_file}
 5 | \title{Query vcf file, extracting by rsid}
 6 | \usage{
 7 | query_rsid_file(rsid, vcffile, id = NULL, build = "GRCh37")
 8 | }
 9 | \arguments{
10 | \item{rsid}{Vector of rsids. Use DBSNP build (???)}
11 | 
12 | \item{vcffile}{Path to .vcf.gz GWAS summary data file}
13 | 
14 | \item{id}{If multiple GWAS datasets in the vcf file, the name (sample ID) from which to perform the filter}
15 | 
16 | \item{build}{Default="GRCh37" Build of vcffile}
17 | }
18 | \value{
19 | VCF object
20 | }
21 | \description{
22 | Query vcf file, extracting by rsid
23 | }
24 | 


--------------------------------------------------------------------------------
/man/query_rsid_rsidx.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/query.r
 3 | \name{query_rsid_rsidx}
 4 | \alias{query_rsid_rsidx}
 5 | \title{Query rsid from file using rsidx index}
 6 | \usage{
 7 | query_rsid_rsidx(rsid, vcffile, id = NULL, rsidx)
 8 | }
 9 | \arguments{
10 | \item{rsid}{Vector of rsids}
11 | 
12 | \item{vcffile}{Path to .vcf.gz GWAS summary data file}
13 | 
14 | \item{id}{If multiple GWAS datasets in the vcf file, the name (sample ID) from which to perform the filter}
15 | 
16 | \item{rsidx}{Path to rsidx index file}
17 | }
18 | \value{
19 | vcf object
20 | }
21 | \description{
22 | See create_rsidx_index
23 | }
24 | 


--------------------------------------------------------------------------------
/man/query_rsid_vcf.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/query.r
 3 | \name{query_rsid_vcf}
 4 | \alias{query_rsid_vcf}
 5 | \title{Query rsid from vcf object}
 6 | \usage{
 7 | query_rsid_vcf(rsid, vcf, id = NULL)
 8 | }
 9 | \arguments{
10 | \item{rsid}{Vector of rsids}
11 | 
12 | \item{vcf}{VCF object (e.g. from readVcf)}
13 | 
14 | \item{id}{If multiple GWAS datasets in the vcf file, the name (sample ID) from which to perform the filter}
15 | }
16 | \value{
17 | VCF object
18 | }
19 | \description{
20 | Query rsid from vcf object
21 | }
22 | 


--------------------------------------------------------------------------------
/man/query_rsidx.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/query.r
 3 | \name{query_rsidx}
 4 | \alias{query_rsidx}
 5 | \title{Query rsidx}
 6 | \usage{
 7 | query_rsidx(rsid, rsidx)
 8 | }
 9 | \arguments{
10 | \item{rsid}{Vector of rsids}
11 | 
12 | \item{rsidx}{Path to rsidx index file}
13 | }
14 | \value{
15 | data frame
16 | }
17 | \description{
18 | Query rsidx
19 | }
20 | 


--------------------------------------------------------------------------------
/man/set_bcftools.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/binaries.r
 3 | \name{set_bcftools}
 4 | \alias{set_bcftools}
 5 | \title{Set bcftools binary location}
 6 | \usage{
 7 | set_bcftools(path = "")
 8 | }
 9 | \arguments{
10 | \item{path}{If "" (default), then will use the MRCIEU/genetics.binaRies to get binaries that are appropriate for the detected operating system. Otherwise, provide the path to the bcftools binary. If NULL then will set the option to NULL.}
11 | }
12 | \value{
13 | NULL, sets option 'tools_bcftools'
14 | }
15 | \description{
16 | Set bcftools binary location
17 | }
18 | 


--------------------------------------------------------------------------------
/man/set_plink.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/binaries.r
 3 | \name{set_plink}
 4 | \alias{set_plink}
 5 | \title{Set plink binary location}
 6 | \usage{
 7 | set_plink(path = "")
 8 | }
 9 | \arguments{
10 | \item{path}{If "" (default), then will use the MRCIEU/genetics.binaRies to get binaries that are appropriate for the detected operating system. Otherwise, provide the path to the plink binary. If NULL then will set the option to NULL.}
11 | }
12 | \value{
13 | NULL, sets option 'tools_plink'
14 | }
15 | \description{
16 | Set plink binary location
17 | }
18 | 


--------------------------------------------------------------------------------
/man/sqlite_ld_proxies.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/proxy.r
 3 | \name{sqlite_ld_proxies}
 4 | \alias{sqlite_ld_proxies}
 5 | \title{Lookup LD proxies from sqlite database}
 6 | \usage{
 7 | sqlite_ld_proxies(rsids, dbfile, tag_r2)
 8 | }
 9 | \arguments{
10 | \item{rsids}{List of rsids}
11 | 
12 | \item{dbfile}{path to dbfile}
13 | 
14 | \item{tag_r2}{minimum r2 value}
15 | }
16 | \value{
17 | data frame
18 | }
19 | \description{
20 | Lookup LD proxies from sqlite database
21 | }
22 | 


--------------------------------------------------------------------------------
/man/vcf_to_granges.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/manipulate.r
 3 | \name{vcf_to_granges}
 4 | \alias{vcf_to_granges}
 5 | \title{Convert vcf format to granges format}
 6 | \usage{
 7 | vcf_to_granges(vcf, id = NULL)
 8 | }
 9 | \arguments{
10 | \item{vcf}{Output from readVcf}
11 | 
12 | \item{id}{Only accepts one ID, so specify here if there are multiple GWAS datasets in the vcf}
13 | }
14 | \value{
15 | GRanges object
16 | }
17 | \description{
18 | Convert vcf format to granges format
19 | }
20 | 


--------------------------------------------------------------------------------
/man/vcf_to_tibble.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/manipulate.r
 3 | \name{vcf_to_tibble}
 4 | \alias{vcf_to_tibble}
 5 | \title{Convert vcf format to tibble (data frame)}
 6 | \usage{
 7 | vcf_to_tibble(vcf, id = NULL)
 8 | }
 9 | \arguments{
10 | \item{vcf}{Output from readVcf}
11 | 
12 | \item{id}{Only accepts one ID, so specify here if there are multiple GWAS datasets in the vcf}
13 | }
14 | \value{
15 | GRanges object
16 | }
17 | \description{
18 | Convert vcf format to tibble (data frame)
19 | }
20 | 


--------------------------------------------------------------------------------
/man/vcflist_overlaps.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/manipulate.r
 3 | \name{vcflist_overlaps}
 4 | \alias{vcflist_overlaps}
 5 | \title{Reduce list of VCFs to intersecting regions}
 6 | \usage{
 7 | vcflist_overlaps(vcflist, chrompos)
 8 | }
 9 | \arguments{
10 | \item{vcflist}{List of VCF objects, or list of VCF filenames, or mix of VCF objects and filenames}
11 | 
12 | \item{chrompos}{Either vector of chromosome and position ranges e.g. "1:1000" or "1:1000-2000", or data frame with columns \code{chrom}, \code{start}, \code{end}.}
13 | }
14 | \value{
15 | List of VCFs
16 | }
17 | \description{
18 | Reduce list of VCFs to intersecting regions
19 | }
20 | 


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(gwasvcf)
3 | 
4 | test_check("gwasvcf")
5 | 


--------------------------------------------------------------------------------
/tests/testthat/test_manipulate.r:
--------------------------------------------------------------------------------
 1 | context("VCF manipulations")
 2 | library(gwasvcf)
 3 | library(magrittr)
 4 | library(dplyr)
 5 | 
 6 | fn <- system.file("extdata","data.vcf.gz", package="gwasvcf")
 7 | vcf1 <- VariantAnnotation::readVcf(fn)[1:70,]
 8 | vcf2 <- VariantAnnotation::readVcf(fn)[40:90,]
 9 | vcf3 <- VariantAnnotation::readVcf(fn)[60:92,]
10 | vcf4 <- VariantAnnotation::readVcf(fn)[65:92,]
11 | l <- list(vcf1, vcf2, vcf3, vcf4)
12 | if (Sys.info()["sysname"] != "Windows") set_bcftools()
13 | 
14 | # Need to check what happens with multiallelic variants
15 | 
16 | test_that("vcflist_overlaps", {
17 |   skip_on_os("windows")
18 |   
19 | 	o <- vcflist_overlaps(vcflist=list(vcf1, vcf2), chrompos=NULL)
20 | 	expect_true(all(sapply(o, length) == 31) & length(o) == 2)
21 | 
22 | 	o <- vcflist_overlaps(vcflist=list(vcf1, vcf2, vcf3, fn), chrompos="1:1-10000000")
23 | 	expect_true(all(sapply(o, length) == 11) & length(o) == 4)
24 | 
25 | 	o <- vcflist_overlaps(vcflist=list(vcf1, vcf2, vcf3, vcf4), chrompos="1:1-10000000")
26 | 	expect_true(all(sapply(o, length) == 6) & length(o) == 4)
27 | 
28 | 	o <- vcflist_overlaps(vcflist=list(fn, fn), chrompos="1:1-10000000")
29 | 	expect_true(all(sapply(o, length) == 92) & length(o) == 2)
30 | 
31 | 	o <- vcflist_overlaps(vcflist=list(fn, fn), chrompos="2:1-10000000")
32 | 	expect_true(all(sapply(o, length) == 0) & length(o) == 2)	
33 | })
34 | 
35 | 
36 | fn <- system.file("extdata","data.vcf.gz", package="gwasvcf")
37 | V <- VariantAnnotation::readVcf(fn) 
38 | vv <- V %>% vcf_to_granges %>% dplyr::as_tibble()
39 | 
40 | test_that("create vcf", {
41 | 	out <- vv %$% create_vcf(chrom=seqnames, pos=start, nea=REF, ea=ALT, snp=ID, ea_af=AF, effect=ES, se=SE, pval=10^-LP, n=SS, name="a")
42 | 	VariantAnnotation::writeVcf(out, file="temp.vcf")
43 | 	expect_true(file.exists("temp.vcf"))
44 | })
45 | 


--------------------------------------------------------------------------------
/tests/testthat/test_proxy.R:
--------------------------------------------------------------------------------
  1 | context("Getting LD proxies")
  2 | library(gwasvcf)
  3 | library(genetics.binaRies)
  4 | 
  5 | vcffile <- system.file("extdata","data.vcf.gz", package="gwasvcf")
  6 | vcf <- VariantAnnotation::readVcf(vcffile)
  7 | bfile <- system.file("extdata","eur.bed", package="gwasvcf") %>% gsub(".bed", "", .)
  8 | 
  9 | set_plink()
 10 | 
 11 | test_that("query native", {
 12 |   skip_on_os("windows")
 13 |   
 14 | 	set_bcftools(NULL)
 15 | 	a <- query_gwas(vcffile, rsid="rs4970420")
 16 | 	expect_equal(nrow(a), 1)
 17 | 
 18 | 	a <- query_gwas(vcf, rsid="rs4970420")
 19 | 	expect_equal(nrow(a), 1)
 20 | 
 21 | 	a <- query_gwas(vcffile, rsid="rs4442317")
 22 | 	expect_equal(nrow(a), 0)
 23 | 
 24 | 	a <- query_gwas(vcf, rsid="rs4442317")
 25 | 	expect_equal(nrow(a), 0)
 26 | 
 27 | 	a <- query_gwas(vcffile, rsid="rs4442317", proxies="yes", bfile=bfile, tag_r2=0.05)
 28 | 	expect_equal(nrow(a), 1)
 29 | 
 30 | 	a <- query_gwas(vcf, rsid="rs4442317", proxies="yes", bfile=bfile, tag_r2=0.05)
 31 | 	expect_equal(nrow(a), 1)
 32 | 
 33 | 	a <- query_gwas(vcffile, rsid="rs9729550", proxies="only", bfile=bfile, tag_r2=0.05)
 34 | 	expect_equal(nrow(a), 1)
 35 | 
 36 | 	a <- query_gwas(vcf, rsid="rs9729550", proxies="only", bfile=bfile, tag_r2=0.05)
 37 | 	expect_equal(nrow(a), 1)
 38 | 
 39 | })
 40 | 
 41 | 
 42 | test_that("query bcftools", {
 43 |   skip_on_os("windows")
 44 |   
 45 | 	set_bcftools()
 46 | 	a <- query_gwas(vcffile, rsid="rs4970420")
 47 | 	expect_equal(nrow(a), 1)
 48 | 
 49 | 	a <- query_gwas(vcf, rsid="rs4970420")
 50 | 	expect_equal(nrow(a), 1)
 51 | 
 52 | 	a <- query_gwas(vcffile, rsid="rs4442317")
 53 | 	expect_equal(nrow(a), 0)
 54 | 
 55 | 	a <- query_gwas(vcf, rsid="rs4442317")
 56 | 	expect_equal(nrow(a), 0)
 57 | 
 58 | 	a <- query_gwas(vcffile, rsid="rs4442317", proxies="yes", bfile=bfile, tag_r2=0.05)
 59 | 	expect_equal(nrow(a), 1)
 60 | 
 61 | 	a <- query_gwas(vcffile, rsid=c("rs12565286","rs4442317"), proxies="yes", bfile=bfile, tag_r2=0.05)
 62 | 	expect_equal(nrow(a), 2)
 63 | 
 64 | 	a <- query_gwas(vcf, rsid="rs4442317", proxies="yes", bfile=bfile, tag_r2=0.05)
 65 | 	expect_equal(nrow(a), 1)
 66 | 
 67 | 	a <- query_gwas(vcf, rsid=c("rs12565286","rs4442317"), proxies="yes", bfile=bfile, tag_r2=0.05)
 68 | 	expect_equal(nrow(a), 2)
 69 | 
 70 | 	a <- query_gwas(vcffile, rsid="rs9729550", proxies="only", bfile=bfile, tag_r2=0.05)
 71 | 	expect_equal(nrow(a), 1)
 72 | 
 73 | 	a <- query_gwas(vcf, rsid="rs9729550", proxies="only", bfile=bfile, tag_r2=0.05)
 74 | 	expect_equal(nrow(a), 1)
 75 | 
 76 | })
 77 | 
 78 | test_that("alignment native", {
 79 |   skip_on_os("windows")
 80 |   
 81 | 	set_bcftools(NULL)
 82 | 	rsid <- names(SummarizedExperiment::rowRanges(vcf))
 83 | 	a <- proxy_match(vcf, rsid, bfile, proxies="only")
 84 | 	b <- query_gwas(vcf, rsid=rsid)
 85 | 	index <- match(names(b), names(a))
 86 | 	names(b) == names(a)[index]
 87 | 	expect_true(cor(vcf_to_granges(b)$ES, vcf_to_granges(a)$ES[index], use="pair") > 0.5)
 88 | })
 89 | 
 90 | test_that("alignment bcftools", {
 91 |   skip_on_os("windows")
 92 |   
 93 | 	set_bcftools()
 94 | 	rsid <- names(SummarizedExperiment::rowRanges(vcf))
 95 | 	a <- proxy_match(vcf, rsid, bfile, proxies="only")
 96 | 	b <- query_gwas(vcf, rsid=rsid)
 97 | 	index <- match(names(b), names(a))
 98 | 	names(b) == names(a)[index]
 99 | 	expect_true(cor(vcf_to_granges(b)$ES, vcf_to_granges(a)$ES[index], use="pair") > 0.5)
100 | })
101 | 


--------------------------------------------------------------------------------
/tests/testthat/test_pvali.r:
--------------------------------------------------------------------------------
 1 | context("Querying vcf files with pval index")
 2 | library(gwasvcf)
 3 | 
 4 | fn <- system.file("extdata","data.vcf.gz", package="gwasvcf")
 5 | vcf <- VariantAnnotation::readVcf(fn)
 6 | if (Sys.info()["sysname"] != "Windows") set_bcftools()
 7 | 
 8 | indexname <- tempfile()
 9 | 
10 | test_that("create index", {
11 |   skip_on_os(c("windows", "linux"))
12 | 	create_pval_index_from_vcf(fn, 0.4, indexname)
13 | 	expect_true(file.exists(indexname))
14 | })
15 | 
16 | test_that("read in", {
17 |   skip_on_os(c("windows", "linux"))
18 | 	out <- query_pvali(0.05, indexname)
19 | 	expect_equal(nrow(out), 7)
20 | })
21 | 
22 | test_that("query with pvali", {
23 |   skip_on_os(c("windows", "linux"))
24 | 	b <- query_gwas(fn, pval=0.05, pvali=indexname)
25 | 	expect_equal(nrow(b), 7)
26 | })
27 | 
28 | test_that("query with pvali", {
29 |   skip_on_os("windows")
30 | 	b <- query_gwas(fn, pval=0.05)
31 | 	expect_equal(nrow(b), 7)
32 | })
33 | 


--------------------------------------------------------------------------------
/tests/testthat/test_query.r:
--------------------------------------------------------------------------------
  1 | context("Querying vcf files")
  2 | library(gwasvcf)
  3 | 
  4 | 
  5 | fn <- system.file("extdata","data.vcf.gz", package="gwasvcf")
  6 | vcf <- VariantAnnotation::readVcf(fn)
  7 | 
  8 | 
  9 | 
 10 | test_that("query_gwas", {
 11 | 	chrompos<- c("1:800000-1000000")
 12 | 	rsid <- c("rs3128126", "rs3121561", "rs3813193")
 13 | 	id <- "IEU-a-2"
 14 | 	pval <- 0.2
 15 | 
 16 | 	expect_true({
 17 | 		a = query_gwas(fn, chrompos=chrompos, os="Darwin")
 18 | 		b = query_gwas(fn, chrompos=chrompos, os="Windows")
 19 | 		c = query_gwas(fn, chrompos=chrompos, id=id, os="Darwin")
 20 | 		d = query_gwas(fn, chrompos=chrompos, id=id, os="Windows")
 21 | 		all(a == b) && all(b == c) && all(c == d)
 22 | 	})
 23 | 
 24 | 	expect_true({
 25 | 		a = query_gwas(fn, rsid=rsid, os="Darwin")
 26 | 		b = query_gwas(fn, rsid=rsid, os="Windows")
 27 | 		c = query_gwas(fn, rsid=rsid, id=id, os="Darwin")
 28 | 		d = query_gwas(fn, rsid=rsid, id=id, os="Windows")
 29 | 		all(a == b) && all(b == c) && all(c == d)
 30 | 	})
 31 | 
 32 | 	expect_true({
 33 | 		a = query_gwas(fn, pval=pval, os="Darwin")
 34 | 		b = query_gwas(fn, pval=pval, os="Windows")
 35 | 		c = query_gwas(fn, pval=pval, id=id, os="Darwin")
 36 | 		d = query_gwas(fn, pval=pval, id=id, os="Windows")
 37 | 		all(a == b) && all(b == c) && all(c == d)
 38 | 	})
 39 | 
 40 | 	expect_true({
 41 | 		a = query_gwas(vcf, chrompos=chrompos, os="Darwin")
 42 | 		b = query_gwas(vcf, chrompos=chrompos, os="Windows")
 43 | 		c = query_gwas(vcf, chrompos=chrompos, id=id, os="Darwin")
 44 | 		d = query_gwas(vcf, chrompos=chrompos, id=id, os="Windows")
 45 | 		all(a == b) && all(b == c) && all(c == d)
 46 | 	})
 47 | 
 48 | 	expect_true({
 49 | 		a = query_gwas(vcf, rsid=rsid, os="Darwin")
 50 | 		b = query_gwas(vcf, rsid=rsid, os="Windows")
 51 | 		c = query_gwas(vcf, rsid=rsid, id=id, os="Darwin")
 52 | 		d = query_gwas(vcf, rsid=rsid, id=id, os="Windows")
 53 | 		all(a == b) && all(b == c) && all(c == d)
 54 | 	})
 55 | 
 56 | 	expect_true({
 57 | 		a = query_gwas(vcf, pval=pval, os="Darwin")
 58 | 		b = query_gwas(vcf, pval=pval, os="Windows")
 59 | 		c = query_gwas(vcf, pval=pval, id=id, os="Darwin")
 60 | 		d = query_gwas(vcf, pval=pval, id=id, os="Windows")
 61 | 		all(a == b) && all(b == c) && all(c == d)
 62 | 	})
 63 | })
 64 | 
 65 | 
 66 | test_that("parse_chrompos", {
 67 | 	expect_equal(parse_chrompos("1:10000") %>% length, 1)
 68 | 	expect_equal(parse_chrompos("1:10000-100000") %>% length, 1)
 69 | 	expect_equal(parse_chrompos(c("1:10000-10000", "2:100-200")) %>% length, 2)
 70 | 	expect_equal(parse_chrompos(dplyr::tibble(chrom=c(1,2),start=c(10000,100), end=c(10000,200))) %>% length, 2)
 71 | })
 72 | 
 73 | 
 74 | test_that("vcf_to_granges", {
 75 | 	g <- vcf_to_granges(vcf)
 76 | 	expect_equal(length(g), length(vcf))
 77 | })
 78 | 
 79 | 
 80 | test_that("query_chrompos_file", {
 81 | 	g <- parse_chrompos("1:800000-1000000")
 82 | 	v <- query_chrompos_file(g, fn)
 83 | 	expect_equal(length(v), 3)
 84 | })
 85 | 
 86 | 
 87 | test_that("query_rsid_file", {
 88 | 	v <- query_rsid_file(c("rs3128126", "rs3121561", "rs3813193"), fn)
 89 | 	expect_equal(length(v), 3)
 90 | })
 91 | 
 92 | 
 93 | test_that("query_pval_file", {
 94 | 	v <- query_pval_file(0.2, fn)
 95 | 	expect_true(length(v) < 92)
 96 | 	expect_true(length(v) > 5)
 97 | })
 98 | 
 99 | 
100 | test_that("query_chrompos_vcf", {
101 | 	v <- query_chrompos_vcf("1:800000-1000000", vcf)
102 | 	expect_equal(length(v), 3)
103 | })
104 | 
105 | 
106 | test_that("query_rsid_vcf", {
107 | 	v <- query_rsid_vcf(c("rs3128126", "rs3121561", "rs3813193"), vcf)
108 | 	expect_equal(length(v), 3)
109 | })
110 | 
111 | 
112 | test_that("query_pval_vcf", {
113 | 	v <- query_pval_vcf(0.2, vcf)
114 | 	expect_true(length(v) < 92)
115 | 	expect_true(length(v) > 5)
116 | })
117 | 
118 | 
119 | test_that("query_rsid_bcftools", {
120 |   skip_on_os("windows")
121 | 	set_bcftools()
122 | 	v <- query_rsid_bcftools(c("rs3128126", "rs3121561", "rs3813193"), fn)
123 | 	expect_equal(length(v), 3)
124 | })
125 | 
126 | 
127 | test_that("query_pval_bcftools", {
128 |   skip_on_os("windows")
129 | 	set_bcftools()
130 | 	v <- query_pval_bcftools(0.2, fn)
131 | 	expect_true(length(v) < 92)
132 | 	expect_true(length(v) > 5)
133 | })
134 | 
135 | 
136 | test_that("query_chrompos_vcf", {
137 |   skip_on_os("windows")
138 | 	set_bcftools()
139 | 	v <- query_chrompos_bcftools("1:800000-1000000", fn)
140 | 	expect_equal(length(v), 3)
141 | })
142 | 
143 | 
144 | test_that("query_chrompos_vcf url", {
145 |   skip_on_ci()
146 |   skip_on_os(c("mac", "windows", "linux"))
147 | 	set_bcftools()
148 | 	u <- "https://objectstorage.us-ashburn-1.oraclecloud.com/n/idrvm4tkz2a8/b/OpenGWAS/o/ieu-a/ieu-a-2/ieu-a-2.vcf.gz"
149 | 	RCurl::url.exists(u)
150 | 	v <- query_chrompos_bcftools("1:800000-1000000", u)
151 | 	expect_equal(length(v), 3)
152 | })
153 | 
154 | test_that("query_chrompos_vcf url2", {
155 |   skip_on_ci()
156 |   skip_on_os(c("mac", "windows", "linux"))
157 | 	set_bcftools()
158 | 	u <- "https://objectstorage.us-ashburn-1.oraclecloud.com/n/idrvm4tkz2a8/b/OpenGWAS/o/ieu-a/ieu-a-2/ieu-a-2.vcf.gz"
159 | 	# RCurl::url.exists(u)
160 | 	v <- query_gwas(u, "1:800000-1000000")
161 | 	expect_equal(length(v), 3)
162 | })
163 | 
164 | test_that("query_chrompos_vcf url2", {
165 |   skip_on_ci()
166 |   skip_on_os(c("mac", "windows", "linux"))
167 | 	set_bcftools()
168 | 	u <- "https://objectstorage.us-ashburn-1.oraclecloud.com/n/idrvm4tkz2a8/b/OpenGWAS/o/ieu-a/ieu-a-2/ieu-a-2.vcf.gz"
169 | 	v <- query_gwas(u, pval=5e-8)
170 | 	expect_equal(length(v), 2041)
171 | })
172 | 


--------------------------------------------------------------------------------
/tests/testthat/test_rsidx.r:
--------------------------------------------------------------------------------
 1 | context("Querying vcf files with rsidx")
 2 | library(gwasvcf)
 3 | 
 4 | 
 5 | fn <- system.file("extdata","data.vcf.gz", package="gwasvcf")
 6 | vcf <- VariantAnnotation::readVcf(fn)
 7 | 
 8 | indexname <- tempfile()
 9 | 
10 | test_that("create index", {
11 |   skip_on_os(c("windows", "linux"))
12 | 	create_rsidx_index_from_vcf(fn, indexname)
13 | 	expect_true(file.exists(indexname))
14 | })
15 | 
16 | test_that("read in", {
17 |   skip_on_os(c("windows", "linux"))
18 | 	out <- query_rsidx(head(names(vcf)), indexname)
19 | 	expect_true(nrow(out) == 6)
20 | })
21 | 
22 | test_that("create sub index", {
23 |   skip_on_os(c("windows", "linux"))
24 | 	newname <- tempfile()
25 | 	create_rsidx_sub_index(head(names(vcf)), indexname, newname)
26 | 	expect_true(file.exists(newname))
27 | })
28 | 
29 | test_that("query with rsidx", {
30 |   skip_on_os(c("windows", "linux"))
31 | 	a <- query_gwas(fn, rsid=head(names(vcf)))
32 | 	b <- query_gwas(fn, rsid=head(names(vcf)), rsidx=indexname)
33 | 	expect_true(all(names(a) == names(b)))
34 | })
35 | 
36 | 
37 | 
38 | fn <- system.file("extdata", "eur.bed", package="gwasvcf") %>% gsub("eur.bed", "eur", .)
39 | dbfile <- tempfile()
40 | 
41 | set_plink()
42 | 
43 | test_that("tag db", {
44 |   skip_on_os(c("windows", "linux"))
45 | 	create_ldref_sqlite(fn, dbfile, 0.04)
46 | 	expect_true(file.exists(dbfile))
47 | })
48 | 
49 | test_that("sqlite_ld_proxies", {
50 |   skip("TODO: check this test")
51 | 	m <- data.table::fread(paste0(fn, ".bim")) %>% {sample(.$V2, 100, replace=FALSE)}
52 | 	ld <- sqlite_ld_proxies(m, dbfile, 0.2)
53 | 	# Requires an expect_* condition here
54 | })
55 | 
56 | test_that("sqlite proxy", {
57 |   skip("TODO: check this test")
58 | 	vcffile <- system.file("extdata","data.vcf.gz", package="gwasvcf")
59 | 	set_bcftools()
60 | 	a <- query_gwas(vcffile, rsid="rs4442317", proxies="yes", dbfile=dbfile, tag_r2=0.05)
61 | 	expect_equal(nrow(a), 1)
62 | })
63 | 
64 | test_that("sqlite proxy", {
65 |   skip("TODO: check this test")
66 | 	vcffile <- system.file("extdata","data.vcf.gz", package="gwasvcf")
67 | 	set_bcftools()
68 | 	a <- query_gwas(vcffile, rsid=c("rs12565286","rs4442317"), proxies="yes", dbfile=dbfile, tag_r2=0.05)
69 | 	expect_equal(nrow(a), 2)
70 | })
71 | 
72 | test_that("sqlite proxy only", {
73 |   skip("TODO: check this test")
74 | 	vcffile <- system.file("extdata","data.vcf.gz", package="gwasvcf")
75 | 	set_bcftools()
76 | 	a <- query_gwas(vcffile, rsid=c("rs12565286","rs4442317"), proxies="only", dbfile=dbfile, tag_r2=0.05)
77 | 	expect_equal(nrow(a), 2)
78 | 	b <- a %>% vcf_to_tibble
79 | 	expect_true(all(! b$ID %in% c("rs12565286","rs4442317")))
80 | })
81 | 
82 | test_that("sqlite proxy no result", {
83 |   skip_on_os(c("windows", "linux"))
84 |   vcffile <- system.file("extdata","data.vcf.gz", package="gwasvcf")
85 | 	set_bcftools()
86 | 	a <- query_gwas(vcffile, rsid="rs4442317", proxies="yes", dbfile=dbfile, tag_r2=0.5)
87 | 	expect_equal(nrow(a), 0)
88 | })
89 | 
90 | unlink(dbfile)
91 | 


--------------------------------------------------------------------------------
/vignettes/figure/target-effects-plot-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MRCIEU/gwasvcf/820267653ac7720926a13cac00b82c0a0ca840b6/vignettes/figure/target-effects-plot-1.png


--------------------------------------------------------------------------------
/vignettes/guide.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Reading, querying and writing GWAS summary data in VCF format"
  3 | output: rmarkdown::html_vignette
  4 | vignette: >
  5 |   %\VignetteIndexEntry{Reading, querying and writing GWAS summary data in VCF format}
  6 |   %\VignetteEngine{knitr::rmarkdown}
  7 |   %\VignetteEncoding{UTF-8}
  8 | ---
  9 | 
 10 | 
 11 | 
 12 | We developed a format for storing and harmonising GWAS summary data known as [GWAS VCF format](https://github.com/MRCIEU/gwas-vcf-specification). This format is effective for being very fast when querying chromosome and position ranges, handling multiallelic variants and indels. 
 13 | 
 14 | All the data in the [IEU GWAS database](https://gwas.mrcieu.ac.uk/) is available for download in the GWAS VCF format. This R package provides fast and convenient functions for querying and creating GWAS summary data in GWAS VCF format. The package builds on the [VariantAnnotation](https://bioconductor.org/packages/release/bioc/html/VariantAnnotation.html) Bioconductor package, which itself is based on the widely used [SummarizedExperiment](https://bioconductor.org/packages/release/bioc/html/SummarizedExperiment.html) Bioconductor package.
 15 | 
 16 | 
 17 | ## External tools
 18 | 
 19 | For some VCF querying functions it is faster to optionally use [bcftools](https://samtools.github.io/bcftools/bcftools.html), and when available the R package will use that strategy. To set a location for the bcftools package, use
 20 | 
 21 | ```r
 22 | library(gwasvcf)
 23 | set_bcftools('/path/to/bcftools')
 24 | ```
 25 | 
 26 | Note that there is bcftools binary for Windows available, so some querying options will be slower on Windows.
 27 | 
 28 | For LD related functions the package uses [plink 1.90](https://www.cog-genomics.org/plink/1.9). You can specify the location of your plink installation by running
 29 | 
 30 | ```r
 31 | set_plink('/path/to/plink')
 32 | ```
 33 | 
 34 | Alternatively you can automatically use use the binaries bundled here: https://github.com/mrcieu/genetics.binaRies 
 35 | 
 36 | ```r
 37 | remotes::install_github('mrcieu/genetics.binaRies')
 38 | set_plink()
 39 | set_bcftools()
 40 | ```
 41 | 
 42 | To unset a path:
 43 | 
 44 | ```r
 45 | set_plink(NULL)
 46 | set_bcftools(NULL)
 47 | ```
 48 | 
 49 | For this vignette we will use the bundled binaries in `genetics.binaRies`.
 50 | 
 51 | 
 52 | ``` r
 53 | suppressWarnings(suppressPackageStartupMessages({
 54 |   library(gwasvcf)
 55 |   library(VariantAnnotation)
 56 |   library(dplyr)
 57 |   library(magrittr)
 58 | }))
 59 | ```
 60 | 
 61 | ``` r
 62 | set_bcftools()
 63 | #> Path not provided, using binaries in the MRCIEU/genetics.binaRies package
 64 | ```
 65 | 
 66 | ## Reading in everything
 67 | 
 68 | To read an entire dataset use the `readVcf` function. As an example we'll use the bundled data which is a small subset of the Speliotes et al 2010 BMI GWAS.
 69 | 
 70 | 
 71 | ``` r
 72 | vcffile <- system.file("extdata", "data.vcf.gz", package="gwasvcf")
 73 | vcf <- readVcf(vcffile)
 74 | class(vcf)
 75 | #> [1] "CollapsedVCF"
 76 | #> attr(,"package")
 77 | #> [1] "VariantAnnotation"
 78 | ```
 79 | 
 80 | Please refer to the `VariantAnnotation` package documentation for full details about the `CollapsedVCF` object. A brief summary follows.
 81 | 
 82 | General info about the dataset can be obtained by calling it:
 83 | 
 84 | 
 85 | ``` r
 86 | vcf
 87 | #> class: CollapsedVCF 
 88 | #> dim: 92 1 
 89 | #> rowRanges(vcf):
 90 | #>   GRanges with 5 metadata columns: paramRangeID, REF, ALT, QUAL, FILTER
 91 | #> info(vcf):
 92 | #>   DataFrame with 1 column: AF
 93 | #> info(header(vcf)):
 94 | #>       Number Type  Description     
 95 | #>    AF A      Float Allele Frequency
 96 | #> geno(vcf):
 97 | #>   List of length 9: ES, SE, LP, AF, SS, EZ, SI, NC, ID
 98 | #> geno(header(vcf)):
 99 | #>       Number Type   Description                                                     
100 | #>    ES A      Float  Effect size estimate relative to the alternative allele         
101 | #>    SE A      Float  Standard error of effect size estimate                          
102 | #>    LP A      Float  -log10 p-value for effect estimate                              
103 | #>    AF A      Float  Alternate allele frequency in the association study             
104 | #>    SS A      Float  Sample size used to estimate genetic effect                     
105 | #>    EZ A      Float  Z-score provided if it was used to derive the EFFECT and SE f...
106 | #>    SI A      Float  Accuracy score of summary data imputation                       
107 | #>    NC A      Float  Number of cases used to estimate genetic effect                 
108 | #>    ID 1      String Study variant identifier
109 | ```
110 | 
111 | There are 92 rows and 1 column which means 92 SNPs and one GWAS. See the header information:
112 | 
113 | 
114 | ``` r
115 | header(vcf)
116 | #> class: VCFHeader 
117 | #> samples(1): IEU-a-2
118 | #> meta(4): fileformat META SAMPLE contig
119 | #> fixed(1): FILTER
120 | #> info(1): AF
121 | #> geno(9): ES SE ... NC ID
122 | ```
123 | 
124 | See the names of the GWAS datasets (in this case just one, and it refers to the IEU GWAS database ID name):
125 | 
126 | 
127 | ``` r
128 | samples(header(vcf))
129 | #> [1] "IEU-a-2"
130 | ```
131 | 
132 | In this case you can obtain information about this study through the `ieugwasr` package e.g. `ieugwasr::gwasinfo("IEU-a-2")`.
133 | 
134 | There are a few components within the object:
135 | 
136 | - `header` which has the meta data describing the dataset, including the association result variables
137 | - `rowRanges` which is information about each variant
138 | - `info` which is further metadata about each variant
139 | - `geno` which is the actual association results for each GWAS
140 | 
141 | the `rowRanges` object is a `GenomicRanges` class, which is useful for performing fast operations on chromosome position information.
142 | 
143 | 
144 | ``` r
145 | rowRanges(vcf)
146 | #> GRanges object with 92 ranges and 5 metadata columns:
147 | #>              seqnames    ranges strand | paramRangeID            REF
148 | #>                 <Rle> <IRanges>  <Rle> |     <factor> <DNAStringSet>
149 | #>   rs12565286        1    721290      * |           NA              G
150 | #>   rs11804171        1    723819      * |           NA              T
151 | #>    rs2977670        1    723891      * |           NA              G
152 | #>    rs3094315        1    752566      * |           NA              G
153 | #>    rs2073813        1    753541      * |           NA              G
154 | #>          ...      ...       ...    ... .          ...            ...
155 | #>     rs715643        1   1172907      * |           NA              C
156 | #>    rs6675798        1   1176597      * |           NA              T
157 | #>    rs6603783        1   1181751      * |           NA              T
158 | #>    rs6603785        1   1186502      * |           NA              A
159 | #>    rs6603787        1   1188225      * |           NA              G
160 | #>                             ALT      QUAL      FILTER
161 | #>              <DNAStringSetList> <numeric> <character>
162 | #>   rs12565286                  C        NA        PASS
163 | #>   rs11804171                  A        NA        PASS
164 | #>    rs2977670                  C        NA        PASS
165 | #>    rs3094315                  A        NA        PASS
166 | #>    rs2073813                  A        NA        PASS
167 | #>          ...                ...       ...         ...
168 | #>     rs715643                  T        NA        PASS
169 | #>    rs6675798                  C        NA        PASS
170 | #>    rs6603783                  C        NA        PASS
171 | #>    rs6603785                  T        NA        PASS
172 | #>    rs6603787                  T        NA        PASS
173 | #>   -------
174 | #>   seqinfo: 84 sequences from GRCh37 genome
175 | ```
176 | 
177 | ## Converting to simple dataframes
178 | 
179 | The VCF object is somewhat complex and you can read more about it in the [VariantAnnotation package documentation](https://bioconductor.org/packages/release/bioc/html/VariantAnnotation.html). You can create various other formats that might be easier to use from it. For example, create a `GRanges` object which is great for fast chromosome-position operations
180 | 
181 | 
182 | ``` r
183 | vcf_to_granges(vcf)
184 | #> GRanges object with 92 ranges and 15 metadata columns:
185 | #>              seqnames    ranges strand | paramRangeID         REF         ALT
186 | #>                 <Rle> <IRanges>  <Rle> |     <factor> <character> <character>
187 | #>   rs12565286        1    721290      * |           NA           G           C
188 | #>   rs11804171        1    723819      * |           NA           T           A
189 | #>    rs2977670        1    723891      * |           NA           G           C
190 | #>    rs3094315        1    752566      * |           NA           G           A
191 | #>    rs2073813        1    753541      * |           NA           G           A
192 | #>          ...      ...       ...    ... .          ...         ...         ...
193 | #>     rs715643        1   1172907      * |           NA           C           T
194 | #>    rs6675798        1   1176597      * |           NA           T           C
195 | #>    rs6603783        1   1181751      * |           NA           T           C
196 | #>    rs6603785        1   1186502      * |           NA           A           T
197 | #>    rs6603787        1   1188225      * |           NA           G           T
198 | #>                   QUAL      FILTER        ES        SE        LP        AF        SS
199 | #>              <numeric> <character> <numeric> <numeric> <numeric> <numeric> <numeric>
200 | #>   rs12565286        NA        PASS   -0.0067    0.0145 0.1930060   0.93220  109823.0
201 | #>   rs11804171        NA        PASS   -0.0146    0.0175 0.3935110   0.96296   84828.0
202 | #>    rs2977670        NA        PASS    0.0044    0.0184 0.0909791   0.07143   68458.9
203 | #>    rs3094315        NA        PASS    0.0060    0.0065 0.4485500   0.15520  131544.0
204 | #>    rs2073813        NA        PASS    0.0035    0.0102 0.1357860        NA   64351.3
205 | #>          ...       ...         ...       ...       ...       ...       ...       ...
206 | #>     rs715643        NA        PASS    0.0019    0.0118 0.0594337   0.90833    121822
207 | #>    rs6675798        NA        PASS   -0.0013    0.0067 0.0725270   0.89170    223475
208 | #>    rs6603783        NA        PASS   -0.0002    0.0069 0.0101499   0.90000    220022
209 | #>    rs6603785        NA        PASS    0.0075    0.0104 0.3271640   0.91667    165964
210 | #>    rs6603787        NA        PASS    0.0025    0.0089 0.1085740        NA    199099
211 | #>                     EZ        SI        NC          ID          id
212 | #>              <numeric> <numeric> <numeric> <character> <character>
213 | #>   rs12565286        NA        NA        NA  rs12565286     IEU-a-2
214 | #>   rs11804171        NA        NA        NA  rs11804171     IEU-a-2
215 | #>    rs2977670        NA        NA        NA   rs2977670     IEU-a-2
216 | #>    rs3094315        NA        NA        NA   rs3094315     IEU-a-2
217 | #>    rs2073813        NA        NA        NA   rs2073813     IEU-a-2
218 | #>          ...       ...       ...       ...         ...         ...
219 | #>     rs715643        NA        NA        NA    rs715643     IEU-a-2
220 | #>    rs6675798        NA        NA        NA   rs6675798     IEU-a-2
221 | #>    rs6603783        NA        NA        NA   rs6603783     IEU-a-2
222 | #>    rs6603785        NA        NA        NA   rs6603785     IEU-a-2
223 | #>    rs6603787        NA        NA        NA   rs6603787     IEU-a-2
224 | #>   -------
225 | #>   seqinfo: 84 sequences from GRCh37 genome
226 | ```
227 | 
228 | Create a data frame:
229 | 
230 | 
231 | ``` r
232 | vcf_to_granges(vcf) %>% dplyr::as_tibble()
233 | #> # A tibble: 92 × 20
234 | #>    seqnames  start    end width strand paramRangeID REF   ALT    QUAL FILTER      ES
235 | #>    <fct>     <int>  <int> <int> <fct>  <fct>        <chr> <chr> <dbl> <chr>    <dbl>
236 | #>  1 1        721290 721290     1 *      <NA>         G     C        NA PASS   -0.0067
237 | #>  2 1        723819 723819     1 *      <NA>         T     A        NA PASS   -0.0146
238 | #>  3 1        723891 723891     1 *      <NA>         G     C        NA PASS    0.0044
239 | #>  4 1        752566 752566     1 *      <NA>         G     A        NA PASS    0.006 
240 | #>  5 1        753541 753541     1 *      <NA>         G     A        NA PASS    0.0035
241 | #>  6 1        754192 754192     1 *      <NA>         A     G        NA PASS    0.0077
242 | #>  7 1        768448 768448     1 *      <NA>         G     A        NA PASS   -0.0027
243 | #>  8 1        775659 775659     1 *      <NA>         A     G        NA PASS    0.0029
244 | #>  9 1        777122 777122     1 *      <NA>         A     T        NA PASS    0.0031
245 | #> 10 1        779322 779322     1 *      <NA>         A     G        NA PASS   -0.0062
246 | #> # ℹ 82 more rows
247 | #> # ℹ 9 more variables: SE <dbl>, LP <dbl>, AF <dbl>, SS <dbl>, EZ <dbl>, SI <dbl>,
248 | #> #   NC <dbl>, ID <chr>, id <chr>
249 | ```
250 | 
251 | The direct conversion to formats for tools such as TwoSampleMR, coloc, and many others can also be made using the [https://github.com/mrcieu/gwasglue](https://github.com/mrcieu/gwasglue) R package.
252 | 
253 | ## Reading in with filters
254 | 
255 | The `query_gwas()` function takes either a filename to a vcf file, or vcf object as the main argument. You can then query on `rsid`, `pval` or `chrompos`. For example
256 | 
257 | 
258 | ``` r
259 | vcfsubset <- query_gwas(vcffile, chrompos=c("1:1097291-1099437"))
260 | ```
261 | 
262 | and
263 | 
264 | 
265 | ``` r
266 | vcf <- readVcf(vcffile)
267 | vcfsubset <- query_gwas(vcf, chrompos=c("1:1097291-1099437"))
268 | ```
269 | 
270 | are each identical, but the former saves time and memory because it is querying the file using an index and only reading in what is required.
271 | 
272 | Examples of other filters are here:
273 | 
274 | 
275 | ``` r
276 | vcf <- query_gwas(vcffile, rsid=c("rs3128126", "rs3121561", "rs3813193"))
277 | vcf
278 | #> class: CollapsedVCF 
279 | #> dim: 3 1 
280 | #> rowRanges(vcf):
281 | #>   GRanges with 5 metadata columns: paramRangeID, REF, ALT, QUAL, FILTER
282 | #> info(vcf):
283 | #>   DataFrame with 3 columns: AF, AC, AN
284 | #> info(header(vcf)):
285 | #>       Number Type    Description                                
286 | #>    AF A      Float   Allele Frequency                           
287 | #>    AC A      Integer Allele count in genotypes                  
288 | #>    AN 1      Integer Total number of alleles in called genotypes
289 | #> geno(vcf):
290 | #>   List of length 9: ES, SE, LP, AF, SS, EZ, SI, NC, ID
291 | #> geno(header(vcf)):
292 | #>       Number Type   Description                                                     
293 | #>    ES A      Float  Effect size estimate relative to the alternative allele         
294 | #>    SE A      Float  Standard error of effect size estimate                          
295 | #>    LP A      Float  -log10 p-value for effect estimate                              
296 | #>    AF A      Float  Alternate allele frequency in the association study             
297 | #>    SS A      Float  Sample size used to estimate genetic effect                     
298 | #>    EZ A      Float  Z-score provided if it was used to derive the EFFECT and SE f...
299 | #>    SI A      Float  Accuracy score of summary data imputation                       
300 | #>    NC A      Float  Number of cases used to estimate genetic effect                 
301 | #>    ID 1      String Study variant identifier
302 | ```
303 | 
304 | 
305 | ``` r
306 | vcf <- query_gwas(vcffile, pval=0.5)
307 | vcf
308 | #> class: CollapsedVCF 
309 | #> dim: 45 1 
310 | #> rowRanges(vcf):
311 | #>   GRanges with 5 metadata columns: paramRangeID, REF, ALT, QUAL, FILTER
312 | #> info(vcf):
313 | #>   DataFrame with 3 columns: AF, AC, AN
314 | #> info(header(vcf)):
315 | #>       Number Type    Description                                
316 | #>    AF A      Float   Allele Frequency                           
317 | #>    AC A      Integer Allele count in genotypes                  
318 | #>    AN 1      Integer Total number of alleles in called genotypes
319 | #> geno(vcf):
320 | #>   List of length 9: ES, SE, LP, AF, SS, EZ, SI, NC, ID
321 | #> geno(header(vcf)):
322 | #>       Number Type   Description                                                     
323 | #>    ES A      Float  Effect size estimate relative to the alternative allele         
324 | #>    SE A      Float  Standard error of effect size estimate                          
325 | #>    LP A      Float  -log10 p-value for effect estimate                              
326 | #>    AF A      Float  Alternate allele frequency in the association study             
327 | #>    SS A      Float  Sample size used to estimate genetic effect                     
328 | #>    EZ A      Float  Z-score provided if it was used to derive the EFFECT and SE f...
329 | #>    SI A      Float  Accuracy score of summary data imputation                       
330 | #>    NC A      Float  Number of cases used to estimate genetic effect                 
331 | #>    ID 1      String Study variant identifier
332 | ```
333 | 
334 | 
335 | ``` r
336 | vcf <- query_gwas(vcffile, chrompos=c("1:1097291-1099437"))
337 | vcf
338 | #> class: CollapsedVCF 
339 | #> dim: 2 1 
340 | #> rowRanges(vcf):
341 | #>   GRanges with 5 metadata columns: paramRangeID, REF, ALT, QUAL, FILTER
342 | #> info(vcf):
343 | #>   DataFrame with 3 columns: AF, AC, AN
344 | #> info(header(vcf)):
345 | #>       Number Type    Description                                
346 | #>    AF A      Float   Allele Frequency                           
347 | #>    AC A      Integer Allele count in genotypes                  
348 | #>    AN 1      Integer Total number of alleles in called genotypes
349 | #> geno(vcf):
350 | #>   List of length 9: ES, SE, LP, AF, SS, EZ, SI, NC, ID
351 | #> geno(header(vcf)):
352 | #>       Number Type   Description                                                     
353 | #>    ES A      Float  Effect size estimate relative to the alternative allele         
354 | #>    SE A      Float  Standard error of effect size estimate                          
355 | #>    LP A      Float  -log10 p-value for effect estimate                              
356 | #>    AF A      Float  Alternate allele frequency in the association study             
357 | #>    SS A      Float  Sample size used to estimate genetic effect                     
358 | #>    EZ A      Float  Z-score provided if it was used to derive the EFFECT and SE f...
359 | #>    SI A      Float  Accuracy score of summary data imputation                       
360 | #>    NC A      Float  Number of cases used to estimate genetic effect                 
361 | #>    ID 1      String Study variant identifier
362 | ```
363 | 
364 | It's possible to chain filters together e.g.
365 | 
366 | 
367 | ``` r
368 | vcf <- query_gwas(vcffile, rsid=c("rs3128126", "rs3121561", "rs3813193")) %>%
369 |     query_gwas(pval=0.5)
370 | vcf
371 | #> class: CollapsedVCF 
372 | #> dim: 1 1 
373 | #> rowRanges(vcf):
374 | #>   GRanges with 5 metadata columns: paramRangeID, REF, ALT, QUAL, FILTER
375 | #> info(vcf):
376 | #>   DataFrame with 3 columns: AF, AC, AN
377 | #> info(header(vcf)):
378 | #>       Number Type    Description                                
379 | #>    AF A      Float   Allele Frequency                           
380 | #>    AC A      Integer Allele count in genotypes                  
381 | #>    AN 1      Integer Total number of alleles in called genotypes
382 | #> geno(vcf):
383 | #>   List of length 9: ES, SE, LP, AF, SS, EZ, SI, NC, ID
384 | #> geno(header(vcf)):
385 | #>       Number Type   Description                                                     
386 | #>    ES A      Float  Effect size estimate relative to the alternative allele         
387 | #>    SE A      Float  Standard error of effect size estimate                          
388 | #>    LP A      Float  -log10 p-value for effect estimate                              
389 | #>    AF A      Float  Alternate allele frequency in the association study             
390 | #>    SS A      Float  Sample size used to estimate genetic effect                     
391 | #>    EZ A      Float  Z-score provided if it was used to derive the EFFECT and SE f...
392 | #>    SI A      Float  Accuracy score of summary data imputation                       
393 | #>    NC A      Float  Number of cases used to estimate genetic effect                 
394 | #>    ID 1      String Study variant identifier
395 | ```
396 | 
397 | It's possible to have multiple GWAS studies per vcf. You can specify specific GWAS studies to read in using e.g.
398 | 
399 | 
400 | ``` r
401 | vcf <- query_gwas(vcffile, rsid=c("rs3128126", "rs3121561", "rs3813193"), id="IEU-a-2")
402 | ```
403 | 
404 | Note that querying by chrompos is the fastest way to deal with VCFs, use this over rsid where possible when speed is an issue.
405 | 
406 | ## Indexing rsid values
407 | 
408 | Querying by rsid is slow. If a large number of queries by rsid are to be performed then it could be worth generating an index which would speed up the querying. This approach uses [SQLite](https://www.sqlite.org/index.html) to create a local database, linking rsid to chromosome and position. It strips out the 'rs' from the rs identifiers to make fast searchers by integer. The concept is based on that developed here: [bioforensics/rsidx](https://github.com/bioforensics/rsidx).
409 | 
410 | To create the index:
411 | 
412 | 
413 | ``` r
414 | create_rsidx_index_from_vcf(vcffile, "index.rsidx")
415 | #> Extracting position info
416 | #> Generating index
417 | ```
418 | 
419 | To query using the index:
420 | 
421 | 
422 | ``` r
423 | vcf <- query_gwas(vcffile, rsid=c("rs3128126", "rs3121561", "rs3813193"), rsidx="index.rsidx")
424 | ```
425 | 
426 | ## Indexing p-values
427 | 
428 | Querying by p-value is slow. It could be worth generating an index file for p-values to speed this up. Similar to rsid queries, it uses an sqlite database linking -log10 pvalues to chromosome and position. 
429 | 
430 | To create the index:
431 | 
432 | 
433 | ``` r
434 | create_pval_index_from_vcf(vcffile, maximum_pval=0.05, "index.pvali")
435 | #> Extracting pval info
436 | #> [1] "CREATE TABLE pval_to_coord (chrom TEXT NOT NULL DEFAULT NULL, coord INTEGER NOT NULL DEFAULT NULL, LP REAL NOT NULL DEFAULT 0);"
437 | #> [2] ".separator ,"                                                                                                                   
438 | #> [3] ".import /var/folders/9j/bw4vdrw94yndry3z9cv8ms1m0000gn/T//RtmpFnAKs1/file22f4fdc3a4d pval_to_coord"                             
439 | #> [4] "CREATE INDEX idx_LP ON pval_to_coord (LP)"
440 | #> Generating index
441 | ```
442 | 
443 | To query using the index:
444 | 
445 | 
446 | ``` r
447 | vcf <- query_gwas(vcffile, pval=0.05, pvali="index.pvali")
448 | #> Using pval index
449 | #> Identified 7 variants passing threshold. Extracting...
450 | ```
451 | 
452 | ## A note about chrompos
453 | 
454 | The fastest way to query VCFs is by specifying chromosome and position. Can specify specific positions, or ranges. e.g.
455 | 
456 | 
457 | ``` r
458 | cp <- c("1:10000", "2:10000-20000")
459 | ```
460 | 
461 | or as a data frame
462 | 
463 | 
464 | ``` r
465 | cp <- dplyr::tibble(chrom=c(1,2), start=c(10000,10000), end=c(10000, 20000))
466 | ```
467 | 
468 | You can check what will be parsed out with:
469 | 
470 | 
471 | ``` r
472 | parse_chrompos(cp)
473 | #> GRanges object with 2 ranges and 0 metadata columns:
474 | #>       seqnames      ranges strand
475 | #>          <Rle>   <IRanges>  <Rle>
476 | #>   [1]        1       10000      *
477 | #>   [2]        2 10000-20000      *
478 | #>   -------
479 | #>   seqinfo: 2 sequences from an unspecified genome; no seqlengths
480 | ```
481 | 
482 | Querying by p-value or rsid is also possible but is slower as only chrompos is indexed. On Mac and Linux, rsid and p-value queries are performed by calls to bcftools. On Windows it uses VariantAnnotation directly, because bcftools binaries are not available. This is unfortunately somewhat slower. If many operations are being performed it might be faster to read in the whole dataset and perform queries that way.
483 | 
484 | ## LD proxies
485 | 
486 | If a set of rsids are requested from a vcf but some are absent, a reference panel can be used to search for LD proxies, extract them, and align the effects and alleles against the original variants that were requested.
487 | 
488 | There are two ways to perform the LD proxy search:
489 | 
490 | - using a set of genotyped samples as an LD reference panel (e.g. 1000 genomes data) - this is slow but relatively convenient
491 | - compiling an LD tag list from an LD reference panel - once generated this is very fast
492 | 
493 | ### Using an LD reference panel
494 | 
495 | An LD reference panel can be obtained from here: [http://fileserve.mrcieu.ac.uk/ld/data_maf0.01_rs_ref.tgz](http://fileserve.mrcieu.ac.uk/ld/data_maf0.01_rs_ref.tgz). This dataset comprises Europeans from the 1000 genomes project, in plink format, and including only SNPs with MAF > 0.01, and with the reference alleles aligned to the human genome reference sequence. For this vignette we can use a small subset of that dataset:
496 | 
497 | 
498 | ``` r
499 | ldfile <- system.file("extdata", "eur.bed", package="gwasvcf") %>% 
500 |   gsub(".bed", "", .)
501 | ```
502 | 
503 | We also need to provide a path to the plink binary used to generate LD calculations. This can be done through the `genetics.binaRies` package as with bcftools
504 | 
505 | 
506 | ``` r
507 | set_plink()
508 | #> Path not provided, using binaries in the MRCIEU/genetics.binaRies package
509 | ```
510 | 
511 | The rs4442317 variant is not present in the vcf file, i.e. if we query that variant:
512 | 
513 | 
514 | ``` r
515 | query_gwas(vcffile, rsid="rs4442317") %>% nrow
516 | #> [1] 0
517 | ```
518 | 
519 | 
520 | ``` r
521 | vcf <- query_gwas(vcffile, rsid="rs4442317", proxies="yes", bfile=ldfile, tag_r2=0.05)
522 | #> Initial search...
523 | #> Extracted 0 out of 1 rsids
524 | #> Searching for proxies for 1 rsids
525 | #> Determining searchspace...
526 | #> Proxy lookup...
527 | #> Finding proxies...
528 | #> Found 10 proxies
529 | #> Extrating proxies...
530 | #> Identified proxies for 1 of 1 rsids
531 | #> Aligning...
532 | vcf %>% vcf_to_granges()
533 | #> GRanges object with 1 range and 15 metadata columns:
534 | #>             seqnames    ranges strand |         REF         ALT      QUAL
535 | #>                <Rle> <IRanges>  <Rle> | <character> <character> <numeric>
536 | #>   rs4442317        1   1106784      * |           T           C        NA
537 | #>                  FILTER        ES        SE        LP        AF        SS        EZ
538 | #>             <character> <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
539 | #>   rs4442317        PASS    0.0059    0.0071  0.391474    0.8559    138001        NA
540 | #>                    SI        NC          ID          PR          id
541 | #>             <numeric> <numeric> <character> <character> <character>
542 | #>   rs4442317        NA        NA   rs4970420   rs4970420     IEU-a-2
543 | #>   -------
544 | #>   seqinfo: 1 sequence from an unspecified genome; no seqlengths
545 | ```
546 | 
547 | Here we see that the proxy variant is rs4970420.
548 | 
549 | You may also extract only the best available proxies even if the requested rsids are present, by using `proxies="only"`. An example of this shows that the effect size estimates for the proxy variants are aligned to the effect alleles of the target variants:
550 | 
551 | 
552 | 
553 | ``` r
554 | # Read vcf
555 | a <- readVcf(vcffile)
556 | 
557 | # Obtain the best LD proxy for each of the rsids
558 | b <- query_gwas(vcffile, rsid=names(a), proxies="only", bfile=ldfile, tag_r2=0.6)
559 | #> Determining searchspace...
560 | #> Proxy lookup...
561 | #> Finding proxies...
562 | #> Found 270 proxies
563 | #> Extrating proxies...
564 | #> Identified proxies for 52 of 1 rsids
565 | #> Aligning...
566 | 
567 | # Match the target data to the proxy data
568 | index <- match(names(b), names(a))
569 | 
570 | # Plot the target data effects against the proxy data effects
571 | plot(vcf_to_granges(b)$ES, vcf_to_granges(a)$ES[index])
572 | ```
573 | 
574 | <div class="figure">
575 | <img src="figure/target-effects-plot-1.png" alt="Plot of the target data effects against the proxy data effects"  />
576 | <p class="caption">Plot the target data effects against the proxy data effects</p>
577 | </div>
578 | 
579 | ### Compiling a list of tagging variants
580 | 
581 | Using the LD reference panel described above, it is possible to create a sqlite tag reference panel using the following commands. First get an example LD reference panel:
582 | 
583 | 
584 | ``` r
585 | ldfile <- system.file("extdata", "eur.bed", package="gwasvcf") %>% 
586 |   gsub(".bed", "", .)
587 | ```
588 | 
589 | We also need to provide a path to the plink binary used to generate LD calculations. This can be done through the `genetics.binaRies` package as with bcftools
590 | 
591 | 
592 | ``` r
593 | set_plink()
594 | #> Path not provided, using binaries in the MRCIEU/genetics.binaRies package
595 | ```
596 | 
597 | Now generate the tagging database
598 | 
599 | 
600 | ``` r
601 | dbfile <- tempfile()
602 | create_ldref_sqlite(ldfile, dbfile, tag_r2 = 0.05)
603 | #> identifying indels to remove
604 | #> calculating ld tags
605 | #> formatting
606 | #> creating sqlite db
607 | ```
608 | 
609 | Perform the query
610 | 
611 | 
612 | ``` r
613 | vcf <- query_gwas(vcffile, rsid="rs4442317", proxies="yes", dbfile=dbfile, tag_r2=0.05)
614 | #> Initial search...
615 | #> Extracted 0 out of 1 rsids
616 | #> Searching for proxies for 1 rsids
617 | #> Proxy lookup...
618 | #> Found 168 proxies
619 | #> Extrating proxies...
620 | #> Identified proxies for 1 of 1 rsids
621 | #> Aligning...
622 | vcf %>% vcf_to_granges()
623 | #> GRanges object with 1 range and 15 metadata columns:
624 | #>             seqnames    ranges strand |         REF         ALT      QUAL
625 | #>                <Rle> <IRanges>  <Rle> | <character> <character> <numeric>
626 | #>   rs4442317        1   1106784      * |           T           C        NA
627 | #>                  FILTER        ES        SE        LP        AF        SS        EZ
628 | #>             <character> <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
629 | #>   rs4442317        PASS    -4e-04    0.0066 0.0214999       0.9    233073        NA
630 | #>                    SI        NC          ID          PR          id
631 | #>             <numeric> <numeric> <character> <character> <character>
632 | #>   rs4442317        NA        NA  rs10907175  rs10907175     IEU-a-2
633 | #>   -------
634 | #>   seqinfo: 1 sequence from an unspecified genome; no seqlengths
635 | ```
636 | 
637 | 
638 | 
639 | ## Creating the VCF object from a data frame
640 | 
641 | If you have GWAS summary data in a text file or data frame, this can be converted to a VCF object.
642 | 
643 | 
644 | ``` r
645 | vcf <- readVcf(vcffile)
646 | vv <- vcf_to_granges(vcf) %>% dplyr::as_tibble()
647 | out <- vv %$% create_vcf(chrom=seqnames, pos=start, nea=REF, ea=ALT, snp=ID, ea_af=AF, effect=ES, se=SE, pval=10^-LP, n=SS, name="a")
648 | out
649 | #> class: CollapsedVCF 
650 | #> dim: 92 1 
651 | #> rowRanges(vcf):
652 | #>   GRanges with 4 metadata columns: REF, ALT, QUAL, FILTER
653 | #> info(vcf):
654 | #>   DataFrame with 0 columns: 
655 | #> geno(vcf):
656 | #>   List of length 6: AF, ES, SE, LP, SS, ID
657 | #> geno(header(vcf)):
658 | #>       Number Type   Description                                            
659 | #>    AF A      Float  Alternate allele frequency in the association study    
660 | #>    ES A      Float  Effect size estimate relative to the alternative allele
661 | #>    SE A      Float  Standard error of effect size estimate                 
662 | #>    LP A      Float  -log10 p-value for effect estimate                     
663 | #>    SS A      Float  Sample size used to estimate genetic effect            
664 | #>    ID A      String Study variant identifier
665 | ```
666 | 
667 | It's possible to write the vcf file:
668 | 
669 | 
670 | ``` r
671 | writeVcf(out, file="temp.vcf")
672 | ```
673 | 
674 | You may want to first harmonise the data so that all the non-effect alleles are aligned to the human genome reference. See the [gwasglue](https://github.com/MRCIEU/gwasglue) package on some functions to do this. 
675 | 
676 | ## Creating a gwasglue2 SummarySet object from a vcf file
677 | 
678 | Although still under development, if compared with its predecessor, the [gwasglue2](https://mrcieu.github.io/gwasglue2/) package has several new features, including the use of S4 R objects.
679 | 
680 | It is possible to create a `SummarySet` object from a GWAS-VCF file or VCF object e.g. output from `VariantAnnotation::readVcf()`, `create_vcf()` or `query_gwas()` using the `gwasvcf_to_summaryset()` function.
681 | 
682 | For example:
683 | 
684 | 
685 | ``` r
686 | summaryset <- readVcf(vcffile) %>% 
687 |               gwasvcf_to_summaryset()
688 | ```
689 | 
690 | Once the `SummarySet` objects are created, it is possible to use `gwasglue2` to harmonise data, harmonise against a LD matrix, remap genomic coordinates to a different genome assembly, convert to other formats and more.
691 | 


--------------------------------------------------------------------------------
/vignettes/guide.Rmd.orig:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Reading, querying and writing GWAS summary data in VCF format"
  3 | output: rmarkdown::html_vignette
  4 | vignette: >
  5 |   %\VignetteIndexEntry{Reading, querying and writing GWAS summary data in VCF format}
  6 |   %\VignetteEngine{knitr::rmarkdown}
  7 |   %\VignetteEncoding{UTF-8}
  8 | ---
  9 | 
 10 | ```{r, include = FALSE}
 11 | knitr::opts_chunk$set(
 12 |   collapse = TRUE,
 13 |   comment = "#>"
 14 | )
 15 | ```
 16 | 
 17 | We developed a format for storing and harmonising GWAS summary data known as [GWAS VCF format](https://github.com/MRCIEU/gwas-vcf-specification). This format is effective for being very fast when querying chromosome and position ranges, handling multiallelic variants and indels. 
 18 | 
 19 | All the data in the [IEU GWAS database](https://gwas.mrcieu.ac.uk/) is available for download in the GWAS VCF format. This R package provides fast and convenient functions for querying and creating GWAS summary data in GWAS VCF format. The package builds on the [VariantAnnotation](https://bioconductor.org/packages/release/bioc/html/VariantAnnotation.html) Bioconductor package, which itself is based on the widely used [SummarizedExperiment](https://bioconductor.org/packages/release/bioc/html/SummarizedExperiment.html) Bioconductor package.
 20 | 
 21 | 
 22 | ## External tools
 23 | 
 24 | For some VCF querying functions it is faster to optionally use [bcftools](https://samtools.github.io/bcftools/bcftools.html), and when available the R package will use that strategy. To set a location for the bcftools package, use
 25 | 
 26 | ```r
 27 | library(gwasvcf)
 28 | set_bcftools('/path/to/bcftools')
 29 | ```
 30 | 
 31 | Note that there is bcftools binary for Windows available, so some querying options will be slower on Windows.
 32 | 
 33 | For LD related functions the package uses [plink 1.90](https://www.cog-genomics.org/plink/1.9). You can specify the location of your plink installation by running
 34 | 
 35 | ```r
 36 | set_plink('/path/to/plink')
 37 | ```
 38 | 
 39 | Alternatively you can automatically use use the binaries bundled here: https://github.com/mrcieu/genetics.binaRies 
 40 | 
 41 | ```r
 42 | remotes::install_github('mrcieu/genetics.binaRies')
 43 | set_plink()
 44 | set_bcftools()
 45 | ```
 46 | 
 47 | To unset a path:
 48 | 
 49 | ```r
 50 | set_plink(NULL)
 51 | set_bcftools(NULL)
 52 | ```
 53 | 
 54 | For this vignette we will use the bundled binaries in `genetics.binaRies`.
 55 | 
 56 | ```{r}
 57 | suppressWarnings(suppressPackageStartupMessages({
 58 |   library(gwasvcf)
 59 |   library(VariantAnnotation)
 60 |   library(dplyr)
 61 |   library(magrittr)
 62 | }))
 63 | ```
 64 | ```{r eval=Sys.info()["sysname"] != "Windows"}
 65 | set_bcftools()
 66 | ```
 67 | 
 68 | ## Reading in everything
 69 | 
 70 | To read an entire dataset use the `readVcf` function. As an example we'll use the bundled data which is a small subset of the Speliotes et al 2010 BMI GWAS.
 71 | 
 72 | ```{r}
 73 | vcffile <- system.file("extdata", "data.vcf.gz", package="gwasvcf")
 74 | vcf <- readVcf(vcffile)
 75 | class(vcf)
 76 | ```
 77 | 
 78 | Please refer to the `VariantAnnotation` package documentation for full details about the `CollapsedVCF` object. A brief summary follows.
 79 | 
 80 | General info about the dataset can be obtained by calling it:
 81 | 
 82 | ```{r}
 83 | vcf
 84 | ```
 85 | 
 86 | There are 92 rows and 1 column which means 92 SNPs and one GWAS. See the header information:
 87 | 
 88 | ```{r}
 89 | header(vcf)
 90 | ```
 91 | 
 92 | See the names of the GWAS datasets (in this case just one, and it refers to the IEU GWAS database ID name):
 93 | 
 94 | ```{r}
 95 | samples(header(vcf))
 96 | ```
 97 | 
 98 | In this case you can obtain information about this study through the `ieugwasr` package e.g. `ieugwasr::gwasinfo("IEU-a-2")`.
 99 | 
100 | There are a few components within the object:
101 | 
102 | - `header` which has the meta data describing the dataset, including the association result variables
103 | - `rowRanges` which is information about each variant
104 | - `info` which is further metadata about each variant
105 | - `geno` which is the actual association results for each GWAS
106 | 
107 | the `rowRanges` object is a `GenomicRanges` class, which is useful for performing fast operations on chromosome position information.
108 | 
109 | ```{r}
110 | rowRanges(vcf)
111 | ```
112 | 
113 | ## Converting to simple dataframes
114 | 
115 | The VCF object is somewhat complex and you can read more about it in the [VariantAnnotation package documentation](https://bioconductor.org/packages/release/bioc/html/VariantAnnotation.html). You can create various other formats that might be easier to use from it. For example, create a `GRanges` object which is great for fast chromosome-position operations
116 | 
117 | ```{r}
118 | vcf_to_granges(vcf)
119 | ```
120 | 
121 | Create a data frame:
122 | 
123 | ```{r}
124 | vcf_to_granges(vcf) %>% dplyr::as_tibble()
125 | ```
126 | 
127 | The direct conversion to formats for tools such as TwoSampleMR, coloc, and many others can also be made using the [https://github.com/mrcieu/gwasglue](https://github.com/mrcieu/gwasglue) R package.
128 | 
129 | ## Reading in with filters
130 | 
131 | The `query_gwas()` function takes either a filename to a vcf file, or vcf object as the main argument. You can then query on `rsid`, `pval` or `chrompos`. For example
132 | 
133 | ```{r}
134 | vcfsubset <- query_gwas(vcffile, chrompos=c("1:1097291-1099437"))
135 | ```
136 | 
137 | and
138 | 
139 | ```{r}
140 | vcf <- readVcf(vcffile)
141 | vcfsubset <- query_gwas(vcf, chrompos=c("1:1097291-1099437"))
142 | ```
143 | 
144 | are each identical, but the former saves time and memory because it is querying the file using an index and only reading in what is required.
145 | 
146 | Examples of other filters are here:
147 | 
148 | ```{r}
149 | vcf <- query_gwas(vcffile, rsid=c("rs3128126", "rs3121561", "rs3813193"))
150 | vcf
151 | ```
152 | 
153 | ```{r}
154 | vcf <- query_gwas(vcffile, pval=0.5)
155 | vcf
156 | ```
157 | 
158 | ```{r}
159 | vcf <- query_gwas(vcffile, chrompos=c("1:1097291-1099437"))
160 | vcf
161 | ```
162 | 
163 | It's possible to chain filters together e.g.
164 | 
165 | ```{r}
166 | vcf <- query_gwas(vcffile, rsid=c("rs3128126", "rs3121561", "rs3813193")) %>%
167 |     query_gwas(pval=0.5)
168 | vcf
169 | ```
170 | 
171 | It's possible to have multiple GWAS studies per vcf. You can specify specific GWAS studies to read in using e.g.
172 | 
173 | ```{r}
174 | vcf <- query_gwas(vcffile, rsid=c("rs3128126", "rs3121561", "rs3813193"), id="IEU-a-2")
175 | ```
176 | 
177 | Note that querying by chrompos is the fastest way to deal with VCFs, use this over rsid where possible when speed is an issue.
178 | 
179 | ## Indexing rsid values
180 | 
181 | Querying by rsid is slow. If a large number of queries by rsid are to be performed then it could be worth generating an index which would speed up the querying. This approach uses [SQLite](https://www.sqlite.org/index.html) to create a local database, linking rsid to chromosome and position. It strips out the 'rs' from the rs identifiers to make fast searchers by integer. The concept is based on that developed here: [bioforensics/rsidx](https://github.com/bioforensics/rsidx).
182 | 
183 | To create the index:
184 | 
185 | ```{r eval=Sys.info()["sysname"] != "Windows"}
186 | create_rsidx_index_from_vcf(vcffile, "index.rsidx")
187 | ```
188 | 
189 | To query using the index:
190 | 
191 | ```{r eval=Sys.info()["sysname"] != "Windows"}
192 | vcf <- query_gwas(vcffile, rsid=c("rs3128126", "rs3121561", "rs3813193"), rsidx="index.rsidx")
193 | ```
194 | 
195 | ## Indexing p-values
196 | 
197 | Querying by p-value is slow. It could be worth generating an index file for p-values to speed this up. Similar to rsid queries, it uses an sqlite database linking -log10 pvalues to chromosome and position. 
198 | 
199 | To create the index:
200 | 
201 | ```{r eval=Sys.info()["sysname"] != "Windows"}
202 | create_pval_index_from_vcf(vcffile, maximum_pval=0.05, "index.pvali")
203 | ```
204 | 
205 | To query using the index:
206 | 
207 | ```{r eval=Sys.info()["sysname"] != "Windows"}
208 | vcf <- query_gwas(vcffile, pval=0.05, pvali="index.pvali")
209 | ```
210 | 
211 | ## A note about chrompos
212 | 
213 | The fastest way to query VCFs is by specifying chromosome and position. Can specify specific positions, or ranges. e.g.
214 | 
215 | ```{r}
216 | cp <- c("1:10000", "2:10000-20000")
217 | ```
218 | 
219 | or as a data frame
220 | 
221 | ```{r}
222 | cp <- dplyr::tibble(chrom=c(1,2), start=c(10000,10000), end=c(10000, 20000))
223 | ```
224 | 
225 | You can check what will be parsed out with:
226 | 
227 | ```{r}
228 | parse_chrompos(cp)
229 | ```
230 | 
231 | Querying by p-value or rsid is also possible but is slower as only chrompos is indexed. On Mac and Linux, rsid and p-value queries are performed by calls to bcftools. On Windows it uses VariantAnnotation directly, because bcftools binaries are not available. This is unfortunately somewhat slower. If many operations are being performed it might be faster to read in the whole dataset and perform queries that way.
232 | 
233 | ## LD proxies
234 | 
235 | If a set of rsids are requested from a vcf but some are absent, a reference panel can be used to search for LD proxies, extract them, and align the effects and alleles against the original variants that were requested.
236 | 
237 | There are two ways to perform the LD proxy search:
238 | 
239 | - using a set of genotyped samples as an LD reference panel (e.g. 1000 genomes data) - this is slow but relatively convenient
240 | - compiling an LD tag list from an LD reference panel - once generated this is very fast
241 | 
242 | ### Using an LD reference panel
243 | 
244 | An LD reference panel can be obtained from here: [http://fileserve.mrcieu.ac.uk/ld/data_maf0.01_rs_ref.tgz](http://fileserve.mrcieu.ac.uk/ld/data_maf0.01_rs_ref.tgz). This dataset comprises Europeans from the 1000 genomes project, in plink format, and including only SNPs with MAF > 0.01, and with the reference alleles aligned to the human genome reference sequence. For this vignette we can use a small subset of that dataset:
245 | 
246 | ```{r}
247 | ldfile <- system.file("extdata", "eur.bed", package="gwasvcf") %>% 
248 |   gsub(".bed", "", .)
249 | ```
250 | 
251 | We also need to provide a path to the plink binary used to generate LD calculations. This can be done through the `genetics.binaRies` package as with bcftools
252 | 
253 | ```{r}
254 | set_plink()
255 | ```
256 | 
257 | The rs4442317 variant is not present in the vcf file, i.e. if we query that variant:
258 | 
259 | ```{r}
260 | query_gwas(vcffile, rsid="rs4442317") %>% nrow
261 | ```
262 | 
263 | ```{r eval=Sys.info()["sysname"] != "Windows"}
264 | vcf <- query_gwas(vcffile, rsid="rs4442317", proxies="yes", bfile=ldfile, tag_r2=0.05)
265 | vcf %>% vcf_to_granges()
266 | ```
267 | 
268 | Here we see that the proxy variant is `r vcf_to_granges(vcf)$PR`.
269 | 
270 | You may also extract only the best available proxies even if the requested rsids are present, by using `proxies="only"`. An example of this shows that the effect size estimates for the proxy variants are aligned to the effect alleles of the target variants:
271 | 
272 | 
273 | ```{r target-effects-plot, eval=Sys.info()["sysname"] != "Windows", fig.alt="Plot of the target data effects against the proxy data effects", fig.cap="Plot the target data effects against the proxy data effects"}
274 | # Read vcf
275 | a <- readVcf(vcffile)
276 | 
277 | # Obtain the best LD proxy for each of the rsids
278 | b <- query_gwas(vcffile, rsid=names(a), proxies="only", bfile=ldfile, tag_r2=0.6)
279 | 
280 | # Match the target data to the proxy data
281 | index <- match(names(b), names(a))
282 | 
283 | # Plot the target data effects against the proxy data effects
284 | plot(vcf_to_granges(b)$ES, vcf_to_granges(a)$ES[index])
285 | ```
286 | 
287 | ### Compiling a list of tagging variants
288 | 
289 | Using the LD reference panel described above, it is possible to create a sqlite tag reference panel using the following commands. First get an example LD reference panel:
290 | 
291 | ```{r}
292 | ldfile <- system.file("extdata", "eur.bed", package="gwasvcf") %>% 
293 |   gsub(".bed", "", .)
294 | ```
295 | 
296 | We also need to provide a path to the plink binary used to generate LD calculations. This can be done through the `genetics.binaRies` package as with bcftools
297 | 
298 | ```{r}
299 | set_plink()
300 | ```
301 | 
302 | Now generate the tagging database
303 | 
304 | ```{r eval=Sys.info()["sysname"] != "Windows"}
305 | dbfile <- tempfile()
306 | create_ldref_sqlite(ldfile, dbfile, tag_r2 = 0.05)
307 | ```
308 | 
309 | Perform the query
310 | 
311 | ```{r eval=Sys.info()["sysname"] != "Windows"}
312 | vcf <- query_gwas(vcffile, rsid="rs4442317", proxies="yes", dbfile=dbfile, tag_r2=0.05)
313 | vcf %>% vcf_to_granges()
314 | ```
315 | 
316 | ```{r, echo=FALSE, eval=Sys.info()["sysname"] != "Windows"}
317 | unlink(dbfile)
318 | ```
319 | 
320 | ## Creating the VCF object from a data frame
321 | 
322 | If you have GWAS summary data in a text file or data frame, this can be converted to a VCF object.
323 | 
324 | ```{r}
325 | vcf <- readVcf(vcffile)
326 | vv <- vcf_to_granges(vcf) %>% dplyr::as_tibble()
327 | out <- vv %$% create_vcf(chrom=seqnames, pos=start, nea=REF, ea=ALT, snp=ID, ea_af=AF, effect=ES, se=SE, pval=10^-LP, n=SS, name="a")
328 | out
329 | ```
330 | 
331 | It's possible to write the vcf file:
332 | 
333 | ```{r, eval=FALSE}
334 | writeVcf(out, file="temp.vcf")
335 | ```
336 | 
337 | You may want to first harmonise the data so that all the non-effect alleles are aligned to the human genome reference. See the [gwasglue](https://github.com/MRCIEU/gwasglue) package on some functions to do this. 
338 | 
339 | ## Creating a gwasglue2 SummarySet object from a vcf file
340 | 
341 | Although still under development, if compared with its predecessor, the [gwasglue2](https://mrcieu.github.io/gwasglue2/) package has several new features, including the use of S4 R objects.
342 | 
343 | It is possible to create a `SummarySet` object from a GWAS-VCF file or VCF object e.g. output from `VariantAnnotation::readVcf()`, `create_vcf()` or `query_gwas()` using the `gwasvcf_to_summaryset()` function.
344 | 
345 | For example:
346 | 
347 | ```{r, eval=FALSE}
348 | summaryset <- readVcf(vcffile) %>% 
349 |               gwasvcf_to_summaryset()
350 | ```
351 | 
352 | Once the `SummarySet` objects are created, it is possible to use `gwasglue2` to harmonise data, harmonise against a LD matrix, remap genomic coordinates to a different genome assembly, convert to other formats and more.
353 | 


--------------------------------------------------------------------------------
/vignettes/precompile.R:
--------------------------------------------------------------------------------
1 | # Execute the code from the vignette
2 | knitr::knit("vignettes/guide.Rmd.orig", output = "vignettes/guide.Rmd")
3 | file.rename("figure/target-effects-plot-1.png", "vignettes/figure/target-effects-plot-1.png")
4 | 


--------------------------------------------------------------------------------