├── .Rbuildignore ├── .devcontainer └── devcontainer.json ├── .gitattributes ├── .github ├── .gitignore └── workflows │ ├── R-CMD-check.yaml │ └── pkgdown.yaml ├── .gitignore ├── DESCRIPTION ├── LICENSE ├── LICENSE.md ├── NAMESPACE ├── NEWS.md ├── R ├── binaries.r ├── gwasglue.R ├── manipulate.r ├── proxy.r ├── pval_index.r ├── query.r ├── rsid_index.r ├── utils-pipe.R └── zzz.r ├── README.md ├── _pkgdown.yml ├── gwasvcf.Rproj ├── inst ├── extdata │ ├── data.vcf.gz │ ├── data.vcf.gz.tbi │ ├── eur.bed │ ├── eur.bim │ └── eur.fam └── sandpit │ ├── bmi_example.r │ ├── bmi_example_cp.r │ ├── harmonise_against_ref.r │ ├── misc │ ├── create_ref.sh │ ├── harmonise.r │ ├── harmonise_against_ref.r │ ├── query_times.html │ ├── query_times.rmd │ ├── skeleton.sh │ ├── vcf.html │ └── vcf.rmd │ └── test_extract.r ├── man ├── VariantAnnotation.Rd ├── check_bcftools.Rd ├── check_plink.Rd ├── create_ldref_sqlite.Rd ├── create_pval_index_from_vcf.Rd ├── create_rsidx_index_from_vcf.Rd ├── create_rsidx_sub_index.Rd ├── create_vcf.Rd ├── get_ld_proxies.Rd ├── gwasvcf_to_summaryset.Rd ├── merge_vcf.Rd ├── parse_chrompos.Rd ├── pipe.Rd ├── proxy_match.Rd ├── query_chrompos_bcftools.Rd ├── query_chrompos_file.Rd ├── query_chrompos_vcf.Rd ├── query_gwas.Rd ├── query_pval_bcftools.Rd ├── query_pval_file.Rd ├── query_pval_sqlite3.Rd ├── query_pval_vcf.Rd ├── query_pvali.Rd ├── query_rsid_bcftools.Rd ├── query_rsid_file.Rd ├── query_rsid_rsidx.Rd ├── query_rsid_vcf.Rd ├── query_rsidx.Rd ├── set_bcftools.Rd ├── set_plink.Rd ├── sqlite_ld_proxies.Rd ├── vcf_to_granges.Rd ├── vcf_to_tibble.Rd └── vcflist_overlaps.Rd ├── tests ├── testthat.R └── testthat │ ├── test_manipulate.r │ ├── test_proxy.R │ ├── test_pvali.r │ ├── test_query.r │ └── test_rsidx.r └── vignettes ├── figure └── target-effects-plot-1.png ├── guide.Rmd ├── guide.Rmd.orig └── precompile.R /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^LICENSE\.md$ 2 | ^mrbase\.oauth$ 3 | ^ieugwasr_oauth$ 4 | .travis.yml 5 | ^_pkgdown\.yml$ 6 | ^docs$ 7 | ^pkgdown$ 8 | ^\.github$ 9 | ^.*\.Rproj$ 10 | ^\.Rproj\.user$ 11 | ^Dockerfile$ 12 | ^\.devcontainer$ 13 | 14 | # Files generated by tests 15 | inst/extdata/eur.indels 16 | inst/extdata/eur.log 17 | inst/extdata/eur.nosex 18 | tests/testthat/temp.vcf 19 | 20 | # Files created by vignettes 21 | ^vignettes/index\.rsidx$ 22 | ^vignettes/temp\.vcf$ 23 | ^vignettes/index\.pvali$ 24 | 25 | ^vignettes/precompile\.R$ 26 | ^vignettes/figure$ 27 | -------------------------------------------------------------------------------- /.devcontainer/devcontainer.json: -------------------------------------------------------------------------------- 1 | // For format details, see https://aka.ms/devcontainer.json. For config options, see the README at: 2 | // https://github.com/microsoft/vscode-dev-containers/tree/v0.245.2/containers/docker-existing-dockerfile 3 | { 4 | "name": "Existing Dockerfile", 5 | 6 | // Sets the run context to one level up instead of the .devcontainer folder. 7 | "context": "..", 8 | 9 | // Update the 'dockerFile' property if you aren't using the standard 'Dockerfile' filename. 10 | "dockerFile": "../Dockerfile" 11 | 12 | // Use 'forwardPorts' to make a list of ports inside the container available locally. 13 | // "forwardPorts": [], 14 | 15 | // Uncomment the next line to run commands after the container is created - for example installing curl. 16 | // "postCreateCommand": "apt-get update && apt-get install -y curl", 17 | 18 | // Uncomment when using a ptrace-based debugger like C++, Go, and Rust 19 | // "runArgs": [ "--cap-add=SYS_PTRACE", "--security-opt", "seccomp=unconfined" ], 20 | 21 | // Uncomment to use the Docker CLI from inside the container. See https://aka.ms/vscode-remote/samples/docker-from-docker. 22 | // "mounts": [ "source=/var/run/docker.sock,target=/var/run/docker.sock,type=bind" ], 23 | 24 | // Uncomment to connect as a non-root user if you've added one. See https://aka.ms/vscode-remote/containers/non-root. 25 | // "remoteUser": "vscode" 26 | } 27 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | 4 | # Settings for Linguist Languages pane 5 | *.html linguist-vendored 6 | *.css linguist-vendored 7 | *.js linguist-vendored 8 | docs/* linguist-vendored 9 | *.rdb linguist-vendored 10 | *.rdx linguist-vendored 11 | -------------------------------------------------------------------------------- /.github/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | -------------------------------------------------------------------------------- /.github/workflows/R-CMD-check.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | # 4 | # NOTE: This workflow is overkill for most R packages and 5 | # check-standard.yaml is likely a better choice. 6 | # usethis::use_github_action("check-standard") will install it. 7 | on: 8 | push: 9 | branches: [main, master] 10 | pull_request: 11 | 12 | name: R-CMD-check.yaml 13 | 14 | permissions: read-all 15 | 16 | jobs: 17 | R-CMD-check: 18 | runs-on: ${{ matrix.config.os }} 19 | 20 | name: ${{ matrix.config.os }} (${{ matrix.config.r }}) 21 | 22 | strategy: 23 | fail-fast: false 24 | matrix: 25 | config: 26 | - {os: macos-latest, r: 'release'} 27 | 28 | - {os: windows-latest, r: 'release'} 29 | # use 4.0 or 4.1 to check with rtools40's older compiler 30 | - {os: windows-latest, r: 'oldrel-4'} 31 | 32 | - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} 33 | - {os: ubuntu-latest, r: 'release'} 34 | - {os: ubuntu-latest, r: 'oldrel-1'} 35 | - {os: ubuntu-latest, r: 'oldrel-2'} 36 | - {os: ubuntu-latest, r: 'oldrel-3'} 37 | - {os: ubuntu-latest, r: 'oldrel-4'} 38 | - {os: ubuntu-latest, r: 'oldrel-5'} 39 | 40 | env: 41 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 42 | R_KEEP_PKG_SOURCE: yes 43 | 44 | steps: 45 | - uses: actions/checkout@v4 46 | 47 | - uses: r-lib/actions/setup-pandoc@v2 48 | 49 | - uses: r-lib/actions/setup-r@v2 50 | with: 51 | r-version: ${{ matrix.config.r }} 52 | http-user-agent: ${{ matrix.config.http-user-agent }} 53 | use-public-rspm: true 54 | 55 | - uses: r-lib/actions/setup-r-dependencies@v2 56 | with: 57 | extra-packages: any::rcmdcheck 58 | needs: check 59 | 60 | - uses: r-lib/actions/check-r-package@v2 61 | with: 62 | upload-snapshots: true 63 | build_args: 'c("--no-manual","--compact-vignettes=gs+qpdf")' 64 | -------------------------------------------------------------------------------- /.github/workflows/pkgdown.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | release: 8 | types: [published] 9 | workflow_dispatch: 10 | 11 | name: pkgdown.yaml 12 | 13 | permissions: read-all 14 | 15 | jobs: 16 | pkgdown: 17 | runs-on: ubuntu-latest 18 | # Only restrict concurrency for non-PR jobs 19 | concurrency: 20 | group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }} 21 | env: 22 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 23 | permissions: 24 | contents: write 25 | steps: 26 | - uses: actions/checkout@v4 27 | 28 | - uses: r-lib/actions/setup-pandoc@v2 29 | 30 | - uses: r-lib/actions/setup-r@v2 31 | with: 32 | use-public-rspm: true 33 | 34 | - uses: r-lib/actions/setup-r-dependencies@v2 35 | with: 36 | extra-packages: any::pkgdown, local::. 37 | needs: website 38 | 39 | - name: Build site 40 | run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE) 41 | shell: Rscript {0} 42 | 43 | - name: Deploy to GitHub pages 🚀 44 | if: github.event_name != 'pull_request' 45 | uses: JamesIves/github-pages-deploy-action@v4.5.0 46 | with: 47 | clean: false 48 | branch: gh-pages 49 | folder: docs 50 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | inst/doc 5 | doc 6 | Meta 7 | 8 | # Files generated by tests 9 | inst/extdata/eur.indels 10 | inst/extdata/eur.log 11 | inst/extdata/eur.nosex 12 | tests/testthat/temp.vcf 13 | 14 | # Files created by vignettes 15 | vignettes/index.rsidx 16 | vignettes/temp.vcf 17 | vignettes/index.pvali 18 | 19 | docs/ 20 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: gwasvcf 2 | Title: Tools for Dealing with GWAS Summary Data in VCF Format 3 | Version: 0.1.4 4 | Authors@R: c( 5 | person("Gibran", "Hemani", , "g.hemani@bristol.ac.uk", role = c("aut", "cre"), 6 | comment = c(ORCID = "0000-0003-0920-1055")), 7 | person("Tom", "Palmer", , "tom.palmer@bristol.ac.uk", role = "ctb", 8 | comment = c(ORCID = "0000-0003-4655-4511")), 9 | person("Rita", "Rasteiro", , "rita.rasteiro@bristol.ac.uk", role = "ctb", 10 | comment = c(ORCID = "0000-0002-4217-3060")) 11 | ) 12 | Description: Tools for dealing with GWAS summary data in VCF format. 13 | Includes reading, querying, writing, as well as helper functions such 14 | as LD proxy searches. 15 | License: MIT + file LICENSE 16 | URL: https://github.com/mrcieu/gwasvcf, https://mrcieu.github.io/gwasvcf/ 17 | BugReports: https://github.com/mrcieu/gwasvcf/issues 18 | Depends: 19 | R (>= 4.0.0) 20 | Imports: 21 | BiocGenerics, 22 | Biostrings, 23 | data.table, 24 | dplyr, 25 | genetics.binaRies, 26 | GenomeInfoDb, 27 | GenomicRanges, 28 | gwasglue2, 29 | IRanges, 30 | magrittr, 31 | RCurl, 32 | rlang, 33 | Rsamtools, 34 | RSQLite, 35 | S4Vectors, 36 | stringr, 37 | SummarizedExperiment, 38 | utils, 39 | VariantAnnotation 40 | Suggests: 41 | knitr, 42 | rmarkdown, 43 | testthat 44 | VignetteBuilder: 45 | knitr 46 | Remotes: 47 | github::mrcieu/genetics.binaRies, 48 | github::mrcieu/gwasglue2 49 | Encoding: UTF-8 50 | Roxygen: list(markdown = TRUE) 51 | RoxygenNote: 7.3.2 52 | SystemRequirements: GNU unzip, sqlite3 53 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | YEAR: 2019 2 | COPYRIGHT HOLDER: Gibran Hemani 3 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | Copyright (c) 2019 Gibran Hemani 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export("%>%") 4 | export(check_bcftools) 5 | export(check_plink) 6 | export(create_ldref_sqlite) 7 | export(create_pval_index_from_vcf) 8 | export(create_rsidx_index_from_vcf) 9 | export(create_rsidx_sub_index) 10 | export(create_vcf) 11 | export(get_ld_proxies) 12 | export(gwasvcf_to_summaryset) 13 | export(merge_vcf) 14 | export(parse_chrompos) 15 | export(proxy_match) 16 | export(query_chrompos_bcftools) 17 | export(query_chrompos_file) 18 | export(query_chrompos_vcf) 19 | export(query_gwas) 20 | export(query_pval_bcftools) 21 | export(query_pval_file) 22 | export(query_pval_sqlite3) 23 | export(query_pval_vcf) 24 | export(query_pvali) 25 | export(query_rsid_bcftools) 26 | export(query_rsid_file) 27 | export(query_rsid_rsidx) 28 | export(query_rsid_vcf) 29 | export(query_rsidx) 30 | export(set_bcftools) 31 | export(set_plink) 32 | export(sqlite_ld_proxies) 33 | export(vcf_to_granges) 34 | export(vcf_to_tibble) 35 | export(vcflist_overlaps) 36 | import(VariantAnnotation) 37 | importFrom(magrittr,"%>%") 38 | importFrom(rlang,.data) 39 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | # gwasvcf 0.1.4 2 | 3 | * Add sqlite3 to DESCRIPTION SystemRequirements for `create_pval_index_from_vcf()` 4 | * Update some URLs within the package documentation 5 | 6 | # gwasvcf 0.1.3 7 | 8 | * Fix for security message in `get_ld_proxies()` (thanks @mattlee821) 9 | 10 | # gwasvcf 0.1.2 11 | 12 | * New `gwasvcf_to_summaryset()` function to create a [gwasglue2](https://mrcieu.github.io/gwasglue2/) SummarySet object from a vcf file 13 | * Fixed error in `get_ld_proxies()` related with argument `validate`, deprecated in `as_tibble()` (tibble 2.0.0) 14 | -------------------------------------------------------------------------------- /R/binaries.r: -------------------------------------------------------------------------------- 1 | #' Check if the tools_bcftools option is set 2 | #' 3 | #' See set_bcftools() for more information 4 | #' 5 | #' 6 | #' @export 7 | #' @return TRUE or FALSE 8 | check_bcftools <- function() 9 | { 10 | if(is.null(options()[["tools_bcftools"]])) 11 | { 12 | message("'tools_bcftools' option is not set, using native read which may be substantially slower. See 'set_bcftools' for information.") 13 | return(FALSE) 14 | } 15 | filecheck <- file.exists(options()[["tools_bcftools"]]) 16 | if(filecheck) 17 | { 18 | return(TRUE) 19 | } 20 | pathcheck <- any(sapply(strsplit(Sys.getenv("PATH"), split=":"), function(x) file.exists(file.path(x, options()[["tools_bcftools"]])))) 21 | if(pathcheck) 22 | { 23 | return(TRUE) 24 | } 25 | message("'tools_bcftools' option does not point to an existing file, using native read which may be substantially slower. See 'set_bcftools' for information.") 26 | return(FALSE) 27 | } 28 | 29 | 30 | #' Check if the tools_plink option is set 31 | #' 32 | #' See set_plink() for more information 33 | #' 34 | #' 35 | #' @export 36 | #' @return TRUE or FALSE 37 | check_plink <- function() 38 | { 39 | if(is.null(options()[["tools_plink"]])) 40 | { 41 | message("'tools_plink' option is not set. See 'set_plink' for information.") 42 | return(FALSE) 43 | } 44 | filecheck <- file.exists(options()[["tools_plink"]]) 45 | if(filecheck) 46 | { 47 | return(TRUE) 48 | } 49 | pathcheck <- any(sapply(strsplit(Sys.getenv("PATH"), split=":"), function(x) file.exists(file.path(x, options()[["tools_plink"]])))) 50 | if(pathcheck) 51 | { 52 | return(TRUE) 53 | } 54 | message("'tools_plink' option is not set. See 'set_plink' for information.") 55 | return(FALSE) 56 | } 57 | 58 | #' Set bcftools binary location 59 | #' 60 | #' 61 | #' @param path If "" (default), then will use the MRCIEU/genetics.binaRies to get binaries that are appropriate for the detected operating system. Otherwise, provide the path to the bcftools binary. If NULL then will set the option to NULL. 62 | #' 63 | #' @export 64 | #' @return NULL, sets option 'tools_bcftools' 65 | set_bcftools <- function(path="") 66 | { 67 | if(is.null(path)) 68 | { 69 | options(tools_bcftools = NULL) 70 | } else if(path == "") 71 | { 72 | a <- requireNamespace("genetics.binaRies") 73 | if(a) 74 | { 75 | message("Path not provided, using binaries in the MRCIEU/genetics.binaRies package") 76 | options(tools_bcftools = genetics.binaRies::get_bcftools_binary()) 77 | } else { 78 | stop("Please provide a path to bcftools binary or run devtools::install_github('MRCIEU/genetics.binaRies')") 79 | } 80 | } else { 81 | options(tools_bcftools = path) 82 | } 83 | } 84 | 85 | #' Set plink binary location 86 | #' 87 | #' 88 | #' @param path If "" (default), then will use the MRCIEU/genetics.binaRies to get binaries that are appropriate for the detected operating system. Otherwise, provide the path to the plink binary. If NULL then will set the option to NULL. 89 | #' 90 | #' @export 91 | #' @return NULL, sets option 'tools_plink' 92 | set_plink <- function(path="") 93 | { 94 | if(is.null(path)) 95 | { 96 | options(tools_plink = NULL) 97 | } else if(path == "") 98 | { 99 | a <- requireNamespace("genetics.binaRies") 100 | if(a) 101 | { 102 | message("Path not provided, using binaries in the MRCIEU/genetics.binaRies package") 103 | options(tools_plink = genetics.binaRies::get_plink_binary()) 104 | } else { 105 | stop("Please provide a path to plink binary or run devtools::install_github('MRCIEU/genetics.binaRies')") 106 | } 107 | } else { 108 | options(tools_plink = path) 109 | } 110 | } 111 | -------------------------------------------------------------------------------- /R/gwasglue.R: -------------------------------------------------------------------------------- 1 | # This file contains the functions to create a gwasglue2 SummarySet object 2 | 3 | 4 | 5 | #' Create a SummarySet 6 | #' 7 | #' Returns a gwasglue2 SummarySet object 8 | #' @param vcf Path or URL to GWAS-VCF file or VCF object e.g. output from [VariantAnnotation::readVcf()], [create_vcf()] or [query_gwas()] 9 | #' @export 10 | gwasvcf_to_summaryset <- function(vcf){ 11 | # get metadata from vcf and create metadata object 12 | md <- gwasglue2::create_metadata(id = vcf@metadata$header@samples, build = unique(VariantAnnotation::meta(header(vcf))$contig$assembly)) 13 | 14 | # get summary data and create SummarySet 15 | 16 | s <- vcf %>% 17 | vcf_to_tibble() %>% 18 | gwasglue2::create_summaryset_from_gwasvcf(metadata = md) 19 | 20 | return(s) 21 | } 22 | -------------------------------------------------------------------------------- /R/manipulate.r: -------------------------------------------------------------------------------- 1 | # All functions require v1.0 of the GWAS-VCF specification available from https://github.com/MRCIEU/gwas-vcf-specification/releases/tag/1.0.0 2 | 3 | #' Create GWAS vcf 4 | #' 5 | #' @param chrom chrom vector 6 | #' @param pos pos vector 7 | #' @param nea nea vector 8 | #' @param ea ea vector 9 | #' @param snp Optional vector 10 | #' @param ea_af Optional vector 11 | #' @param effect Optional vector 12 | #' @param se Optional vector 13 | #' @param pval Optional vector 14 | #' @param n Optional vector 15 | #' @param ncase Optional vector 16 | #' @param name Optional vector 17 | #' 18 | #' @export 19 | #' @return vcf object 20 | create_vcf <- function(chrom, pos, nea, ea, snp=NULL, ea_af=NULL, effect=NULL, se=NULL, pval=NULL, n=NULL, ncase=NULL, name=NULL) 21 | { 22 | stopifnot(length(chrom) == length(pos)) 23 | if(is.null(snp)) 24 | { 25 | snp <- paste0(chrom, ":", pos) 26 | } 27 | nsnp <- length(chrom) 28 | gen <- list() 29 | if(!is.null(ea_af)) gen[["AF"]] <- matrix(ea_af, nsnp) 30 | if(!is.null(effect)) gen[["ES"]] <- matrix(effect, nsnp) 31 | if(!is.null(se)) gen[["SE"]] <- matrix(se, nsnp) 32 | if(!is.null(pval)) gen[["LP"]] <- matrix(-log10(pval), nsnp) 33 | if(!is.null(n)) gen[["SS"]] <- matrix(n, nsnp) 34 | if(!is.null(ncase)) gen[["NC"]] <- matrix(ncase, nsnp) 35 | if(!is.null(snp)) gen[["ID"]] <- matrix(snp, nsnp) 36 | gen <- S4Vectors::SimpleList(gen) 37 | 38 | gr <- GenomicRanges::GRanges(chrom, IRanges::IRanges(start=pos, end=pos + pmax(nchar(nea), nchar(ea)) - 1, names=snp)) 39 | coldata <- S4Vectors::DataFrame(Samples = length(name), row.names=name) 40 | 41 | hdr <- VariantAnnotation::VCFHeader( 42 | header = IRanges::DataFrameList( 43 | fileformat = S4Vectors::DataFrame(Value="VCFv4.2", row.names="fileformat") 44 | ), 45 | sample = name 46 | ) 47 | VariantAnnotation::geno(hdr) <- S4Vectors::DataFrame( 48 | Number = c("A", "A", "A", "A", "A", "A", "A"), 49 | Type = c("Float", "Float", "Float", "Float", "Float", "Float", "String"), 50 | Description = c( 51 | "Effect size estimate relative to the alternative allele", 52 | "Standard error of effect size estimate", 53 | "-log10 p-value for effect estimate", 54 | "Alternate allele frequency in the association study", 55 | "Sample size used to estimate genetic effect", 56 | "Number of cases used to estimate genetic effect", 57 | "Study variant identifier" 58 | ), 59 | row.names=c("ES", "SE", "LP", "AF", "SS", "NC", "ID") 60 | ) 61 | VariantAnnotation::geno(hdr) <- subset(VariantAnnotation::geno(hdr), rownames(VariantAnnotation::geno(hdr)) %in% names(gen)) 62 | 63 | vcf <- VariantAnnotation::VCF( 64 | rowRanges = gr, 65 | colData = coldata, 66 | exptData = list( 67 | header = hdr 68 | ), 69 | geno = gen 70 | ) 71 | VariantAnnotation::alt(vcf) <- Biostrings::DNAStringSetList(as.list(ea)) 72 | VariantAnnotation::ref(vcf) <- Biostrings::DNAStringSet(nea) 73 | VariantAnnotation::fixed(vcf)$FILTER <- "PASS" 74 | return(sort(vcf)) 75 | } 76 | 77 | #' Merge two GWAS VCF objects 78 | #' 79 | #' Returns merged intersection of two VCF objects 80 | #' 81 | #' @param a VCF object 82 | #' @param b VCF object 83 | #' 84 | #' @export 85 | #' @return SimpleList of VCF objects 86 | #' @importFrom rlang .data 87 | merge_vcf <- function(a, b) 88 | { 89 | a <- VariantAnnotation::expand(a) 90 | b <- VariantAnnotation::expand(b) 91 | # o <- SummarizedExperiment::findOverlaps(a, b) 92 | o <- dplyr::tibble( 93 | from = which(names(a) %in% names(b)), 94 | to = match(names(a)[.data$from], names(b)) 95 | ) 96 | a <- a[o[["from"]],] 97 | b <- b[o[["to"]],] 98 | allele_match <- VariantAnnotation::ref(a) == VariantAnnotation::ref(b) & VariantAnnotation::alt(a) == VariantAnnotation::alt(b) 99 | switch <- VariantAnnotation::ref(a) == VariantAnnotation::alt(b) & VariantAnnotation::ref(b) == VariantAnnotation::alt(a) 100 | if(any(switch)) 101 | { 102 | for(i in 1:ncol(VariantAnnotation::geno(b)[["ES"]])) 103 | { 104 | VariantAnnotation::geno(b)[["ES"]][,i][switch] <- lapply(VariantAnnotation::geno(b)[["ES"]][,i][switch], function(x) x * -1) 105 | } 106 | } 107 | a <- a[allele_match | switch, ] 108 | b <- b[allele_match | switch, ] 109 | 110 | ab <- a 111 | temp <- lapply(names(VariantAnnotation::geno(ab)), function(x) rbind(VariantAnnotation::geno(ab)[x], VariantAnnotation::geno(b)[x])) %>% S4Vectors::SimpleList 112 | names(temp) <- names(VariantAnnotation::geno(ab)) 113 | VariantAnnotation::geno(ab) <- temp 114 | 115 | h <- VariantAnnotation::header(a) 116 | out <- VCFHeader( 117 | reference = VariantAnnotation::reference(h), 118 | samples = c(VariantAnnotation::samples(h), VariantAnnotation::samples(VariantAnnotation::header(b))), 119 | meta = VariantAnnotation::meta(h) 120 | ) 121 | 122 | return(S4Vectors::SimpleList(out)) 123 | } 124 | 125 | 126 | 127 | #' Convert vcf format to granges format 128 | #' 129 | #' @param vcf Output from readVcf 130 | #' @param id Only accepts one ID, so specify here if there are multiple GWAS datasets in the vcf 131 | #' 132 | #' @importFrom magrittr %>% 133 | #' @importFrom rlang .data 134 | #' 135 | #' @export 136 | #' @return GRanges object 137 | vcf_to_granges <- function(vcf, id=NULL) 138 | { 139 | stopifnot(inherits(vcf, c("ExpandedVCF", "CollapsedVCF"))) 140 | if(length(vcf) == 0) 141 | { 142 | message("VCF has length 0") 143 | return(NULL) 144 | } 145 | if(is.null(id)) 146 | { 147 | id <- VariantAnnotation::samples(VariantAnnotation::header(vcf)) 148 | } 149 | stopifnot(length(id) == 1) 150 | vcf <- VariantAnnotation::expand(vcf) 151 | a <- SummarizedExperiment::rowRanges(vcf) 152 | a$`REF` <- as.character(a$`REF`) 153 | a$`ALT` <- as.character(a$`ALT`) 154 | 155 | if(length(VariantAnnotation::geno(vcf)) == 0) 156 | { 157 | return(a) 158 | } else { 159 | out <- VariantAnnotation::expand(vcf) %>% 160 | VariantAnnotation::geno() %>% 161 | as.list() %>% 162 | lapply(function(x) unlist(x[,id,drop=TRUE])) %>% 163 | dplyr::bind_cols() 164 | S4Vectors::values(a) <- cbind(S4Vectors::values(a), out) 165 | S4Vectors::values(a)[["id"]] <- id 166 | 167 | if("TotalCases" %in% names(VariantAnnotation::meta(VariantAnnotation::header(vcf))$SAMPLE)) 168 | { 169 | S4Vectors::values(a)[["NC"]] <- as.numeric(VariantAnnotation::meta(VariantAnnotation::header(vcf))$SAMPLE$TotalCases) %>% rep(length(a)) 170 | S4Vectors::values(a)[["SS"]] <- as.numeric(VariantAnnotation::meta(VariantAnnotation::header(vcf))$SAMPLE$TotalCases) + as.numeric(VariantAnnotation::meta(VariantAnnotation::header(vcf))$SAMPLE$TotalControls) %>% rep(length(a)) 171 | } else if("TotalControls" %in% names(VariantAnnotation::meta(VariantAnnotation::header(vcf))$SAMPLE)) { 172 | S4Vectors::values(a)[["SS"]] <- as.numeric(VariantAnnotation::meta(VariantAnnotation::header(vcf))$SAMPLE$TotalControls) %>% rep(length(a)) 173 | } 174 | return(a) 175 | } 176 | } 177 | 178 | 179 | #' Convert vcf format to tibble (data frame) 180 | #' 181 | #' @param vcf Output from readVcf 182 | #' @param id Only accepts one ID, so specify here if there are multiple GWAS datasets in the vcf 183 | #' 184 | #' @export 185 | #' @return GRanges object 186 | vcf_to_tibble <- function(vcf, id=NULL) 187 | { 188 | a <- vcf_to_granges(vcf, id) 189 | if(is.null(a)) 190 | { 191 | return(dplyr::tibble()) 192 | } 193 | S4Vectors::values(a)[["rsid"]] <- names(a) 194 | return(dplyr::as_tibble(a, .name_repair = "minimal")) 195 | } 196 | 197 | 198 | #' Reduce list of VCFs to intersecting regions 199 | #' 200 | #' @param vcflist List of VCF objects, or list of VCF filenames, or mix of VCF objects and filenames 201 | #' @param chrompos Either vector of chromosome and position ranges e.g. "1:1000" or "1:1000-2000", or data frame with columns `chrom`, `start`, `end`. 202 | #' 203 | #' @export 204 | #' @return List of VCFs 205 | vcflist_overlaps <- function(vcflist, chrompos) 206 | { 207 | stopifnot(is.list(vcflist)) 208 | if(!is.null(chrompos)) 209 | { 210 | chrompos <- parse_chrompos(chrompos) 211 | vcflist <- lapply(vcflist, function(x) 212 | { 213 | query_gwas(x, chrompos) 214 | }) 215 | } else { 216 | vcflist <- lapply(1:length(vcflist), function(i) 217 | { 218 | x <- vcflist[[i]] 219 | if(inherits(x, c("CollapsedVCF", "ExpandedVCF"))) 220 | { 221 | if(is.null(chrompos)) 222 | { 223 | return(x) 224 | } else { 225 | return(query_gwas(x, chrompos)) 226 | } 227 | } 228 | if(is.character(x)) 229 | { 230 | if(is.null(chrompos)) 231 | { 232 | return(VariantAnnotation::readVcf(x)) 233 | } else { 234 | return(query_gwas(x, chrompos)) 235 | } 236 | } 237 | stop("Item ", i, " in vcflist is neither VCF object nor path to VCF file") 238 | }) 239 | } 240 | 241 | # collapse indels for sorting purposes 242 | vcflist <- lapply(vcflist, function(x) 243 | { 244 | SummarizedExperiment::end(x) <- SummarizedExperiment::start(x) 245 | 246 | # Simple approach to avoid duplicate positions due to snps and indels 247 | x <- x[!duplicated(SummarizedExperiment::start(x))] 248 | return(x) 249 | }) 250 | 251 | 252 | 253 | o <- Reduce(IRanges::subsetByOverlaps, lapply(vcflist, SummarizedExperiment::rowRanges)) 254 | vcflist <- lapply(vcflist, function(x) IRanges::subsetByOverlaps(x, o)) 255 | return(vcflist) 256 | } 257 | -------------------------------------------------------------------------------- /R/proxy.r: -------------------------------------------------------------------------------- 1 | #' Find LD proxies for a set of SNPs 2 | #' 3 | #' @param rsid list of rs IDs 4 | #' @param bfile ld reference panel 5 | #' @param tag_kb =5000 Proxy parameter 6 | #' @param tag_nsnp =5000 Proxy parameter 7 | #' @param tag_r2 =0.6 Proxy parameter 8 | #' @param searchspace Optional list of rs IDs to use as potential proxies 9 | #' @param threads Number of threads to use (=1) 10 | #' @param out temporary output file 11 | #' 12 | #' @importFrom magrittr %>% 13 | #' @importFrom rlang .data 14 | #' 15 | #' @export 16 | #' @return data frame 17 | get_ld_proxies <- function(rsid, bfile, searchspace=NULL, tag_kb=5000, tag_nsnp=5000, tag_r2=0.6, threads=1, out=tempfile()) 18 | { 19 | stopifnot(check_plink()) 20 | searchspacename <- paste0(out, ".searchspace") 21 | targetsname <- paste0(out, ".targets") 22 | outname <- paste0(out, ".targets.ld.gz") 23 | utils::write.table(rsid, file=targetsname, row.names = FALSE, col.names = FALSE, quote = FALSE) 24 | if(!is.null(searchspace)) 25 | { 26 | stopifnot(is.character(searchspace)) 27 | 28 | utils::write.table(unique(c(rsid, searchspace)), file=searchspacename, row.names = FALSE, col.names = FALSE, quote = FALSE) 29 | extract_param <- paste0(" --extract ", searchspacename) 30 | } else { 31 | extract_param <- " " 32 | } 33 | cmd <- paste0(options()[["tools_plink"]], 34 | " --bfile ", bfile, 35 | extract_param, 36 | " --keep-allele-order ", 37 | " --r in-phase with-freqs gz", 38 | " --ld-snp-list ", targetsname, 39 | " --ld-window-kb ", tag_kb, 40 | " --ld-window-r2 ", tag_r2, 41 | " --ld-window ", tag_nsnp, 42 | " --out ", targetsname, 43 | " --threads ", threads, 44 | " 2>&1 > /dev/null" 45 | ) 46 | message("Finding proxies...") 47 | system(cmd) 48 | 49 | if (Sys.info()["sysname"] == "Windows") { 50 | stop("Currently, this function only works on macOS and Linux") 51 | } 52 | if (!file.exists(outname)) { 53 | ld <- data.frame(CHR_A = integer(), BP_A = integer(), SNP_A = character(), MAF_A = double(), CHR_B = integer(), BP_B = integer(), 54 | SNP_B = character(), PHASE = character(), MAF_B = double(), R = double()) 55 | message("Index SNP not found in the reference panel") 56 | return(ld) 57 | } 58 | ld <- data.table::fread(cmd = paste0("gunzip -c ", outname), header = TRUE) %>% 59 | dplyr::as_tibble(.name_repair="minimal") %>% 60 | dplyr::filter(.data[["R"]]^2 > tag_r2) %>% 61 | dplyr::filter(.data[["SNP_A"]] != .data[["SNP_B"]]) %>% 62 | dplyr::mutate(PHASE=gsub("/", "", .data[["PHASE"]])) %>% 63 | dplyr::filter(nchar(.data[["PHASE"]]) == 4) 64 | unlink(searchspacename) 65 | unlink(targetsname) 66 | unlink(paste0(targetsname, c(".log", ".nosex"))) 67 | unlink(outname) 68 | if(nrow(ld) == 0) 69 | { 70 | message("No proxies found") 71 | return(ld) 72 | } 73 | temp <- do.call(rbind, strsplit(ld[["PHASE"]], "")) %>% dplyr::as_tibble(.name_repair="minimal") 74 | names(temp) <- c("A1", "B1", "A2", "B2") 75 | ld <- cbind(ld, temp) %>% dplyr::as_tibble(.name_repair="minimal") 76 | # ld <- dplyr::arrange(ld, desc(abs(R))) %>% 77 | # dplyr::filter(!duplicated(SNP_A)) 78 | ld <- dplyr::arrange(ld, dplyr::desc(abs(.data[["R"]]))) 79 | message("Found ", nrow(ld), " proxies") 80 | return(ld) 81 | } 82 | 83 | 84 | 85 | #' Lookup LD proxies from sqlite database 86 | #' 87 | #' @param rsids List of rsids 88 | #' @param dbfile path to dbfile 89 | #' @param tag_r2 minimum r2 value 90 | #' 91 | #' @importFrom magrittr %>% 92 | #' @importFrom rlang .data 93 | #' 94 | #' @export 95 | #' @return data frame 96 | sqlite_ld_proxies <- function(rsids, dbfile, tag_r2) 97 | { 98 | conn <- RSQLite::dbConnect(RSQLite::SQLite(), dbfile) 99 | numid <- gsub("rs", "", rsids) %>% paste(collapse=",") 100 | query <- paste0("SELECT DISTINCT * FROM tags WHERE SNP_A IN (", numid, ")") 101 | ld <- RSQLite::dbGetQuery(conn, query) %>% 102 | dplyr::as_tibble(.name_repair="minimal") %>% 103 | dplyr::filter(.data[["R"]]^2 > tag_r2) %>% 104 | dplyr::filter(.data[["SNP_A"]] != .data[["SNP_B"]]) %>% 105 | dplyr::mutate(PHASE=gsub("/", "", .data[["PHASE"]])) %>% 106 | dplyr::filter(nchar(.data[["PHASE"]]) == 4) %>% 107 | dplyr::mutate(SNP_A = paste0("rs", .data[["SNP_A"]]), SNP_B = paste0("rs", .data[["SNP_B"]])) 108 | 109 | temp <- do.call(rbind, strsplit(ld[["PHASE"]], "")) %>% dplyr::as_tibble(.name_repair="minimal") 110 | names(temp) <- c("A1", "B1", "A2", "B2") 111 | ld <- cbind(ld, temp) %>% dplyr::as_tibble(.name_repair="minimal") 112 | ld <- dplyr::arrange(ld, dplyr::desc(abs(.data[["R"]]))) 113 | message("Found ", nrow(ld), " proxies") 114 | RSQLite::dbDisconnect(conn) 115 | return(ld) 116 | } 117 | 118 | 119 | #' Extract SNPs from vcf file 120 | #' 121 | #' Finds proxies if necessary 122 | #' 123 | #' @param vcf vcf file name 124 | #' @param rsid list of rs IDs 125 | #' @param bfile ld reference panel (plink) 126 | #' @param proxies ="yes" If SNPs are absent then look for proxies (yes) or not (no). Can also mask all target SNPs and only return proxies (only), for testing purposes 127 | #' @param tag_kb =5000 Proxy parameter 128 | #' @param tag_nsnp =5000 Proxy parameter 129 | #' @param tag_r2 =0.6 Proxy parameter 130 | #' @param threads Number of threads to use (=1) 131 | #' @param rsidx Path to rsidx index 132 | #' @param dbfile ld tag database (sqlite) 133 | #' 134 | #' @export 135 | #' @return data frame 136 | proxy_match <- function(vcf, rsid, bfile=NULL, proxies="yes", tag_kb=5000, tag_nsnp=5000, tag_r2=0.6, threads=1, rsidx=NULL, dbfile=NULL) 137 | { 138 | if(is.null(dbfile) & is.null(bfile)) 139 | { 140 | stop('please provide either bfile or dbfile') 141 | } 142 | if(!is.null(dbfile) & !is.null(bfile)) 143 | { 144 | warning("bfile and dbfile both provided. Using dbfile.") 145 | } 146 | os <- Sys.info()[['sysname']] 147 | if(proxies=="yes") 148 | { 149 | message("Initial search...") 150 | o <- query_gwas(vcf, rsid=rsid, rsidx=rsidx) 151 | missing <- rsid[!rsid %in% names(o)] 152 | if(length(missing) != 0) 153 | { 154 | message("Extracted ", length(rsid) - length(missing), " out of ", length(rsid), " rsids") 155 | message("Searching for proxies for ", length(missing), " rsids") 156 | searchspacename <- tempfile() 157 | if(is.character(vcf)) 158 | { 159 | if(check_bcftools() & is.null(dbfile)) 160 | { 161 | message("Determining searchspace...") 162 | cmd <- paste0(options()[["tools_bcftools"]], " query -f'%ID\n' ", vcf, " > ", searchspacename) 163 | system(cmd) 164 | searchspace <- scan(searchspacename, what="character", quiet=TRUE) 165 | } else { 166 | searchspace <- NULL 167 | } 168 | } else { 169 | message("Determining searchspace...") 170 | searchspace <- names(SummarizedExperiment::rowRanges(vcf)) 171 | } 172 | message("Proxy lookup...") 173 | if(is.null(dbfile)) 174 | { 175 | ld <- get_ld_proxies(missing, bfile, searchspace=searchspace, tag_kb=tag_kb, tag_nsnp=tag_nsnp, tag_r2=tag_r2, threads=threads) 176 | } else { 177 | ld <- sqlite_ld_proxies(rsids=missing, dbfile=dbfile, tag_r2=tag_r2) 178 | } 179 | if(nrow(ld) == 0) 180 | { 181 | return(o) 182 | } 183 | } else { 184 | return(o) 185 | } 186 | } else if(proxies == "only") { 187 | searchspacename <- tempfile() 188 | if(is.character(vcf)) 189 | { 190 | if(check_bcftools() & is.null(dbfile)) 191 | { 192 | message("Determining searchspace...") 193 | cmd <- paste0(options()[["tools_bcftools"]], " query -f'%ID\n' ", vcf, " > ", searchspacename) 194 | system(cmd) 195 | searchspace <- scan(searchspacename, what="character") 196 | } else { 197 | searchspace <- NULL 198 | } 199 | } else { 200 | message("Determining searchspace...") 201 | searchspace <- names(SummarizedExperiment::rowRanges(vcf)) 202 | } 203 | message("Proxy lookup...") 204 | if(is.null(dbfile)) 205 | { 206 | ld <- get_ld_proxies(rsid, bfile, searchspace=searchspace, tag_kb=tag_kb, tag_nsnp=tag_nsnp, tag_r2=tag_r2, threads=threads) 207 | } else { 208 | ld <- sqlite_ld_proxies(rsids=rsid, dbfile=dbfile, tag_r2=tag_r2) 209 | } 210 | if(nrow(ld) == 0) 211 | { 212 | return(VCF()) 213 | } 214 | } else if(proxies == "no") { 215 | o <- query_gwas(vcf, rsid=rsid, rsidx=rsidx) 216 | return(o) 217 | } else { 218 | stop('proxies must be "yes", "no" or "only"') 219 | } 220 | if(!is.null(searchspace)) 221 | { 222 | ld <- ld %>% dplyr::filter(!duplicated(.data[["SNP_A"]])) 223 | } 224 | message("Extrating proxies...") 225 | e <- query_gwas(vcf, rsid=ld[["SNP_B"]], rsidx=rsidx) 226 | 227 | if(is.null(searchspace)) 228 | { 229 | ld <- subset(ld, ld[["SNP_B"]] %in% names(e)) %>% 230 | dplyr::filter(!duplicated(.data[["SNP_A"]])) 231 | } 232 | e <- e[names(e) %in% ld[["SNP_B"]], ] 233 | message("Identified proxies for ", nrow(e), " of ", length(missing), " rsids") 234 | message("Aligning...") 235 | index <- match(names(e), ld[["SNP_B"]]) 236 | ld <- ld[index,] 237 | if(nrow(ld) == 0) 238 | { 239 | return(o) 240 | } 241 | stopifnot(all(ld[["SNP_B"]] == names(e))) 242 | sign_index <- GenomicRanges::mcols(SummarizedExperiment::rowRanges(e))[,"REF"] == ld[["B1"]] 243 | gr <- GenomicRanges::GRanges(ld[["CHR_A"]], IRanges::IRanges(start=ld[["BP_A"]], end=ld[["BP_A"]], names=ld[["SNP_A"]])) 244 | fixeddat <- S4Vectors::DataFrame( 245 | REF=Biostrings::DNAStringSet(ld[["A1"]]), 246 | ALT=Biostrings::DNAStringSetList(as.list(ld[["A2"]])), 247 | QUAL=as.numeric(NA), 248 | FILTER="PASS" 249 | ) 250 | prox <- VariantAnnotation::VCF( 251 | rowRanges = gr, 252 | colData = SummarizedExperiment::colData(e), 253 | fixed = fixeddat, 254 | info = VariantAnnotation::info(e), 255 | exptData = list( 256 | header = VariantAnnotation::header(e) 257 | ), 258 | geno = S4Vectors::SimpleList( 259 | lapply(VariantAnnotation::geno(e), `dimnames<-`, NULL) 260 | ) 261 | ) 262 | VariantAnnotation::geno(VariantAnnotation::header(prox)) <- rbind(VariantAnnotation::geno(VariantAnnotation::header(prox)), 263 | S4Vectors::DataFrame(row.names="PR", Number="1", Type="String", Description="Proxy rsid") 264 | ) 265 | VariantAnnotation::geno(prox)[["ES"]][!sign_index] <- {unlist(VariantAnnotation::geno(prox)[["ES"]][!sign_index]) * -1} %>% as.list 266 | VariantAnnotation::geno(prox)[["PR"]] <- matrix(ld[["SNP_B"]], length(ld[["SNP_B"]]), 1) 267 | 268 | if(proxies == "only") 269 | { 270 | return(prox) 271 | } else { 272 | VariantAnnotation::geno(VariantAnnotation::header(o)) <- rbind(VariantAnnotation::geno(VariantAnnotation::header(o)), S4Vectors::DataFrame(row.names="PR", Number="1", Type="String", Description="Proxy rsid")) 273 | VariantAnnotation::geno(o)[["PR"]] <- matrix(rep(NA, length(o)), length(o), 1) 274 | return(BiocGenerics::rbind(o, prox)) 275 | } 276 | } 277 | -------------------------------------------------------------------------------- /R/pval_index.r: -------------------------------------------------------------------------------- 1 | #' Create pval index from GWAS-VCF file 2 | #' 3 | #' Create a separate file called `.pvali` which is used to speed up p-value queries. 4 | #' 5 | #' @param vcffile VCF filename 6 | #' @param maximum_pval Maximum p-value to include. Default = 0.05 7 | #' @param indexname index file name to create. Deletes existing file if exists. 8 | #' 9 | #' @export 10 | #' @return NULL 11 | create_pval_index_from_vcf <- function(vcffile, maximum_pval, indexname) 12 | { 13 | stopifnot(!is.null(options()$tools_bcftools)) 14 | checksqlite3 <- system("which sqlite3") 15 | if(checksqlite3 != 0) stop("sqlite3 not installed") 16 | fn <- tempfile() 17 | id <- VariantAnnotation::samples(VariantAnnotation::scanVcfHeader(vcffile)) 18 | if(length(id) != 1) 19 | { 20 | stop("Not implemented for vcf files that don't have a single study") 21 | } 22 | cmd <- paste0(options()$tools_bcftools, " query -s ", id, " -i 'FORMAT/LP > ", -log10(maximum_pval), "' -f '%CHROM,%POS,[%LP]\n' ", vcffile, " | sort -r -k 3 > ", fn) 23 | message("Extracting pval info") 24 | system(cmd) 25 | cmd <- c( 26 | 'CREATE TABLE pval_to_coord (chrom TEXT NOT NULL DEFAULT NULL, coord INTEGER NOT NULL DEFAULT NULL, LP REAL NOT NULL DEFAULT 0);', 27 | '.separator ,', 28 | paste0('.import ', fn, ' pval_to_coord'), 29 | 'CREATE INDEX idx_LP ON pval_to_coord (LP)' 30 | ) 31 | print(cmd) 32 | utils::write.table(cmd, file=paste0(fn, ".sql"), row.names = FALSE, col.names = FALSE, quote = FALSE) 33 | message("Generating index") 34 | cmd <- paste0("sqlite3 ", indexname, " < ", fn, ".sql") 35 | unlink(indexname) 36 | system(cmd) 37 | } 38 | 39 | #' Query pval from file using pvali index 40 | #' 41 | #' See create_pvali_index 42 | #' 43 | #' @param pval pval threshold 44 | #' @param vcffile Path to .vcf.gz GWAS summary data file 45 | #' @param id If multiple GWAS datasets in the vcf file, the name (sample ID) from which to perform the filter 46 | #' @param pvali Path to pval index file 47 | #' 48 | #' @export 49 | #' @return vcf object 50 | query_pval_sqlite3 <- function(pval, vcffile, id=NULL, pvali) 51 | { 52 | out <- query_pvali(pval, pvali) 53 | return( 54 | query_gwas(vcffile, chrompos=data.frame(chrom=out$chrom, start=out$coord, end=out$coord), id=id) 55 | ) 56 | } 57 | 58 | #' Query pvali 59 | #' 60 | #' @param pval pval threshold 61 | #' @param pvali Path to pval index file 62 | #' 63 | #' @export 64 | #' @return data frame 65 | query_pvali <- function(pval, pvali) 66 | { 67 | conn <- RSQLite::dbConnect(RSQLite::SQLite(), pvali) 68 | query <- paste0("SELECT DISTINCT * FROM pval_to_coord WHERE lp >= ", -log10(pval)) 69 | out <- RSQLite::dbGetQuery(conn, query) 70 | RSQLite::dbDisconnect(conn) 71 | return(out) 72 | } 73 | -------------------------------------------------------------------------------- /R/query.r: -------------------------------------------------------------------------------- 1 | #' Query data from vcf file 2 | #' 3 | #' Read in GWAS summary data with filters on datasets (if multiple datasets per file) and/or chromosome/position, rsids or pvalues. Chooses most optimal choice for the detected operating system. Typically chrompos searches are the fastest. On Windows, rsid or pvalue filters from a file will be slow. 4 | #' 5 | #' @param vcf Path or URL to GWAS-VCF file or VCF object e.g. output from [VariantAnnotation::readVcf()] or [create_vcf()] 6 | #' @param chrompos Either vector of chromosome and position ranges e.g. "1:1000" or "1:1000-2000", or data frame with columns `chrom`, `start`, `end`. 7 | #' @param rsid Vector of rsids 8 | #' @param pval P-value threshold (NOT -log10) 9 | #' @param id If multiple GWAS datasets in the vcf file, the name (sample ID) from which to perform the filter 10 | #' @param rsidx Path to rsidx index file 11 | #' @param pvali Path to pval index file 12 | #' @param build ="GRCh37" Build of vcffile 13 | #' @param os The operating system. Default is as detected. Determines the method used to perform query 14 | #' @param proxies ="no" If SNPs are absent then look for proxies (yes) or not (no). Can also mask all target SNPs and only return proxies (only), for testing purposes. Currently only possible if querying rsid. 15 | #' @param bfile =path to plink bed/bim/fam ld reference panel 16 | #' @param dbfile =path to sqlite tag snp database 17 | #' @param tag_kb =5000 Proxy parameter 18 | #' @param tag_nsnp =5000 Proxy parameter 19 | #' @param tag_r2 =0.6 Proxy parameter 20 | #' @param threads =1 NUmber of threads 21 | #' 22 | #' @export 23 | #' @return vcf object 24 | query_gwas <- function(vcf, chrompos=NULL, rsid=NULL, pval=NULL, id=NULL, rsidx=NULL, pvali=NULL, build="GRCh37", os=Sys.info()[['sysname']], proxies="no", bfile=NULL, dbfile=NULL, tag_kb=5000, tag_nsnp=5000, tag_r2=0.6, threads=1) 25 | { 26 | if(is.character(vcf)) 27 | { 28 | if(!file.exists(vcf)) 29 | { 30 | if(!RCurl::url.exists(vcf)) 31 | { 32 | stop("vcf appears to be a string but doesn't refer to an existing URL or local file") 33 | } 34 | } 35 | fileflag <- TRUE 36 | } else { 37 | stopifnot(inherits(vcf, c("CollapsedVCF", "ExpandedVCF"))) 38 | fileflag <- FALSE 39 | } 40 | if(sum(c(!is.null(chrompos), !is.null(rsid), !is.null(pval))) != 1) 41 | { 42 | stop("Must specify filters only for one of chrompos, rsid or pval") 43 | } 44 | 45 | if(proxies != "no") 46 | { 47 | if(is.null(rsid)) 48 | { 49 | stop("Proxies can only be searched for if rsid query specified") 50 | } 51 | } 52 | 53 | if(!is.null(chrompos)) 54 | { 55 | if(!fileflag) 56 | { 57 | return(query_chrompos_vcf(chrompos, vcf)) 58 | } else { 59 | if(!check_bcftools()) 60 | { 61 | return(query_chrompos_file(chrompos, vcf, id, build)) 62 | } else { 63 | return(query_chrompos_bcftools(chrompos, vcf, id)) 64 | } 65 | } 66 | } 67 | 68 | if(!is.null(rsid)) 69 | { 70 | stopifnot(proxies %in% c("yes", "no", "only")) 71 | if(proxies != "no") 72 | { 73 | return(proxy_match(vcf, rsid, bfile=bfile, dbfile=dbfile, proxies=proxies, tag_kb=tag_kb, tag_nsnp=tag_nsnp, tag_r2=tag_r2, threads=threads)) 74 | } 75 | if(!fileflag) 76 | { 77 | return(query_rsid_vcf(rsid, vcf)) 78 | } else { 79 | if(!is.null(rsidx)) 80 | { 81 | return(query_rsid_rsidx(rsid, vcf, id, rsidx)) 82 | } 83 | if(!check_bcftools()) 84 | { 85 | return(query_rsid_file(rsid, vcf, id, build)) 86 | } else { 87 | return(query_rsid_bcftools(rsid, vcf, id)) 88 | } 89 | } 90 | } 91 | 92 | if(!is.null(pval)) 93 | { 94 | if(!fileflag) 95 | { 96 | return(query_pval_vcf(pval, vcf, id)) 97 | } else { 98 | if(!is.null(pvali)) 99 | { 100 | message("Using pval index") 101 | return(query_pval_sqlite3(pval, vcf, id, pvali)) 102 | } 103 | if(!check_bcftools()) 104 | { 105 | return(query_pval_file(pval, vcf, id, build)) 106 | } else { 107 | return(query_pval_bcftools(pval, vcf, id)) 108 | } 109 | } 110 | } 111 | } 112 | 113 | 114 | 115 | 116 | df_to_granges <- function(df) 117 | { 118 | GenomicRanges::GRanges(seqnames=df[["chrom"]], ranges=IRanges::IRanges(start=df[["start"]], end=df[["end"]])) 119 | } 120 | 121 | 122 | 123 | 124 | 125 | 126 | #' Parse chromosome:position 127 | #' 128 | #' Takes data frame or vector of chromosome position ranges and parses to granges object 129 | #' 130 | #' @param chrompos Either vector of chromosome and position ranges e.g. "1:1000" or "1:1000-2000", or data frame with columns `chrom`, `start`, `end`. 131 | #' @param radius Add radius to the specified positions. Default = NULL 132 | #' 133 | #' @export 134 | #' @return GRanges object 135 | parse_chrompos <- function(chrompos, radius=NULL) 136 | { 137 | 138 | if(inherits(chrompos, "GRanges")) 139 | { 140 | if(!is.null(radius)) 141 | { 142 | chrompos <- GenomicRanges::GRanges( 143 | seqnames = GenomeInfoDb::seqnames(chrompos), 144 | ranges = IRanges::IRanges( 145 | start = pmax(chrompos@start - radius, 0), 146 | end = chrompos@end + radius 147 | ), 148 | strand = chrompos@strand 149 | ) 150 | } 151 | return(chrompos) 152 | } else if(is.data.frame(chrompos)) { 153 | stopifnot(is.data.frame(chrompos)) 154 | stopifnot(all(c("chrom", "start", "end") %in% names(chrompos))) 155 | return(df_to_granges(chrompos)) 156 | } else if(!is.character(chrompos)) { 157 | stop("chrompos must be data frame with columns chrom, start, end, or character vector of or ") 158 | } 159 | 160 | a <- stringr::str_split(chrompos, ":") 161 | chrom <- sapply(a, function(x) x[1]) 162 | pos <- sapply(a, function(x) x[2]) 163 | i <- grepl("-", pos) 164 | temp <- stringr::str_split(pos[i], "-") 165 | pos1 <- pos 166 | pos2 <- pos 167 | pos1[i] <- sapply(temp, function(x) {x[1]}) 168 | pos2[i] <- sapply(temp, function(x) {x[2]}) 169 | pos1 <- as.numeric(pos1) 170 | pos2 <- as.numeric(pos2) 171 | if(!is.null(radius)) 172 | { 173 | pos1 <- pmax(0, pos1 - radius) 174 | pos2 <- pos2 + radius 175 | } 176 | return(df_to_granges(data.frame(chrom, start=pos1, end=pos2, stringsAsFactors=FALSE))) 177 | } 178 | 179 | 180 | 181 | 182 | #' Query vcf file, extracting by chromosome and position 183 | #' 184 | #' 185 | #' @param chrompos Either vector of chromosome and position ranges e.g. "1:1000" or "1:1000-2000", or data frame with columns `chrom`, `start`, `end`. 186 | #' @param vcffile Path to .vcf.gz GWAS summary data file 187 | #' @param id If multiple GWAS datasets in the vcf file, the name (sample ID) from which to perform the filter 188 | #' @param build Default="GRCh37" Build of vcffile 189 | #' 190 | #' @export 191 | #' @return VCF object 192 | query_chrompos_file <- function(chrompos, vcffile, id=NULL, build="GRCh37") 193 | { 194 | chrompos <- parse_chrompos(chrompos) 195 | if(!is.null(id)) 196 | { 197 | param <- VariantAnnotation::ScanVcfParam(which=chrompos, samples=id) 198 | } else { 199 | param <- VariantAnnotation::ScanVcfParam(which=chrompos) 200 | } 201 | tab <- Rsamtools::TabixFile(vcffile) 202 | vcf <- VariantAnnotation::readVcf(tab, build, param=chrompos) 203 | return(vcf) 204 | } 205 | 206 | 207 | #' Query vcf file, extracting by rsid 208 | #' 209 | #' @param rsid Vector of rsids. Use DBSNP build (???) 210 | #' @param vcffile Path to .vcf.gz GWAS summary data file 211 | #' @param id If multiple GWAS datasets in the vcf file, the name (sample ID) from which to perform the filter 212 | #' @param build Default="GRCh37" Build of vcffile 213 | #' 214 | #' @export 215 | #' @return VCF object 216 | query_rsid_file <- function(rsid, vcffile, id=NULL, build="GRCh37") 217 | { 218 | message("Note, this is much slower than searching by chromosome/position (e.g. see query_chrompos_file)") 219 | vcf <- Rsamtools::TabixFile(vcffile) 220 | fil <- function(x) 221 | { 222 | grepl(paste(rsid, collapse="|"), x) 223 | } 224 | 225 | tempfile <- tempfile() 226 | VariantAnnotation::filterVcf(vcf, build, tempfile, prefilters=S4Vectors::FilterRules(list(fil=fil)), verbose=TRUE) 227 | if(!is.null(id)) 228 | { 229 | o <- VariantAnnotation::readVcf(tempfile, param=VariantAnnotation::ScanVcfParam(samples=id)) 230 | } else { 231 | o <- VariantAnnotation::readVcf(tempfile) 232 | } 233 | unlink(tempfile) 234 | 235 | # Grep isn't matching on exact word so do second pass here 236 | o <- query_rsid_vcf(rsid, o) 237 | return(o) 238 | } 239 | 240 | 241 | #' Query pval from vcf file 242 | #' 243 | #' @param pval P-value threshold (NOT -log10) 244 | #' @param vcffile Path to tabix indexed vcf file 245 | #' @param id If multiple GWAS datasets in the vcf file, the name (sample ID) from which to perform the filter 246 | #' @param build Default="GRCh37" 247 | #' 248 | #' @export 249 | #' @return VCF object 250 | query_pval_file <- function(pval, vcffile, id=NULL, build="GRCh37") 251 | { 252 | if(is.null(id)) 253 | { 254 | id <- VariantAnnotation::samples(VariantAnnotation::scanVcfHeader(vcffile)) 255 | } 256 | stopifnot(length(id) == 1) 257 | message("Filtering p-value based on id ", id) 258 | message("Note, this is much slower than searching by chromosome/position (e.g. see query_chrompos_file)") 259 | vcf <- Rsamtools::TabixFile(vcffile) 260 | fil <- function(x) 261 | { 262 | VariantAnnotation::geno(x)[["LP"]][,id,drop=TRUE] > -log10(pval) 263 | } 264 | tempfile <- tempfile() 265 | VariantAnnotation::filterVcf(vcf, build, tempfile, filters=S4Vectors::FilterRules(list(fil=fil)), verbose=TRUE) 266 | o <- VariantAnnotation::readVcf(tempfile) 267 | unlink(tempfile) 268 | return(o) 269 | } 270 | 271 | 272 | #' Query chrompos from vcf object 273 | #' 274 | #' @param chrompos Either vector of chromosome and position ranges e.g. "1:1000" or "1:1000-2000", or data frame with columns `chrom`, `start`, `end`. 275 | #' @param vcf VCF object (e.g. from readVcf) 276 | #' @param id If multiple GWAS datasets in the vcf file, the name (sample ID) from which to perform the filter 277 | #' 278 | #' @export 279 | #' @return VCF object 280 | query_chrompos_vcf <- function(chrompos, vcf, id=NULL) 281 | { 282 | if(is.null(id)) 283 | { 284 | id <- VariantAnnotation::samples(VariantAnnotation::header(vcf)) 285 | } 286 | colid <- which(VariantAnnotation::samples(VariantAnnotation::header(vcf)) == id) 287 | chrompos <- parse_chrompos(chrompos) 288 | i <- IRanges::findOverlaps(SummarizedExperiment::rowRanges(vcf), chrompos) %>% S4Vectors::queryHits() %>% unique %>% sort 289 | vcf[i,colid] 290 | } 291 | 292 | 293 | #' Query rsid from vcf object 294 | #' 295 | #' @param rsid Vector of rsids 296 | #' @param vcf VCF object (e.g. from readVcf) 297 | #' @param id If multiple GWAS datasets in the vcf file, the name (sample ID) from which to perform the filter 298 | #' 299 | #' @export 300 | #' @return VCF object 301 | query_rsid_vcf <- function(rsid, vcf, id=NULL) 302 | { 303 | if(is.null(id)) 304 | { 305 | id <- VariantAnnotation::samples(VariantAnnotation::header(vcf)) 306 | } 307 | colid <- which(VariantAnnotation::samples(VariantAnnotation::header(vcf)) == id) 308 | vcf[rownames(vcf) %in% rsid,colid] 309 | } 310 | 311 | 312 | #' Query based on p-value threshold from vcf 313 | #' 314 | #' @param pval P-value threshold (NOT -log10) 315 | #' @param id If multiple GWAS datasets in the vcf file, the name (sample ID) from which to perform the filter 316 | #' @param vcf VCF object (e.g. from readVcf) 317 | #' 318 | #' @export 319 | #' @return VCF object 320 | query_pval_vcf <- function(pval, vcf, id=NULL) 321 | { 322 | if(is.null(id)) 323 | { 324 | id <- VariantAnnotation::samples(VariantAnnotation::header(vcf)) 325 | } 326 | stopifnot(length(id) == 1) 327 | colid <- which(VariantAnnotation::samples(VariantAnnotation::header(vcf)) == id) 328 | vcf[VariantAnnotation::geno(vcf)[["LP"]][,colid,drop=TRUE] > -log10(pval),colid] 329 | } 330 | 331 | 332 | #' Query 333 | #' 334 | #' @param rsid Vector of rsids 335 | #' @param vcffile Path to .vcf.gz GWAS summary data file 336 | #' @param id If multiple GWAS datasets in the vcf file, the name (sample ID) from which to perform the filter 337 | #' 338 | #' @export 339 | #' @return VCF object 340 | query_rsid_bcftools <- function(rsid, vcffile, id=NULL) 341 | { 342 | stopifnot(check_bcftools()) 343 | bcftools <- options()[["tools_bcftools"]] 344 | if(is.null(id)) 345 | { 346 | id <- VariantAnnotation::samples(VariantAnnotation::scanVcfHeader(vcffile)) 347 | } 348 | id <- paste(id, collapse=",") 349 | tmp <- tempfile() 350 | utils::write.table(unique(rsid), file=paste0(tmp, ".snplist"), row.names = FALSE, col.names = FALSE, quote = FALSE) 351 | cmd <- sprintf("%s view -s %s -i'ID=@%s.snplist' %s > %s.vcf", bcftools, id, tmp, vcffile, tmp) 352 | system(cmd) 353 | o <- VariantAnnotation::readVcf(paste0(tmp, ".vcf")) 354 | unlink(paste0(tmp, ".vcf")) 355 | unlink(paste0(tmp, ".snplist")) 356 | return(o) 357 | } 358 | 359 | #' Query p-value using bcftools 360 | #' 361 | #' @param pval P-value threshold (NOT -log10) 362 | #' @param vcffile Path to .vcf.gz GWAS summary data file 363 | #' @param id If multiple GWAS datasets in the vcf file, the name (sample ID) from which to perform the filter 364 | #' 365 | #' @export 366 | #' @return vcf object 367 | query_pval_bcftools <- function(pval, vcffile, id=NULL) 368 | { 369 | stopifnot(check_bcftools()) 370 | bcftools <- options()[["tools_bcftools"]] 371 | if(is.null(id)) 372 | { 373 | id <- VariantAnnotation::samples(VariantAnnotation::scanVcfHeader(vcffile)) 374 | } 375 | id <- paste(id, collapse=",") 376 | tmp <- tempfile() 377 | cmd <- sprintf("%s view -s %s -i 'FORMAT/LP > %s' %s > %s.vcf", bcftools, id, -log10(pval), vcffile, tmp) 378 | system(cmd) 379 | o <- VariantAnnotation::readVcf(paste0(tmp, ".vcf")) 380 | unlink(paste0(tmp, ".vcf")) 381 | return(o) 382 | } 383 | 384 | #' Query chromosome and position using bcftools 385 | #' 386 | #' @param chrompos Either vector of chromosome and position ranges e.g. "1:1000" or "1:1000-2000", or data frame with columns `chrom`, `start`, `end`. 387 | #' @param vcffile Path to .vcf.gz GWAS summary data file 388 | #' @param id If multiple GWAS datasets in the vcf file, the name (sample ID) from which to perform the filter 389 | #' 390 | #' @export 391 | #' @return vcf object 392 | query_chrompos_bcftools <- function(chrompos, vcffile, id=NULL) 393 | { 394 | stopifnot(check_bcftools()) 395 | bcftools <- options()[["tools_bcftools"]] 396 | if(is.null(id)) 397 | { 398 | id <- VariantAnnotation::samples(VariantAnnotation::scanVcfHeader(vcffile)) 399 | } 400 | idclause <- ifelse(length(id) == 0, "", paste0("-s ", paste(id, collapse=","))) 401 | 402 | chrompos <- parse_chrompos(chrompos) 403 | chrompos %>% as.data.frame 404 | tmp <- tempfile() 405 | utils::write.table(as.data.frame(chrompos)[,1:3], file=paste0(tmp, ".snplist"), sep="\t", row.names = FALSE, col.names = FALSE, quote = FALSE) 406 | 407 | cmd <- sprintf(paste0("%s view %s -R %s.snplist %s > %s.vcf"), bcftools, idclause, tmp, vcffile, tmp) 408 | system(cmd) 409 | o <- VariantAnnotation::readVcf(paste0(tmp, ".vcf")) 410 | unlink(paste0(tmp, ".vcf")) 411 | unlink(paste0(tmp, ".snplist")) 412 | return(o) 413 | } 414 | 415 | 416 | #' Query rsid from file using rsidx index 417 | #' 418 | #' See create_rsidx_index 419 | #' 420 | #' @param rsid Vector of rsids 421 | #' @param vcffile Path to .vcf.gz GWAS summary data file 422 | #' @param id If multiple GWAS datasets in the vcf file, the name (sample ID) from which to perform the filter 423 | #' @param rsidx Path to rsidx index file 424 | #' 425 | #' @export 426 | #' @return vcf object 427 | query_rsid_rsidx <- function(rsid, vcffile, id=NULL, rsidx) 428 | { 429 | out <- query_rsidx(rsid, rsidx) 430 | return( 431 | query_gwas(vcffile, chrompos=data.frame(chrom=out$chrom, start=out$coord, end=out$coord), id=id) 432 | ) 433 | } 434 | 435 | #' Query rsidx 436 | #' 437 | #' @param rsid Vector of rsids 438 | #' @param rsidx Path to rsidx index file 439 | #' 440 | #' @export 441 | #' @return data frame 442 | query_rsidx <- function(rsid, rsidx) 443 | { 444 | conn <- RSQLite::dbConnect(RSQLite::SQLite(), rsidx) 445 | numid <- gsub("rs", "", rsid) %>% paste(.data, collapse=",") 446 | query <- paste0("SELECT DISTINCT * FROM rsid_to_coord WHERE rsid IN (", numid, ")") 447 | out <- RSQLite::dbGetQuery(conn, query) 448 | RSQLite::dbDisconnect(conn) 449 | return(out) 450 | } 451 | 452 | 453 | #' Query pval from file using pvali index 454 | #' 455 | #' See create_pvali_index 456 | #' 457 | #' @param pval pval threshold 458 | #' @param vcffile Path to .vcf.gz GWAS summary data file 459 | #' @param id If multiple GWAS datasets in the vcf file, the name (sample ID) from which to perform the filter 460 | #' @param pvali Path to pval index file 461 | #' 462 | #' @export 463 | #' @return vcf object 464 | query_pval_sqlite3 <- function(pval, vcffile, id=NULL, pvali) 465 | { 466 | out <- query_pvali(pval, pvali) 467 | message("Identified ", nrow(out), " variants passing threshold. Extracting...") 468 | return( 469 | query_gwas(vcffile, chrompos=data.frame(chrom=out$chrom, start=out$coord, end=out$coord), id=id) 470 | ) 471 | } 472 | 473 | #' Query pvali 474 | #' 475 | #' @param pval pval threshold 476 | #' @param pvali Path to pval index file 477 | #' 478 | #' @export 479 | #' @return data frame 480 | query_pvali <- function(pval, pvali) 481 | { 482 | conn <- RSQLite::dbConnect(RSQLite::SQLite(), pvali) 483 | query <- paste0("SELECT DISTINCT * FROM pval_to_coord WHERE lp >= ", -log10(pval)) 484 | out <- RSQLite::dbGetQuery(conn, query) 485 | RSQLite::dbDisconnect(conn) 486 | return(out) 487 | } 488 | -------------------------------------------------------------------------------- /R/rsid_index.r: -------------------------------------------------------------------------------- 1 | #' Create RSID index from VCF 2 | #' 3 | #' @param vcf VCF filename 4 | #' @param indexname index file name to create. Deletes existing file if exists. 5 | #' 6 | #' @export 7 | #' @return NULL 8 | create_rsidx_index_from_vcf <- function(vcf, indexname) 9 | { 10 | fn <- tempfile() 11 | if (Sys.info()["sysname"] == "Windows") { 12 | stop("Currently, this function only works on macOS and Linux") 13 | } 14 | cmd <- paste0("gunzip -c ", vcf, " | grep -v '#' | awk '{ print substr($3, 3), $1, $2 }' > ", fn, ".txt") 15 | message("Extracting position info") 16 | system(cmd) 17 | 18 | cmd <- c( 19 | 'CREATE TABLE rsid_to_coord (rsid INTEGER PRIMARY KEY, chrom TEXT NULL DEFAULT NULL, coord INTEGER NOT NULL DEFAULT 0);', 20 | '.separator " "', 21 | paste0('.import ', fn, '.txt rsid_to_coord') 22 | ) 23 | utils::write.table(cmd, file=paste0(fn, ".sql"), row.names = FALSE, col.names = FALSE, quote = FALSE) 24 | message("Generating index") 25 | cmd <- paste0("sqlite3 ", indexname, " < ", fn, ".sql") 26 | unlink(indexname) 27 | system(cmd) 28 | } 29 | 30 | #' Create new index from existing index using a subset of rsids 31 | #' 32 | #' Note this requires a modified version of plink that allows ld-window-r2 flag for --r option. 33 | #' Available here: https://github.com/explodecomputer/plink-ng 34 | #' 35 | #' @param rsid Vector of rsids 36 | #' @param rsidx Existing index 37 | #' @param newindex New index (Note: will delete existing file if exists) 38 | #' 39 | #' @export 40 | #' @return NULL, creates new index file 41 | create_rsidx_sub_index <- function(rsid, rsidx, newindex) 42 | { 43 | out <- query_rsidx(rsid, rsidx) 44 | unlink(newindex) 45 | conn <- RSQLite::dbConnect(RSQLite::SQLite(), newindex) 46 | RSQLite::dbWriteTable(conn, "rsid_to_coord", out) 47 | RSQLite::dbExecute(conn, "CREATE INDEX rsid on rsid_to_coord(rsid);") 48 | RSQLite::dbDisconnect(conn) 49 | } 50 | 51 | 52 | 53 | #' Create LD reference sqlite database for tags 54 | #' 55 | #' This is used for looking up proxies 56 | #' 57 | #' @param bfile path to plink file 58 | #' @param dbname dbname to produce (overwrites existing if exists) 59 | #' @param tag_r2 minimum tag r2 60 | #' 61 | #' @export 62 | #' @return NULL 63 | create_ldref_sqlite <- function(bfile, dbname, tag_r2=0.6) 64 | { 65 | stopifnot(check_plink()) 66 | message("identifying indels to remove") 67 | cmd <- paste0("awk '{ if (length($5) != 1 || length($6) != 1) { print $2 }}' ", bfile, ".bim > ", bfile, ".indels") 68 | system(cmd) 69 | 70 | message("calculating ld tags") 71 | cmd <- paste0(options()[["tools_plink"]], " --bfile ", bfile, " --keep-allele-order --exclude ", bfile, ".indels --r in-phase with-freqs gz --out ", bfile, " --ld-window-kb 250 --ld-window 1000 --ld-window-r2 ", tag_r2) 72 | system(cmd) 73 | 74 | message("formatting") 75 | if (Sys.info()["sysname"] == "Windows") { 76 | stop("Currently, this function only works on macOS and Linux") 77 | } 78 | cmd <- paste0("gunzip -c ", bfile, ".ld.gz | awk 'BEGIN {OFS=\",\"} { if(NR != 1) { print substr($3, 3), $1, $2, $4, substr($7, 3), $5, $6, $9, $8, $10 }}' > ", bfile, ".ld.tab") 79 | system(cmd) 80 | 81 | message("creating sqlite db") 82 | cmd <- c( 83 | 'CREATE TABLE tags (', 84 | ' SNP_A INTEGER NOT NULL, ', 85 | ' CHR_A TEXT NULL DEFAULT NULL, ', 86 | ' BP_A INTEGER NOT NULL,', 87 | ' MAF_A REAL NOT NULL,', 88 | ' SNP_B INTEGER NOT NULL, ', 89 | ' CHR_B TEXT NULL DEFAULT NULL, ', 90 | ' BP_B INTEGER NOT NULL,', 91 | ' MAF_B REAL NOT NULL,', 92 | ' PHASE TEXT NOT NULL,', 93 | ' R REAL NOT NULL', 94 | ');', 95 | 'CREATE INDEX SNP_A_INDEX ON tags(SNP_A);', 96 | '.separator ","', 97 | paste0(".import ", bfile, ".ld.tab tags") 98 | ) 99 | unlink(paste0(bfile, ".ld.sqlite")) 100 | utils::write.table(cmd, file=paste0(bfile, ".ld.sqlite"), row.names = FALSE, col.names = FALSE, quote = FALSE) 101 | unlink(dbname) 102 | cmd <- paste0("sqlite3 ", dbname, " < ", bfile, ".ld.sqlite") 103 | system(cmd) 104 | unlink(paste0(bfile, ".ld.tab")) 105 | unlink(paste0(bfile, ".ld.gz")) 106 | unlink(paste0(bfile, ".ld.sqlite")) 107 | # unlink(paste0(bfile, ".indels")) 108 | } 109 | -------------------------------------------------------------------------------- /R/utils-pipe.R: -------------------------------------------------------------------------------- 1 | #' Pipe operator 2 | #' 3 | #' See \code{magrittr::\link[magrittr]{\%>\%}} for details. 4 | #' 5 | #' @name %>% 6 | #' @rdname pipe 7 | #' @keywords internal 8 | #' @export 9 | #' @importFrom magrittr %>% 10 | #' @usage lhs \%>\% rhs 11 | NULL 12 | 13 | #' VariantAnnotation 14 | #' 15 | #' @name VariantAnnotation 16 | #' @import VariantAnnotation 17 | NULL 18 | -------------------------------------------------------------------------------- /R/zzz.r: -------------------------------------------------------------------------------- 1 | options(datatable.fread.input.cmd.message=FALSE) 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Reading, querying and writing GWAS summary data in VCF format 2 | 3 | 4 | [![Lifecycle:experimental](https://img.shields.io/badge/lifecycle-experimental-orange.svg)](https://lifecycle.r-lib.org/articles/stages.html) 5 | [![R-CMD-check](https://github.com/MRCIEU/gwasvcf/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/MRCIEU/gwasvcf/actions/workflows/R-CMD-check.yaml) 6 | 7 | 8 | Complete GWAS summary datasets are now abundant. A large repository of curated, harmonised and QC'd datasets is available in the [IEU GWAS database](https://gwas.mrcieu.ac.uk/). They can be queried via the [API](https://api.opengwas.io/api/) directly, or through the [ieugwasr](https://github.com/mrcieu/ieugwasr) R package, or the [ieugwaspy](https://github.com/mrcieu/ieugwaspy) Python package. However, for faster querying that can be used in a HPC environment, accessing the data directly and not through cloud systems is advantageous. 9 | 10 | We developed a format for storing and harmonising GWAS summary data known as [GWAS VCF format](https://github.com/MRCIEU/gwas-vcf-specification/releases/tag/1.0.0) which can be created using [gwas2vcf](https://github.com/mrcieu/gwas2vcf). All the data in the [IEU GWAS database](https://gwas.mrcieu.ac.uk/) is available for download in this format. This R package provides fast and convenient functions for querying and creating GWAS summary data in GWAS VCF format (v1.0). See also [pygwasvcf](https://github.com/mrcieu/pygwasvcf) a Python3 parser for querying GWAS VCF files. 11 | 12 | This package includes: 13 | 14 | - a wrapper around the [bioconductor/VariantAnnotation](https://bioconductor.org/packages/release/bioc/html/VariantAnnotation.html) package, providing functions tailored to GWAS VCF for reading, querying, creating and writing GWAS VCF format files 15 | - some LD related functions such as using a reference panel to extract proxies, create LD matrices and perform LD clumping 16 | - functions for harmonising a dataset against the reference genome and creating GWAS VCF files. 17 | 18 | See also the [gwasglue](https://github.com/MRCIEU/gwasglue) R package for methods to connect the VCF data to Mendelian randomization, colocalisation, fine mapping etc. 19 | 20 | ## Installation 21 | 22 | You can install a binary version from our [MRC IEU r-universe](https://mrcieu.r-universe.dev/builds) with 23 | 24 | ```r 25 | install.packages('gwasvcf', repos = c('https://mrcieu.r-universe.dev', 'https://cloud.r-project.org')) 26 | ``` 27 | 28 | or install from the GitHub repo 29 | 30 | ```r 31 | remotes::install_github("mrcieu/gwasvcf") 32 | ``` 33 | 34 | ## Usage 35 | 36 | See vignettes here: [https://mrcieu.github.io/gwasvcf/](https://mrcieu.github.io/gwasvcf/). 37 | 38 | ## Citation 39 | 40 | If using GWAS-VCF files please reference the studies that you use and the following paper: 41 | 42 | **The variant call format provides efficient and robust storage of GWAS summary statistics.** Matthew Lyon, Shea J Andrews, Ben Elsworth, Tom R Gaunt, Gibran Hemani, Edoardo Marcora. bioRxiv 2020.05.29.115824; doi: https://doi.org/10.1101/2020.05.29.115824 43 | 44 | 45 | ## Reference datasets 46 | 47 | Example GWAS VCF (GIANT 2010 BMI): 48 | 49 | - [http://fileserve.mrcieu.ac.uk/vcf/IEU-a-2.vcf.gz](http://fileserve.mrcieu.ac.uk/vcf/IEU-a-2.vcf.gz) 50 | - [http://fileserve.mrcieu.ac.uk/vcf/IEU-a-2.vcf.gz.tbi](http://fileserve.mrcieu.ac.uk/vcf/IEU-a-2.vcf.gz.tbi) 51 | 52 | 1000 genomes reference panels for LD for each superpopulation - used by default in OpenGWAS: 53 | 54 | - [http://fileserve.mrcieu.ac.uk/ld/1kg.v3.tgz](http://fileserve.mrcieu.ac.uk/ld/1kg.v3.tgz) 55 | 56 | RSID index for faster querying: 57 | 58 | - [http://fileserve.mrcieu.ac.uk/vcf/annotations.vcf.gz.rsidx](http://fileserve.mrcieu.ac.uk/vcf/annotations.vcf.gz.rsidx) 59 | 60 | 1000 genomes annotations in vcf format harmonised against human genome reference: 61 | 62 | - [http://fileserve.mrcieu.ac.uk/vcf/1kg_v3_nomult.vcf.gz](http://fileserve.mrcieu.ac.uk/vcf/1kg_v3_nomult.vcf.gz) 63 | - [http://fileserve.mrcieu.ac.uk/vcf/1kg_v3_nomult.vcf.gz.tbi](http://fileserve.mrcieu.ac.uk/vcf/1kg_v3_nomult.vcf.gz.tbi) 64 | 65 | --- 66 | 67 | ### Notes 68 | 69 | #### Example data 70 | 71 | data.vcf.gz and data.vcf.gz.tbi are the first few rows of the Speliotes 2010 BMI GWAS 72 | 73 | The eur.bed/bim/fam files are the same range as data.vcf.gz, from here http://fileserve.mrcieu.ac.uk/ld/data_maf0.01_rs_ref.tgz 74 | -------------------------------------------------------------------------------- /_pkgdown.yml: -------------------------------------------------------------------------------- 1 | template: 2 | bootstrap: 5 3 | light-switch: true 4 | url: https://mrcieu.github.io/gwasvcf/ 5 | -------------------------------------------------------------------------------- /gwasvcf.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | AutoAppendNewline: Yes 16 | 17 | BuildType: Package 18 | PackageUseDevtools: Yes 19 | PackageInstallArgs: --no-multiarch --with-keep.source 20 | PackageRoxygenize: rd,collate,namespace 21 | -------------------------------------------------------------------------------- /inst/extdata/data.vcf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MRCIEU/gwasvcf/820267653ac7720926a13cac00b82c0a0ca840b6/inst/extdata/data.vcf.gz -------------------------------------------------------------------------------- /inst/extdata/data.vcf.gz.tbi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MRCIEU/gwasvcf/820267653ac7720926a13cac00b82c0a0ca840b6/inst/extdata/data.vcf.gz.tbi -------------------------------------------------------------------------------- /inst/extdata/eur.bed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MRCIEU/gwasvcf/820267653ac7720926a13cac00b82c0a0ca840b6/inst/extdata/eur.bed -------------------------------------------------------------------------------- /inst/extdata/eur.fam: -------------------------------------------------------------------------------- 1 | HG00097 HG00097 0 0 0 -9 2 | HG00099 HG00099 0 0 0 -9 3 | HG00100 HG00100 0 0 0 -9 4 | HG00101 HG00101 0 0 0 -9 5 | HG00102 HG00102 0 0 0 -9 6 | HG00103 HG00103 0 0 0 -9 7 | HG00105 HG00105 0 0 0 -9 8 | HG00106 HG00106 0 0 0 -9 9 | HG00107 HG00107 0 0 0 -9 10 | HG00108 HG00108 0 0 0 -9 11 | HG00109 HG00109 0 0 0 -9 12 | HG00110 HG00110 0 0 0 -9 13 | HG00111 HG00111 0 0 0 -9 14 | HG00112 HG00112 0 0 0 -9 15 | HG00113 HG00113 0 0 0 -9 16 | HG00114 HG00114 0 0 0 -9 17 | HG00115 HG00115 0 0 0 -9 18 | HG00116 HG00116 0 0 0 -9 19 | HG00117 HG00117 0 0 0 -9 20 | HG00118 HG00118 0 0 0 -9 21 | HG00119 HG00119 0 0 0 -9 22 | HG00120 HG00120 0 0 0 -9 23 | HG00121 HG00121 0 0 0 -9 24 | HG00122 HG00122 0 0 0 -9 25 | HG00123 HG00123 0 0 0 -9 26 | HG00125 HG00125 0 0 0 -9 27 | HG00126 HG00126 0 0 0 -9 28 | HG00127 HG00127 0 0 0 -9 29 | HG00128 HG00128 0 0 0 -9 30 | HG00129 HG00129 0 0 0 -9 31 | HG00130 HG00130 0 0 0 -9 32 | HG00131 HG00131 0 0 0 -9 33 | HG00132 HG00132 0 0 0 -9 34 | HG00133 HG00133 0 0 0 -9 35 | HG00136 HG00136 0 0 0 -9 36 | HG00137 HG00137 0 0 0 -9 37 | HG00138 HG00138 0 0 0 -9 38 | HG00139 HG00139 0 0 0 -9 39 | HG00140 HG00140 0 0 0 -9 40 | HG00141 HG00141 0 0 0 -9 41 | HG00142 HG00142 0 0 0 -9 42 | HG00143 HG00143 0 0 0 -9 43 | HG00145 HG00145 0 0 0 -9 44 | HG00146 HG00146 0 0 0 -9 45 | HG00148 HG00148 0 0 0 -9 46 | HG00149 HG00149 0 0 0 -9 47 | HG00150 HG00150 0 0 0 -9 48 | HG00151 HG00151 0 0 0 -9 49 | HG00154 HG00154 0 0 0 -9 50 | HG00155 HG00155 0 0 0 -9 51 | HG00157 HG00157 0 0 0 -9 52 | HG00158 HG00158 0 0 0 -9 53 | HG00159 HG00159 0 0 0 -9 54 | HG00160 HG00160 0 0 0 -9 55 | HG00171 HG00171 0 0 0 -9 56 | HG00173 HG00173 0 0 0 -9 57 | HG00174 HG00174 0 0 0 -9 58 | HG00176 HG00176 0 0 0 -9 59 | HG00177 HG00177 0 0 0 -9 60 | HG00178 HG00178 0 0 0 -9 61 | HG00179 HG00179 0 0 0 -9 62 | HG00180 HG00180 0 0 0 -9 63 | HG00181 HG00181 0 0 0 -9 64 | HG00182 HG00182 0 0 0 -9 65 | HG00183 HG00183 0 0 0 -9 66 | HG00185 HG00185 0 0 0 -9 67 | HG00186 HG00186 0 0 0 -9 68 | HG00187 HG00187 0 0 0 -9 69 | HG00188 HG00188 0 0 0 -9 70 | HG00189 HG00189 0 0 0 -9 71 | HG00190 HG00190 0 0 0 -9 72 | HG00231 HG00231 0 0 0 -9 73 | HG00232 HG00232 0 0 0 -9 74 | HG00233 HG00233 0 0 0 -9 75 | HG00234 HG00234 0 0 0 -9 76 | HG00235 HG00235 0 0 0 -9 77 | HG00236 HG00236 0 0 0 -9 78 | HG00237 HG00237 0 0 0 -9 79 | HG00238 HG00238 0 0 0 -9 80 | HG00239 HG00239 0 0 0 -9 81 | HG00240 HG00240 0 0 0 -9 82 | HG00242 HG00242 0 0 0 -9 83 | HG00243 HG00243 0 0 0 -9 84 | HG00244 HG00244 0 0 0 -9 85 | HG00245 HG00245 0 0 0 -9 86 | HG00246 HG00246 0 0 0 -9 87 | HG00250 HG00250 0 0 0 -9 88 | HG00251 HG00251 0 0 0 -9 89 | HG00252 HG00252 0 0 0 -9 90 | HG00253 HG00253 0 0 0 -9 91 | HG00254 HG00254 0 0 0 -9 92 | HG00255 HG00255 0 0 0 -9 93 | HG00256 HG00256 0 0 0 -9 94 | HG00257 HG00257 0 0 0 -9 95 | HG00258 HG00258 0 0 0 -9 96 | HG00259 HG00259 0 0 0 -9 97 | HG00260 HG00260 0 0 0 -9 98 | HG00261 HG00261 0 0 0 -9 99 | HG00262 HG00262 0 0 0 -9 100 | HG00263 HG00263 0 0 0 -9 101 | HG00264 HG00264 0 0 0 -9 102 | HG00265 HG00265 0 0 0 -9 103 | HG00266 HG00266 0 0 0 -9 104 | HG00267 HG00267 0 0 0 -9 105 | HG00268 HG00268 0 0 0 -9 106 | HG00269 HG00269 0 0 0 -9 107 | HG00271 HG00271 0 0 0 -9 108 | HG00272 HG00272 0 0 0 -9 109 | HG00273 HG00273 0 0 0 -9 110 | HG00274 HG00274 0 0 0 -9 111 | HG00275 HG00275 0 0 0 -9 112 | HG00276 HG00276 0 0 0 -9 113 | HG00277 HG00277 0 0 0 -9 114 | HG00278 HG00278 0 0 0 -9 115 | HG00280 HG00280 0 0 0 -9 116 | HG00281 HG00281 0 0 0 -9 117 | HG00282 HG00282 0 0 0 -9 118 | HG00284 HG00284 0 0 0 -9 119 | HG00285 HG00285 0 0 0 -9 120 | HG00288 HG00288 0 0 0 -9 121 | HG00290 HG00290 0 0 0 -9 122 | HG00304 HG00304 0 0 0 -9 123 | HG00306 HG00306 0 0 0 -9 124 | HG00308 HG00308 0 0 0 -9 125 | HG00309 HG00309 0 0 0 -9 126 | HG00310 HG00310 0 0 0 -9 127 | HG00311 HG00311 0 0 0 -9 128 | HG00313 HG00313 0 0 0 -9 129 | HG00315 HG00315 0 0 0 -9 130 | HG00318 HG00318 0 0 0 -9 131 | HG00319 HG00319 0 0 0 -9 132 | HG00320 HG00320 0 0 0 -9 133 | HG00321 HG00321 0 0 0 -9 134 | HG00323 HG00323 0 0 0 -9 135 | HG00324 HG00324 0 0 0 -9 136 | HG00325 HG00325 0 0 0 -9 137 | HG00326 HG00326 0 0 0 -9 138 | HG00327 HG00327 0 0 0 -9 139 | HG00328 HG00328 0 0 0 -9 140 | HG00329 HG00329 0 0 0 -9 141 | HG00330 HG00330 0 0 0 -9 142 | HG00331 HG00331 0 0 0 -9 143 | HG00332 HG00332 0 0 0 -9 144 | HG00334 HG00334 0 0 0 -9 145 | HG00335 HG00335 0 0 0 -9 146 | HG00336 HG00336 0 0 0 -9 147 | HG00337 HG00337 0 0 0 -9 148 | HG00338 HG00338 0 0 0 -9 149 | HG00339 HG00339 0 0 0 -9 150 | HG00341 HG00341 0 0 0 -9 151 | HG00342 HG00342 0 0 0 -9 152 | HG00343 HG00343 0 0 0 -9 153 | HG00344 HG00344 0 0 0 -9 154 | HG00345 HG00345 0 0 0 -9 155 | HG00346 HG00346 0 0 0 -9 156 | HG00349 HG00349 0 0 0 -9 157 | HG00350 HG00350 0 0 0 -9 158 | HG00351 HG00351 0 0 0 -9 159 | HG00353 HG00353 0 0 0 -9 160 | HG00355 HG00355 0 0 0 -9 161 | HG00356 HG00356 0 0 0 -9 162 | HG00357 HG00357 0 0 0 -9 163 | HG00358 HG00358 0 0 0 -9 164 | HG00360 HG00360 0 0 0 -9 165 | HG00361 HG00361 0 0 0 -9 166 | HG00362 HG00362 0 0 0 -9 167 | HG00364 HG00364 0 0 0 -9 168 | HG00365 HG00365 0 0 0 -9 169 | HG00366 HG00366 0 0 0 -9 170 | HG00367 HG00367 0 0 0 -9 171 | HG00368 HG00368 0 0 0 -9 172 | HG00369 HG00369 0 0 0 -9 173 | HG00371 HG00371 0 0 0 -9 174 | HG00372 HG00372 0 0 0 -9 175 | HG00373 HG00373 0 0 0 -9 176 | HG00375 HG00375 0 0 0 -9 177 | HG00376 HG00376 0 0 0 -9 178 | HG00378 HG00378 0 0 0 -9 179 | HG00379 HG00379 0 0 0 -9 180 | HG00380 HG00380 0 0 0 -9 181 | HG00381 HG00381 0 0 0 -9 182 | HG00382 HG00382 0 0 0 -9 183 | HG00383 HG00383 0 0 0 -9 184 | HG00384 HG00384 0 0 0 -9 185 | HG01334 HG01334 0 0 0 -9 186 | HG01500 HG01500 0 0 0 -9 187 | HG01501 HG01501 0 0 0 -9 188 | HG01503 HG01503 0 0 0 -9 189 | HG01504 HG01504 0 0 0 -9 190 | HG01506 HG01506 0 0 0 -9 191 | HG01507 HG01507 0 0 0 -9 192 | HG01509 HG01509 0 0 0 -9 193 | HG01510 HG01510 0 0 0 -9 194 | HG01512 HG01512 0 0 0 -9 195 | HG01513 HG01513 0 0 0 -9 196 | HG01515 HG01515 0 0 0 -9 197 | HG01516 HG01516 0 0 0 -9 198 | HG01518 HG01518 0 0 0 -9 199 | HG01519 HG01519 0 0 0 -9 200 | HG01521 HG01521 0 0 0 -9 201 | HG01522 HG01522 0 0 0 -9 202 | HG01524 HG01524 0 0 0 -9 203 | HG01525 HG01525 0 0 0 -9 204 | HG01527 HG01527 0 0 0 -9 205 | HG01528 HG01528 0 0 0 -9 206 | HG01530 HG01530 0 0 0 -9 207 | HG01531 HG01531 0 0 0 -9 208 | HG01536 HG01536 0 0 0 -9 209 | HG01537 HG01537 0 0 0 -9 210 | HG01602 HG01602 0 0 0 -9 211 | HG01603 HG01603 0 0 0 -9 212 | HG01605 HG01605 0 0 0 -9 213 | HG01606 HG01606 0 0 0 -9 214 | HG01607 HG01607 0 0 0 -9 215 | HG01608 HG01608 0 0 0 -9 216 | HG01610 HG01610 0 0 0 -9 217 | HG01612 HG01612 0 0 0 -9 218 | HG01613 HG01613 0 0 0 -9 219 | HG01615 HG01615 0 0 0 -9 220 | HG01617 HG01617 0 0 0 -9 221 | HG01618 HG01618 0 0 0 -9 222 | HG01619 HG01619 0 0 0 -9 223 | HG01620 HG01620 0 0 0 -9 224 | HG01623 HG01623 0 0 0 -9 225 | HG01624 HG01624 0 0 0 -9 226 | HG01625 HG01625 0 0 0 -9 227 | HG01626 HG01626 0 0 0 -9 228 | HG01628 HG01628 0 0 0 -9 229 | HG01630 HG01630 0 0 0 -9 230 | HG01631 HG01631 0 0 0 -9 231 | HG01632 HG01632 0 0 0 -9 232 | HG01668 HG01668 0 0 0 -9 233 | HG01669 HG01669 0 0 0 -9 234 | HG01670 HG01670 0 0 0 -9 235 | HG01672 HG01672 0 0 0 -9 236 | HG01673 HG01673 0 0 0 -9 237 | HG01675 HG01675 0 0 0 -9 238 | HG01676 HG01676 0 0 0 -9 239 | HG01678 HG01678 0 0 0 -9 240 | HG01679 HG01679 0 0 0 -9 241 | HG01680 HG01680 0 0 0 -9 242 | HG01682 HG01682 0 0 0 -9 243 | HG01684 HG01684 0 0 0 -9 244 | HG01685 HG01685 0 0 0 -9 245 | HG01686 HG01686 0 0 0 -9 246 | HG01694 HG01694 0 0 0 -9 247 | HG01695 HG01695 0 0 0 -9 248 | HG01697 HG01697 0 0 0 -9 249 | HG01699 HG01699 0 0 0 -9 250 | HG01700 HG01700 0 0 0 -9 251 | HG01702 HG01702 0 0 0 -9 252 | HG01704 HG01704 0 0 0 -9 253 | HG01705 HG01705 0 0 0 -9 254 | HG01707 HG01707 0 0 0 -9 255 | HG01708 HG01708 0 0 0 -9 256 | HG01709 HG01709 0 0 0 -9 257 | HG01710 HG01710 0 0 0 -9 258 | HG01746 HG01746 0 0 0 -9 259 | HG01747 HG01747 0 0 0 -9 260 | HG01756 HG01756 0 0 0 -9 261 | HG01757 HG01757 0 0 0 -9 262 | HG01761 HG01761 0 0 0 -9 263 | HG01762 HG01762 0 0 0 -9 264 | HG01765 HG01765 0 0 0 -9 265 | HG01766 HG01766 0 0 0 -9 266 | HG01767 HG01767 0 0 0 -9 267 | HG01768 HG01768 0 0 0 -9 268 | HG01770 HG01770 0 0 0 -9 269 | HG01771 HG01771 0 0 0 -9 270 | HG01773 HG01773 0 0 0 -9 271 | HG01775 HG01775 0 0 0 -9 272 | HG01776 HG01776 0 0 0 -9 273 | HG01777 HG01777 0 0 0 -9 274 | HG01779 HG01779 0 0 0 -9 275 | HG01781 HG01781 0 0 0 -9 276 | HG01783 HG01783 0 0 0 -9 277 | HG01784 HG01784 0 0 0 -9 278 | HG01785 HG01785 0 0 0 -9 279 | HG01786 HG01786 0 0 0 -9 280 | HG01789 HG01789 0 0 0 -9 281 | HG01790 HG01790 0 0 0 -9 282 | HG01791 HG01791 0 0 0 -9 283 | HG02215 HG02215 0 0 0 -9 284 | HG02219 HG02219 0 0 0 -9 285 | HG02220 HG02220 0 0 0 -9 286 | HG02221 HG02221 0 0 0 -9 287 | HG02223 HG02223 0 0 0 -9 288 | HG02224 HG02224 0 0 0 -9 289 | HG02230 HG02230 0 0 0 -9 290 | HG02231 HG02231 0 0 0 -9 291 | HG02232 HG02232 0 0 0 -9 292 | HG02233 HG02233 0 0 0 -9 293 | HG02235 HG02235 0 0 0 -9 294 | HG02236 HG02236 0 0 0 -9 295 | HG02238 HG02238 0 0 0 -9 296 | HG02239 HG02239 0 0 0 -9 297 | NA06984 NA06984 0 0 0 -9 298 | NA06985 NA06985 0 0 0 -9 299 | NA06986 NA06986 0 0 0 -9 300 | NA06989 NA06989 0 0 0 -9 301 | NA06994 NA06994 0 0 0 -9 302 | NA07000 NA07000 0 0 0 -9 303 | NA07037 NA07037 0 0 0 -9 304 | NA07048 NA07048 0 0 0 -9 305 | NA07051 NA07051 0 0 0 -9 306 | NA07056 NA07056 0 0 0 -9 307 | NA07347 NA07347 0 0 0 -9 308 | NA07357 NA07357 0 0 0 -9 309 | NA10847 NA10847 0 0 0 -9 310 | NA10851 NA10851 0 0 0 -9 311 | NA11829 NA11829 0 0 0 -9 312 | NA11830 NA11830 0 0 0 -9 313 | NA11831 NA11831 0 0 0 -9 314 | NA11832 NA11832 0 0 0 -9 315 | NA11840 NA11840 0 0 0 -9 316 | NA11843 NA11843 0 0 0 -9 317 | NA11881 NA11881 0 0 0 -9 318 | NA11892 NA11892 0 0 0 -9 319 | NA11893 NA11893 0 0 0 -9 320 | NA11894 NA11894 0 0 0 -9 321 | NA11918 NA11918 0 0 0 -9 322 | NA11919 NA11919 0 0 0 -9 323 | NA11920 NA11920 0 0 0 -9 324 | NA11930 NA11930 0 0 0 -9 325 | NA11931 NA11931 0 0 0 -9 326 | NA11932 NA11932 0 0 0 -9 327 | NA11933 NA11933 0 0 0 -9 328 | NA11992 NA11992 0 0 0 -9 329 | NA11994 NA11994 0 0 0 -9 330 | NA11995 NA11995 0 0 0 -9 331 | NA12003 NA12003 0 0 0 -9 332 | NA12004 NA12004 0 0 0 -9 333 | NA12005 NA12005 0 0 0 -9 334 | NA12006 NA12006 0 0 0 -9 335 | NA12043 NA12043 0 0 0 -9 336 | NA12044 NA12044 0 0 0 -9 337 | NA12045 NA12045 0 0 0 -9 338 | NA12046 NA12046 0 0 0 -9 339 | NA12058 NA12058 0 0 0 -9 340 | NA12144 NA12144 0 0 0 -9 341 | NA12154 NA12154 0 0 0 -9 342 | NA12155 NA12155 0 0 0 -9 343 | NA12156 NA12156 0 0 0 -9 344 | NA12234 NA12234 0 0 0 -9 345 | NA12249 NA12249 0 0 0 -9 346 | NA12272 NA12272 0 0 0 -9 347 | NA12273 NA12273 0 0 0 -9 348 | NA12275 NA12275 0 0 0 -9 349 | NA12282 NA12282 0 0 0 -9 350 | NA12283 NA12283 0 0 0 -9 351 | NA12286 NA12286 0 0 0 -9 352 | NA12287 NA12287 0 0 0 -9 353 | NA12340 NA12340 0 0 0 -9 354 | NA12341 NA12341 0 0 0 -9 355 | NA12342 NA12342 0 0 0 -9 356 | NA12347 NA12347 0 0 0 -9 357 | NA12348 NA12348 0 0 0 -9 358 | NA12383 NA12383 0 0 0 -9 359 | NA12399 NA12399 0 0 0 -9 360 | NA12400 NA12400 0 0 0 -9 361 | NA12413 NA12413 0 0 0 -9 362 | NA12414 NA12414 0 0 0 -9 363 | NA12489 NA12489 0 0 0 -9 364 | NA12546 NA12546 0 0 0 -9 365 | NA12716 NA12716 0 0 0 -9 366 | NA12717 NA12717 0 0 0 -9 367 | NA12718 NA12718 0 0 0 -9 368 | NA12748 NA12748 0 0 0 -9 369 | NA12749 NA12749 0 0 0 -9 370 | NA12750 NA12750 0 0 0 -9 371 | NA12751 NA12751 0 0 0 -9 372 | NA12760 NA12760 0 0 0 -9 373 | NA12761 NA12761 0 0 0 -9 374 | NA12762 NA12762 0 0 0 -9 375 | NA12763 NA12763 0 0 0 -9 376 | NA12775 NA12775 0 0 0 -9 377 | NA12776 NA12776 0 0 0 -9 378 | NA12777 NA12777 0 0 0 -9 379 | NA12778 NA12778 0 0 0 -9 380 | NA12812 NA12812 0 0 0 -9 381 | NA12813 NA12813 0 0 0 -9 382 | NA12814 NA12814 0 0 0 -9 383 | NA12815 NA12815 0 0 0 -9 384 | NA12827 NA12827 0 0 0 -9 385 | NA12828 NA12828 0 0 0 -9 386 | NA12829 NA12829 0 0 0 -9 387 | NA12830 NA12830 0 0 0 -9 388 | NA12842 NA12842 0 0 0 -9 389 | NA12843 NA12843 0 0 0 -9 390 | NA12872 NA12872 0 0 0 -9 391 | NA12873 NA12873 0 0 0 -9 392 | NA12874 NA12874 0 0 0 -9 393 | NA12878 NA12878 0 0 0 -9 394 | NA12889 NA12889 0 0 0 -9 395 | NA12890 NA12890 0 0 0 -9 396 | NA20502 NA20502 0 0 0 -9 397 | NA20503 NA20503 0 0 0 -9 398 | NA20504 NA20504 0 0 0 -9 399 | NA20505 NA20505 0 0 0 -9 400 | NA20506 NA20506 0 0 0 -9 401 | NA20507 NA20507 0 0 0 -9 402 | NA20508 NA20508 0 0 0 -9 403 | NA20509 NA20509 0 0 0 -9 404 | NA20510 NA20510 0 0 0 -9 405 | NA20511 NA20511 0 0 0 -9 406 | NA20512 NA20512 0 0 0 -9 407 | NA20513 NA20513 0 0 0 -9 408 | NA20514 NA20514 0 0 0 -9 409 | NA20515 NA20515 0 0 0 -9 410 | NA20516 NA20516 0 0 0 -9 411 | NA20517 NA20517 0 0 0 -9 412 | NA20518 NA20518 0 0 0 -9 413 | NA20519 NA20519 0 0 0 -9 414 | NA20520 NA20520 0 0 0 -9 415 | NA20521 NA20521 0 0 0 -9 416 | NA20522 NA20522 0 0 0 -9 417 | NA20524 NA20524 0 0 0 -9 418 | NA20525 NA20525 0 0 0 -9 419 | NA20527 NA20527 0 0 0 -9 420 | NA20528 NA20528 0 0 0 -9 421 | NA20529 NA20529 0 0 0 -9 422 | NA20530 NA20530 0 0 0 -9 423 | NA20531 NA20531 0 0 0 -9 424 | NA20532 NA20532 0 0 0 -9 425 | NA20533 NA20533 0 0 0 -9 426 | NA20534 NA20534 0 0 0 -9 427 | NA20535 NA20535 0 0 0 -9 428 | NA20536 NA20536 0 0 0 -9 429 | NA20538 NA20538 0 0 0 -9 430 | NA20539 NA20539 0 0 0 -9 431 | NA20540 NA20540 0 0 0 -9 432 | NA20541 NA20541 0 0 0 -9 433 | NA20542 NA20542 0 0 0 -9 434 | NA20543 NA20543 0 0 0 -9 435 | NA20544 NA20544 0 0 0 -9 436 | NA20581 NA20581 0 0 0 -9 437 | NA20582 NA20582 0 0 0 -9 438 | NA20585 NA20585 0 0 0 -9 439 | NA20586 NA20586 0 0 0 -9 440 | NA20587 NA20587 0 0 0 -9 441 | NA20588 NA20588 0 0 0 -9 442 | NA20589 NA20589 0 0 0 -9 443 | NA20752 NA20752 0 0 0 -9 444 | NA20753 NA20753 0 0 0 -9 445 | NA20754 NA20754 0 0 0 -9 446 | NA20755 NA20755 0 0 0 -9 447 | NA20756 NA20756 0 0 0 -9 448 | NA20757 NA20757 0 0 0 -9 449 | NA20758 NA20758 0 0 0 -9 450 | NA20759 NA20759 0 0 0 -9 451 | NA20760 NA20760 0 0 0 -9 452 | NA20761 NA20761 0 0 0 -9 453 | NA20762 NA20762 0 0 0 -9 454 | NA20763 NA20763 0 0 0 -9 455 | NA20764 NA20764 0 0 0 -9 456 | NA20765 NA20765 0 0 0 -9 457 | NA20766 NA20766 0 0 0 -9 458 | NA20767 NA20767 0 0 0 -9 459 | NA20768 NA20768 0 0 0 -9 460 | NA20769 NA20769 0 0 0 -9 461 | NA20770 NA20770 0 0 0 -9 462 | NA20771 NA20771 0 0 0 -9 463 | NA20772 NA20772 0 0 0 -9 464 | NA20773 NA20773 0 0 0 -9 465 | NA20774 NA20774 0 0 0 -9 466 | NA20775 NA20775 0 0 0 -9 467 | NA20778 NA20778 0 0 0 -9 468 | NA20783 NA20783 0 0 0 -9 469 | NA20785 NA20785 0 0 0 -9 470 | NA20786 NA20786 0 0 0 -9 471 | NA20787 NA20787 0 0 0 -9 472 | NA20790 NA20790 0 0 0 -9 473 | NA20792 NA20792 0 0 0 -9 474 | NA20795 NA20795 0 0 0 -9 475 | NA20796 NA20796 0 0 0 -9 476 | NA20797 NA20797 0 0 0 -9 477 | NA20798 NA20798 0 0 0 -9 478 | NA20799 NA20799 0 0 0 -9 479 | NA20800 NA20800 0 0 0 -9 480 | NA20801 NA20801 0 0 0 -9 481 | NA20802 NA20802 0 0 0 -9 482 | NA20803 NA20803 0 0 0 -9 483 | NA20804 NA20804 0 0 0 -9 484 | NA20805 NA20805 0 0 0 -9 485 | NA20806 NA20806 0 0 0 -9 486 | NA20807 NA20807 0 0 0 -9 487 | NA20808 NA20808 0 0 0 -9 488 | NA20809 NA20809 0 0 0 -9 489 | NA20810 NA20810 0 0 0 -9 490 | NA20811 NA20811 0 0 0 -9 491 | NA20812 NA20812 0 0 0 -9 492 | NA20813 NA20813 0 0 0 -9 493 | NA20814 NA20814 0 0 0 -9 494 | NA20815 NA20815 0 0 0 -9 495 | NA20818 NA20818 0 0 0 -9 496 | NA20819 NA20819 0 0 0 -9 497 | NA20821 NA20821 0 0 0 -9 498 | NA20822 NA20822 0 0 0 -9 499 | NA20826 NA20826 0 0 0 -9 500 | NA20827 NA20827 0 0 0 -9 501 | NA20828 NA20828 0 0 0 -9 502 | NA20832 NA20832 0 0 0 -9 503 | -------------------------------------------------------------------------------- /inst/sandpit/bmi_example.r: -------------------------------------------------------------------------------- 1 | if(!require(gwasvcftools)) 2 | { 3 | if(!required(devtools)) install.packages("devtools") 4 | devtools::install_github("MRCIEU/gwasvcftools") 5 | } 6 | library(gwasvcftools) 7 | library(argparse) 8 | 9 | # create parser object 10 | parser <- ArgumentParser() 11 | parser$add_argument('--snplist', required=TRUE) 12 | parser$add_argument('--bcf-dir', required=TRUE) 13 | parser$add_argument('--gwas-id', required=TRUE) 14 | parser$add_argument('--out', required=TRUE) 15 | parser$add_argument('--bfile', required=TRUE) 16 | parser$add_argument('--get-proxies', action="store_true", default=FALSE) 17 | parser$add_argument('--vcf-ref', required=FALSE) 18 | parser$add_argument('--tag-r2', type="double", default=0.6) 19 | parser$add_argument('--tag-kb', type="double", default=5000) 20 | parser$add_argument('--tag-nsnp', type="double", default=5000) 21 | parser$add_argument('--palindrome-freq', type="double", default=0.4) 22 | parser$add_argument('--no-clean', action="store_true", default=FALSE) 23 | parser$add_argument('--rdsf-config', required=FALSE, default='') 24 | parser$add_argument('--instrument-list', required=FALSE) 25 | 26 | 27 | # args <- parser$parse_args() 28 | setwd("~/mr-eve/gwas-instrument-subsets/scripts") 29 | args <- parser$parse_args(c("--bfile", "../../vcf-reference-datasets/ukb/ukb_ref", "--gwas-id", "2", "--snplist", "temp1.txt", "--no-clean", "--out", "out", "--bcf-dir", "../../gwas-files", "--vcf-ref", "../../vcf-reference-datasets/1000g/1kg_v3_nomult.bcf", "--get-proxies")) 30 | print(args) 31 | tempname <- tempfile(pattern="extract", tmpdir=dirname(args[['out']])) 32 | bcf <- file.path(args[['bcf_dir']], args[['gwas_id']], "harmonised.bcf") 33 | snplist <- scan(args[['snplist']], what=character()) 34 | 35 | 36 | # Test Different proxy options 37 | 38 | o1 <- extract(bcf, snplist, tempname, "yes", args[["bfile"]], args[["vcf_ref"]]) 39 | dim(o1) 40 | o2 <- extract(bcf, snplist, tempname, "no", args[["bfile"]]) 41 | dim(o2) 42 | o3 <- extract(bcf, snplist, tempname, "only", args[["bfile"]], args[["vcf_ref"]]) 43 | dim(o3) 44 | 45 | 46 | # Check that proxies are correctly oriented 47 | # Expect to see that the proxies (o3) have effect sizes that strongly correlate with the true effect sizes (o2) 48 | 49 | a <- merge(o3, o2, by="ID") 50 | i <- a$ALT.x == a$ALT.y 51 | table(i) 52 | cor(a$B.x, a$B.y) 53 | plot(a$B.x, a$B.y) 54 | 55 | 56 | # Finally, check that the original elastic files are on the same strand as the harmonised data 57 | 58 | o <- fread("gunzip -c ../../gwas-files/2/elastic.gz", he=FALSE) 59 | temp <- merge(o3, o, by.x="ID", by.y="V1") 60 | dim(temp) 61 | i <- temp$REF != temp$V2 62 | table(i) 63 | cor(temp$B, temp$V5) 64 | temp$B[i] <- temp$B[i] * -1 65 | cor(temp$B, temp$V5) 66 | plot(temp$B, temp$V5) 67 | 68 | 69 | -------------------------------------------------------------------------------- /inst/sandpit/bmi_example_cp.r: -------------------------------------------------------------------------------- 1 | if(!require(gwasvcftools)) 2 | { 3 | if(!required(devtools)) install.packages("devtools") 4 | devtools::install_github("MRCIEU/gwasvcftools") 5 | } 6 | library(gwasvcftools) 7 | library(argparse) 8 | 9 | # create parser object 10 | parser <- ArgumentParser() 11 | parser$add_argument('--snplist', required=TRUE) 12 | parser$add_argument('--bcf-dir', required=TRUE) 13 | parser$add_argument('--gwas-id', required=TRUE) 14 | parser$add_argument('--out', required=TRUE) 15 | parser$add_argument('--bfile', required=TRUE) 16 | parser$add_argument('--get-proxies', action="store_true", default=FALSE) 17 | parser$add_argument('--vcf-ref', required=FALSE) 18 | parser$add_argument('--tag-r2', type="double", default=0.6) 19 | parser$add_argument('--tag-kb', type="double", default=5000) 20 | parser$add_argument('--tag-nsnp', type="double", default=5000) 21 | parser$add_argument('--palindrome-freq', type="double", default=0.4) 22 | parser$add_argument('--no-clean', action="store_true", default=FALSE) 23 | parser$add_argument('--rdsf-config', required=FALSE, default='') 24 | parser$add_argument('--instrument-list', required=FALSE) 25 | 26 | 27 | # args <- parser$parse_args() 28 | setwd("~/mr-eve/gwas-instrument-subsets/scripts") 29 | args <- parser$parse_args(c("--bfile", "../../vcf-reference-datasets/ukb/ukb_ref", "--gwas-id", "2", "--snplist", "temp2.txt", "--no-clean", "--out", "out", "--bcf-dir", "../../gwas-files", "--vcf-ref", "../../vcf-reference-datasets/1000g/1kg_v3_nomult.bcf", "--get-proxies")) 30 | print(args) 31 | tempname <- tempfile(pattern="extract", tmpdir=dirname(args[['out']])) 32 | bcf <- file.path(args[['bcf_dir']], args[['gwas_id']], "harmonised.bcf") 33 | snplist <- fread(args[['snplist']], header=FALSE, sep="\t") 34 | 35 | 36 | # Test Different proxy options 37 | 38 | o1 <- extract(bcf, snplist, tempname, "yes", args[["bfile"]], args[["vcf_ref"]]) 39 | dim(o1) 40 | o2 <- extract(bcf, snplist, tempname, "no", args[["bfile"]]) 41 | dim(o2) 42 | o3 <- extract(bcf, snplist, tempname, "only", args[["bfile"]], args[["vcf_ref"]]) 43 | dim(o3) 44 | 45 | 46 | # Check that proxies are correctly oriented 47 | # Expect to see that the proxies (o3) have effect sizes that strongly correlate with the true effect sizes (o2) 48 | 49 | a <- merge(o3, o2, by="ID") 50 | i <- a$ALT.x == a$ALT.y 51 | table(i) 52 | cor(a$B.x, a$B.y) 53 | plot(a$B.x, a$B.y) 54 | 55 | 56 | # Finally, check that the original elastic files are on the same strand as the harmonised data 57 | 58 | o <- fread("gunzip -c ../../gwas-files/2/elastic.gz", he=FALSE) 59 | temp <- merge(o3, o, by.x="ID", by.y="V1") 60 | dim(temp) 61 | i <- temp$REF != temp$V2 62 | table(i) 63 | cor(temp$B, temp$V5) 64 | temp$B[i] <- temp$B[i] * -1 65 | cor(temp$B, temp$V5) 66 | plot(temp$B, temp$V5) 67 | 68 | 69 | -------------------------------------------------------------------------------- /inst/sandpit/harmonise_against_ref.r: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | suppressPackageStartupMessages({ 3 | library(argparse) 4 | library(dplyr) 5 | library(TwoSampleMR) 6 | # library(gwasvcftools) 7 | library(unixtools) 8 | library(devtools) 9 | load_all() 10 | }) 11 | 12 | # create parser object 13 | parser <- ArgumentParser() 14 | parser$add_argument('--ref_file', required=TRUE) 15 | parser$add_argument('--ref_build', required=TRUE) 16 | parser$add_argument('--ref_info', required=TRUE) 17 | parser$add_argument('--mrbase_id', required=TRUE) 18 | parser$add_argument('--gwas_file', required=TRUE) 19 | parser$add_argument('--gzipped', required=TRUE, type="integer", default=1) 20 | parser$add_argument('--delimiter', default="\t", required=TRUE) 21 | parser$add_argument('--skip', required=TRUE, type="integer", default=0) 22 | parser$add_argument('--dbsnp_field', type="integer", required=TRUE) 23 | parser$add_argument('--ea_field', type="integer", required=TRUE) 24 | parser$add_argument('--nea_field', type="integer", required=FALSE, default=0) 25 | parser$add_argument('--ea_af_field', type="integer", required=FALSE, default=0) 26 | parser$add_argument('--effect_field', type="integer", required=FALSE, default=0) 27 | parser$add_argument('--se_field', type="integer", required=FALSE, default=0) 28 | parser$add_argument('--pval_field', type="integer", required=FALSE, default=0) 29 | parser$add_argument('--n_field', type="integer", required=FALSE, default=0) 30 | parser$add_argument('--info_field', type="integer", required=FALSE, default=0) 31 | parser$add_argument('--z_field', type="integer", required=FALSE, default=0) 32 | parser$add_argument('--out_type', required=TRUE, default="bcf") 33 | parser$add_argument('--out', required=TRUE) 34 | args <- parser$parse_args() 35 | str(args) 36 | 37 | 38 | # Read in GWAS data 39 | set.tempdir("tmp") 40 | gwas <- read_gwas( 41 | args[["gwas_file"]], 42 | skip=args[["skip"]], 43 | snp=args[["dbsnp_field"]], 44 | gzipped=args[["gzipped"]], 45 | delimiter=args[["delimiter"]], 46 | ea=args[["ea_field"]], 47 | nea=args[["nea_field"]], 48 | ea_af=args[["ea_af_field"]], 49 | effect=args[["effect_field"]], 50 | se=args[["se_field"]], 51 | pval=args[["pval_field"]], 52 | n=args[["n_field"]], 53 | info=args[["info_field"]], 54 | z=args[["z_field"]] 55 | ) 56 | 57 | 58 | # Read in ref 59 | ref <- read_reference(args[["ref_file"]], gwas$SNP, args[["out"]]) 60 | 61 | # Harmonise 62 | harmonised <- harmonise_against_ref(gwas, ref) 63 | save(harmonised, file="temp.rdata") 64 | q() 65 | 66 | # Gather metadata 67 | metadata.input <- args 68 | names(metadata.input) <- paste0("input.", names(metadata.input)) 69 | 70 | ao <- TwoSampleMR::available_outcomes(NULL) 71 | metadata.gwas <- as.list(subset(ao, id == args[["mrbase_id"]])) 72 | metadata.gwas[['path']] <- NULL 73 | metadata.gwas[['filename']] <- NULL 74 | names(metadata.gwas) <- paste0("gwas.", names(metadata.gwas)) 75 | 76 | metadata.counts <- as.list(attr(harmonised, "log")) 77 | metadata.counts[['id.exposure']] <- NULL 78 | metadata.counts[['id.outcome']] <- NULL 79 | names(metadata.counts) <- paste0("counts.", names(metadata.counts)) 80 | 81 | metadata <- c(metadata.input, metadata.gwas, metadata.counts) 82 | str(metadata) 83 | 84 | # Create vcf format 85 | vcf <- gwasvcftools::make_vcf( 86 | ID = harmonised$ID, 87 | ALT = harmonised$ALT, 88 | REF = harmonised$REF, 89 | B = harmonised$BETA, 90 | SE = harmonised$SE, 91 | PVAL = harmonised$PVALUE, 92 | N = harmonised$N, 93 | CHROM = harmonised$CHROM, 94 | POS = harmonised$POS, 95 | AF = harmonised$AF, 96 | QUAL = harmonised$INFO, 97 | FILTER = rep("PASS", nrow(harmonised)), 98 | ZVALUE = harmonised$ZVALUE, 99 | build = args[["ref_build"]], 100 | meta_data = metadata 101 | ) 102 | 103 | # Write vcf 104 | gwasvcftools::write_vcf(vcf, paste0(args[["out"]], ".", args[["out_type"]])) 105 | 106 | -------------------------------------------------------------------------------- /inst/sandpit/misc/create_ref.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Download 1000 genomes 4 | 5 | wget ALL.chr1.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz 6 | wget ALL.chr1.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi 7 | wget ALL.chr10.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz 8 | wget ALL.chr10.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi 9 | wget ALL.chr11.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz 10 | wget ALL.chr11.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi 11 | wget ALL.chr12.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz 12 | wget ALL.chr12.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi 13 | wget ALL.chr13.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz 14 | wget ALL.chr13.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi 15 | wget ALL.chr14.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz 16 | wget ALL.chr14.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi 17 | wget ALL.chr15.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz 18 | wget ALL.chr15.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi 19 | wget ALL.chr16.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz 20 | wget ALL.chr16.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi 21 | wget ALL.chr17.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz 22 | wget ALL.chr17.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi 23 | wget ALL.chr18.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz 24 | wget ALL.chr18.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi 25 | wget ALL.chr19.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz 26 | wget ALL.chr19.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi 27 | wget ALL.chr2.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz 28 | wget ALL.chr2.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi 29 | wget ALL.chr20.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz 30 | wget ALL.chr20.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi 31 | wget ALL.chr21.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz 32 | wget ALL.chr21.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi 33 | wget ALL.chr22.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz 34 | wget ALL.chr22.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi 35 | wget ALL.chr3.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz 36 | wget ALL.chr3.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi 37 | wget ALL.chr4.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz 38 | wget ALL.chr4.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi 39 | wget ALL.chr5.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz 40 | wget ALL.chr5.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi 41 | wget ALL.chr6.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz 42 | wget ALL.chr6.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi 43 | wget ALL.chr7.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz 44 | wget ALL.chr7.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi 45 | wget ALL.chr8.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz 46 | wget ALL.chr8.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi 47 | wget ALL.chr9.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz 48 | wget ALL.chr9.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi 49 | wget ALL.chrMT.phase3_callmom-v0_4.20130502.genotypes.vcf.gz 50 | wget ALL.chrMT.phase3_callmom-v0_4.20130502.genotypes.vcf.gz.tbi 51 | wget ALL.chrX.phase3_shapeit2_mvncall_integrated_v1b.20130502.genotypes.vcf.gz 52 | wget ALL.chrX.phase3_shapeit2_mvncall_integrated_v1b.20130502.genotypes.vcf.gz.tbi 53 | wget ALL.chrY.phase3_integrated_v2a.20130502.genotypes.vcf.gz 54 | wget ALL.chrY.phase3_integrated_v2a.20130502.genotypes.vcf.gz.tbi 55 | 56 | 57 | for f in *vcf.gz 58 | do 59 | echo $f 60 | i=`echo $f | cut -d "." -f 2` 61 | echo $i 62 | bcftools view -G -Ob $f > 1kg_v3_$i.bcf 63 | bcftools index 1kg_v3_$i.bcf 64 | done 65 | 66 | bcftools concat 1kg_v3_*.bcf -Ob > 1kg_v3.bcf 67 | 68 | # Multi allelic SNPs are problematic for harmonisation because we would have to update TwoSampleMR functions to allow this. 69 | # Most GWASs drop multi-allelic SNPs so we can probably try to ignore, though remains to be seen how much we lose due to this strategy 70 | # Split and keep first biallelic version of each variant only 71 | # Is there a better way to handle multi-allelic SNPs? 72 | bcftools norm -m- 1kg_v3.bcf | bcftools norm -d all -Ob > 1kg_v3_nomult.bcf 73 | 74 | # Note we can also use dbSNP which has ALT allele frequencies 75 | # ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/VCF/All_20180418.vcf.gz 76 | 77 | # Or dbSNP filter of only common variants 78 | # ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/VCF/common_all_20180418.vcf.gz 79 | # Though this is slightly prooblematic because they have a CAF column instead of a standard AF column, 80 | # so handling multiallelic SNPs won't automatically handle this column 81 | -------------------------------------------------------------------------------- /inst/sandpit/misc/harmonise.r: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | library(TwoSampleMR) 4 | library(data.table) 5 | library(dplyr) 6 | library(argparse) 7 | library(magrittr) 8 | 9 | parser <- ArgumentParser() 10 | parser$add_argument('--ref', required=TRUE) 11 | parser$add_argument('--gwas', required=FALSE) 12 | parser$add_argument('--out', required=TRUE) 13 | 14 | args <- parser$parse_args() 15 | 16 | 17 | 18 | # 1. Read in the GWAS 19 | # 2. Read in the reference data 20 | # 3. Harmonise GWAS against the reference 21 | # 4. Write to VCF 22 | 23 | 24 | # Read the GWAS 25 | # Just assuming the format used for uploading to elastic 26 | gwas <- data.table::fread(paste0("gunzip -c ", args[["gwas"]])) 27 | names(gwas) <- c("snp_col", "ea_col", "oa_col", "eaf_col", "beta_col", "se_col", "pval_col", "ncontrol_col") 28 | # This is a continuous GWAS so no ncase column 29 | gwas$ncase_col <- NA 30 | 31 | 32 | # Read the reference 33 | ref <- data.table::fread(paste0("gunzip -c ", args[["ref"]])) 34 | stopifnot(c("CHROM", "ID", "REF", "ALT", "MAF", "POS") %in% names(ref)) 35 | 36 | # For simplicity just keeping SNP Ids that are in common 37 | ref <- subset(ref, ID %in% gwas$snp_col) 38 | 39 | # Put in some dummy variables for the reference for harmonising 40 | ref$beta <- 1 41 | ref$se <- 0.1 42 | ref$pval <- 0.1 43 | a <- TwoSampleMR::format_data( 44 | ref, 45 | type="exposure", 46 | snp_col="ID", 47 | effect_allele_col="ALT", 48 | other_allele_col="REF", 49 | eaf_col="MAF" 50 | ) 51 | 52 | b <- TwoSampleMR::format_data(gwas, type="outcome", 53 | snp_col="snp_col", 54 | beta_col="beta_col", 55 | se_col="se_col", 56 | effect_allele_col="ea_col", 57 | other_allele_col="oa_col", 58 | eaf_col="eaf_col", 59 | ncase_col="ncase_col", 60 | ncontrol_col="ncontrol_col", 61 | pval_col="pval_col" 62 | ) 63 | 64 | # Is the gwas on the forward strand? 65 | action <- is_forward_strand(gwas$snp_col, gwas$ea_col, gwas$oa_col, ref$ID, ref$ALT, ref$REF) 66 | 67 | # Harmonise the gwas according to the reference panel 68 | ab <- TwoSampleMR::harmonise_data(a, b, action=action) 69 | 70 | gwas_h <- ab %$% 71 | dplyr::data_frame( 72 | ID=SNP, 73 | ALT=effect_allele.exposure, 74 | REF=other_allele.exposure, 75 | BETA=beta.outcome, 76 | SE=se.outcome, 77 | PVALUE=pval.outcome, 78 | AF=eaf.outcome, 79 | N=samplesize.outcome, 80 | NCASE=ncase.outcome, 81 | NCONTROL=ncontrol.outcome) %>% 82 | dplyr::inner_join(subset(ref, select=c(ID,REF,ALT,CHROM,POS,MAF)), by=c("ID", "REF", "ALT")) 83 | 84 | save(gwas_h, file=args[["out"]]) 85 | 86 | -------------------------------------------------------------------------------- /inst/sandpit/misc/harmonise_against_ref.r: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | suppressPackageStartupMessages(library(argparse)) 4 | suppressPackageStartupMessages(library(dplyr)) 5 | suppressPackageStartupMessages(library(TwoSampleMR)) 6 | suppressPackageStartupMessages(library(data.table)) 7 | suppressPackageStartupMessages(library(vcfR)) 8 | library(methods) 9 | library(utils) 10 | 11 | # create parser object 12 | parser <- ArgumentParser() 13 | 14 | parser$add_argument('--ref-file', required=TRUE) 15 | parser$add_argument('--ref-build', required=TRUE, default="b37") 16 | parser$add_argument('--gwas-file', required=TRUE) 17 | parser$add_argument('--gwas-header', required=TRUE, type="logical", default=FALSE) 18 | parser$add_argument('--gwas-snp', type="integer", required=TRUE) 19 | parser$add_argument('--gwas-ref', type="integer", required=FALSE) 20 | parser$add_argument('--gwas-alt', type="integer", required=TRUE) 21 | parser$add_argument('--gwas-af', type="integer", required=FALSE) 22 | parser$add_argument('--gwas-beta', type="integer", required=FALSE) 23 | parser$add_argument('--gwas-se', type="integer", required=FALSE) 24 | parser$add_argument('--gwas-pval', type="integer", required=FALSE) 25 | parser$add_argument('--gwas-n0', type="integer", required=FALSE) 26 | parser$add_argument('--gwas-n1', type="integer", required=FALSE) 27 | parser$add_argument('--out', required=TRUE) 28 | 29 | args <- parser$parse_args() 30 | 31 | print(args) 32 | 33 | read_dat <- function(filename, type, header, snp, ref, alt, af, beta, se, pval, n0, n1) 34 | { 35 | if(grepl("gz$", filename)) 36 | { 37 | dat <- data.table::fread(paste0("gunzip -c ", filename), header=header) 38 | } else { 39 | dat <- data.table::fread(filename, header=header) 40 | } 41 | nc <- ncol(dat) 42 | if(snp == 0) 43 | { 44 | dat[[paste0("V", ncol(dat)+1)]] <- rep(NA, nrow(dat)) 45 | snp <- ncol(dat) 46 | } 47 | if(ref == 0) 48 | { 49 | dat[[paste0("V", ncol(dat)+1)]] <- rep(NA, nrow(dat)) 50 | ref <- ncol(dat) 51 | } 52 | if(alt == 0) 53 | { 54 | dat[[paste0("V", ncol(dat)+1)]] <- rep(NA, nrow(dat)) 55 | alt <- ncol(dat) 56 | } 57 | if(af == 0) 58 | { 59 | dat[[paste0("V", ncol(dat)+1)]] <- rep(NA, nrow(dat)) 60 | af <- ncol(dat) 61 | } 62 | if(beta == 0) 63 | { 64 | dat[[paste0("V", ncol(dat)+1)]] <- rep(NA, nrow(dat)) 65 | beta <- ncol(dat) 66 | } 67 | if(se == 0) 68 | { 69 | dat[[paste0("V", ncol(dat)+1)]] <- rep(NA, nrow(dat)) 70 | se <- ncol(dat) 71 | } 72 | if(pval == 0) 73 | { 74 | dat[[paste0("V", ncol(dat)+1)]] <- rep(NA, nrow(dat)) 75 | pval <- ncol(dat) 76 | } 77 | if(n0 == 0) 78 | { 79 | dat[[paste0("V", ncol(dat)+1)]] <- rep(NA, nrow(dat)) 80 | n0 <- ncol(dat) 81 | } 82 | if(n1 == 0) 83 | { 84 | dat[[paste0("V", ncol(dat)+1)]] <- rep(NA, nrow(dat)) 85 | n1 <- ncol(dat) 86 | } 87 | 88 | o <- format_data( 89 | dat, 90 | type=type, 91 | phenotype_col=type, 92 | snp_col=names(dat)[snp], 93 | beta_col=names(dat)[beta], 94 | se_col=names(dat)[se], 95 | effect_allele_col=names(dat)[alt], 96 | other_allele_col=names(dat)[ref], 97 | eaf_col=names(dat)[af], 98 | pval_col=names(dat)[pval], 99 | ncase_col=names(dat)[n1], 100 | ncontrol_col=names(dat)[n0] 101 | ) 102 | return(o) 103 | } 104 | 105 | 106 | 107 | # Read in gwas data 108 | gwas <- read_dat( 109 | args[["gwas_file"]], 110 | type="outcome", 111 | header=args[["gwas_header"]], 112 | snp=args[["gwas_snp"]], 113 | ref=args[["gwas_ref"]], 114 | alt=args[["gwas_alt"]], 115 | af=args[["gwas_af"]], 116 | beta=args[["gwas_beta"]], 117 | se=args[["gwas_se"]], 118 | pval=args[["gwas_pval"]], 119 | n0=args[["gwas_n0"]], 120 | n1=args[["gwas_n1"]] 121 | ) 122 | 123 | 124 | # Read in ref 125 | 126 | ref <- data.table::fread(paste0("gunzip -c ", args[["ref_file"]])) 127 | stopifnot(all(c("CHROM", "ID", "REF", "ALT", "AF", "POS") %in% names(ref))) 128 | 129 | # For simplicity just keeping SNP Ids that are in common 130 | ref <- subset(ref, ID %in% gwas$SNP) 131 | 132 | # Put in some dummy variables for the reference for harmonising 133 | ref$beta <- 1 134 | ref$se <- 0.1 135 | ref$pval <- 0.1 136 | a <- TwoSampleMR::format_data( 137 | ref, 138 | type="exposure", 139 | snp_col="ID", 140 | effect_allele_col="ALT", 141 | other_allele_col="REF", 142 | eaf_col="AF" 143 | ) 144 | 145 | # Check strand 146 | action <- TwoSampleMR::is_forward_strand(gwas$SNP, gwas$effect_allele.outcome, gwas$other_allele.outcome, ref$ID, ref$ALT, ref$REF, threshold=0.9) 147 | 148 | # Harmonise 149 | dat <- TwoSampleMR::harmonise_data(a, gwas, action) 150 | 151 | 152 | gwas_h <- dat %$% 153 | dplyr::data_frame( 154 | ID=SNP, 155 | ALT=effect_allele.exposure, 156 | REF=other_allele.exposure, 157 | BETA=beta.outcome, 158 | SE=se.outcome, 159 | PVALUE=pval.outcome, 160 | AF=eaf.outcome, 161 | N=samplesize.outcome, 162 | NCASE=ncase.outcome, 163 | NCONTROL=ncontrol.outcome) %>% 164 | dplyr::inner_join(subset(ref, select=c(ID,REF,ALT,CHROM,POS)), by=c("ID", "REF", "ALT")) 165 | 166 | 167 | # Create vcf format 168 | vcf <- TwoSampleMR::make_vcf( 169 | ID = gwas_h$ID, 170 | ALT = gwas_h$ALT, 171 | REF = gwas_h$REF, 172 | B = gwas_h$BETA, 173 | SE = gwas_h$SE, 174 | PVAL = gwas_h$PVALUE, 175 | N0 = gwas_h$NCONTROL, 176 | N1 = gwas_h$NCASE, 177 | CHROM = gwas_h$CHROM, 178 | POS = gwas_h$POS, 179 | AF = gwas_h$AF, 180 | QUAL = rep(NA, nrow(gwas_h)), 181 | FILTER = rep("PASS", nrow(gwas_h)), 182 | build = args[["ref_build"]] 183 | ) 184 | 185 | # Write vcf 186 | TwoSampleMR::write_vcf(vcf, args[["out"]]) 187 | 188 | 189 | -------------------------------------------------------------------------------- /inst/sandpit/misc/query_times.rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: Timing for different methods to query vcf files 3 | author: Gibran Hemani 4 | date: 2/10/2019 5 | --- 6 | 7 | 8 | Setup 9 | 10 | ```{r} 11 | library(knitr) 12 | opts_chunk$set(warning=FALSE, message=FALSE, cache=TRUE) 13 | library(devtools) 14 | load_all() 15 | fn <- system.file("data","IEU-a-2.vcf.gz", package="gwasvcftools") 16 | chrompos <- "20:800000-4000000" 17 | pval <- 5e-8 18 | rsid <- c("rs3128126", "rs3121561", "rs3813193") 19 | ``` 20 | 21 | Reading in the vcf file 22 | 23 | ```{r} 24 | system.time({ 25 | v <- readVcf(fn) 26 | }) 27 | ``` 28 | 29 | Different methods for searching for rsid: 30 | 31 | ```{r} 32 | system.time({ 33 | query_rsid_vcf(rsid, v) 34 | }) 35 | system.time({ 36 | query_rsid_file(rsid, fn) 37 | }) 38 | system.time({ 39 | query_rsid_bcftools(rsid, fn) 40 | }) 41 | ``` 42 | 43 | Different methods for searching by p-value: 44 | 45 | ```{r} 46 | system.time({ 47 | query_pval_vcf(pval, v) 48 | }) 49 | system.time({ 50 | query_pval_file(pval, fn) 51 | }) 52 | system.time({ 53 | query_pval_bcftools(pval, fn) 54 | }) 55 | ``` 56 | 57 | Different methods for searching by chrompos: 58 | 59 | ```{r} 60 | system.time({ 61 | query_chrompos_vcf(chrompos, v) 62 | }) 63 | system.time({ 64 | query_chrompos_file(chrompos, fn) 65 | }) 66 | system.time({ 67 | query_chrompos_bcftools(chrompos, fn) 68 | }) 69 | ``` 70 | -------------------------------------------------------------------------------- /inst/sandpit/misc/skeleton.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | 5 | # BCF file with: 6 | # - correct REF and ALT alleles 7 | # - AF field in INFO that is the ALT allele frequency 8 | ref="../../../reference/1000g/1kg_v3_nomult.bcf" 9 | 10 | 11 | # GWAS file: 12 | gwas="~/mr-eve/gwas-instrument-subsets/studies/2/elastic.gz" 13 | 14 | 15 | # 0. Clean GWAS 16 | 17 | 18 | 19 | # STILL TO DO 20 | 21 | 22 | 23 | # 1. check for name merges against sqlite 24 | # This file has rs ID merges 25 | # https://www.ncbi.nlm.nih.gov/projects/SNP/snp_db_table_description.cgi?t=RsMergeArch 26 | # Step 1 is to update any rs IDs in the GWAS based on this file 27 | 28 | 29 | 30 | # STILL TO DO 31 | 32 | 33 | 34 | # 2. Convert any chr:pos SNPs in the GWAS to rs IDs 35 | # Do this by extracting variants that are missing rs IDs in GWAS 36 | # and find the rs ID in the reference 37 | # and update the GWAS file 38 | 39 | 40 | 41 | # STILL TO DO 42 | 43 | 44 | 45 | # 3. Get subset of reference in tab format 46 | 47 | 48 | gunzip -c elastic.gz | cut -f 1 > snplist.txt 49 | wc -l snplist.txt 50 | 51 | time bcftools view -i'ID=@temp' $ref | bcftools query -f'%CHROM\t%POS\t%ID\t%REF\t%ALT\t%AF\n' | sed '1 i\ 52 | CHROM\tPOS\tID\tREF\tALT\tAF 53 | ' | gzip -c > ref_extract.txt.gz 54 | 55 | 56 | 57 | # 4. Harmonise the GWAS file against the reference 58 | # This needs to retain indels so use the TwoSampleMR::harmonise_data 59 | # function. It will: 60 | # - switch effect alleles 61 | # - handle sequence coded indels 62 | # - convert D/I indels to sequence coding (as in the reference) 63 | # - check for forward strand and flip if necessary 64 | # - tries to harmonise with only effect allele if other allele not available 65 | # 4b. Write out to bcf format 66 | # After harmonising can use the TwoSampleMR::write_vcf function 67 | # It will create file based on extension and index. 68 | 69 | 70 | 71 | Rscript harmonise_against_ref.r \ 72 | --ref-file ref_extract.txt.gz \ 73 | --ref-build b37 \ 74 | --gwas-file $gwas \ 75 | --gwas-header FALSE \ 76 | --gwas-snp 1 \ 77 | --gwas-ref 3 \ 78 | --gwas-alt 2 \ 79 | --gwas-af 4 \ 80 | --gwas-beta 5 \ 81 | --gwas-se 6 \ 82 | --gwas-pval 7 \ 83 | --gwas-n0 8 \ 84 | --gwas-n1 NA \ 85 | --out harmonised.bcf 86 | 87 | 88 | 89 | # 5. Create report and json document of harmonising stats 90 | # This could be included above 91 | 92 | -------------------------------------------------------------------------------- /inst/sandpit/misc/vcf.rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: Using VCF to handle GWAS 3 | author: Gibran Hemani 4 | date: 18/11/2018 5 | --- 6 | 7 | ## Background 8 | 9 | Using VCF as a format for storing GWAS summary data. Possible benifits - 10 | 11 | * Well known standard format 12 | * Potentially smaller after compression 13 | * Very fast tools already exist 14 | * Standardised tools can run implicit checks 15 | * Indexing will help with regional lookups 16 | * Standardised way to represent variant info including indels and multi-allelic variants 17 | * Easy to update build 18 | 19 | 20 | ## Specification 21 | 22 | VCF has detailed specification here [http://samtools.github.io/hts-specs/VCFv4.3.pdf](http://samtools.github.io/hts-specs/VCFv4.3.pdf). We need an agreed way to apply the specification to GWAS summary data. Current implementation: 23 | 24 | 1. Use only the first 8 fixed fields. 25 | 2. QUAL will be set to missing (.) unless an obvious way to use it can be identified. 26 | 3. ALT allele is always the effect allele. Ideally this is matched to a reference dataset. REF allele is always the non-effect allele 27 | 4. For binary traits we want to store the number of cases and number of controls 28 | 5. For continuous traits we use 0 for number of cases, and number of controls is the total sample size 29 | 6. The INFO column will have fields describing the genetic association, as follows: 30 | * B, Type = Float, Description = Effect size estimate relative to the alternative allele(s) 31 | * SE, Type = Float, Description = Standard error of effect size estimate 32 | * P, Type = Float, Description = P-value for effect estimate 33 | * AF, Type = Float, Description = Alternate allele frequency 34 | * N1, Type = Integer, Description = Number of cases. 0 if continuous trait 35 | * N0, Type = Integer, Description = Number of controls. Total sample size if continuous trait 36 | 7. FILTER is always PASS unless the variant does not meet some QC parameter. 37 | 38 | The VCF header encapsulating this info will look like this: 39 | 40 | ``` 41 | ##INFO= 42 | ##INFO= 43 | ##INFO= 44 | ##INFO= 45 | ##INFO= 46 | ##INFO= 47 | ``` 48 | 49 | Missing values throughout are specified as ".", as standard for VCF. 50 | 51 | Custom annotations can be added i.e. ##gwas=casecontrol which are lowercase by convention 52 | 53 | ## Reference FASTA 54 | 55 | The reference fasta should be downloaded from the GATK bundle: 56 | [b38/hg38](https://console.cloud.google.com/storage/browser/genomics-public-data/resources/broad/hg38/v0) 57 | [b37/hg19](ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/b37) 58 | 59 | ## Generating the data 60 | 61 | To run the following, first clone the TwoSampleMR repository 62 | 63 | ```bash 64 | git clone git@github.com:MRCIEU/TwoSampleMR.git 65 | ``` 66 | 67 | Then navigate to here: 68 | 69 | ```bash 70 | cd TwoSampleMR/vignettes/vcf 71 | ``` 72 | 73 | 74 | ### Download some example datasets 75 | 76 | Start with two datasets, a reference (for example 1000 genomes) and a GWAS summary dataset (e.g. Locke et al 2015 BMI analysis). First we will convert the GWAS dataset to be harmonised against the reference dataset 77 | 78 | Download the example GWAS dataset: 79 | 80 | ```{r engine='bash'} 81 | wget -q -O bmi.txt.gz https://www.dropbox.com/s/ph7in04w6dki2tv/bmi.txt.gz?dl=0 82 | gunzip -c bmi.txt.gz | head 83 | gunzip -c bmi.txt.gz | wc -l 84 | ``` 85 | 86 | Download the reference dataset: 87 | 88 | ```{r engine='bash'} 89 | wget -q -O ref.txt.gz https://www.dropbox.com/s/8vgg08zip2wkayk/ref.txt.gz?dl=0 90 | gunzip -c ref.txt.gz | head 91 | gunzip -c ref.txt.gz | wc -l 92 | ``` 93 | 94 | ### Harmonise the GWAS against the reference 95 | 96 | For simplicity I will just use the `harmonise_data` function in the `R/TwoSampleMR` package. This has limitations in that it throws away indels. The scripts that Denis is writing to harmonise against SNP-Base are going to be more appropriate, but this is just here for illustration. 97 | 98 | ```{r engine='bash'} 99 | Rscript harmonise.r --gwas bmi.txt.gz --ref ref.txt.gz --out harmonised.rdata 100 | ``` 101 | 102 | 103 | ### Create VCF files from the harmonised object 104 | 105 | Now that we have a file that has all the required columns: 106 | * CHROM 107 | * POS 108 | * ID (rs ID) 109 | * REF allele 110 | * ALT allele 111 | * BETA 112 | * SE 113 | * PVAL 114 | * NCASE 115 | * NCONTROL 116 | 117 | And they are all harmonised to a reference dataset, we can produce a vcf file using a couple of functions in the `TwoSampleMR` package 118 | 119 | ```{r} 120 | library(TwoSampleMR) 121 | library(dplyr) 122 | library(vcfR) 123 | library(methods) 124 | library(utils) 125 | 126 | # This loads in the harmonised object that we just created - `gwas_h` 127 | load("harmonised.rdata") 128 | str(gwas_h) 129 | 130 | vcf <- TwoSampleMR::make_vcf( 131 | ID = gwas_h$ID, 132 | ALT = gwas_h$ALT, 133 | REF = gwas_h$REF, 134 | B = gwas_h$BETA, 135 | SE = gwas_h$SE, 136 | PVAL = gwas_h$PVALUE, 137 | N0 = gwas_h$NCONTROL, 138 | N1 = gwas_h$NCASE, 139 | CHROM = gwas_h$CHROM, 140 | POS = gwas_h$POS, 141 | AF = gwas_h$MAF, 142 | QUAL = rep(NA, nrow(gwas_h)), 143 | FILTER = rep('PASS', nrow(gwas_h)), 144 | build = "b37" 145 | ) 146 | ``` 147 | 148 | We can see some basic stats about the file we just made using the `R/vcfR` package: 149 | 150 | 151 | ```{r} 152 | vcf 153 | ``` 154 | 155 | Finally, we can write the correctly formatted data to file: 156 | 157 | ```{r} 158 | TwoSampleMR::write_vcf(vcf, "bmi.vcf.gz") 159 | TwoSampleMR::write_vcf(vcf, "bmi.bcf") 160 | ``` 161 | 162 | ## Testing the VCF files 163 | 164 | All VCF files should undergo validation before use using [gatk](https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_tools_walkers_variantutils_ValidateVariants.php). 165 | 166 | ```gatk ValidateVariants \ 167 | -R \ 168 | -V \ 169 | --dbsnp 170 | ``` 171 | 172 | We can use [bgzip](https://vcf.iobio.io/help.html), [tabix](https://vcf.iobio.io/help.html) and [bcftools](https://samtools.github.io/bcftools/) to work with VCF. 173 | 174 | Examples of how to compress and index (though the R functions previously run have done this already by calling these tools). 175 | 176 | Compress a `.vcf` with bgzip, then index: 177 | 178 | ```bash 179 | bgzip -c bmi.vcf > bmi.vcf.gz 180 | bcftools index bmi.vcf.gz 181 | ``` 182 | 183 | Convert to `.bcf` which is a binary version of the text file: 184 | 185 | ```bash 186 | bcftools view bmi.vcf.gz -Ob -o bmi.bcf 187 | bcftools index bmi.bcf 188 | ``` 189 | 190 | ### Compare the sizes 191 | 192 | ```{r engine='bash'} 193 | # bcf and index 194 | du -sh bmi.bcf bmi.bcf.csi 195 | ``` 196 | 197 | ```{r engine='bash'} 198 | # vcf.gz and index 199 | du -sh bmi.vcf.gz bmi.vcf.gz.csi 200 | ``` 201 | 202 | ```{r engine='bash'} 203 | # original gzip file 204 | du -sh bmi.txt.gz 205 | ``` 206 | 207 | The original gzip file is smallest, but it doesn't contain chromosome and position info. Surprisingly, bcf format is almost double the size of the gzip format, and vcf.gz is somewhere in between. 208 | 209 | 210 | ### Speed to extract by p-value 211 | 212 | ```{r engine='bash'} 213 | # bcf 214 | time bcftools query -i'PVAL<5e-8' -f'%ID\n' bmi.bcf > extract.txt && wc -l extract.txt 215 | ``` 216 | 217 | ```{r engine='bash'} 218 | # vcf.gz 219 | time bcftools query -i'PVAL<5e-8' -f'%ID\n' bmi.vcf.gz > extract.txt && wc -l extract.txt 220 | ``` 221 | 222 | ```{r engine='bash'} 223 | # for comparison - original gzip file 224 | time gunzip -c bmi.txt.gz | awk -F '\t' '$7 < 5e-8 {print $1}' > extract.txt && wc -l extract.txt 225 | ``` 226 | 227 | Extracting using awk is very slow, bcf format is extremely fast, though how this compares to elastic is not clear. 228 | 229 | ### Speed to extract by rs ID 230 | 231 | ```{r engine='bash'} 232 | # bcf 233 | time bcftools view -i'ID=@extract.txt' -Ob bmi.bcf > extract.bcf 234 | ``` 235 | 236 | ```{r engine='bash'} 237 | # vcf.gz 238 | time bcftools view -i'ID=@extract.txt' -Oz bmi.vcf.gz > extract.vcf.gz 239 | ``` 240 | 241 | ```{r engine='bash'} 242 | # For comparison we can just try grepping from the original file. 243 | # time zfgrep -wf extract.txt bmi.txt.gz | gzip -c > test.txt.gz 244 | ## Not running this because it takes several minutes 245 | ``` 246 | 247 | ### Speed to extract by chromosome and position 248 | 249 | Using chrom and position is even faster than extracting by rs ID 250 | 251 | ```{r engine='bash'} 252 | # Extract top hits again but save chrom and position 253 | time bcftools query -i'PVAL<5e-8' -f'%CHROM\t%POS\n' bmi.bcf > extract.txt && wc -l extract.txt 254 | ``` 255 | 256 | ```{r engine='bash'} 257 | # extract from bcf 258 | time bcftools filter -R extract.txt bmi.bcf > extract.bcf 259 | ``` 260 | 261 | ```{r engine='bash'} 262 | # extract from vcf.gz 263 | time bcftools filter -R extract.txt bmi.vcf.gz > extract.bcf 264 | ``` 265 | 266 | 267 | ## Create format used for elastic 268 | 269 | This is the tab delimited file being uploaded to elastic search db: 270 | 271 | ```{r engine='bash'} 272 | time bcftools query -f'%ID\t%ALT\t%REF\t%AF\t%B\t%SE\t%PVAL\t%N1\t%N0\n' bmi.bcf | sed 's@\t\.@\t@g' | grep -v '$\.' > elastic.txt 273 | head elastic.txt 274 | ``` 275 | 276 | Ideally would create it like this - 277 | - no alleles 278 | - no total sample size (N) 279 | - for case/control N1 and N0 are number of cases and number of controls 280 | - for continuous N1 is 0 and N0 is total sample size 281 | 282 | ```{r engine='bash'} 283 | time bcftools query -f'%ID\t%AF\t%B\t%SE\t%PVAL\t%N1\t%N0\n' bmi.bcf | sed 's@\t\.@\t@g' | grep -v '$\.' > elastic.txt 284 | head elastic.txt 285 | ``` 286 | 287 | 288 | 289 | -------------------------------------------------------------------------------- /inst/sandpit/test_extract.r: -------------------------------------------------------------------------------- 1 | load_all() 2 | s <- fread("~/mr-eve/mr-eve/instruments.txt") 3 | a <- extract( 4 | bcf="~/mr-eve/gwas-files/7/data.bcf", 5 | snplist=s, 6 | tempname="temp", 7 | proxies="yes", 8 | bfile="~/mr-eve/vcf-reference-datasets/1000g_filtered/data_maf0.01_rs_snps", 9 | vcf="~/mr-eve/vcf-reference-datasets/1000g/1kg_v3_nomult.bcf" 10 | ) 11 | 12 | a <- extract( 13 | bcf="~/mr-eve/gwas-files/7/data.bcf", 14 | snplist=s[[6]], 15 | tempname="temp", 16 | proxies="yes", 17 | bfile="~/mr-eve/vcf-reference-datasets/1000g_filtered/data_maf0.01_rs_snps", 18 | vcf="~/mr-eve/vcf-reference-datasets/1000g/1kg_v3_nomult.bcf" 19 | ) 20 | 21 | 22 | a <- extract( 23 | bcf="~/mr-eve/gwas-files/2/data.bcf", 24 | snplist=s[[6]], 25 | tempname="temp", 26 | proxies="yes", 27 | bfile="~/mr-eve/vcf-reference-datasets/1000g_filtered/data_maf0.01_rs_snps", 28 | vcf="~/mr-eve/vcf-reference-datasets/1000g/1kg_v3_nomult.bcf" 29 | ) 30 | 31 | a <- extract( 32 | bcf="~/mr-eve/gwas-files/2/data.bcf", 33 | snplist=s, 34 | tempname="temp", 35 | proxies="yes", 36 | bfile="~/mr-eve/vcf-reference-datasets/1000g_filtered/data_maf0.01_rs_snps", 37 | vcf="~/mr-eve/vcf-reference-datasets/1000g/1kg_v3_nomult.bcf" 38 | ) 39 | 40 | 41 | 42 | 43 | library(devtools) 44 | load_all() 45 | a <- TwoSampleMR::extract_instruments(2) 46 | fn <- system.file("data","IEU-a-2.vcf.gz", package="gwasvcftools") 47 | ldref <- "~/repo/mr-base-api/app/ld_files/data_maf0.01_rs" 48 | 49 | o <- get_ld_proxies(a$SNP, fn, ldref, tempfile()) 50 | -------------------------------------------------------------------------------- /man/VariantAnnotation.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils-pipe.R 3 | \name{VariantAnnotation} 4 | \alias{VariantAnnotation} 5 | \title{VariantAnnotation} 6 | \description{ 7 | VariantAnnotation 8 | } 9 | -------------------------------------------------------------------------------- /man/check_bcftools.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/binaries.r 3 | \name{check_bcftools} 4 | \alias{check_bcftools} 5 | \title{Check if the tools_bcftools option is set} 6 | \usage{ 7 | check_bcftools() 8 | } 9 | \value{ 10 | TRUE or FALSE 11 | } 12 | \description{ 13 | See set_bcftools() for more information 14 | } 15 | -------------------------------------------------------------------------------- /man/check_plink.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/binaries.r 3 | \name{check_plink} 4 | \alias{check_plink} 5 | \title{Check if the tools_plink option is set} 6 | \usage{ 7 | check_plink() 8 | } 9 | \value{ 10 | TRUE or FALSE 11 | } 12 | \description{ 13 | See set_plink() for more information 14 | } 15 | -------------------------------------------------------------------------------- /man/create_ldref_sqlite.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/rsid_index.r 3 | \name{create_ldref_sqlite} 4 | \alias{create_ldref_sqlite} 5 | \title{Create LD reference sqlite database for tags} 6 | \usage{ 7 | create_ldref_sqlite(bfile, dbname, tag_r2 = 0.6) 8 | } 9 | \arguments{ 10 | \item{bfile}{path to plink file} 11 | 12 | \item{dbname}{dbname to produce (overwrites existing if exists)} 13 | 14 | \item{tag_r2}{minimum tag r2} 15 | } 16 | \description{ 17 | This is used for looking up proxies 18 | } 19 | -------------------------------------------------------------------------------- /man/create_pval_index_from_vcf.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/pval_index.r 3 | \name{create_pval_index_from_vcf} 4 | \alias{create_pval_index_from_vcf} 5 | \title{Create pval index from GWAS-VCF file} 6 | \usage{ 7 | create_pval_index_from_vcf(vcffile, maximum_pval, indexname) 8 | } 9 | \arguments{ 10 | \item{vcffile}{VCF filename} 11 | 12 | \item{maximum_pval}{Maximum p-value to include. Default = 0.05} 13 | 14 | \item{indexname}{index file name to create. Deletes existing file if exists.} 15 | } 16 | \description{ 17 | Create a separate file called \verb{.pvali} which is used to speed up p-value queries. 18 | } 19 | -------------------------------------------------------------------------------- /man/create_rsidx_index_from_vcf.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/rsid_index.r 3 | \name{create_rsidx_index_from_vcf} 4 | \alias{create_rsidx_index_from_vcf} 5 | \title{Create RSID index from VCF} 6 | \usage{ 7 | create_rsidx_index_from_vcf(vcf, indexname) 8 | } 9 | \arguments{ 10 | \item{vcf}{VCF filename} 11 | 12 | \item{indexname}{index file name to create. Deletes existing file if exists.} 13 | } 14 | \description{ 15 | Create RSID index from VCF 16 | } 17 | -------------------------------------------------------------------------------- /man/create_rsidx_sub_index.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/rsid_index.r 3 | \name{create_rsidx_sub_index} 4 | \alias{create_rsidx_sub_index} 5 | \title{Create new index from existing index using a subset of rsids} 6 | \usage{ 7 | create_rsidx_sub_index(rsid, rsidx, newindex) 8 | } 9 | \arguments{ 10 | \item{rsid}{Vector of rsids} 11 | 12 | \item{rsidx}{Existing index} 13 | 14 | \item{newindex}{New index (Note: will delete existing file if exists)} 15 | } 16 | \value{ 17 | NULL, creates new index file 18 | } 19 | \description{ 20 | Note this requires a modified version of plink that allows ld-window-r2 flag for --r option. 21 | Available here: https://github.com/explodecomputer/plink-ng 22 | } 23 | -------------------------------------------------------------------------------- /man/create_vcf.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/manipulate.r 3 | \name{create_vcf} 4 | \alias{create_vcf} 5 | \title{Create GWAS vcf} 6 | \usage{ 7 | create_vcf( 8 | chrom, 9 | pos, 10 | nea, 11 | ea, 12 | snp = NULL, 13 | ea_af = NULL, 14 | effect = NULL, 15 | se = NULL, 16 | pval = NULL, 17 | n = NULL, 18 | ncase = NULL, 19 | name = NULL 20 | ) 21 | } 22 | \arguments{ 23 | \item{chrom}{chrom vector} 24 | 25 | \item{pos}{pos vector} 26 | 27 | \item{nea}{nea vector} 28 | 29 | \item{ea}{ea vector} 30 | 31 | \item{snp}{Optional vector} 32 | 33 | \item{ea_af}{Optional vector} 34 | 35 | \item{effect}{Optional vector} 36 | 37 | \item{se}{Optional vector} 38 | 39 | \item{pval}{Optional vector} 40 | 41 | \item{n}{Optional vector} 42 | 43 | \item{ncase}{Optional vector} 44 | 45 | \item{name}{Optional vector} 46 | } 47 | \value{ 48 | vcf object 49 | } 50 | \description{ 51 | Create GWAS vcf 52 | } 53 | -------------------------------------------------------------------------------- /man/get_ld_proxies.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/proxy.r 3 | \name{get_ld_proxies} 4 | \alias{get_ld_proxies} 5 | \title{Find LD proxies for a set of SNPs} 6 | \usage{ 7 | get_ld_proxies( 8 | rsid, 9 | bfile, 10 | searchspace = NULL, 11 | tag_kb = 5000, 12 | tag_nsnp = 5000, 13 | tag_r2 = 0.6, 14 | threads = 1, 15 | out = tempfile() 16 | ) 17 | } 18 | \arguments{ 19 | \item{rsid}{list of rs IDs} 20 | 21 | \item{bfile}{ld reference panel} 22 | 23 | \item{searchspace}{Optional list of rs IDs to use as potential proxies} 24 | 25 | \item{tag_kb}{=5000 Proxy parameter} 26 | 27 | \item{tag_nsnp}{=5000 Proxy parameter} 28 | 29 | \item{tag_r2}{=0.6 Proxy parameter} 30 | 31 | \item{threads}{Number of threads to use (=1)} 32 | 33 | \item{out}{temporary output file} 34 | } 35 | \value{ 36 | data frame 37 | } 38 | \description{ 39 | Find LD proxies for a set of SNPs 40 | } 41 | -------------------------------------------------------------------------------- /man/gwasvcf_to_summaryset.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/gwasglue.R 3 | \name{gwasvcf_to_summaryset} 4 | \alias{gwasvcf_to_summaryset} 5 | \title{Create a SummarySet} 6 | \usage{ 7 | gwasvcf_to_summaryset(vcf) 8 | } 9 | \arguments{ 10 | \item{vcf}{Path or URL to GWAS-VCF file or VCF object e.g. output from \code{\link[VariantAnnotation:readVcf-methods]{VariantAnnotation::readVcf()}}, \code{\link[=create_vcf]{create_vcf()}} or \code{\link[=query_gwas]{query_gwas()}}} 11 | } 12 | \description{ 13 | Returns a gwasglue2 SummarySet object 14 | } 15 | -------------------------------------------------------------------------------- /man/merge_vcf.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/manipulate.r 3 | \name{merge_vcf} 4 | \alias{merge_vcf} 5 | \title{Merge two GWAS VCF objects} 6 | \usage{ 7 | merge_vcf(a, b) 8 | } 9 | \arguments{ 10 | \item{a}{VCF object} 11 | 12 | \item{b}{VCF object} 13 | } 14 | \value{ 15 | SimpleList of VCF objects 16 | } 17 | \description{ 18 | Returns merged intersection of two VCF objects 19 | } 20 | -------------------------------------------------------------------------------- /man/parse_chrompos.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/query.r 3 | \name{parse_chrompos} 4 | \alias{parse_chrompos} 5 | \title{Parse chromosome:position} 6 | \usage{ 7 | parse_chrompos(chrompos, radius = NULL) 8 | } 9 | \arguments{ 10 | \item{chrompos}{Either vector of chromosome and position ranges e.g. "1:1000" or "1:1000-2000", or data frame with columns \code{chrom}, \code{start}, \code{end}.} 11 | 12 | \item{radius}{Add radius to the specified positions. Default = NULL} 13 | } 14 | \value{ 15 | GRanges object 16 | } 17 | \description{ 18 | Takes data frame or vector of chromosome position ranges and parses to granges object 19 | } 20 | -------------------------------------------------------------------------------- /man/pipe.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils-pipe.R 3 | \name{\%>\%} 4 | \alias{\%>\%} 5 | \title{Pipe operator} 6 | \usage{ 7 | lhs \%>\% rhs 8 | } 9 | \description{ 10 | See \code{magrittr::\link[magrittr]{\%>\%}} for details. 11 | } 12 | \keyword{internal} 13 | -------------------------------------------------------------------------------- /man/proxy_match.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/proxy.r 3 | \name{proxy_match} 4 | \alias{proxy_match} 5 | \title{Extract SNPs from vcf file} 6 | \usage{ 7 | proxy_match( 8 | vcf, 9 | rsid, 10 | bfile = NULL, 11 | proxies = "yes", 12 | tag_kb = 5000, 13 | tag_nsnp = 5000, 14 | tag_r2 = 0.6, 15 | threads = 1, 16 | rsidx = NULL, 17 | dbfile = NULL 18 | ) 19 | } 20 | \arguments{ 21 | \item{vcf}{vcf file name} 22 | 23 | \item{rsid}{list of rs IDs} 24 | 25 | \item{bfile}{ld reference panel (plink)} 26 | 27 | \item{proxies}{="yes" If SNPs are absent then look for proxies (yes) or not (no). Can also mask all target SNPs and only return proxies (only), for testing purposes} 28 | 29 | \item{tag_kb}{=5000 Proxy parameter} 30 | 31 | \item{tag_nsnp}{=5000 Proxy parameter} 32 | 33 | \item{tag_r2}{=0.6 Proxy parameter} 34 | 35 | \item{threads}{Number of threads to use (=1)} 36 | 37 | \item{rsidx}{Path to rsidx index} 38 | 39 | \item{dbfile}{ld tag database (sqlite)} 40 | } 41 | \value{ 42 | data frame 43 | } 44 | \description{ 45 | Finds proxies if necessary 46 | } 47 | -------------------------------------------------------------------------------- /man/query_chrompos_bcftools.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/query.r 3 | \name{query_chrompos_bcftools} 4 | \alias{query_chrompos_bcftools} 5 | \title{Query chromosome and position using bcftools} 6 | \usage{ 7 | query_chrompos_bcftools(chrompos, vcffile, id = NULL) 8 | } 9 | \arguments{ 10 | \item{chrompos}{Either vector of chromosome and position ranges e.g. "1:1000" or "1:1000-2000", or data frame with columns \code{chrom}, \code{start}, \code{end}.} 11 | 12 | \item{vcffile}{Path to .vcf.gz GWAS summary data file} 13 | 14 | \item{id}{If multiple GWAS datasets in the vcf file, the name (sample ID) from which to perform the filter} 15 | } 16 | \value{ 17 | vcf object 18 | } 19 | \description{ 20 | Query chromosome and position using bcftools 21 | } 22 | -------------------------------------------------------------------------------- /man/query_chrompos_file.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/query.r 3 | \name{query_chrompos_file} 4 | \alias{query_chrompos_file} 5 | \title{Query vcf file, extracting by chromosome and position} 6 | \usage{ 7 | query_chrompos_file(chrompos, vcffile, id = NULL, build = "GRCh37") 8 | } 9 | \arguments{ 10 | \item{chrompos}{Either vector of chromosome and position ranges e.g. "1:1000" or "1:1000-2000", or data frame with columns \code{chrom}, \code{start}, \code{end}.} 11 | 12 | \item{vcffile}{Path to .vcf.gz GWAS summary data file} 13 | 14 | \item{id}{If multiple GWAS datasets in the vcf file, the name (sample ID) from which to perform the filter} 15 | 16 | \item{build}{Default="GRCh37" Build of vcffile} 17 | } 18 | \value{ 19 | VCF object 20 | } 21 | \description{ 22 | Query vcf file, extracting by chromosome and position 23 | } 24 | -------------------------------------------------------------------------------- /man/query_chrompos_vcf.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/query.r 3 | \name{query_chrompos_vcf} 4 | \alias{query_chrompos_vcf} 5 | \title{Query chrompos from vcf object} 6 | \usage{ 7 | query_chrompos_vcf(chrompos, vcf, id = NULL) 8 | } 9 | \arguments{ 10 | \item{chrompos}{Either vector of chromosome and position ranges e.g. "1:1000" or "1:1000-2000", or data frame with columns \code{chrom}, \code{start}, \code{end}.} 11 | 12 | \item{vcf}{VCF object (e.g. from readVcf)} 13 | 14 | \item{id}{If multiple GWAS datasets in the vcf file, the name (sample ID) from which to perform the filter} 15 | } 16 | \value{ 17 | VCF object 18 | } 19 | \description{ 20 | Query chrompos from vcf object 21 | } 22 | -------------------------------------------------------------------------------- /man/query_gwas.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/query.r 3 | \name{query_gwas} 4 | \alias{query_gwas} 5 | \title{Query data from vcf file} 6 | \usage{ 7 | query_gwas( 8 | vcf, 9 | chrompos = NULL, 10 | rsid = NULL, 11 | pval = NULL, 12 | id = NULL, 13 | rsidx = NULL, 14 | pvali = NULL, 15 | build = "GRCh37", 16 | os = Sys.info()[["sysname"]], 17 | proxies = "no", 18 | bfile = NULL, 19 | dbfile = NULL, 20 | tag_kb = 5000, 21 | tag_nsnp = 5000, 22 | tag_r2 = 0.6, 23 | threads = 1 24 | ) 25 | } 26 | \arguments{ 27 | \item{vcf}{Path or URL to GWAS-VCF file or VCF object e.g. output from \code{\link[VariantAnnotation:readVcf-methods]{VariantAnnotation::readVcf()}} or \code{\link[=create_vcf]{create_vcf()}}} 28 | 29 | \item{chrompos}{Either vector of chromosome and position ranges e.g. "1:1000" or "1:1000-2000", or data frame with columns \code{chrom}, \code{start}, \code{end}.} 30 | 31 | \item{rsid}{Vector of rsids} 32 | 33 | \item{pval}{P-value threshold (NOT -log10)} 34 | 35 | \item{id}{If multiple GWAS datasets in the vcf file, the name (sample ID) from which to perform the filter} 36 | 37 | \item{rsidx}{Path to rsidx index file} 38 | 39 | \item{pvali}{Path to pval index file} 40 | 41 | \item{build}{="GRCh37" Build of vcffile} 42 | 43 | \item{os}{The operating system. Default is as detected. Determines the method used to perform query} 44 | 45 | \item{proxies}{="no" If SNPs are absent then look for proxies (yes) or not (no). Can also mask all target SNPs and only return proxies (only), for testing purposes. Currently only possible if querying rsid.} 46 | 47 | \item{bfile}{=path to plink bed/bim/fam ld reference panel} 48 | 49 | \item{dbfile}{=path to sqlite tag snp database} 50 | 51 | \item{tag_kb}{=5000 Proxy parameter} 52 | 53 | \item{tag_nsnp}{=5000 Proxy parameter} 54 | 55 | \item{tag_r2}{=0.6 Proxy parameter} 56 | 57 | \item{threads}{=1 NUmber of threads} 58 | } 59 | \value{ 60 | vcf object 61 | } 62 | \description{ 63 | Read in GWAS summary data with filters on datasets (if multiple datasets per file) and/or chromosome/position, rsids or pvalues. Chooses most optimal choice for the detected operating system. Typically chrompos searches are the fastest. On Windows, rsid or pvalue filters from a file will be slow. 64 | } 65 | -------------------------------------------------------------------------------- /man/query_pval_bcftools.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/query.r 3 | \name{query_pval_bcftools} 4 | \alias{query_pval_bcftools} 5 | \title{Query p-value using bcftools} 6 | \usage{ 7 | query_pval_bcftools(pval, vcffile, id = NULL) 8 | } 9 | \arguments{ 10 | \item{pval}{P-value threshold (NOT -log10)} 11 | 12 | \item{vcffile}{Path to .vcf.gz GWAS summary data file} 13 | 14 | \item{id}{If multiple GWAS datasets in the vcf file, the name (sample ID) from which to perform the filter} 15 | } 16 | \value{ 17 | vcf object 18 | } 19 | \description{ 20 | Query p-value using bcftools 21 | } 22 | -------------------------------------------------------------------------------- /man/query_pval_file.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/query.r 3 | \name{query_pval_file} 4 | \alias{query_pval_file} 5 | \title{Query pval from vcf file} 6 | \usage{ 7 | query_pval_file(pval, vcffile, id = NULL, build = "GRCh37") 8 | } 9 | \arguments{ 10 | \item{pval}{P-value threshold (NOT -log10)} 11 | 12 | \item{vcffile}{Path to tabix indexed vcf file} 13 | 14 | \item{id}{If multiple GWAS datasets in the vcf file, the name (sample ID) from which to perform the filter} 15 | 16 | \item{build}{Default="GRCh37"} 17 | } 18 | \value{ 19 | VCF object 20 | } 21 | \description{ 22 | Query pval from vcf file 23 | } 24 | -------------------------------------------------------------------------------- /man/query_pval_sqlite3.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/pval_index.r, R/query.r 3 | \name{query_pval_sqlite3} 4 | \alias{query_pval_sqlite3} 5 | \title{Query pval from file using pvali index} 6 | \usage{ 7 | query_pval_sqlite3(pval, vcffile, id = NULL, pvali) 8 | 9 | query_pval_sqlite3(pval, vcffile, id = NULL, pvali) 10 | } 11 | \arguments{ 12 | \item{pval}{pval threshold} 13 | 14 | \item{vcffile}{Path to .vcf.gz GWAS summary data file} 15 | 16 | \item{id}{If multiple GWAS datasets in the vcf file, the name (sample ID) from which to perform the filter} 17 | 18 | \item{pvali}{Path to pval index file} 19 | } 20 | \value{ 21 | vcf object 22 | 23 | vcf object 24 | } 25 | \description{ 26 | See create_pvali_index 27 | 28 | See create_pvali_index 29 | } 30 | -------------------------------------------------------------------------------- /man/query_pval_vcf.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/query.r 3 | \name{query_pval_vcf} 4 | \alias{query_pval_vcf} 5 | \title{Query based on p-value threshold from vcf} 6 | \usage{ 7 | query_pval_vcf(pval, vcf, id = NULL) 8 | } 9 | \arguments{ 10 | \item{pval}{P-value threshold (NOT -log10)} 11 | 12 | \item{vcf}{VCF object (e.g. from readVcf)} 13 | 14 | \item{id}{If multiple GWAS datasets in the vcf file, the name (sample ID) from which to perform the filter} 15 | } 16 | \value{ 17 | VCF object 18 | } 19 | \description{ 20 | Query based on p-value threshold from vcf 21 | } 22 | -------------------------------------------------------------------------------- /man/query_pvali.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/pval_index.r, R/query.r 3 | \name{query_pvali} 4 | \alias{query_pvali} 5 | \title{Query pvali} 6 | \usage{ 7 | query_pvali(pval, pvali) 8 | 9 | query_pvali(pval, pvali) 10 | } 11 | \arguments{ 12 | \item{pval}{pval threshold} 13 | 14 | \item{pvali}{Path to pval index file} 15 | } 16 | \value{ 17 | data frame 18 | 19 | data frame 20 | } 21 | \description{ 22 | Query pvali 23 | 24 | Query pvali 25 | } 26 | -------------------------------------------------------------------------------- /man/query_rsid_bcftools.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/query.r 3 | \name{query_rsid_bcftools} 4 | \alias{query_rsid_bcftools} 5 | \title{Query} 6 | \usage{ 7 | query_rsid_bcftools(rsid, vcffile, id = NULL) 8 | } 9 | \arguments{ 10 | \item{rsid}{Vector of rsids} 11 | 12 | \item{vcffile}{Path to .vcf.gz GWAS summary data file} 13 | 14 | \item{id}{If multiple GWAS datasets in the vcf file, the name (sample ID) from which to perform the filter} 15 | } 16 | \value{ 17 | VCF object 18 | } 19 | \description{ 20 | Query 21 | } 22 | -------------------------------------------------------------------------------- /man/query_rsid_file.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/query.r 3 | \name{query_rsid_file} 4 | \alias{query_rsid_file} 5 | \title{Query vcf file, extracting by rsid} 6 | \usage{ 7 | query_rsid_file(rsid, vcffile, id = NULL, build = "GRCh37") 8 | } 9 | \arguments{ 10 | \item{rsid}{Vector of rsids. Use DBSNP build (???)} 11 | 12 | \item{vcffile}{Path to .vcf.gz GWAS summary data file} 13 | 14 | \item{id}{If multiple GWAS datasets in the vcf file, the name (sample ID) from which to perform the filter} 15 | 16 | \item{build}{Default="GRCh37" Build of vcffile} 17 | } 18 | \value{ 19 | VCF object 20 | } 21 | \description{ 22 | Query vcf file, extracting by rsid 23 | } 24 | -------------------------------------------------------------------------------- /man/query_rsid_rsidx.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/query.r 3 | \name{query_rsid_rsidx} 4 | \alias{query_rsid_rsidx} 5 | \title{Query rsid from file using rsidx index} 6 | \usage{ 7 | query_rsid_rsidx(rsid, vcffile, id = NULL, rsidx) 8 | } 9 | \arguments{ 10 | \item{rsid}{Vector of rsids} 11 | 12 | \item{vcffile}{Path to .vcf.gz GWAS summary data file} 13 | 14 | \item{id}{If multiple GWAS datasets in the vcf file, the name (sample ID) from which to perform the filter} 15 | 16 | \item{rsidx}{Path to rsidx index file} 17 | } 18 | \value{ 19 | vcf object 20 | } 21 | \description{ 22 | See create_rsidx_index 23 | } 24 | -------------------------------------------------------------------------------- /man/query_rsid_vcf.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/query.r 3 | \name{query_rsid_vcf} 4 | \alias{query_rsid_vcf} 5 | \title{Query rsid from vcf object} 6 | \usage{ 7 | query_rsid_vcf(rsid, vcf, id = NULL) 8 | } 9 | \arguments{ 10 | \item{rsid}{Vector of rsids} 11 | 12 | \item{vcf}{VCF object (e.g. from readVcf)} 13 | 14 | \item{id}{If multiple GWAS datasets in the vcf file, the name (sample ID) from which to perform the filter} 15 | } 16 | \value{ 17 | VCF object 18 | } 19 | \description{ 20 | Query rsid from vcf object 21 | } 22 | -------------------------------------------------------------------------------- /man/query_rsidx.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/query.r 3 | \name{query_rsidx} 4 | \alias{query_rsidx} 5 | \title{Query rsidx} 6 | \usage{ 7 | query_rsidx(rsid, rsidx) 8 | } 9 | \arguments{ 10 | \item{rsid}{Vector of rsids} 11 | 12 | \item{rsidx}{Path to rsidx index file} 13 | } 14 | \value{ 15 | data frame 16 | } 17 | \description{ 18 | Query rsidx 19 | } 20 | -------------------------------------------------------------------------------- /man/set_bcftools.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/binaries.r 3 | \name{set_bcftools} 4 | \alias{set_bcftools} 5 | \title{Set bcftools binary location} 6 | \usage{ 7 | set_bcftools(path = "") 8 | } 9 | \arguments{ 10 | \item{path}{If "" (default), then will use the MRCIEU/genetics.binaRies to get binaries that are appropriate for the detected operating system. Otherwise, provide the path to the bcftools binary. If NULL then will set the option to NULL.} 11 | } 12 | \value{ 13 | NULL, sets option 'tools_bcftools' 14 | } 15 | \description{ 16 | Set bcftools binary location 17 | } 18 | -------------------------------------------------------------------------------- /man/set_plink.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/binaries.r 3 | \name{set_plink} 4 | \alias{set_plink} 5 | \title{Set plink binary location} 6 | \usage{ 7 | set_plink(path = "") 8 | } 9 | \arguments{ 10 | \item{path}{If "" (default), then will use the MRCIEU/genetics.binaRies to get binaries that are appropriate for the detected operating system. Otherwise, provide the path to the plink binary. If NULL then will set the option to NULL.} 11 | } 12 | \value{ 13 | NULL, sets option 'tools_plink' 14 | } 15 | \description{ 16 | Set plink binary location 17 | } 18 | -------------------------------------------------------------------------------- /man/sqlite_ld_proxies.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/proxy.r 3 | \name{sqlite_ld_proxies} 4 | \alias{sqlite_ld_proxies} 5 | \title{Lookup LD proxies from sqlite database} 6 | \usage{ 7 | sqlite_ld_proxies(rsids, dbfile, tag_r2) 8 | } 9 | \arguments{ 10 | \item{rsids}{List of rsids} 11 | 12 | \item{dbfile}{path to dbfile} 13 | 14 | \item{tag_r2}{minimum r2 value} 15 | } 16 | \value{ 17 | data frame 18 | } 19 | \description{ 20 | Lookup LD proxies from sqlite database 21 | } 22 | -------------------------------------------------------------------------------- /man/vcf_to_granges.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/manipulate.r 3 | \name{vcf_to_granges} 4 | \alias{vcf_to_granges} 5 | \title{Convert vcf format to granges format} 6 | \usage{ 7 | vcf_to_granges(vcf, id = NULL) 8 | } 9 | \arguments{ 10 | \item{vcf}{Output from readVcf} 11 | 12 | \item{id}{Only accepts one ID, so specify here if there are multiple GWAS datasets in the vcf} 13 | } 14 | \value{ 15 | GRanges object 16 | } 17 | \description{ 18 | Convert vcf format to granges format 19 | } 20 | -------------------------------------------------------------------------------- /man/vcf_to_tibble.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/manipulate.r 3 | \name{vcf_to_tibble} 4 | \alias{vcf_to_tibble} 5 | \title{Convert vcf format to tibble (data frame)} 6 | \usage{ 7 | vcf_to_tibble(vcf, id = NULL) 8 | } 9 | \arguments{ 10 | \item{vcf}{Output from readVcf} 11 | 12 | \item{id}{Only accepts one ID, so specify here if there are multiple GWAS datasets in the vcf} 13 | } 14 | \value{ 15 | GRanges object 16 | } 17 | \description{ 18 | Convert vcf format to tibble (data frame) 19 | } 20 | -------------------------------------------------------------------------------- /man/vcflist_overlaps.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/manipulate.r 3 | \name{vcflist_overlaps} 4 | \alias{vcflist_overlaps} 5 | \title{Reduce list of VCFs to intersecting regions} 6 | \usage{ 7 | vcflist_overlaps(vcflist, chrompos) 8 | } 9 | \arguments{ 10 | \item{vcflist}{List of VCF objects, or list of VCF filenames, or mix of VCF objects and filenames} 11 | 12 | \item{chrompos}{Either vector of chromosome and position ranges e.g. "1:1000" or "1:1000-2000", or data frame with columns \code{chrom}, \code{start}, \code{end}.} 13 | } 14 | \value{ 15 | List of VCFs 16 | } 17 | \description{ 18 | Reduce list of VCFs to intersecting regions 19 | } 20 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(gwasvcf) 3 | 4 | test_check("gwasvcf") 5 | -------------------------------------------------------------------------------- /tests/testthat/test_manipulate.r: -------------------------------------------------------------------------------- 1 | context("VCF manipulations") 2 | library(gwasvcf) 3 | library(magrittr) 4 | library(dplyr) 5 | 6 | fn <- system.file("extdata","data.vcf.gz", package="gwasvcf") 7 | vcf1 <- VariantAnnotation::readVcf(fn)[1:70,] 8 | vcf2 <- VariantAnnotation::readVcf(fn)[40:90,] 9 | vcf3 <- VariantAnnotation::readVcf(fn)[60:92,] 10 | vcf4 <- VariantAnnotation::readVcf(fn)[65:92,] 11 | l <- list(vcf1, vcf2, vcf3, vcf4) 12 | if (Sys.info()["sysname"] != "Windows") set_bcftools() 13 | 14 | # Need to check what happens with multiallelic variants 15 | 16 | test_that("vcflist_overlaps", { 17 | skip_on_os("windows") 18 | 19 | o <- vcflist_overlaps(vcflist=list(vcf1, vcf2), chrompos=NULL) 20 | expect_true(all(sapply(o, length) == 31) & length(o) == 2) 21 | 22 | o <- vcflist_overlaps(vcflist=list(vcf1, vcf2, vcf3, fn), chrompos="1:1-10000000") 23 | expect_true(all(sapply(o, length) == 11) & length(o) == 4) 24 | 25 | o <- vcflist_overlaps(vcflist=list(vcf1, vcf2, vcf3, vcf4), chrompos="1:1-10000000") 26 | expect_true(all(sapply(o, length) == 6) & length(o) == 4) 27 | 28 | o <- vcflist_overlaps(vcflist=list(fn, fn), chrompos="1:1-10000000") 29 | expect_true(all(sapply(o, length) == 92) & length(o) == 2) 30 | 31 | o <- vcflist_overlaps(vcflist=list(fn, fn), chrompos="2:1-10000000") 32 | expect_true(all(sapply(o, length) == 0) & length(o) == 2) 33 | }) 34 | 35 | 36 | fn <- system.file("extdata","data.vcf.gz", package="gwasvcf") 37 | V <- VariantAnnotation::readVcf(fn) 38 | vv <- V %>% vcf_to_granges %>% dplyr::as_tibble() 39 | 40 | test_that("create vcf", { 41 | out <- vv %$% create_vcf(chrom=seqnames, pos=start, nea=REF, ea=ALT, snp=ID, ea_af=AF, effect=ES, se=SE, pval=10^-LP, n=SS, name="a") 42 | VariantAnnotation::writeVcf(out, file="temp.vcf") 43 | expect_true(file.exists("temp.vcf")) 44 | }) 45 | -------------------------------------------------------------------------------- /tests/testthat/test_proxy.R: -------------------------------------------------------------------------------- 1 | context("Getting LD proxies") 2 | library(gwasvcf) 3 | library(genetics.binaRies) 4 | 5 | vcffile <- system.file("extdata","data.vcf.gz", package="gwasvcf") 6 | vcf <- VariantAnnotation::readVcf(vcffile) 7 | bfile <- system.file("extdata","eur.bed", package="gwasvcf") %>% gsub(".bed", "", .) 8 | 9 | set_plink() 10 | 11 | test_that("query native", { 12 | skip_on_os("windows") 13 | 14 | set_bcftools(NULL) 15 | a <- query_gwas(vcffile, rsid="rs4970420") 16 | expect_equal(nrow(a), 1) 17 | 18 | a <- query_gwas(vcf, rsid="rs4970420") 19 | expect_equal(nrow(a), 1) 20 | 21 | a <- query_gwas(vcffile, rsid="rs4442317") 22 | expect_equal(nrow(a), 0) 23 | 24 | a <- query_gwas(vcf, rsid="rs4442317") 25 | expect_equal(nrow(a), 0) 26 | 27 | a <- query_gwas(vcffile, rsid="rs4442317", proxies="yes", bfile=bfile, tag_r2=0.05) 28 | expect_equal(nrow(a), 1) 29 | 30 | a <- query_gwas(vcf, rsid="rs4442317", proxies="yes", bfile=bfile, tag_r2=0.05) 31 | expect_equal(nrow(a), 1) 32 | 33 | a <- query_gwas(vcffile, rsid="rs9729550", proxies="only", bfile=bfile, tag_r2=0.05) 34 | expect_equal(nrow(a), 1) 35 | 36 | a <- query_gwas(vcf, rsid="rs9729550", proxies="only", bfile=bfile, tag_r2=0.05) 37 | expect_equal(nrow(a), 1) 38 | 39 | }) 40 | 41 | 42 | test_that("query bcftools", { 43 | skip_on_os("windows") 44 | 45 | set_bcftools() 46 | a <- query_gwas(vcffile, rsid="rs4970420") 47 | expect_equal(nrow(a), 1) 48 | 49 | a <- query_gwas(vcf, rsid="rs4970420") 50 | expect_equal(nrow(a), 1) 51 | 52 | a <- query_gwas(vcffile, rsid="rs4442317") 53 | expect_equal(nrow(a), 0) 54 | 55 | a <- query_gwas(vcf, rsid="rs4442317") 56 | expect_equal(nrow(a), 0) 57 | 58 | a <- query_gwas(vcffile, rsid="rs4442317", proxies="yes", bfile=bfile, tag_r2=0.05) 59 | expect_equal(nrow(a), 1) 60 | 61 | a <- query_gwas(vcffile, rsid=c("rs12565286","rs4442317"), proxies="yes", bfile=bfile, tag_r2=0.05) 62 | expect_equal(nrow(a), 2) 63 | 64 | a <- query_gwas(vcf, rsid="rs4442317", proxies="yes", bfile=bfile, tag_r2=0.05) 65 | expect_equal(nrow(a), 1) 66 | 67 | a <- query_gwas(vcf, rsid=c("rs12565286","rs4442317"), proxies="yes", bfile=bfile, tag_r2=0.05) 68 | expect_equal(nrow(a), 2) 69 | 70 | a <- query_gwas(vcffile, rsid="rs9729550", proxies="only", bfile=bfile, tag_r2=0.05) 71 | expect_equal(nrow(a), 1) 72 | 73 | a <- query_gwas(vcf, rsid="rs9729550", proxies="only", bfile=bfile, tag_r2=0.05) 74 | expect_equal(nrow(a), 1) 75 | 76 | }) 77 | 78 | test_that("alignment native", { 79 | skip_on_os("windows") 80 | 81 | set_bcftools(NULL) 82 | rsid <- names(SummarizedExperiment::rowRanges(vcf)) 83 | a <- proxy_match(vcf, rsid, bfile, proxies="only") 84 | b <- query_gwas(vcf, rsid=rsid) 85 | index <- match(names(b), names(a)) 86 | names(b) == names(a)[index] 87 | expect_true(cor(vcf_to_granges(b)$ES, vcf_to_granges(a)$ES[index], use="pair") > 0.5) 88 | }) 89 | 90 | test_that("alignment bcftools", { 91 | skip_on_os("windows") 92 | 93 | set_bcftools() 94 | rsid <- names(SummarizedExperiment::rowRanges(vcf)) 95 | a <- proxy_match(vcf, rsid, bfile, proxies="only") 96 | b <- query_gwas(vcf, rsid=rsid) 97 | index <- match(names(b), names(a)) 98 | names(b) == names(a)[index] 99 | expect_true(cor(vcf_to_granges(b)$ES, vcf_to_granges(a)$ES[index], use="pair") > 0.5) 100 | }) 101 | -------------------------------------------------------------------------------- /tests/testthat/test_pvali.r: -------------------------------------------------------------------------------- 1 | context("Querying vcf files with pval index") 2 | library(gwasvcf) 3 | 4 | fn <- system.file("extdata","data.vcf.gz", package="gwasvcf") 5 | vcf <- VariantAnnotation::readVcf(fn) 6 | if (Sys.info()["sysname"] != "Windows") set_bcftools() 7 | 8 | indexname <- tempfile() 9 | 10 | test_that("create index", { 11 | skip_on_os(c("windows", "linux")) 12 | create_pval_index_from_vcf(fn, 0.4, indexname) 13 | expect_true(file.exists(indexname)) 14 | }) 15 | 16 | test_that("read in", { 17 | skip_on_os(c("windows", "linux")) 18 | out <- query_pvali(0.05, indexname) 19 | expect_equal(nrow(out), 7) 20 | }) 21 | 22 | test_that("query with pvali", { 23 | skip_on_os(c("windows", "linux")) 24 | b <- query_gwas(fn, pval=0.05, pvali=indexname) 25 | expect_equal(nrow(b), 7) 26 | }) 27 | 28 | test_that("query with pvali", { 29 | skip_on_os("windows") 30 | b <- query_gwas(fn, pval=0.05) 31 | expect_equal(nrow(b), 7) 32 | }) 33 | -------------------------------------------------------------------------------- /tests/testthat/test_query.r: -------------------------------------------------------------------------------- 1 | context("Querying vcf files") 2 | library(gwasvcf) 3 | 4 | 5 | fn <- system.file("extdata","data.vcf.gz", package="gwasvcf") 6 | vcf <- VariantAnnotation::readVcf(fn) 7 | 8 | 9 | 10 | test_that("query_gwas", { 11 | chrompos<- c("1:800000-1000000") 12 | rsid <- c("rs3128126", "rs3121561", "rs3813193") 13 | id <- "IEU-a-2" 14 | pval <- 0.2 15 | 16 | expect_true({ 17 | a = query_gwas(fn, chrompos=chrompos, os="Darwin") 18 | b = query_gwas(fn, chrompos=chrompos, os="Windows") 19 | c = query_gwas(fn, chrompos=chrompos, id=id, os="Darwin") 20 | d = query_gwas(fn, chrompos=chrompos, id=id, os="Windows") 21 | all(a == b) && all(b == c) && all(c == d) 22 | }) 23 | 24 | expect_true({ 25 | a = query_gwas(fn, rsid=rsid, os="Darwin") 26 | b = query_gwas(fn, rsid=rsid, os="Windows") 27 | c = query_gwas(fn, rsid=rsid, id=id, os="Darwin") 28 | d = query_gwas(fn, rsid=rsid, id=id, os="Windows") 29 | all(a == b) && all(b == c) && all(c == d) 30 | }) 31 | 32 | expect_true({ 33 | a = query_gwas(fn, pval=pval, os="Darwin") 34 | b = query_gwas(fn, pval=pval, os="Windows") 35 | c = query_gwas(fn, pval=pval, id=id, os="Darwin") 36 | d = query_gwas(fn, pval=pval, id=id, os="Windows") 37 | all(a == b) && all(b == c) && all(c == d) 38 | }) 39 | 40 | expect_true({ 41 | a = query_gwas(vcf, chrompos=chrompos, os="Darwin") 42 | b = query_gwas(vcf, chrompos=chrompos, os="Windows") 43 | c = query_gwas(vcf, chrompos=chrompos, id=id, os="Darwin") 44 | d = query_gwas(vcf, chrompos=chrompos, id=id, os="Windows") 45 | all(a == b) && all(b == c) && all(c == d) 46 | }) 47 | 48 | expect_true({ 49 | a = query_gwas(vcf, rsid=rsid, os="Darwin") 50 | b = query_gwas(vcf, rsid=rsid, os="Windows") 51 | c = query_gwas(vcf, rsid=rsid, id=id, os="Darwin") 52 | d = query_gwas(vcf, rsid=rsid, id=id, os="Windows") 53 | all(a == b) && all(b == c) && all(c == d) 54 | }) 55 | 56 | expect_true({ 57 | a = query_gwas(vcf, pval=pval, os="Darwin") 58 | b = query_gwas(vcf, pval=pval, os="Windows") 59 | c = query_gwas(vcf, pval=pval, id=id, os="Darwin") 60 | d = query_gwas(vcf, pval=pval, id=id, os="Windows") 61 | all(a == b) && all(b == c) && all(c == d) 62 | }) 63 | }) 64 | 65 | 66 | test_that("parse_chrompos", { 67 | expect_equal(parse_chrompos("1:10000") %>% length, 1) 68 | expect_equal(parse_chrompos("1:10000-100000") %>% length, 1) 69 | expect_equal(parse_chrompos(c("1:10000-10000", "2:100-200")) %>% length, 2) 70 | expect_equal(parse_chrompos(dplyr::tibble(chrom=c(1,2),start=c(10000,100), end=c(10000,200))) %>% length, 2) 71 | }) 72 | 73 | 74 | test_that("vcf_to_granges", { 75 | g <- vcf_to_granges(vcf) 76 | expect_equal(length(g), length(vcf)) 77 | }) 78 | 79 | 80 | test_that("query_chrompos_file", { 81 | g <- parse_chrompos("1:800000-1000000") 82 | v <- query_chrompos_file(g, fn) 83 | expect_equal(length(v), 3) 84 | }) 85 | 86 | 87 | test_that("query_rsid_file", { 88 | v <- query_rsid_file(c("rs3128126", "rs3121561", "rs3813193"), fn) 89 | expect_equal(length(v), 3) 90 | }) 91 | 92 | 93 | test_that("query_pval_file", { 94 | v <- query_pval_file(0.2, fn) 95 | expect_true(length(v) < 92) 96 | expect_true(length(v) > 5) 97 | }) 98 | 99 | 100 | test_that("query_chrompos_vcf", { 101 | v <- query_chrompos_vcf("1:800000-1000000", vcf) 102 | expect_equal(length(v), 3) 103 | }) 104 | 105 | 106 | test_that("query_rsid_vcf", { 107 | v <- query_rsid_vcf(c("rs3128126", "rs3121561", "rs3813193"), vcf) 108 | expect_equal(length(v), 3) 109 | }) 110 | 111 | 112 | test_that("query_pval_vcf", { 113 | v <- query_pval_vcf(0.2, vcf) 114 | expect_true(length(v) < 92) 115 | expect_true(length(v) > 5) 116 | }) 117 | 118 | 119 | test_that("query_rsid_bcftools", { 120 | skip_on_os("windows") 121 | set_bcftools() 122 | v <- query_rsid_bcftools(c("rs3128126", "rs3121561", "rs3813193"), fn) 123 | expect_equal(length(v), 3) 124 | }) 125 | 126 | 127 | test_that("query_pval_bcftools", { 128 | skip_on_os("windows") 129 | set_bcftools() 130 | v <- query_pval_bcftools(0.2, fn) 131 | expect_true(length(v) < 92) 132 | expect_true(length(v) > 5) 133 | }) 134 | 135 | 136 | test_that("query_chrompos_vcf", { 137 | skip_on_os("windows") 138 | set_bcftools() 139 | v <- query_chrompos_bcftools("1:800000-1000000", fn) 140 | expect_equal(length(v), 3) 141 | }) 142 | 143 | 144 | test_that("query_chrompos_vcf url", { 145 | skip_on_ci() 146 | skip_on_os(c("mac", "windows", "linux")) 147 | set_bcftools() 148 | u <- "https://objectstorage.us-ashburn-1.oraclecloud.com/n/idrvm4tkz2a8/b/OpenGWAS/o/ieu-a/ieu-a-2/ieu-a-2.vcf.gz" 149 | RCurl::url.exists(u) 150 | v <- query_chrompos_bcftools("1:800000-1000000", u) 151 | expect_equal(length(v), 3) 152 | }) 153 | 154 | test_that("query_chrompos_vcf url2", { 155 | skip_on_ci() 156 | skip_on_os(c("mac", "windows", "linux")) 157 | set_bcftools() 158 | u <- "https://objectstorage.us-ashburn-1.oraclecloud.com/n/idrvm4tkz2a8/b/OpenGWAS/o/ieu-a/ieu-a-2/ieu-a-2.vcf.gz" 159 | # RCurl::url.exists(u) 160 | v <- query_gwas(u, "1:800000-1000000") 161 | expect_equal(length(v), 3) 162 | }) 163 | 164 | test_that("query_chrompos_vcf url2", { 165 | skip_on_ci() 166 | skip_on_os(c("mac", "windows", "linux")) 167 | set_bcftools() 168 | u <- "https://objectstorage.us-ashburn-1.oraclecloud.com/n/idrvm4tkz2a8/b/OpenGWAS/o/ieu-a/ieu-a-2/ieu-a-2.vcf.gz" 169 | v <- query_gwas(u, pval=5e-8) 170 | expect_equal(length(v), 2041) 171 | }) 172 | -------------------------------------------------------------------------------- /tests/testthat/test_rsidx.r: -------------------------------------------------------------------------------- 1 | context("Querying vcf files with rsidx") 2 | library(gwasvcf) 3 | 4 | 5 | fn <- system.file("extdata","data.vcf.gz", package="gwasvcf") 6 | vcf <- VariantAnnotation::readVcf(fn) 7 | 8 | indexname <- tempfile() 9 | 10 | test_that("create index", { 11 | skip_on_os(c("windows", "linux")) 12 | create_rsidx_index_from_vcf(fn, indexname) 13 | expect_true(file.exists(indexname)) 14 | }) 15 | 16 | test_that("read in", { 17 | skip_on_os(c("windows", "linux")) 18 | out <- query_rsidx(head(names(vcf)), indexname) 19 | expect_true(nrow(out) == 6) 20 | }) 21 | 22 | test_that("create sub index", { 23 | skip_on_os(c("windows", "linux")) 24 | newname <- tempfile() 25 | create_rsidx_sub_index(head(names(vcf)), indexname, newname) 26 | expect_true(file.exists(newname)) 27 | }) 28 | 29 | test_that("query with rsidx", { 30 | skip_on_os(c("windows", "linux")) 31 | a <- query_gwas(fn, rsid=head(names(vcf))) 32 | b <- query_gwas(fn, rsid=head(names(vcf)), rsidx=indexname) 33 | expect_true(all(names(a) == names(b))) 34 | }) 35 | 36 | 37 | 38 | fn <- system.file("extdata", "eur.bed", package="gwasvcf") %>% gsub("eur.bed", "eur", .) 39 | dbfile <- tempfile() 40 | 41 | set_plink() 42 | 43 | test_that("tag db", { 44 | skip_on_os(c("windows", "linux")) 45 | create_ldref_sqlite(fn, dbfile, 0.04) 46 | expect_true(file.exists(dbfile)) 47 | }) 48 | 49 | test_that("sqlite_ld_proxies", { 50 | skip("TODO: check this test") 51 | m <- data.table::fread(paste0(fn, ".bim")) %>% {sample(.$V2, 100, replace=FALSE)} 52 | ld <- sqlite_ld_proxies(m, dbfile, 0.2) 53 | # Requires an expect_* condition here 54 | }) 55 | 56 | test_that("sqlite proxy", { 57 | skip("TODO: check this test") 58 | vcffile <- system.file("extdata","data.vcf.gz", package="gwasvcf") 59 | set_bcftools() 60 | a <- query_gwas(vcffile, rsid="rs4442317", proxies="yes", dbfile=dbfile, tag_r2=0.05) 61 | expect_equal(nrow(a), 1) 62 | }) 63 | 64 | test_that("sqlite proxy", { 65 | skip("TODO: check this test") 66 | vcffile <- system.file("extdata","data.vcf.gz", package="gwasvcf") 67 | set_bcftools() 68 | a <- query_gwas(vcffile, rsid=c("rs12565286","rs4442317"), proxies="yes", dbfile=dbfile, tag_r2=0.05) 69 | expect_equal(nrow(a), 2) 70 | }) 71 | 72 | test_that("sqlite proxy only", { 73 | skip("TODO: check this test") 74 | vcffile <- system.file("extdata","data.vcf.gz", package="gwasvcf") 75 | set_bcftools() 76 | a <- query_gwas(vcffile, rsid=c("rs12565286","rs4442317"), proxies="only", dbfile=dbfile, tag_r2=0.05) 77 | expect_equal(nrow(a), 2) 78 | b <- a %>% vcf_to_tibble 79 | expect_true(all(! b$ID %in% c("rs12565286","rs4442317"))) 80 | }) 81 | 82 | test_that("sqlite proxy no result", { 83 | skip_on_os(c("windows", "linux")) 84 | vcffile <- system.file("extdata","data.vcf.gz", package="gwasvcf") 85 | set_bcftools() 86 | a <- query_gwas(vcffile, rsid="rs4442317", proxies="yes", dbfile=dbfile, tag_r2=0.5) 87 | expect_equal(nrow(a), 0) 88 | }) 89 | 90 | unlink(dbfile) 91 | -------------------------------------------------------------------------------- /vignettes/figure/target-effects-plot-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MRCIEU/gwasvcf/820267653ac7720926a13cac00b82c0a0ca840b6/vignettes/figure/target-effects-plot-1.png -------------------------------------------------------------------------------- /vignettes/guide.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Reading, querying and writing GWAS summary data in VCF format" 3 | output: rmarkdown::html_vignette 4 | vignette: > 5 | %\VignetteIndexEntry{Reading, querying and writing GWAS summary data in VCF format} 6 | %\VignetteEngine{knitr::rmarkdown} 7 | %\VignetteEncoding{UTF-8} 8 | --- 9 | 10 | 11 | 12 | We developed a format for storing and harmonising GWAS summary data known as [GWAS VCF format](https://github.com/MRCIEU/gwas-vcf-specification). This format is effective for being very fast when querying chromosome and position ranges, handling multiallelic variants and indels. 13 | 14 | All the data in the [IEU GWAS database](https://gwas.mrcieu.ac.uk/) is available for download in the GWAS VCF format. This R package provides fast and convenient functions for querying and creating GWAS summary data in GWAS VCF format. The package builds on the [VariantAnnotation](https://bioconductor.org/packages/release/bioc/html/VariantAnnotation.html) Bioconductor package, which itself is based on the widely used [SummarizedExperiment](https://bioconductor.org/packages/release/bioc/html/SummarizedExperiment.html) Bioconductor package. 15 | 16 | 17 | ## External tools 18 | 19 | For some VCF querying functions it is faster to optionally use [bcftools](https://samtools.github.io/bcftools/bcftools.html), and when available the R package will use that strategy. To set a location for the bcftools package, use 20 | 21 | ```r 22 | library(gwasvcf) 23 | set_bcftools('/path/to/bcftools') 24 | ``` 25 | 26 | Note that there is bcftools binary for Windows available, so some querying options will be slower on Windows. 27 | 28 | For LD related functions the package uses [plink 1.90](https://www.cog-genomics.org/plink/1.9). You can specify the location of your plink installation by running 29 | 30 | ```r 31 | set_plink('/path/to/plink') 32 | ``` 33 | 34 | Alternatively you can automatically use use the binaries bundled here: https://github.com/mrcieu/genetics.binaRies 35 | 36 | ```r 37 | remotes::install_github('mrcieu/genetics.binaRies') 38 | set_plink() 39 | set_bcftools() 40 | ``` 41 | 42 | To unset a path: 43 | 44 | ```r 45 | set_plink(NULL) 46 | set_bcftools(NULL) 47 | ``` 48 | 49 | For this vignette we will use the bundled binaries in `genetics.binaRies`. 50 | 51 | 52 | ``` r 53 | suppressWarnings(suppressPackageStartupMessages({ 54 | library(gwasvcf) 55 | library(VariantAnnotation) 56 | library(dplyr) 57 | library(magrittr) 58 | })) 59 | ``` 60 | 61 | ``` r 62 | set_bcftools() 63 | #> Path not provided, using binaries in the MRCIEU/genetics.binaRies package 64 | ``` 65 | 66 | ## Reading in everything 67 | 68 | To read an entire dataset use the `readVcf` function. As an example we'll use the bundled data which is a small subset of the Speliotes et al 2010 BMI GWAS. 69 | 70 | 71 | ``` r 72 | vcffile <- system.file("extdata", "data.vcf.gz", package="gwasvcf") 73 | vcf <- readVcf(vcffile) 74 | class(vcf) 75 | #> [1] "CollapsedVCF" 76 | #> attr(,"package") 77 | #> [1] "VariantAnnotation" 78 | ``` 79 | 80 | Please refer to the `VariantAnnotation` package documentation for full details about the `CollapsedVCF` object. A brief summary follows. 81 | 82 | General info about the dataset can be obtained by calling it: 83 | 84 | 85 | ``` r 86 | vcf 87 | #> class: CollapsedVCF 88 | #> dim: 92 1 89 | #> rowRanges(vcf): 90 | #> GRanges with 5 metadata columns: paramRangeID, REF, ALT, QUAL, FILTER 91 | #> info(vcf): 92 | #> DataFrame with 1 column: AF 93 | #> info(header(vcf)): 94 | #> Number Type Description 95 | #> AF A Float Allele Frequency 96 | #> geno(vcf): 97 | #> List of length 9: ES, SE, LP, AF, SS, EZ, SI, NC, ID 98 | #> geno(header(vcf)): 99 | #> Number Type Description 100 | #> ES A Float Effect size estimate relative to the alternative allele 101 | #> SE A Float Standard error of effect size estimate 102 | #> LP A Float -log10 p-value for effect estimate 103 | #> AF A Float Alternate allele frequency in the association study 104 | #> SS A Float Sample size used to estimate genetic effect 105 | #> EZ A Float Z-score provided if it was used to derive the EFFECT and SE f... 106 | #> SI A Float Accuracy score of summary data imputation 107 | #> NC A Float Number of cases used to estimate genetic effect 108 | #> ID 1 String Study variant identifier 109 | ``` 110 | 111 | There are 92 rows and 1 column which means 92 SNPs and one GWAS. See the header information: 112 | 113 | 114 | ``` r 115 | header(vcf) 116 | #> class: VCFHeader 117 | #> samples(1): IEU-a-2 118 | #> meta(4): fileformat META SAMPLE contig 119 | #> fixed(1): FILTER 120 | #> info(1): AF 121 | #> geno(9): ES SE ... NC ID 122 | ``` 123 | 124 | See the names of the GWAS datasets (in this case just one, and it refers to the IEU GWAS database ID name): 125 | 126 | 127 | ``` r 128 | samples(header(vcf)) 129 | #> [1] "IEU-a-2" 130 | ``` 131 | 132 | In this case you can obtain information about this study through the `ieugwasr` package e.g. `ieugwasr::gwasinfo("IEU-a-2")`. 133 | 134 | There are a few components within the object: 135 | 136 | - `header` which has the meta data describing the dataset, including the association result variables 137 | - `rowRanges` which is information about each variant 138 | - `info` which is further metadata about each variant 139 | - `geno` which is the actual association results for each GWAS 140 | 141 | the `rowRanges` object is a `GenomicRanges` class, which is useful for performing fast operations on chromosome position information. 142 | 143 | 144 | ``` r 145 | rowRanges(vcf) 146 | #> GRanges object with 92 ranges and 5 metadata columns: 147 | #> seqnames ranges strand | paramRangeID REF 148 | #> | 149 | #> rs12565286 1 721290 * | NA G 150 | #> rs11804171 1 723819 * | NA T 151 | #> rs2977670 1 723891 * | NA G 152 | #> rs3094315 1 752566 * | NA G 153 | #> rs2073813 1 753541 * | NA G 154 | #> ... ... ... ... . ... ... 155 | #> rs715643 1 1172907 * | NA C 156 | #> rs6675798 1 1176597 * | NA T 157 | #> rs6603783 1 1181751 * | NA T 158 | #> rs6603785 1 1186502 * | NA A 159 | #> rs6603787 1 1188225 * | NA G 160 | #> ALT QUAL FILTER 161 | #> 162 | #> rs12565286 C NA PASS 163 | #> rs11804171 A NA PASS 164 | #> rs2977670 C NA PASS 165 | #> rs3094315 A NA PASS 166 | #> rs2073813 A NA PASS 167 | #> ... ... ... ... 168 | #> rs715643 T NA PASS 169 | #> rs6675798 C NA PASS 170 | #> rs6603783 C NA PASS 171 | #> rs6603785 T NA PASS 172 | #> rs6603787 T NA PASS 173 | #> ------- 174 | #> seqinfo: 84 sequences from GRCh37 genome 175 | ``` 176 | 177 | ## Converting to simple dataframes 178 | 179 | The VCF object is somewhat complex and you can read more about it in the [VariantAnnotation package documentation](https://bioconductor.org/packages/release/bioc/html/VariantAnnotation.html). You can create various other formats that might be easier to use from it. For example, create a `GRanges` object which is great for fast chromosome-position operations 180 | 181 | 182 | ``` r 183 | vcf_to_granges(vcf) 184 | #> GRanges object with 92 ranges and 15 metadata columns: 185 | #> seqnames ranges strand | paramRangeID REF ALT 186 | #> | 187 | #> rs12565286 1 721290 * | NA G C 188 | #> rs11804171 1 723819 * | NA T A 189 | #> rs2977670 1 723891 * | NA G C 190 | #> rs3094315 1 752566 * | NA G A 191 | #> rs2073813 1 753541 * | NA G A 192 | #> ... ... ... ... . ... ... ... 193 | #> rs715643 1 1172907 * | NA C T 194 | #> rs6675798 1 1176597 * | NA T C 195 | #> rs6603783 1 1181751 * | NA T C 196 | #> rs6603785 1 1186502 * | NA A T 197 | #> rs6603787 1 1188225 * | NA G T 198 | #> QUAL FILTER ES SE LP AF SS 199 | #> 200 | #> rs12565286 NA PASS -0.0067 0.0145 0.1930060 0.93220 109823.0 201 | #> rs11804171 NA PASS -0.0146 0.0175 0.3935110 0.96296 84828.0 202 | #> rs2977670 NA PASS 0.0044 0.0184 0.0909791 0.07143 68458.9 203 | #> rs3094315 NA PASS 0.0060 0.0065 0.4485500 0.15520 131544.0 204 | #> rs2073813 NA PASS 0.0035 0.0102 0.1357860 NA 64351.3 205 | #> ... ... ... ... ... ... ... ... 206 | #> rs715643 NA PASS 0.0019 0.0118 0.0594337 0.90833 121822 207 | #> rs6675798 NA PASS -0.0013 0.0067 0.0725270 0.89170 223475 208 | #> rs6603783 NA PASS -0.0002 0.0069 0.0101499 0.90000 220022 209 | #> rs6603785 NA PASS 0.0075 0.0104 0.3271640 0.91667 165964 210 | #> rs6603787 NA PASS 0.0025 0.0089 0.1085740 NA 199099 211 | #> EZ SI NC ID id 212 | #> 213 | #> rs12565286 NA NA NA rs12565286 IEU-a-2 214 | #> rs11804171 NA NA NA rs11804171 IEU-a-2 215 | #> rs2977670 NA NA NA rs2977670 IEU-a-2 216 | #> rs3094315 NA NA NA rs3094315 IEU-a-2 217 | #> rs2073813 NA NA NA rs2073813 IEU-a-2 218 | #> ... ... ... ... ... ... 219 | #> rs715643 NA NA NA rs715643 IEU-a-2 220 | #> rs6675798 NA NA NA rs6675798 IEU-a-2 221 | #> rs6603783 NA NA NA rs6603783 IEU-a-2 222 | #> rs6603785 NA NA NA rs6603785 IEU-a-2 223 | #> rs6603787 NA NA NA rs6603787 IEU-a-2 224 | #> ------- 225 | #> seqinfo: 84 sequences from GRCh37 genome 226 | ``` 227 | 228 | Create a data frame: 229 | 230 | 231 | ``` r 232 | vcf_to_granges(vcf) %>% dplyr::as_tibble() 233 | #> # A tibble: 92 × 20 234 | #> seqnames start end width strand paramRangeID REF ALT QUAL FILTER ES 235 | #> 236 | #> 1 1 721290 721290 1 * G C NA PASS -0.0067 237 | #> 2 1 723819 723819 1 * T A NA PASS -0.0146 238 | #> 3 1 723891 723891 1 * G C NA PASS 0.0044 239 | #> 4 1 752566 752566 1 * G A NA PASS 0.006 240 | #> 5 1 753541 753541 1 * G A NA PASS 0.0035 241 | #> 6 1 754192 754192 1 * A G NA PASS 0.0077 242 | #> 7 1 768448 768448 1 * G A NA PASS -0.0027 243 | #> 8 1 775659 775659 1 * A G NA PASS 0.0029 244 | #> 9 1 777122 777122 1 * A T NA PASS 0.0031 245 | #> 10 1 779322 779322 1 * A G NA PASS -0.0062 246 | #> # ℹ 82 more rows 247 | #> # ℹ 9 more variables: SE , LP , AF , SS , EZ , SI , 248 | #> # NC , ID , id 249 | ``` 250 | 251 | The direct conversion to formats for tools such as TwoSampleMR, coloc, and many others can also be made using the [https://github.com/mrcieu/gwasglue](https://github.com/mrcieu/gwasglue) R package. 252 | 253 | ## Reading in with filters 254 | 255 | The `query_gwas()` function takes either a filename to a vcf file, or vcf object as the main argument. You can then query on `rsid`, `pval` or `chrompos`. For example 256 | 257 | 258 | ``` r 259 | vcfsubset <- query_gwas(vcffile, chrompos=c("1:1097291-1099437")) 260 | ``` 261 | 262 | and 263 | 264 | 265 | ``` r 266 | vcf <- readVcf(vcffile) 267 | vcfsubset <- query_gwas(vcf, chrompos=c("1:1097291-1099437")) 268 | ``` 269 | 270 | are each identical, but the former saves time and memory because it is querying the file using an index and only reading in what is required. 271 | 272 | Examples of other filters are here: 273 | 274 | 275 | ``` r 276 | vcf <- query_gwas(vcffile, rsid=c("rs3128126", "rs3121561", "rs3813193")) 277 | vcf 278 | #> class: CollapsedVCF 279 | #> dim: 3 1 280 | #> rowRanges(vcf): 281 | #> GRanges with 5 metadata columns: paramRangeID, REF, ALT, QUAL, FILTER 282 | #> info(vcf): 283 | #> DataFrame with 3 columns: AF, AC, AN 284 | #> info(header(vcf)): 285 | #> Number Type Description 286 | #> AF A Float Allele Frequency 287 | #> AC A Integer Allele count in genotypes 288 | #> AN 1 Integer Total number of alleles in called genotypes 289 | #> geno(vcf): 290 | #> List of length 9: ES, SE, LP, AF, SS, EZ, SI, NC, ID 291 | #> geno(header(vcf)): 292 | #> Number Type Description 293 | #> ES A Float Effect size estimate relative to the alternative allele 294 | #> SE A Float Standard error of effect size estimate 295 | #> LP A Float -log10 p-value for effect estimate 296 | #> AF A Float Alternate allele frequency in the association study 297 | #> SS A Float Sample size used to estimate genetic effect 298 | #> EZ A Float Z-score provided if it was used to derive the EFFECT and SE f... 299 | #> SI A Float Accuracy score of summary data imputation 300 | #> NC A Float Number of cases used to estimate genetic effect 301 | #> ID 1 String Study variant identifier 302 | ``` 303 | 304 | 305 | ``` r 306 | vcf <- query_gwas(vcffile, pval=0.5) 307 | vcf 308 | #> class: CollapsedVCF 309 | #> dim: 45 1 310 | #> rowRanges(vcf): 311 | #> GRanges with 5 metadata columns: paramRangeID, REF, ALT, QUAL, FILTER 312 | #> info(vcf): 313 | #> DataFrame with 3 columns: AF, AC, AN 314 | #> info(header(vcf)): 315 | #> Number Type Description 316 | #> AF A Float Allele Frequency 317 | #> AC A Integer Allele count in genotypes 318 | #> AN 1 Integer Total number of alleles in called genotypes 319 | #> geno(vcf): 320 | #> List of length 9: ES, SE, LP, AF, SS, EZ, SI, NC, ID 321 | #> geno(header(vcf)): 322 | #> Number Type Description 323 | #> ES A Float Effect size estimate relative to the alternative allele 324 | #> SE A Float Standard error of effect size estimate 325 | #> LP A Float -log10 p-value for effect estimate 326 | #> AF A Float Alternate allele frequency in the association study 327 | #> SS A Float Sample size used to estimate genetic effect 328 | #> EZ A Float Z-score provided if it was used to derive the EFFECT and SE f... 329 | #> SI A Float Accuracy score of summary data imputation 330 | #> NC A Float Number of cases used to estimate genetic effect 331 | #> ID 1 String Study variant identifier 332 | ``` 333 | 334 | 335 | ``` r 336 | vcf <- query_gwas(vcffile, chrompos=c("1:1097291-1099437")) 337 | vcf 338 | #> class: CollapsedVCF 339 | #> dim: 2 1 340 | #> rowRanges(vcf): 341 | #> GRanges with 5 metadata columns: paramRangeID, REF, ALT, QUAL, FILTER 342 | #> info(vcf): 343 | #> DataFrame with 3 columns: AF, AC, AN 344 | #> info(header(vcf)): 345 | #> Number Type Description 346 | #> AF A Float Allele Frequency 347 | #> AC A Integer Allele count in genotypes 348 | #> AN 1 Integer Total number of alleles in called genotypes 349 | #> geno(vcf): 350 | #> List of length 9: ES, SE, LP, AF, SS, EZ, SI, NC, ID 351 | #> geno(header(vcf)): 352 | #> Number Type Description 353 | #> ES A Float Effect size estimate relative to the alternative allele 354 | #> SE A Float Standard error of effect size estimate 355 | #> LP A Float -log10 p-value for effect estimate 356 | #> AF A Float Alternate allele frequency in the association study 357 | #> SS A Float Sample size used to estimate genetic effect 358 | #> EZ A Float Z-score provided if it was used to derive the EFFECT and SE f... 359 | #> SI A Float Accuracy score of summary data imputation 360 | #> NC A Float Number of cases used to estimate genetic effect 361 | #> ID 1 String Study variant identifier 362 | ``` 363 | 364 | It's possible to chain filters together e.g. 365 | 366 | 367 | ``` r 368 | vcf <- query_gwas(vcffile, rsid=c("rs3128126", "rs3121561", "rs3813193")) %>% 369 | query_gwas(pval=0.5) 370 | vcf 371 | #> class: CollapsedVCF 372 | #> dim: 1 1 373 | #> rowRanges(vcf): 374 | #> GRanges with 5 metadata columns: paramRangeID, REF, ALT, QUAL, FILTER 375 | #> info(vcf): 376 | #> DataFrame with 3 columns: AF, AC, AN 377 | #> info(header(vcf)): 378 | #> Number Type Description 379 | #> AF A Float Allele Frequency 380 | #> AC A Integer Allele count in genotypes 381 | #> AN 1 Integer Total number of alleles in called genotypes 382 | #> geno(vcf): 383 | #> List of length 9: ES, SE, LP, AF, SS, EZ, SI, NC, ID 384 | #> geno(header(vcf)): 385 | #> Number Type Description 386 | #> ES A Float Effect size estimate relative to the alternative allele 387 | #> SE A Float Standard error of effect size estimate 388 | #> LP A Float -log10 p-value for effect estimate 389 | #> AF A Float Alternate allele frequency in the association study 390 | #> SS A Float Sample size used to estimate genetic effect 391 | #> EZ A Float Z-score provided if it was used to derive the EFFECT and SE f... 392 | #> SI A Float Accuracy score of summary data imputation 393 | #> NC A Float Number of cases used to estimate genetic effect 394 | #> ID 1 String Study variant identifier 395 | ``` 396 | 397 | It's possible to have multiple GWAS studies per vcf. You can specify specific GWAS studies to read in using e.g. 398 | 399 | 400 | ``` r 401 | vcf <- query_gwas(vcffile, rsid=c("rs3128126", "rs3121561", "rs3813193"), id="IEU-a-2") 402 | ``` 403 | 404 | Note that querying by chrompos is the fastest way to deal with VCFs, use this over rsid where possible when speed is an issue. 405 | 406 | ## Indexing rsid values 407 | 408 | Querying by rsid is slow. If a large number of queries by rsid are to be performed then it could be worth generating an index which would speed up the querying. This approach uses [SQLite](https://www.sqlite.org/index.html) to create a local database, linking rsid to chromosome and position. It strips out the 'rs' from the rs identifiers to make fast searchers by integer. The concept is based on that developed here: [bioforensics/rsidx](https://github.com/bioforensics/rsidx). 409 | 410 | To create the index: 411 | 412 | 413 | ``` r 414 | create_rsidx_index_from_vcf(vcffile, "index.rsidx") 415 | #> Extracting position info 416 | #> Generating index 417 | ``` 418 | 419 | To query using the index: 420 | 421 | 422 | ``` r 423 | vcf <- query_gwas(vcffile, rsid=c("rs3128126", "rs3121561", "rs3813193"), rsidx="index.rsidx") 424 | ``` 425 | 426 | ## Indexing p-values 427 | 428 | Querying by p-value is slow. It could be worth generating an index file for p-values to speed this up. Similar to rsid queries, it uses an sqlite database linking -log10 pvalues to chromosome and position. 429 | 430 | To create the index: 431 | 432 | 433 | ``` r 434 | create_pval_index_from_vcf(vcffile, maximum_pval=0.05, "index.pvali") 435 | #> Extracting pval info 436 | #> [1] "CREATE TABLE pval_to_coord (chrom TEXT NOT NULL DEFAULT NULL, coord INTEGER NOT NULL DEFAULT NULL, LP REAL NOT NULL DEFAULT 0);" 437 | #> [2] ".separator ," 438 | #> [3] ".import /var/folders/9j/bw4vdrw94yndry3z9cv8ms1m0000gn/T//RtmpFnAKs1/file22f4fdc3a4d pval_to_coord" 439 | #> [4] "CREATE INDEX idx_LP ON pval_to_coord (LP)" 440 | #> Generating index 441 | ``` 442 | 443 | To query using the index: 444 | 445 | 446 | ``` r 447 | vcf <- query_gwas(vcffile, pval=0.05, pvali="index.pvali") 448 | #> Using pval index 449 | #> Identified 7 variants passing threshold. Extracting... 450 | ``` 451 | 452 | ## A note about chrompos 453 | 454 | The fastest way to query VCFs is by specifying chromosome and position. Can specify specific positions, or ranges. e.g. 455 | 456 | 457 | ``` r 458 | cp <- c("1:10000", "2:10000-20000") 459 | ``` 460 | 461 | or as a data frame 462 | 463 | 464 | ``` r 465 | cp <- dplyr::tibble(chrom=c(1,2), start=c(10000,10000), end=c(10000, 20000)) 466 | ``` 467 | 468 | You can check what will be parsed out with: 469 | 470 | 471 | ``` r 472 | parse_chrompos(cp) 473 | #> GRanges object with 2 ranges and 0 metadata columns: 474 | #> seqnames ranges strand 475 | #> 476 | #> [1] 1 10000 * 477 | #> [2] 2 10000-20000 * 478 | #> ------- 479 | #> seqinfo: 2 sequences from an unspecified genome; no seqlengths 480 | ``` 481 | 482 | Querying by p-value or rsid is also possible but is slower as only chrompos is indexed. On Mac and Linux, rsid and p-value queries are performed by calls to bcftools. On Windows it uses VariantAnnotation directly, because bcftools binaries are not available. This is unfortunately somewhat slower. If many operations are being performed it might be faster to read in the whole dataset and perform queries that way. 483 | 484 | ## LD proxies 485 | 486 | If a set of rsids are requested from a vcf but some are absent, a reference panel can be used to search for LD proxies, extract them, and align the effects and alleles against the original variants that were requested. 487 | 488 | There are two ways to perform the LD proxy search: 489 | 490 | - using a set of genotyped samples as an LD reference panel (e.g. 1000 genomes data) - this is slow but relatively convenient 491 | - compiling an LD tag list from an LD reference panel - once generated this is very fast 492 | 493 | ### Using an LD reference panel 494 | 495 | An LD reference panel can be obtained from here: [http://fileserve.mrcieu.ac.uk/ld/data_maf0.01_rs_ref.tgz](http://fileserve.mrcieu.ac.uk/ld/data_maf0.01_rs_ref.tgz). This dataset comprises Europeans from the 1000 genomes project, in plink format, and including only SNPs with MAF > 0.01, and with the reference alleles aligned to the human genome reference sequence. For this vignette we can use a small subset of that dataset: 496 | 497 | 498 | ``` r 499 | ldfile <- system.file("extdata", "eur.bed", package="gwasvcf") %>% 500 | gsub(".bed", "", .) 501 | ``` 502 | 503 | We also need to provide a path to the plink binary used to generate LD calculations. This can be done through the `genetics.binaRies` package as with bcftools 504 | 505 | 506 | ``` r 507 | set_plink() 508 | #> Path not provided, using binaries in the MRCIEU/genetics.binaRies package 509 | ``` 510 | 511 | The rs4442317 variant is not present in the vcf file, i.e. if we query that variant: 512 | 513 | 514 | ``` r 515 | query_gwas(vcffile, rsid="rs4442317") %>% nrow 516 | #> [1] 0 517 | ``` 518 | 519 | 520 | ``` r 521 | vcf <- query_gwas(vcffile, rsid="rs4442317", proxies="yes", bfile=ldfile, tag_r2=0.05) 522 | #> Initial search... 523 | #> Extracted 0 out of 1 rsids 524 | #> Searching for proxies for 1 rsids 525 | #> Determining searchspace... 526 | #> Proxy lookup... 527 | #> Finding proxies... 528 | #> Found 10 proxies 529 | #> Extrating proxies... 530 | #> Identified proxies for 1 of 1 rsids 531 | #> Aligning... 532 | vcf %>% vcf_to_granges() 533 | #> GRanges object with 1 range and 15 metadata columns: 534 | #> seqnames ranges strand | REF ALT QUAL 535 | #> | 536 | #> rs4442317 1 1106784 * | T C NA 537 | #> FILTER ES SE LP AF SS EZ 538 | #> 539 | #> rs4442317 PASS 0.0059 0.0071 0.391474 0.8559 138001 NA 540 | #> SI NC ID PR id 541 | #> 542 | #> rs4442317 NA NA rs4970420 rs4970420 IEU-a-2 543 | #> ------- 544 | #> seqinfo: 1 sequence from an unspecified genome; no seqlengths 545 | ``` 546 | 547 | Here we see that the proxy variant is rs4970420. 548 | 549 | You may also extract only the best available proxies even if the requested rsids are present, by using `proxies="only"`. An example of this shows that the effect size estimates for the proxy variants are aligned to the effect alleles of the target variants: 550 | 551 | 552 | 553 | ``` r 554 | # Read vcf 555 | a <- readVcf(vcffile) 556 | 557 | # Obtain the best LD proxy for each of the rsids 558 | b <- query_gwas(vcffile, rsid=names(a), proxies="only", bfile=ldfile, tag_r2=0.6) 559 | #> Determining searchspace... 560 | #> Proxy lookup... 561 | #> Finding proxies... 562 | #> Found 270 proxies 563 | #> Extrating proxies... 564 | #> Identified proxies for 52 of 1 rsids 565 | #> Aligning... 566 | 567 | # Match the target data to the proxy data 568 | index <- match(names(b), names(a)) 569 | 570 | # Plot the target data effects against the proxy data effects 571 | plot(vcf_to_granges(b)$ES, vcf_to_granges(a)$ES[index]) 572 | ``` 573 | 574 |
575 | Plot of the target data effects against the proxy data effects 576 |

Plot the target data effects against the proxy data effects

577 |
578 | 579 | ### Compiling a list of tagging variants 580 | 581 | Using the LD reference panel described above, it is possible to create a sqlite tag reference panel using the following commands. First get an example LD reference panel: 582 | 583 | 584 | ``` r 585 | ldfile <- system.file("extdata", "eur.bed", package="gwasvcf") %>% 586 | gsub(".bed", "", .) 587 | ``` 588 | 589 | We also need to provide a path to the plink binary used to generate LD calculations. This can be done through the `genetics.binaRies` package as with bcftools 590 | 591 | 592 | ``` r 593 | set_plink() 594 | #> Path not provided, using binaries in the MRCIEU/genetics.binaRies package 595 | ``` 596 | 597 | Now generate the tagging database 598 | 599 | 600 | ``` r 601 | dbfile <- tempfile() 602 | create_ldref_sqlite(ldfile, dbfile, tag_r2 = 0.05) 603 | #> identifying indels to remove 604 | #> calculating ld tags 605 | #> formatting 606 | #> creating sqlite db 607 | ``` 608 | 609 | Perform the query 610 | 611 | 612 | ``` r 613 | vcf <- query_gwas(vcffile, rsid="rs4442317", proxies="yes", dbfile=dbfile, tag_r2=0.05) 614 | #> Initial search... 615 | #> Extracted 0 out of 1 rsids 616 | #> Searching for proxies for 1 rsids 617 | #> Proxy lookup... 618 | #> Found 168 proxies 619 | #> Extrating proxies... 620 | #> Identified proxies for 1 of 1 rsids 621 | #> Aligning... 622 | vcf %>% vcf_to_granges() 623 | #> GRanges object with 1 range and 15 metadata columns: 624 | #> seqnames ranges strand | REF ALT QUAL 625 | #> | 626 | #> rs4442317 1 1106784 * | T C NA 627 | #> FILTER ES SE LP AF SS EZ 628 | #> 629 | #> rs4442317 PASS -4e-04 0.0066 0.0214999 0.9 233073 NA 630 | #> SI NC ID PR id 631 | #> 632 | #> rs4442317 NA NA rs10907175 rs10907175 IEU-a-2 633 | #> ------- 634 | #> seqinfo: 1 sequence from an unspecified genome; no seqlengths 635 | ``` 636 | 637 | 638 | 639 | ## Creating the VCF object from a data frame 640 | 641 | If you have GWAS summary data in a text file or data frame, this can be converted to a VCF object. 642 | 643 | 644 | ``` r 645 | vcf <- readVcf(vcffile) 646 | vv <- vcf_to_granges(vcf) %>% dplyr::as_tibble() 647 | out <- vv %$% create_vcf(chrom=seqnames, pos=start, nea=REF, ea=ALT, snp=ID, ea_af=AF, effect=ES, se=SE, pval=10^-LP, n=SS, name="a") 648 | out 649 | #> class: CollapsedVCF 650 | #> dim: 92 1 651 | #> rowRanges(vcf): 652 | #> GRanges with 4 metadata columns: REF, ALT, QUAL, FILTER 653 | #> info(vcf): 654 | #> DataFrame with 0 columns: 655 | #> geno(vcf): 656 | #> List of length 6: AF, ES, SE, LP, SS, ID 657 | #> geno(header(vcf)): 658 | #> Number Type Description 659 | #> AF A Float Alternate allele frequency in the association study 660 | #> ES A Float Effect size estimate relative to the alternative allele 661 | #> SE A Float Standard error of effect size estimate 662 | #> LP A Float -log10 p-value for effect estimate 663 | #> SS A Float Sample size used to estimate genetic effect 664 | #> ID A String Study variant identifier 665 | ``` 666 | 667 | It's possible to write the vcf file: 668 | 669 | 670 | ``` r 671 | writeVcf(out, file="temp.vcf") 672 | ``` 673 | 674 | You may want to first harmonise the data so that all the non-effect alleles are aligned to the human genome reference. See the [gwasglue](https://github.com/MRCIEU/gwasglue) package on some functions to do this. 675 | 676 | ## Creating a gwasglue2 SummarySet object from a vcf file 677 | 678 | Although still under development, if compared with its predecessor, the [gwasglue2](https://mrcieu.github.io/gwasglue2/) package has several new features, including the use of S4 R objects. 679 | 680 | It is possible to create a `SummarySet` object from a GWAS-VCF file or VCF object e.g. output from `VariantAnnotation::readVcf()`, `create_vcf()` or `query_gwas()` using the `gwasvcf_to_summaryset()` function. 681 | 682 | For example: 683 | 684 | 685 | ``` r 686 | summaryset <- readVcf(vcffile) %>% 687 | gwasvcf_to_summaryset() 688 | ``` 689 | 690 | Once the `SummarySet` objects are created, it is possible to use `gwasglue2` to harmonise data, harmonise against a LD matrix, remap genomic coordinates to a different genome assembly, convert to other formats and more. 691 | -------------------------------------------------------------------------------- /vignettes/guide.Rmd.orig: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Reading, querying and writing GWAS summary data in VCF format" 3 | output: rmarkdown::html_vignette 4 | vignette: > 5 | %\VignetteIndexEntry{Reading, querying and writing GWAS summary data in VCF format} 6 | %\VignetteEngine{knitr::rmarkdown} 7 | %\VignetteEncoding{UTF-8} 8 | --- 9 | 10 | ```{r, include = FALSE} 11 | knitr::opts_chunk$set( 12 | collapse = TRUE, 13 | comment = "#>" 14 | ) 15 | ``` 16 | 17 | We developed a format for storing and harmonising GWAS summary data known as [GWAS VCF format](https://github.com/MRCIEU/gwas-vcf-specification). This format is effective for being very fast when querying chromosome and position ranges, handling multiallelic variants and indels. 18 | 19 | All the data in the [IEU GWAS database](https://gwas.mrcieu.ac.uk/) is available for download in the GWAS VCF format. This R package provides fast and convenient functions for querying and creating GWAS summary data in GWAS VCF format. The package builds on the [VariantAnnotation](https://bioconductor.org/packages/release/bioc/html/VariantAnnotation.html) Bioconductor package, which itself is based on the widely used [SummarizedExperiment](https://bioconductor.org/packages/release/bioc/html/SummarizedExperiment.html) Bioconductor package. 20 | 21 | 22 | ## External tools 23 | 24 | For some VCF querying functions it is faster to optionally use [bcftools](https://samtools.github.io/bcftools/bcftools.html), and when available the R package will use that strategy. To set a location for the bcftools package, use 25 | 26 | ```r 27 | library(gwasvcf) 28 | set_bcftools('/path/to/bcftools') 29 | ``` 30 | 31 | Note that there is bcftools binary for Windows available, so some querying options will be slower on Windows. 32 | 33 | For LD related functions the package uses [plink 1.90](https://www.cog-genomics.org/plink/1.9). You can specify the location of your plink installation by running 34 | 35 | ```r 36 | set_plink('/path/to/plink') 37 | ``` 38 | 39 | Alternatively you can automatically use use the binaries bundled here: https://github.com/mrcieu/genetics.binaRies 40 | 41 | ```r 42 | remotes::install_github('mrcieu/genetics.binaRies') 43 | set_plink() 44 | set_bcftools() 45 | ``` 46 | 47 | To unset a path: 48 | 49 | ```r 50 | set_plink(NULL) 51 | set_bcftools(NULL) 52 | ``` 53 | 54 | For this vignette we will use the bundled binaries in `genetics.binaRies`. 55 | 56 | ```{r} 57 | suppressWarnings(suppressPackageStartupMessages({ 58 | library(gwasvcf) 59 | library(VariantAnnotation) 60 | library(dplyr) 61 | library(magrittr) 62 | })) 63 | ``` 64 | ```{r eval=Sys.info()["sysname"] != "Windows"} 65 | set_bcftools() 66 | ``` 67 | 68 | ## Reading in everything 69 | 70 | To read an entire dataset use the `readVcf` function. As an example we'll use the bundled data which is a small subset of the Speliotes et al 2010 BMI GWAS. 71 | 72 | ```{r} 73 | vcffile <- system.file("extdata", "data.vcf.gz", package="gwasvcf") 74 | vcf <- readVcf(vcffile) 75 | class(vcf) 76 | ``` 77 | 78 | Please refer to the `VariantAnnotation` package documentation for full details about the `CollapsedVCF` object. A brief summary follows. 79 | 80 | General info about the dataset can be obtained by calling it: 81 | 82 | ```{r} 83 | vcf 84 | ``` 85 | 86 | There are 92 rows and 1 column which means 92 SNPs and one GWAS. See the header information: 87 | 88 | ```{r} 89 | header(vcf) 90 | ``` 91 | 92 | See the names of the GWAS datasets (in this case just one, and it refers to the IEU GWAS database ID name): 93 | 94 | ```{r} 95 | samples(header(vcf)) 96 | ``` 97 | 98 | In this case you can obtain information about this study through the `ieugwasr` package e.g. `ieugwasr::gwasinfo("IEU-a-2")`. 99 | 100 | There are a few components within the object: 101 | 102 | - `header` which has the meta data describing the dataset, including the association result variables 103 | - `rowRanges` which is information about each variant 104 | - `info` which is further metadata about each variant 105 | - `geno` which is the actual association results for each GWAS 106 | 107 | the `rowRanges` object is a `GenomicRanges` class, which is useful for performing fast operations on chromosome position information. 108 | 109 | ```{r} 110 | rowRanges(vcf) 111 | ``` 112 | 113 | ## Converting to simple dataframes 114 | 115 | The VCF object is somewhat complex and you can read more about it in the [VariantAnnotation package documentation](https://bioconductor.org/packages/release/bioc/html/VariantAnnotation.html). You can create various other formats that might be easier to use from it. For example, create a `GRanges` object which is great for fast chromosome-position operations 116 | 117 | ```{r} 118 | vcf_to_granges(vcf) 119 | ``` 120 | 121 | Create a data frame: 122 | 123 | ```{r} 124 | vcf_to_granges(vcf) %>% dplyr::as_tibble() 125 | ``` 126 | 127 | The direct conversion to formats for tools such as TwoSampleMR, coloc, and many others can also be made using the [https://github.com/mrcieu/gwasglue](https://github.com/mrcieu/gwasglue) R package. 128 | 129 | ## Reading in with filters 130 | 131 | The `query_gwas()` function takes either a filename to a vcf file, or vcf object as the main argument. You can then query on `rsid`, `pval` or `chrompos`. For example 132 | 133 | ```{r} 134 | vcfsubset <- query_gwas(vcffile, chrompos=c("1:1097291-1099437")) 135 | ``` 136 | 137 | and 138 | 139 | ```{r} 140 | vcf <- readVcf(vcffile) 141 | vcfsubset <- query_gwas(vcf, chrompos=c("1:1097291-1099437")) 142 | ``` 143 | 144 | are each identical, but the former saves time and memory because it is querying the file using an index and only reading in what is required. 145 | 146 | Examples of other filters are here: 147 | 148 | ```{r} 149 | vcf <- query_gwas(vcffile, rsid=c("rs3128126", "rs3121561", "rs3813193")) 150 | vcf 151 | ``` 152 | 153 | ```{r} 154 | vcf <- query_gwas(vcffile, pval=0.5) 155 | vcf 156 | ``` 157 | 158 | ```{r} 159 | vcf <- query_gwas(vcffile, chrompos=c("1:1097291-1099437")) 160 | vcf 161 | ``` 162 | 163 | It's possible to chain filters together e.g. 164 | 165 | ```{r} 166 | vcf <- query_gwas(vcffile, rsid=c("rs3128126", "rs3121561", "rs3813193")) %>% 167 | query_gwas(pval=0.5) 168 | vcf 169 | ``` 170 | 171 | It's possible to have multiple GWAS studies per vcf. You can specify specific GWAS studies to read in using e.g. 172 | 173 | ```{r} 174 | vcf <- query_gwas(vcffile, rsid=c("rs3128126", "rs3121561", "rs3813193"), id="IEU-a-2") 175 | ``` 176 | 177 | Note that querying by chrompos is the fastest way to deal with VCFs, use this over rsid where possible when speed is an issue. 178 | 179 | ## Indexing rsid values 180 | 181 | Querying by rsid is slow. If a large number of queries by rsid are to be performed then it could be worth generating an index which would speed up the querying. This approach uses [SQLite](https://www.sqlite.org/index.html) to create a local database, linking rsid to chromosome and position. It strips out the 'rs' from the rs identifiers to make fast searchers by integer. The concept is based on that developed here: [bioforensics/rsidx](https://github.com/bioforensics/rsidx). 182 | 183 | To create the index: 184 | 185 | ```{r eval=Sys.info()["sysname"] != "Windows"} 186 | create_rsidx_index_from_vcf(vcffile, "index.rsidx") 187 | ``` 188 | 189 | To query using the index: 190 | 191 | ```{r eval=Sys.info()["sysname"] != "Windows"} 192 | vcf <- query_gwas(vcffile, rsid=c("rs3128126", "rs3121561", "rs3813193"), rsidx="index.rsidx") 193 | ``` 194 | 195 | ## Indexing p-values 196 | 197 | Querying by p-value is slow. It could be worth generating an index file for p-values to speed this up. Similar to rsid queries, it uses an sqlite database linking -log10 pvalues to chromosome and position. 198 | 199 | To create the index: 200 | 201 | ```{r eval=Sys.info()["sysname"] != "Windows"} 202 | create_pval_index_from_vcf(vcffile, maximum_pval=0.05, "index.pvali") 203 | ``` 204 | 205 | To query using the index: 206 | 207 | ```{r eval=Sys.info()["sysname"] != "Windows"} 208 | vcf <- query_gwas(vcffile, pval=0.05, pvali="index.pvali") 209 | ``` 210 | 211 | ## A note about chrompos 212 | 213 | The fastest way to query VCFs is by specifying chromosome and position. Can specify specific positions, or ranges. e.g. 214 | 215 | ```{r} 216 | cp <- c("1:10000", "2:10000-20000") 217 | ``` 218 | 219 | or as a data frame 220 | 221 | ```{r} 222 | cp <- dplyr::tibble(chrom=c(1,2), start=c(10000,10000), end=c(10000, 20000)) 223 | ``` 224 | 225 | You can check what will be parsed out with: 226 | 227 | ```{r} 228 | parse_chrompos(cp) 229 | ``` 230 | 231 | Querying by p-value or rsid is also possible but is slower as only chrompos is indexed. On Mac and Linux, rsid and p-value queries are performed by calls to bcftools. On Windows it uses VariantAnnotation directly, because bcftools binaries are not available. This is unfortunately somewhat slower. If many operations are being performed it might be faster to read in the whole dataset and perform queries that way. 232 | 233 | ## LD proxies 234 | 235 | If a set of rsids are requested from a vcf but some are absent, a reference panel can be used to search for LD proxies, extract them, and align the effects and alleles against the original variants that were requested. 236 | 237 | There are two ways to perform the LD proxy search: 238 | 239 | - using a set of genotyped samples as an LD reference panel (e.g. 1000 genomes data) - this is slow but relatively convenient 240 | - compiling an LD tag list from an LD reference panel - once generated this is very fast 241 | 242 | ### Using an LD reference panel 243 | 244 | An LD reference panel can be obtained from here: [http://fileserve.mrcieu.ac.uk/ld/data_maf0.01_rs_ref.tgz](http://fileserve.mrcieu.ac.uk/ld/data_maf0.01_rs_ref.tgz). This dataset comprises Europeans from the 1000 genomes project, in plink format, and including only SNPs with MAF > 0.01, and with the reference alleles aligned to the human genome reference sequence. For this vignette we can use a small subset of that dataset: 245 | 246 | ```{r} 247 | ldfile <- system.file("extdata", "eur.bed", package="gwasvcf") %>% 248 | gsub(".bed", "", .) 249 | ``` 250 | 251 | We also need to provide a path to the plink binary used to generate LD calculations. This can be done through the `genetics.binaRies` package as with bcftools 252 | 253 | ```{r} 254 | set_plink() 255 | ``` 256 | 257 | The rs4442317 variant is not present in the vcf file, i.e. if we query that variant: 258 | 259 | ```{r} 260 | query_gwas(vcffile, rsid="rs4442317") %>% nrow 261 | ``` 262 | 263 | ```{r eval=Sys.info()["sysname"] != "Windows"} 264 | vcf <- query_gwas(vcffile, rsid="rs4442317", proxies="yes", bfile=ldfile, tag_r2=0.05) 265 | vcf %>% vcf_to_granges() 266 | ``` 267 | 268 | Here we see that the proxy variant is `r vcf_to_granges(vcf)$PR`. 269 | 270 | You may also extract only the best available proxies even if the requested rsids are present, by using `proxies="only"`. An example of this shows that the effect size estimates for the proxy variants are aligned to the effect alleles of the target variants: 271 | 272 | 273 | ```{r target-effects-plot, eval=Sys.info()["sysname"] != "Windows", fig.alt="Plot of the target data effects against the proxy data effects", fig.cap="Plot the target data effects against the proxy data effects"} 274 | # Read vcf 275 | a <- readVcf(vcffile) 276 | 277 | # Obtain the best LD proxy for each of the rsids 278 | b <- query_gwas(vcffile, rsid=names(a), proxies="only", bfile=ldfile, tag_r2=0.6) 279 | 280 | # Match the target data to the proxy data 281 | index <- match(names(b), names(a)) 282 | 283 | # Plot the target data effects against the proxy data effects 284 | plot(vcf_to_granges(b)$ES, vcf_to_granges(a)$ES[index]) 285 | ``` 286 | 287 | ### Compiling a list of tagging variants 288 | 289 | Using the LD reference panel described above, it is possible to create a sqlite tag reference panel using the following commands. First get an example LD reference panel: 290 | 291 | ```{r} 292 | ldfile <- system.file("extdata", "eur.bed", package="gwasvcf") %>% 293 | gsub(".bed", "", .) 294 | ``` 295 | 296 | We also need to provide a path to the plink binary used to generate LD calculations. This can be done through the `genetics.binaRies` package as with bcftools 297 | 298 | ```{r} 299 | set_plink() 300 | ``` 301 | 302 | Now generate the tagging database 303 | 304 | ```{r eval=Sys.info()["sysname"] != "Windows"} 305 | dbfile <- tempfile() 306 | create_ldref_sqlite(ldfile, dbfile, tag_r2 = 0.05) 307 | ``` 308 | 309 | Perform the query 310 | 311 | ```{r eval=Sys.info()["sysname"] != "Windows"} 312 | vcf <- query_gwas(vcffile, rsid="rs4442317", proxies="yes", dbfile=dbfile, tag_r2=0.05) 313 | vcf %>% vcf_to_granges() 314 | ``` 315 | 316 | ```{r, echo=FALSE, eval=Sys.info()["sysname"] != "Windows"} 317 | unlink(dbfile) 318 | ``` 319 | 320 | ## Creating the VCF object from a data frame 321 | 322 | If you have GWAS summary data in a text file or data frame, this can be converted to a VCF object. 323 | 324 | ```{r} 325 | vcf <- readVcf(vcffile) 326 | vv <- vcf_to_granges(vcf) %>% dplyr::as_tibble() 327 | out <- vv %$% create_vcf(chrom=seqnames, pos=start, nea=REF, ea=ALT, snp=ID, ea_af=AF, effect=ES, se=SE, pval=10^-LP, n=SS, name="a") 328 | out 329 | ``` 330 | 331 | It's possible to write the vcf file: 332 | 333 | ```{r, eval=FALSE} 334 | writeVcf(out, file="temp.vcf") 335 | ``` 336 | 337 | You may want to first harmonise the data so that all the non-effect alleles are aligned to the human genome reference. See the [gwasglue](https://github.com/MRCIEU/gwasglue) package on some functions to do this. 338 | 339 | ## Creating a gwasglue2 SummarySet object from a vcf file 340 | 341 | Although still under development, if compared with its predecessor, the [gwasglue2](https://mrcieu.github.io/gwasglue2/) package has several new features, including the use of S4 R objects. 342 | 343 | It is possible to create a `SummarySet` object from a GWAS-VCF file or VCF object e.g. output from `VariantAnnotation::readVcf()`, `create_vcf()` or `query_gwas()` using the `gwasvcf_to_summaryset()` function. 344 | 345 | For example: 346 | 347 | ```{r, eval=FALSE} 348 | summaryset <- readVcf(vcffile) %>% 349 | gwasvcf_to_summaryset() 350 | ``` 351 | 352 | Once the `SummarySet` objects are created, it is possible to use `gwasglue2` to harmonise data, harmonise against a LD matrix, remap genomic coordinates to a different genome assembly, convert to other formats and more. 353 | -------------------------------------------------------------------------------- /vignettes/precompile.R: -------------------------------------------------------------------------------- 1 | # Execute the code from the vignette 2 | knitr::knit("vignettes/guide.Rmd.orig", output = "vignettes/guide.Rmd") 3 | file.rename("figure/target-effects-plot-1.png", "vignettes/figure/target-effects-plot-1.png") 4 | --------------------------------------------------------------------------------