├── vignettes ├── .gitignore ├── spark_functions.Rmd ├── label_fields.Rmd └── ascertain_diagnoses.Rmd ├── .Rbuildignore ├── R ├── sysdata.rda ├── get_emr_spark.R ├── upload_to_rap.R ├── get_cancer_registry.R ├── get_selfrep_illness.R ├── get_rap_phenos.R ├── fields_to_phenos.R ├── label_ukb_field.R ├── get_selfrep_illness_spark.R ├── export_tables.R ├── get_diagnoses.R └── get_df.R ├── man ├── figures │ └── ukbrapR.png ├── load_bed.Rd ├── download_from_rap.Rd ├── upload_to_rap.Rd ├── get_emr_spark.Rd ├── make_imputed_bed.Rd ├── make_dragen_bed.Rd ├── label_ukb_field.Rd ├── fields_to_phenos.Rd ├── label_ukb_fields.Rd ├── get_rap_phenos.Rd ├── extract_variants.Rd ├── create_pgs.Rd ├── get_diagnoses.Rd ├── get_selfrep_illness_spark.Rd ├── export_tables.Rd └── get_df.Rd ├── inst └── files │ ├── dragen_pvcf_coordinates.csv.gz │ ├── pgs_liver_cirrhosis.txt │ └── pgs_liver_cirrhosis.README ├── NAMESPACE ├── .gitignore ├── _pkgdown.yml ├── DESCRIPTION ├── data-raw └── internal_files.R ├── README.md └── NEWS.md /vignettes/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | *.R 3 | -------------------------------------------------------------------------------- /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^_pkgdown\.yml$ 2 | ^docs$ 3 | ^pkgdown$ 4 | -------------------------------------------------------------------------------- /R/sysdata.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcpilling/ukbrapR/HEAD/R/sysdata.rda -------------------------------------------------------------------------------- /man/figures/ukbrapR.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcpilling/ukbrapR/HEAD/man/figures/ukbrapR.png -------------------------------------------------------------------------------- /inst/files/dragen_pvcf_coordinates.csv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcpilling/ukbrapR/HEAD/inst/files/dragen_pvcf_coordinates.csv.gz -------------------------------------------------------------------------------- /inst/files/pgs_liver_cirrhosis.txt: -------------------------------------------------------------------------------- 1 | rsID CHR POS effect_allele other_allele effect_weight locus_name 2 | rs2642438 1 220796686 A G -0.177 MARC1 3 | rs11925835 3 56831417 T C -0.235 ARHGEF3 4 | rs72613567 4 87310241 TA T -0.166 HSD17B13 5 | rs2954038 8 125495147 C A 0.16 TRIB1 6 | rs11065384 12 120985482 T C 0.275 HNF1A 7 | rs28929474 14 94378610 T C 0.561 SERPINA1 8 | rs10401969 19 19296909 C T 0.678 SUGP1 9 | rs15052 19 41307470 C T 0.222 HNRNPUL1 10 | rs738408 22 43928850 T C 0.734 PNPLA3 11 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(create_pgs) 4 | export(download_from_rap) 5 | export(export_tables) 6 | export(extract_variants) 7 | export(fields_to_phenos) 8 | export(get_df) 9 | export(get_diagnoses) 10 | export(get_emr) 11 | export(get_emr_spark) 12 | export(get_rap_phenos) 13 | export(get_selfrep_illness_spark) 14 | export(label_ukb_field) 15 | export(label_ukb_fields) 16 | export(load_bed) 17 | export(make_dragen_bed) 18 | export(make_imputed_bed) 19 | export(upload_to_rap) 20 | -------------------------------------------------------------------------------- /inst/files/pgs_liver_cirrhosis.README: -------------------------------------------------------------------------------- 1 | ###PGS CATALOG SCORING FILE - see https://www.pgscatalog.org/downloads/#dl_ftp_scoring for additional information 2 | #format_version=2.0 3 | ##POLYGENIC SCORE (PGS) INFORMATION 4 | #pgs_id=PGS000776 5 | #pgs_name=GRS9_Cirr 6 | #trait_reported=Cirrhosis 7 | #trait_mapped=cirrhosis of liver 8 | #trait_efo=EFO_0001422 9 | #genome_build=NR 10 | #variants_number=9 11 | #weight_type=beta 12 | ##SOURCE INFORMATION 13 | #pgp_id=PGP000180 14 | #citation=Innes H et al. Gastroenterology (2020). doi:10.1053/j.gastro.2020.06.014 15 | ##HARMONIZATION DETAILS 16 | #HmPOS_build=GRCh38 17 | #HmPOS_date=2022-07-29 18 | #HmPOS_match_chr={"True": null, "False": null} 19 | #HmPOS_match_pos={"True": null, "False": null} -------------------------------------------------------------------------------- /man/load_bed.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/extract_variants.R 3 | \name{load_bed} 4 | \alias{load_bed} 5 | \title{Load BED file into memory} 6 | \usage{ 7 | load_bed(in_bed, verbose = FALSE, very_verbose = FALSE) 8 | } 9 | \arguments{ 10 | \item{in_bed}{A string. BED prefix} 11 | 12 | \item{verbose}{Logical. Be verbose (show individual steps), 13 | \code{default=FALSE}} 14 | 15 | \item{very_verbose}{Logical. Be very verbose (show individual steps & show terminal output from Plink etc), 16 | \code{default=FALSE}} 17 | } 18 | \value{ 19 | A data frame 20 | } 21 | \description{ 22 | Use Plink to convert BED to RAW then easily load it 23 | } 24 | \examples{ 25 | 26 | liver_variants <- load_bed(in_bed="liver_cirrhosis.imputed.variants") 27 | 28 | } 29 | \author{ 30 | Luke Pilling 31 | } 32 | -------------------------------------------------------------------------------- /man/download_from_rap.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/upload_to_rap.R 3 | \name{download_from_rap} 4 | \alias{download_from_rap} 5 | \title{Use R to download a file to the UK Biobank RAP} 6 | \usage{ 7 | download_from_rap(file, verbose = FALSE) 8 | } 9 | \arguments{ 10 | \item{file}{A string. Filename of the file to be downloaded (character)} 11 | 12 | \item{verbose}{Logical. Be verbose, 13 | \code{default=FALSE}} 14 | } 15 | \value{ 16 | NA 17 | } 18 | \description{ 19 | Use R to download a file to the UK Biobank RAP (really just a wrapper for `dx download`) 20 | } 21 | \examples{ 22 | 23 | readr::write_tsv(data.frame(x=1:10,y=11:20), "ukbrap.dummy.20231114.txt.gz") 24 | 25 | # download file to RAP storage 26 | download_from_rap(file="ukbrap.dummy.20231114.txt.gz", dir="extracts/") 27 | 28 | } 29 | \author{ 30 | Luke Pilling 31 | } 32 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # History files 2 | .Rhistory 3 | .Rapp.history 4 | 5 | # Session Data files 6 | .RData 7 | .RDataTmp 8 | 9 | # User-specific files 10 | .Ruserdata 11 | 12 | # Example code in package build process 13 | *-Ex.R 14 | 15 | # Output files from R CMD build 16 | /*.tar.gz 17 | 18 | # Output files from R CMD check 19 | /*.Rcheck/ 20 | 21 | # RStudio files 22 | .Rproj.user/ 23 | 24 | # produced vignettes 25 | vignettes/*.html 26 | vignettes/*.pdf 27 | 28 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3 29 | .httr-oauth 30 | 31 | # knitr and R markdown default cache directories 32 | *_cache/ 33 | /cache/ 34 | 35 | # Temporary files created by R markdown 36 | *.utf8.md 37 | *.knit.md 38 | 39 | # R Environment Variables 40 | .Renviron 41 | 42 | # pkgdown site 43 | docs/ 44 | 45 | # translation temp files 46 | po/*~ 47 | 48 | # RStudio Connect folder 49 | rsconnect/ 50 | docs 51 | inst/doc 52 | -------------------------------------------------------------------------------- /man/upload_to_rap.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/upload_to_rap.R 3 | \name{upload_to_rap} 4 | \alias{upload_to_rap} 5 | \title{Use R to upload a file to the UK Biobank RAP} 6 | \usage{ 7 | upload_to_rap(file, dir = "FALSE", verbose = FALSE) 8 | } 9 | \arguments{ 10 | \item{file}{A string. Filename of the file to be uploaded (character)} 11 | 12 | \item{dir}{A string. Target directory in the RAP space. If blank, the current working directory (character)} 13 | 14 | \item{verbose}{Logical. Be verbose, 15 | \code{default=FALSE}} 16 | } 17 | \value{ 18 | NA 19 | } 20 | \description{ 21 | Use R to upload a file to the UK Biobank RAP (really just a wrapper for `dx upload`) 22 | } 23 | \examples{ 24 | 25 | readr::write_tsv(data.frame(x=1:10,y=11:20), "ukbrap.dummy.20231114.txt.gz") 26 | 27 | # upload file to RAP storage 28 | upload_to_rap(file="ukbrap.dummy.20231114.txt.gz", dir="extracts/") 29 | 30 | } 31 | \author{ 32 | Luke Pilling 33 | } 34 | -------------------------------------------------------------------------------- /man/get_emr_spark.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/get_emr_spark.R 3 | \name{get_emr_spark} 4 | \alias{get_emr_spark} 5 | \title{Get UK Biobank participant Electronic Medical Records (EMR) data in a RAP Spark environment} 6 | \usage{ 7 | get_emr_spark( 8 | codes_df = NULL, 9 | spark_master = "spark://master:41000", 10 | verbose = FALSE 11 | ) 12 | } 13 | \arguments{ 14 | \item{codes_df}{A data frame. Contains two columns: `code` and `vocab_id` i.e., a list of diagnostic codes, and an indicator of the vocabulary. Other columns are ignored.} 15 | 16 | \item{spark_master}{A string. The `master` argmuent passed to `sparklyr::spark_connect()`. 17 | \code{default='spark://master:41000'}} 18 | 19 | \item{verbose}{Logical. Be verbose, 20 | \code{default=FALSE}} 21 | } 22 | \value{ 23 | Returns nothing 24 | } 25 | \description{ 26 | This function is completely removed. Better to use `get_diagnoses()`. Use a historic release of this package if you really need it https://github.com/lcpilling/ukbrapR/releases 27 | } 28 | \author{ 29 | Luke Pilling 30 | } 31 | -------------------------------------------------------------------------------- /R/get_emr_spark.R: -------------------------------------------------------------------------------- 1 | #' Get UK Biobank participant Electronic Medical Records (EMR) data in a RAP Spark environment 2 | #' 3 | #' @description 4 | #' 5 | #' This function is completely removed. Better to use `get_diagnoses()`. Use a historic release of this package if you really need it https://github.com/lcpilling/ukbrapR/releases 6 | #' 7 | #' @return Returns nothing 8 | #' 9 | #' @author Luke Pilling 10 | #' 11 | #' @name get_emr_spark 12 | #' 13 | #' @param codes_df A data frame. Contains two columns: `code` and `vocab_id` i.e., a list of diagnostic codes, and an indicator of the vocabulary. Other columns are ignored. 14 | #' @param spark_master A string. The `master` argmuent passed to `sparklyr::spark_connect()`. 15 | #' \code{default='spark://master:41000'} 16 | #' @param verbose Logical. Be verbose, 17 | #' \code{default=FALSE} 18 | #' 19 | #' @export 20 | #' 21 | get_emr_spark <- function( 22 | codes_df=NULL, 23 | spark_master = "spark://master:41000", 24 | verbose=FALSE 25 | ) { 26 | 27 | lifecycle::deprecate_warn("0.3.0", "get_emr_spark()", "get_diagnoses()", details="This function is now completely removed. Use a historic release of this package if you really need it https://github.com/lcpilling/ukbrapR/releases") 28 | 29 | } 30 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /_pkgdown.yml: -------------------------------------------------------------------------------- 1 | url: https://lcpilling.github.io/ukbrapR/ 2 | template: 3 | bootstrap: 5 4 | light-switch: true 5 | 6 | authors: 7 | Luke Pilling: 8 | href: "https://lcpilling.github.io" 9 | 10 | reference: 11 | - title: Export required data 12 | - contents: 13 | - export_tables 14 | - title: Extract genetic variants 15 | - contents: 16 | - extract_variants 17 | - make_imputed_bed 18 | - make_dragen_bed 19 | - load_bed 20 | - title: Create polygenic score 21 | - contents: 22 | - create_pgs 23 | - title: Ascertain diagnoses and determine date first diagnosed 24 | - contents: 25 | - get_diagnoses 26 | - get_df 27 | - title: Working with UK Biobank data fields 28 | - contents: 29 | - fields_to_phenos 30 | - label_ukb_fields 31 | - label_ukb_field 32 | - title: Upload/download from RAP/worker storage 33 | - contents: 34 | - upload_to_rap 35 | - download_from_rap 36 | - title: Spark 37 | des: These functions do not require intermediate files but must be executed in a JupyterLab Spark environment. 38 | - contents: 39 | - get_rap_phenos 40 | - get_selfrep_illness_spark 41 | - get_emr_spark 42 | 43 | articles: 44 | - title: Get started 45 | navbar: ~ 46 | contents: 47 | - ascertain_diagnoses 48 | - label_fields 49 | - spark_functions 50 | 51 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: ukbrapR 2 | Title: R functions to use in the UK Biobank Research Analysis Platform (RAP) 3 | Version: 0.3.9 4 | Authors@R: c(person("Luke", "Pilling", 5 | email = "L.Pilling@exeter.ac.uk", 6 | role = c("aut", "cre"), 7 | comment = c(ORCID = "0000-0002-3332-8454"))) 8 | Description: R functions to use in the UK Biobank Research Analysis Platform (RAP). The aim is to make working in the RAP quicker, easier, and more reproducible. The package is designed to work in a "normal" cluster using RStudio, and raw UK Biobank data from the table-exporter (rather than on a Spark cluster). 9 | License: GPL-3 10 | URL: https://lcpilling.github.io/ukbrapR, https://github.com/lcpilling/ukbrapR 11 | Depends: 12 | R (>= 4.1.1) 13 | Imports: 14 | dplyr (>= 1.1.0), 15 | readr (>= 2.0.0), 16 | stringr (>= 1.5.0), 17 | purrr (>= 1.0.0), 18 | lubridate (>= 1.9.0), 19 | tidyr (>= 1.3.0), 20 | rlang (>= 1.0.0), 21 | cli (>= 3.6.1), 22 | prettyunits (>= 1.0.0), 23 | lifecycle (>= 1.0.0), 24 | haven (>= 2.5.0) 25 | Encoding: UTF-8 26 | LazyData: true 27 | RoxygenNote: 7.3.2 28 | BugReports: https://github.com/lcpilling/ukbrapR/issues 29 | Suggests: 30 | knitr, 31 | rmarkdown 32 | VignetteBuilder: knitr 33 | -------------------------------------------------------------------------------- /man/make_imputed_bed.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/extract_variants.R 3 | \name{make_imputed_bed} 4 | \alias{make_imputed_bed} 5 | \title{Extract variants from imputed genotype file(s) into single BED file} 6 | \usage{ 7 | make_imputed_bed( 8 | in_file, 9 | out_bed, 10 | progress = TRUE, 11 | verbose = FALSE, 12 | very_verbose = FALSE 13 | ) 14 | } 15 | \arguments{ 16 | \item{in_file}{A data frame or file path. Contains at least two columns: `rsID` and `CHR`. Other columns are ignored.} 17 | 18 | \item{out_bed}{A string.} 19 | 20 | \item{progress}{Logical. Show progress through each individual file, 21 | \code{default=TRUE}} 22 | 23 | \item{verbose}{Logical. Be verbose (show individual steps), 24 | \code{default=FALSE}} 25 | 26 | \item{very_verbose}{Logical. Be very verbose (show individual steps & show terminal output from Plink etc), 27 | \code{default=FALSE}} 28 | } 29 | \value{ 30 | A single merged BED file (and BIM and FAM files) 31 | } 32 | \description{ 33 | For a given set of genomic coordinates extract from the UK Biobank imputed genotypes (BGEN files, field 22828) into a single BED file. 34 | } 35 | \examples{ 36 | 37 | make_imputed_bed(in_file=system.file("files", "pgs_liver_cirrhosis.txt", package="ukbrapR"), out_bed="liver_cirrhosis.imputed.variants") 38 | 39 | } 40 | \author{ 41 | Luke Pilling 42 | } 43 | -------------------------------------------------------------------------------- /man/make_dragen_bed.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/extract_variants.R 3 | \name{make_dragen_bed} 4 | \alias{make_dragen_bed} 5 | \title{Extract variants from DRAGEN BGEN file(s) into single BED file} 6 | \usage{ 7 | make_dragen_bed( 8 | in_file, 9 | out_bed, 10 | progress = TRUE, 11 | verbose = FALSE, 12 | very_verbose = FALSE 13 | ) 14 | } 15 | \arguments{ 16 | \item{in_file}{A data frame or file path. Contains at least two columns: `chr` and `pos` (in build 38). Other columns are ignored.} 17 | 18 | \item{out_bed}{A string.} 19 | 20 | \item{progress}{Logical. Show progress through each individual file, 21 | \code{default=TRUE}} 22 | 23 | \item{verbose}{Logical. Be verbose (show individual steps), 24 | \code{default=FALSE}} 25 | 26 | \item{very_verbose}{Logical. Be very verbose (show individual steps & show terminal output from Plink etc), 27 | \code{default=FALSE}} 28 | } 29 | \value{ 30 | A single merged BED file (and BIM and FAM files) 31 | } 32 | \description{ 33 | For a given set of genomic coordinates extract the UK Biobank WGS DRAGEN variant calls (from the BGEN format, field 24309) into a single BED file. 34 | 35 | This assumes your project has access to the WGS BGEN files released April 2025. If not, run `ukbrapR:::make_dragen_bed_from_pvcfs()` to use [tabix] and [plink] to subset the [DRAGEN WGS pVCF files]. 36 | } 37 | \examples{ 38 | 39 | make_dragen_bed(in_file=system.file("files", "pgs_liver_cirrhosis.txt", package="ukbrapR"), out_bed="liver_cirrhosis.dragen.variants") 40 | 41 | } 42 | \author{ 43 | Luke Pilling 44 | } 45 | -------------------------------------------------------------------------------- /man/label_ukb_field.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/label_ukb_field.R 3 | \name{label_ukb_field} 4 | \alias{label_ukb_field} 5 | \title{Update UK Biobank field with `title` and `label` from the schema} 6 | \usage{ 7 | label_ukb_field(d, field, field_id = NULL, verbose = FALSE) 8 | } 9 | \arguments{ 10 | \item{d}{A data frame. The data frame containing the `field` to update.} 11 | 12 | \item{field}{A string. The field (e.g., `p54_i0`) in the provided data frame to update.} 13 | 14 | \item{field_id}{A string. If the field has been renamed (to e.g.,"assessment_centre") provide the field id here (e.g., "54"). 15 | \code{default=NULL}} 16 | 17 | \item{verbose}{Logical. Be verbose, 18 | \code{default=FALSE}} 19 | } 20 | \value{ 21 | Returns a data frame. 22 | } 23 | \description{ 24 | Variables such as education and ethnicity are provided as integers but have specific codes. 25 | 26 | The UK Biobank schema are machine-readable dictionaries and mappings defining the internal structure of the online Showcase. https://biobank.ctsu.ox.ac.uk/crystal/schema.cgi 27 | 28 | This function updates a field in a data frame of UK Biobank with information from the Schema. 29 | } 30 | \examples{ 31 | 32 | # update the Assessment Centre variable 33 | ukb <- ukbrapR::label_ukb_field(ukb, field="p54_i0") 34 | 35 | table(ukb$p54_i0) # tabulates the values 36 | table(haven::as_factor(ukb$p54_i0)) # tabulates the labels 37 | haven::print_labels(ukb$p54_i0) # show the value:label mapping for this variable 38 | 39 | # if the variable has been renamed, provide the "field" and "field_id" e.g., 40 | ukb <- ukbrapR::label_ukb_field(ukb, field="assessment_centre", field_id="54") 41 | 42 | } 43 | \author{ 44 | Luke Pilling 45 | } 46 | -------------------------------------------------------------------------------- /man/fields_to_phenos.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/fields_to_phenos.R 3 | \name{fields_to_phenos} 4 | \alias{fields_to_phenos} 5 | \title{Check UK Biobank field IDs} 6 | \usage{ 7 | fields_to_phenos(fields, filename = "", abort = TRUE, verbose = FALSE) 8 | } 9 | \arguments{ 10 | \item{fields}{A vector of character strings. The field IDs to check if valid.} 11 | 12 | \item{filename}{A string. If provided, will save as a fieldname file ready for the table-exporter (including "eid"). 13 | \code{default=""}} 14 | 15 | \item{abort}{Logical. Abort if a field is missing?, 16 | \code{default=TRUE}} 17 | 18 | \item{verbose}{Logical. Be verbose, 19 | \code{default=FALSE}} 20 | } 21 | \value{ 22 | Returns a vector of strings (valid phenotypes). 23 | } 24 | \description{ 25 | Check if provided field IDs are valid and return all possible phenotype names in the UK Biobank RAP 26 | } 27 | \examples{ 28 | 29 | # not instanced, not arrayed 30 | fields_to_phenos("31") # sex 31 | 32 | # instanced, not arrayed 33 | fields_to_phenos("53") # assessment date 34 | 35 | # instanced and arrayed 36 | fields_to_phenos("93") # systolic blood pressure, manual reading 37 | 38 | # instanced and arrayed, MRI assessments only 39 | fields_to_phenos("12673") # Heart rate recorded during vicorder run (Heart MRI) 40 | 41 | # check multiple simultaneously 42 | fields_to_phenos(c("31","93")) 43 | 44 | # only warn if an invalid field is provided (default is to abort) 45 | fields_to_phenos(c("31","notafield","93")) 46 | fields_to_phenos(c("31","notafield","93"), abort=FALSE) 47 | 48 | # save as fieldname file for the table-exporter (don't forget to upload to the RAP) 49 | fields_to_phenos(c("31","93"), filename="fieldnames.txt") 50 | 51 | } 52 | \author{ 53 | Luke Pilling 54 | } 55 | -------------------------------------------------------------------------------- /man/label_ukb_fields.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/label_ukb_field.R 3 | \name{label_ukb_fields} 4 | \alias{label_ukb_fields} 5 | \title{Update a data frame containing UK Biobank fields with `title` and `label` from the schema} 6 | \usage{ 7 | label_ukb_fields(d, verbose = FALSE) 8 | } 9 | \arguments{ 10 | \item{d}{A data frame. The data frame containing UK Biobank fields to update.} 11 | 12 | \item{verbose}{Logical. Be verbose, 13 | \code{default=FALSE}} 14 | } 15 | \value{ 16 | Returns a data frame. 17 | } 18 | \description{ 19 | Variables such as education and ethnicity are provided as integers but have specific codes. 20 | 21 | The UK Biobank schema are machine-readable dictionaries and mappings defining the internal structure of the online Showcase. https://biobank.ctsu.ox.ac.uk/crystal/schema.cgi 22 | 23 | This function updates a data frame of UK Biobank data field with information from the Schema. 24 | 25 | It is in effect a wrapper to apply ukbrapR::label_ukb_field() to each variable in a data frame that looks like a UK Biobank field. 26 | 27 | Only recognised fields are modified (variables named things like "p54_i0"). Other variables are ignored. 28 | } 29 | \examples{ 30 | 31 | # say the below data frame contains 4 variables: `eid`, `p54_i0`, `p50_i0` and `age_at_assessment` 32 | names(ukb) 33 | 34 | # update the variables that looks like UK Biobank fields with titles and, where cateogrical, labels 35 | # i.e., `p54_i0` and `p50_i0` only -- `eid` and `age_at_assessment` are ignored 36 | ukb <- ukbrapR::label_ukb_fields(ukb) 37 | 38 | table(ukb$p54_i0) # tabulates the values 39 | table(haven::as_factor(ukb$p54_i0)) # tabulates the labels 40 | haven::print_labels(ukb$p54_i0) # show the value:label mapping for this variable 41 | 42 | } 43 | \author{ 44 | Luke Pilling 45 | } 46 | -------------------------------------------------------------------------------- /man/get_rap_phenos.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/get_rap_phenos.R 3 | \name{get_rap_phenos} 4 | \alias{get_rap_phenos} 5 | \title{Get UK Biobank participant phenotype data} 6 | \usage{ 7 | get_rap_phenos( 8 | names, 9 | value_coding = "replace", 10 | names_are_titles = FALSE, 11 | record = NULL, 12 | verbose = FALSE 13 | ) 14 | } 15 | \arguments{ 16 | \item{names}{A string or vector of strings. The variable name(s) required. e.g., c("eid","p31","p21003_i0")} 17 | 18 | \item{value_coding}{A string. How to handle coded fields. "replace" if a coding value exists, replace the raw value with the code; "raw" export the raw values of the field; "exclude" if a coding value exists, do not export the value (most commonly used with sparse fields). 19 | \code{default="replace"}} 20 | 21 | \item{names_are_titles}{Logical. Passing DNAnexus variable "titles" e.g., c("Age at recruitment", "Standing height | Instance 0"). 22 | \code{default=FALSE}} 23 | 24 | \item{record}{A string. The `dnanexus_link` file descriptor of the .dataset to use. Default (if left as NULL) is to use the most recently dispensed dataset. 25 | \code{default=most recent dataset}} 26 | 27 | \item{verbose}{Logical. Be verbose, 28 | \code{default=FALSE}} 29 | } 30 | \value{ 31 | Returns a data.frame (the participant data for the requested variables) 32 | } 33 | \description{ 34 | Using a Spark node/cluster on the UK Biobank Research Analysis Platform (DNAnexus), use R to extract a provided set of variables. Using code from the UK Biobank DNAnexus team https://github.com/UK-Biobank/UKB-RAP-Notebooks/blob/main/NBs_Prelim/105_export_participant_data_to_r.ipynb 35 | } 36 | \examples{ 37 | # get phenotype data 38 | ukb <- get_rap_phenos(c("eid","p31","p21003_i0","p53_i0")) 39 | 40 | # save to file on the RAP worker node 41 | readr::write_tsv(ukb, "ukbrap.phenos.20231114.txt.gz") 42 | 43 | # upload data to RAP storage 44 | upload_to_rap(file="ukbrap.phenos.20231114.txt.gz", dir="") 45 | 46 | } 47 | \author{ 48 | Luke Pilling 49 | } 50 | -------------------------------------------------------------------------------- /vignettes/spark_functions.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Spark functions" 3 | description: > 4 | Pull phenotype data from Spark environment. 5 | output: rmarkdown::html_vignette 6 | vignette: > 7 | %\VignetteIndexEntry{Spark functions} 8 | %\VignetteEngine{knitr::rmarkdown} 9 | %\VignetteEncoding{UTF-8} 10 | --- 11 | 12 | ```{r, include = FALSE} 13 | knitr::opts_chunk$set( 14 | collapse = TRUE, 15 | comment = "#>" 16 | ) 17 | library(ukbrapR) 18 | ``` 19 | 20 | 21 | ## Pull phenotype data from Spark environment to an R data frame 22 | 23 | **Needs to be run in an Apache Spark environment on the UK Biobank DNAnexus RAP.** 24 | 25 | Recommend launching a Spark cluster with at least `mem1_hdd1_v2_x16` and **2 nodes** otherwise this can fail with error "...ensure that workers...have sufficient resources" 26 | 27 | The underlying code is mostly from the [UK Biobank GitHub](https://github.com/UK-Biobank/UKB-RAP-Notebooks/blob/main/NBs_Prelim/105_export_participant_data_to_r.ipynb). 28 | 29 | ```{r, eval=FALSE, echo=TRUE} 30 | # get phenotype data (participant ID, sex, baseline age, and baseline assessment date) 31 | ukb <- get_rap_phenos(c("eid", "p31", "p21003_i0", "p53_i0")) 32 | #> 48.02 sec elapsed 33 | 34 | # summary of data 35 | table(ukb$p31) 36 | #> Female Male 37 | #> 273297 229067 38 | summary(ukb$p21003_i0) 39 | #> Min. 1st Qu. Median Mean 3rd Qu. Max. 40 | #> 37.00 50.00 58.00 56.53 63.00 73.00 41 | ``` 42 | 43 | ### No more updates... 44 | 45 | I am moving away from using Spark as the default environment, mostly due to the cost implications; it is significantly cheaper (and quicker!) to store and search exported raw text files in the RAP persistant storage than do everything in a Spark environment (plus the added benefit that the RStudio interface is available in "normal" instances). 46 | 47 | The Spark functions are available as before but all updates are to improve functionality in "normal" instances using RStudio, as we move to the new era of RAP-only UK Biobank analysis. 48 | 49 | If you need to see the previous release documentation follow the tags to the version required: https://github.com/lcpilling/ukbrapR/tree/v0.1.7 50 | 51 | -------------------------------------------------------------------------------- /man/extract_variants.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/extract_variants.R 3 | \name{extract_variants} 4 | \alias{extract_variants} 5 | \title{Extract variants from bulk data and load to memory} 6 | \usage{ 7 | extract_variants( 8 | in_file, 9 | out_bed = "tmp", 10 | source = "imputed", 11 | overwrite = FALSE, 12 | progress = FALSE, 13 | verbose = FALSE, 14 | very_verbose = FALSE 15 | ) 16 | } 17 | \arguments{ 18 | \item{in_file}{A data frame or file path. Contains rsid, chr, and pos. For imputed genos pos is build 37. For DRAGEN pos is build 38. Other columns are ignored.} 19 | 20 | \item{out_bed}{A string. Prefix for output files (optional) 21 | \code{default="tmp"}} 22 | 23 | \item{source}{A string. Either "imputed" or "dragen" - indicating whether the variants should be from "UKB imputation from genotype" (field 22828) or "DRAGEN population level WGS variants, PLINK format [500k release]" (field 24308) 24 | \code{default="imputed"}} 25 | 26 | \item{overwrite}{Logical. Overwrite output BED files? (If output prefix is left as 'tmp' overwrite is set to TRUE), 27 | \code{default=FALSE}} 28 | 29 | \item{progress}{Logical. Show progress through each individual file, 30 | \code{default=FALSE}} 31 | 32 | \item{verbose}{Logical. Be verbose (show individual steps), 33 | \code{default=FALSE}} 34 | 35 | \item{very_verbose}{Logical. Be very verbose (show individual steps & show terminal output from Plink etc), 36 | \code{default=FALSE}} 37 | } 38 | \value{ 39 | A data frame 40 | } 41 | \description{ 42 | Use user-provided list of genetic variants to extract from imputed BGEN files (field 22828) or WGS DRAGEN BGEN files (field 24309) data and load as data.frame 43 | 44 | If selecting the DRAGEN data as the source, this assumes your project has access to the WGS BGEN files released April 2025. If not, run `ukbrapR:::make_dragen_bed_from_pvcfs()` to use [tabix] and [plink] to subset the [DRAGEN WGS pVCF files]. 45 | } 46 | \examples{ 47 | 48 | liver_variants <- extract_variants(in_file=system.file("files", "pgs_liver_cirrhosis.txt", package="ukbrapR"), out_bed="liver_cirrhosis.imputed.variants") 49 | 50 | } 51 | \author{ 52 | Luke Pilling 53 | } 54 | -------------------------------------------------------------------------------- /vignettes/label_fields.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Label fields" 3 | description: > 4 | Assign categorical UK Biobank fields the labels from the showcase schema. 5 | output: rmarkdown::html_vignette 6 | vignette: > 7 | %\VignetteIndexEntry{Label fields} 8 | %\VignetteEngine{knitr::rmarkdown} 9 | %\VignetteEncoding{UTF-8} 10 | --- 11 | 12 | ```{r, include = FALSE} 13 | knitr::opts_chunk$set( 14 | collapse = TRUE, 15 | comment = "#>" 16 | ) 17 | library(ukbrapR) 18 | ``` 19 | 20 | Categorical fields are exported as integers but are encoded with labels. 21 | 22 | For example [20116 "Smoking status"](https://biobank.ctsu.ox.ac.uk/crystal/field.cgi?id=20116): 23 | 24 | | Coding | Meaning | 25 | |--------|----------------------| 26 | | -3 | Prefer not to answer | 27 | | 0 | Never | 28 | | 1 | Previous | 29 | | 2 | Current | 30 | 31 | This package includes two functions to label a single UK Biobank field or a data frame of them using the [UK Biobank encoding schema](https://biobank.ctsu.ox.ac.uk/crystal/schema.cgi). Examples: 32 | 33 | ```{r, eval=FALSE, echo=TRUE} 34 | # update the Smoking status field 35 | ukb <- label_ukb_field(ukb, field="p20116_i0") 36 | 37 | table(ukb$p20116_i0) # tabulates the values 38 | #> -3 0 1 2 39 | #> 2057 273405 172966 52949 40 | 41 | table(haven::as_factor(ukb$p20116_i0)) # tabulates the labels 42 | #> Prefer not to answer Never Previous Current 43 | #> 2057 273405 172966 52949 44 | 45 | haven::print_labels(ukb$p20116_i0) # show the value:label mapping for this variable 46 | #> Labels: 47 | #> value label 48 | #> -3 Prefer not to answer 49 | #> 0 Never 50 | #> 1 Previous 51 | #> 2 Current 52 | 53 | # 54 | # if you have a whole data frame of exported fields, you can use the wrapper function label_ukb_fields() 55 | 56 | # say the `ukb` data frame contains 4 variables: `eid`, `p54_i0`, `p31` and `age_at_assessment` 57 | 58 | # update the variables that looks like UK Biobank fields with titles and, where cateogrical, labels 59 | # i.e., `p54_i0` and `p31` only -- `eid` and `age_at_assessment` are ignored 60 | ukb <- label_ukb_fields(ukb) 61 | 62 | table(ukb$p31) # tabulates the values 63 | #> 0 1 64 | #> 273238 229031 65 | 66 | table(haven::as_factor(ukb$p31)) # tabulates the labels 67 | #> Female Male 68 | #> 273238 229031 69 | ``` 70 | 71 | -------------------------------------------------------------------------------- /man/create_pgs.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/extract_variants.R 3 | \name{create_pgs} 4 | \alias{create_pgs} 5 | \title{Create a polygenic score} 6 | \usage{ 7 | create_pgs( 8 | in_file, 9 | out_file = "tmp", 10 | pgs_name = "pgs", 11 | source = "imputed", 12 | is_bed = FALSE, 13 | overwrite = FALSE, 14 | progress = FALSE, 15 | verbose = FALSE, 16 | very_verbose = FALSE 17 | ) 18 | } 19 | \arguments{ 20 | \item{in_file}{A data frame or file path. Must contain rsid, chr, pos, effect_allele, other_allele, beta. For imputed genos pos is build 37. For DRAGEN pos is build 38. Other columns are ignored.} 21 | 22 | \item{out_file}{A string. Prefix for output files (optional) 23 | \code{default="tmp"}} 24 | 25 | \item{pgs_name}{A string. Variable name for created PGS (optional) 26 | \code{default="pgs"}} 27 | 28 | \item{source}{A string. Either "imputed" or "dragen" - indicating whether the variants should be from "UKB imputation from genotype" (field 22828) or "DRAGEN population level WGS variants, PLINK format [500k release]" (field 24308). Can instead be a path to a local BED file, if `is_bed=TRUE`. 29 | \code{default="imputed"}} 30 | 31 | \item{is_bed}{Logical. If you already have a BED file containing the required variants set this to TRUE and provide a path to the BED file in the `source` option, 32 | \code{default=FALSE}} 33 | 34 | \item{overwrite}{Logical. Overwrite output BED files? (If out_file is left as 'tmp' overwrite is set to TRUE), 35 | \code{default=FALSE}} 36 | 37 | \item{progress}{Logical. Show progress through each individual file, 38 | \code{default=FALSE}} 39 | 40 | \item{verbose}{Logical. Be verbose (show individual steps), 41 | \code{default=FALSE}} 42 | 43 | \item{very_verbose}{Logical. Be very verbose (show individual steps & show terminal output from Plink etc), 44 | \code{default=FALSE}} 45 | } 46 | \value{ 47 | A data frame 48 | } 49 | \description{ 50 | Use user-provided list of genetic variants with weights for a trait to create a polygenic score. Uses the imputed BGEN files (field 22828) or WGS DRAGEN BGEN files (field 24309) data and load as data.frame 51 | 52 | If selecting the DRAGEN data as the source, this assumes your project has access to the WGS BGEN files released April 2025. If not, run `ukbrapR:::make_dragen_bed_from_pvcfs()` to use [tabix] and [plink] to subset the [DRAGEN WGS pVCF files]. 53 | } 54 | \examples{ 55 | 56 | liver_pgs <- create_pgs(in_file=system.file("files", "pgs_liver_cirrhosis.txt", package="ukbrapR"), out_file="liver_cirrhosis.imputed.pgs", pgs_name="liver_cirrhosis_pgs") 57 | 58 | } 59 | \author{ 60 | Luke Pilling 61 | } 62 | -------------------------------------------------------------------------------- /R/upload_to_rap.R: -------------------------------------------------------------------------------- 1 | #' Use R to upload a file to the UK Biobank RAP 2 | #' 3 | #' @description Use R to upload a file to the UK Biobank RAP (really just a wrapper for `dx upload`) 4 | #' 5 | #' @return NA 6 | #' 7 | #' @author Luke Pilling 8 | #' 9 | #' @name upload_to_rap 10 | #' 11 | #' @param file A string. Filename of the file to be uploaded (character) 12 | #' @param dir A string. Target directory in the RAP space. If blank, the current working directory (character) 13 | #' @param verbose Logical. Be verbose, 14 | #' \code{default=FALSE} 15 | #' 16 | #' @examples 17 | #' 18 | #' readr::write_tsv(data.frame(x=1:10,y=11:20), "ukbrap.dummy.20231114.txt.gz") 19 | #' 20 | #' # upload file to RAP storage 21 | #' upload_to_rap(file="ukbrap.dummy.20231114.txt.gz", dir="extracts/") 22 | #' 23 | #' @export 24 | #' 25 | upload_to_rap <- function( 26 | file, 27 | dir = "FALSE", 28 | verbose = FALSE 29 | ) { 30 | 31 | if (verbose) cat("Check input & options\n") 32 | if (! is.character(file) ) stop("`file` needs to be a string of the filename to upload") 33 | if (! file.exists(file) ) stop("`file` not found") 34 | 35 | if (dir == "FALSE") { 36 | if (verbose) cat("Uploading to current working directory\n") 37 | system(paste0("dx upload ", file)) 38 | } else { 39 | if (verbose) cat(paste0("Uploading to `dir`: ", dir, "\n")) 40 | if (substr(dir, nchar(dir), nchar(dir)) != "/") dir <- paste0(dir, "/") 41 | system(paste0("dx mkdir -p ", dir)) 42 | system(paste0("dx upload ", file, " --path ", dir)) 43 | } 44 | 45 | } 46 | 47 | 48 | #' Use R to download a file to the UK Biobank RAP 49 | #' 50 | #' @description Use R to download a file to the UK Biobank RAP (really just a wrapper for `dx download`) 51 | #' 52 | #' @return NA 53 | #' 54 | #' @author Luke Pilling 55 | #' 56 | #' @name download_from_rap 57 | #' 58 | #' @param file A string. Filename of the file to be downloaded (character) 59 | #' @param verbose Logical. Be verbose, 60 | #' \code{default=FALSE} 61 | #' 62 | #' @examples 63 | #' 64 | #' readr::write_tsv(data.frame(x=1:10,y=11:20), "ukbrap.dummy.20231114.txt.gz") 65 | #' 66 | #' # download file to RAP storage 67 | #' download_from_rap(file="ukbrap.dummy.20231114.txt.gz", dir="extracts/") 68 | #' 69 | #' @export 70 | #' 71 | download_from_rap <- function( 72 | file, 73 | verbose = FALSE 74 | ) { 75 | 76 | if (verbose) cat("Check input & options\n") 77 | if (! is.character(file) ) stop("`file` needs to be a string of the filename to download") 78 | if (! file.exists(file) ) stop("`file` not found") 79 | 80 | if (verbose) cat("downloading to current working directory\n") 81 | system(paste0("dx download ", file)) 82 | 83 | } 84 | 85 | 86 | -------------------------------------------------------------------------------- /man/get_diagnoses.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/get_diagnoses.R 3 | \name{get_diagnoses} 4 | \alias{get_diagnoses} 5 | \title{Get UK Biobank participant diagnosis data} 6 | \usage{ 7 | get_diagnoses(codes_df, file_paths = NULL, verbose = FALSE) 8 | } 9 | \arguments{ 10 | \item{codes_df}{A data frame. Contains two columns: `code` and `vocab_id` i.e., a list of diagnostic codes, and an indicator of the vocabulary (ICD10, Read2, CTV3, OPCS3, OPCS4, ukb_cancer, and ukb_noncancer are recognised). Other columns are ignored.} 11 | 12 | \item{file_paths}{A data frame. Columns must be `object` and `path` containing paths to required files. Default assumes you have the tables exported in the RAP environment from 13 | ukbrapR::export_tables() 14 | \code{default=ukbrapR:::ukbrapr_paths}} 15 | 16 | \item{verbose}{Logical. Be verbose, 17 | \code{default=FALSE}} 18 | } 19 | \value{ 20 | Returns a list of data frames (the participant data for the requested diagnosis codes: `death_cause`, `hesin_diag`, `hesin_oper`, `gp_clinical`, `cancer_registry` and `selfrep_illness`. Also includes the original codes list) 21 | } 22 | \description{ 23 | For a list of diagnostic codes get the HES, GP, cancer registry, operations, and self-reported illness data, matching the provided codes. 24 | 25 | Valid code vocabularies are: 26 | 27 | - ICD10 (for `hesin`, `death_cause` and `cancer_registry` searches) - fuzzy matching 28 | 29 | - ICD9 (for `hesin` searches) - fuzzy matching 30 | 31 | - Read2 / CTV3 (for `gp_clinical`) - exact matches on first 5 characters 32 | 33 | - OPCS3 / OPCS4 (for `hesin_oper`) - fuzzy matching 34 | 35 | - ukb_cancer / ukb_noncancer (for self-reported illness at UK Biobank assessments - all available will be searched) - exact matches 36 | 37 | This function relies on exported raw data files and thus does not need to be run in a Spark cluster. If the files are not in the default locations for the package you will need to specify the `file_paths` to exported tables. Recommend to run `export_tables()` once in your project to export the tables to the default paths for the package. 38 | } 39 | \examples{ 40 | # example diagnostic codes for CKD from GEMINI multimorbidity project 41 | codes_df_ckd <- ukbrapR:::codes_df_ckd 42 | head(codes_df_ckd) 43 | 44 | # Get diagnosis data - returns list of data frames (one per source) 45 | # -- Requires exported tables - see `export_tables()` 46 | diagnosis_list <- get_diagnoses(codes_df_ckd) 47 | 48 | # don't forget to save and upload data to RAP persistent storage! 49 | save(diagnosis_list, "ukbrap.CKD.emr.20231114.RDat") 50 | upload_to_rap(file="ukbrap.CKD.*", dir="") 51 | 52 | } 53 | \author{ 54 | Luke Pilling 55 | } 56 | -------------------------------------------------------------------------------- /man/get_selfrep_illness_spark.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/get_selfrep_illness_spark.R 3 | \name{get_selfrep_illness_spark} 4 | \alias{get_selfrep_illness_spark} 5 | \title{Get UK Biobank participant self-reported illness/year data for specific codes} 6 | \usage{ 7 | get_selfrep_illness_spark( 8 | codes_df, 9 | vocab_col = "vocab_id", 10 | codes_col = "code", 11 | ukb_dat = NULL, 12 | n_cancer_arrays = 5, 13 | n_noncancer_arrays = 30, 14 | verbose = FALSE 15 | ) 16 | } 17 | \arguments{ 18 | \item{codes_df}{A data frame. Contains the `vocab_col` and `codes_col` i.e., an indicator of the vocabulary and the diagnostic codes.} 19 | 20 | \item{vocab_col}{A string. Column name in `codes_df` that contains the vocabulary indicator for the code (for self-reported it needs to be either "ukb_cancer" or "ukb_noncancer"). 21 | \code{default='vocab_id'}} 22 | 23 | \item{codes_col}{A string. Column name in `codes_df` that contains the self-reported disease code (e.g., 1507). 24 | \code{default='code'}} 25 | 26 | \item{ukb_dat}{A data frame. Optional. If not provided, will get the phenotypes from the RAP. Contains the self-reported illness fields e.g., `p20008_i0_a0`. 27 | \code{default=NULL}} 28 | 29 | \item{n_cancer_arrays}{An integer. It is not trivial to determine the max number of arrays to request from Spark for the self-reported illnesses. The defaults match the currently (Feb 2024) available data but may need increasing in the future. 30 | \code{default=5}} 31 | 32 | \item{n_noncancer_arrays}{An integer. It is not trivial to determine the max number of arrays to request from Spark for the self-reported illnesses. The defaults match the currently (Feb 2024) available data but may need increasing in the future. 33 | \code{default=30}} 34 | 35 | \item{verbose}{Logical. Be verbose, 36 | \code{default=FALSE}} 37 | } 38 | \value{ 39 | Returns a data frame with four variables: eid, selfrep [binary, codes identified?], selfrep_df [date of reported illness], selfrep_i [instance the illness was first reported] 40 | } 41 | \description{ 42 | For a specific self-reported illness code or codes, identify whether the participant has self-reported at any visit, and identify the year. 43 | Intended for use on the UK Biobank DNnexus Research Analysis Platform, but if the user provides their own dataframe of UK Biobank self-reported fields this works on any system. 44 | } 45 | \examples{ 46 | # example diagnostic codes for haemochromatosis 47 | print(codes_df_hh) 48 | 49 | # get self-reported data - a data frame 50 | selfrep_df <- get_selfrep_illness_spark(codes_df_hh) 51 | 52 | # inspect variables 53 | table(selfrep_df$selfrep) 54 | summary(selfrep_df$selfrep_df) 55 | table(selfrep_df$selfrep_i) 56 | 57 | } 58 | \author{ 59 | Luke Pilling 60 | } 61 | -------------------------------------------------------------------------------- /man/export_tables.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/export_tables.R 3 | \name{export_tables} 4 | \alias{export_tables} 5 | \title{Export diagnosis files to RAP persistent storage} 6 | \usage{ 7 | export_tables( 8 | submit = FALSE, 9 | ignore_warnings = FALSE, 10 | file_paths = ukbrapR:::ukbrapr_paths, 11 | dataset = NULL, 12 | verbose = FALSE 13 | ) 14 | } 15 | \arguments{ 16 | \item{submit}{Logical. Actually submit `dx` commands. Default is FALSE i.e., just check inputs & file paths, then print the commands, 17 | \code{default=FALSE}} 18 | 19 | \item{ignore_warnings}{Logical. If an exported table already exists do not submit the table-exporter command unless this is TRUE, 20 | \code{default=FALSE}} 21 | 22 | \item{file_paths}{A data frame. Columns must be `object` and `path` containing paths to outputted files. If blank, will use the default paths, 23 | \code{default=ukbrapR:::ukbrapr_paths}} 24 | 25 | \item{dataset}{A string. If you wish to specify dataset. If blank, will use the most recently dispensed dataset in the main project directory. 26 | \code{default=app#####_#####.dataset}} 27 | 28 | \item{verbose}{Logical. Be verbose, 29 | \code{default=FALSE}} 30 | } 31 | \value{ 32 | NA 33 | } 34 | \description{ 35 | In the UK Biobank RAP export tables for HES, GP, death, and cancer registry data, plus self-reported illness fields, using the table-exporter. This is essentially a wrapper function to submit jobs to the table exporter. 36 | 37 | Suggest executing in an RStudio session. ~10Gb of text files are created. This will cost ~£0.15 per month to store in the RAP standard storage. 38 | } 39 | \examples{ 40 | 41 | # To keep files organised this package assumes the following file structure 42 | # This object is not actually required but illstrates the defaults to be 43 | # created in your RAP space (override by providing a new `file_paths`): 44 | ukbrapr_paths = data.frame( 45 | object=c("death","death_cause","hesin","hesin_diag","hesin_oper","gp_clinical","gp_scripts","selfrep_illness","cancer_registry","baseline_dates"), 46 | path=c( 47 | "ukbrapr_data/death.tsv", 48 | "ukbrapr_data/death_cause.tsv", 49 | "ukbrapr_data/hesin.tsv", 50 | "ukbrapr_data/hesin_diag.tsv", 51 | "ukbrapr_data/hesin_oper.tsv", 52 | "ukbrapr_data/gp_clinical.tsv", 53 | "ukbrapr_data/gp_scripts.tsv", 54 | "ukbrapr_data/selfrep_illness.tsv", 55 | "ukbrapr_data/cancer_registry.tsv", 56 | "ukbrapr_data/baseline_dates.tsv" 57 | ) 58 | ) 59 | ukbrapr_paths 60 | 61 | # test run to see `dx run table-exporter` commands - but will not submit jobs 62 | export_tables() 63 | 64 | # Submit all `dx run table-exporter` commands. ~10Gb of text files are created. This will cost ~£0.15 per month to store in the RAP standard storage. 65 | export_tables(submit=TRUE) 66 | 67 | } 68 | \author{ 69 | Luke Pilling 70 | } 71 | -------------------------------------------------------------------------------- /R/get_cancer_registry.R: -------------------------------------------------------------------------------- 1 | #' Get cancer registry data for specific codes 2 | #' 3 | #' @author Luke Pilling 4 | #' 5 | #' @name get_cancer_registry 6 | #' 7 | #' @noRd 8 | get_cancer_registry <- function( 9 | codes, 10 | ukb_dat, 11 | verbose = FALSE 12 | ) { 13 | 14 | start_time <- Sys.time() 15 | 16 | # Check input 17 | if (verbose) cli::cli_alert_info("Searching cancer registry data for {length(unique(codes))} ICD10 codes") 18 | 19 | # remove rows where participant has no cancer data 20 | ukb_dat = ukb_dat |> dplyr::filter( 21 | dplyr::if_any( 22 | dplyr::starts_with("p"), 23 | ~!is.na(.) 24 | ) 25 | ) 26 | 27 | # check all visits for participant - create `canreg` (binary, ever), `canreg_df` (date first) and `canreg_i` (the "instance" i.e., visit) 28 | # https://biobank.ctsu.ox.ac.uk/crystal/label.cgi?id=100092 29 | # date vars = 40005 30 | # cancer vars = 40006 31 | # age vars = 40008 32 | # histology vars = 40011 33 | # behaviour vars = 40012 34 | 35 | # variable prefix 36 | v_icd10 <- "p40006_" 37 | v_date <- "p40005_" 38 | v_age <- "p40008_" 39 | v_histology <- "p40011_" 40 | v_behaviour <- "p40012_" 41 | 42 | # use `tidyr::pivot_longer` to reduce the number of columns and increase the number of rows 43 | if (verbose) cli::cli_alert("Pivot cancer registry data") 44 | pivot_cancer <- function(d, v, n) { 45 | d |> 46 | dplyr::select(eid, dplyr::contains(v)) |> 47 | tidyr::pivot_longer(!eid, names_to = "instance", names_prefix = v, values_to = n) 48 | } 49 | ukb_dat_icd10 <- pivot_cancer(ukb_dat, v_icd10, "icd10") 50 | ukb_dat_date <- pivot_cancer(ukb_dat, v_date, "date") 51 | ukb_dat_age <- pivot_cancer(ukb_dat, v_age, "age") 52 | ukb_dat_histology <- pivot_cancer(ukb_dat, v_histology, "histology") 53 | ukb_dat_behaviour <- pivot_cancer(ukb_dat, v_behaviour, "behaviour") 54 | 55 | # join tables 56 | if (verbose) cli::cli_alert("Join cancer registry data") 57 | ukb_dat_cr = purrr::reduce(list(ukb_dat_icd10, ukb_dat_date, ukb_dat_age, ukb_dat_histology, ukb_dat_behaviour), dplyr::full_join, by = c("eid"="eid", "instance"="instance")) 58 | 59 | # remove rows where participant has no cancer data 60 | ukb_dat_cr = ukb_dat_cr |> dplyr::filter( 61 | dplyr::if_any( 62 | c("icd10","date","age","histology","behaviour"), 63 | ~!is.na(.) 64 | ) 65 | ) 66 | 67 | # subset to ICD10s in provided codes 68 | if (verbose) cli::cli_alert("Identify matching codes") 69 | ukb_dat_cr = ukb_dat_cr |> 70 | dplyr::filter( 71 | stringr::str_detect( 72 | icd10, 73 | stringr::str_flatten(codes, collapse = "|") 74 | ) 75 | ) 76 | 77 | # finish 78 | #if (verbose) cli::cli_alert_success(c("Finished cancer registry: ", "{prettyunits::pretty_sec(as.numeric(difftime(Sys.time(), start_time, units=\"secs\")))}.")) 79 | 80 | # Return data 81 | return(ukb_dat_cr) 82 | 83 | } 84 | 85 | -------------------------------------------------------------------------------- /data-raw/internal_files.R: -------------------------------------------------------------------------------- 1 | 2 | # 3 | # file paths ################################################ 4 | # 5 | 6 | ukbrapr_paths = data.frame( 7 | object=c( 8 | "hesin", 9 | "hesin_diag", 10 | "hesin_oper", 11 | "gp_clinical", 12 | "gp_scripts", 13 | "death", 14 | "death_cause", 15 | "selfrep_illness", 16 | "cancer_registry", 17 | "baseline_dates"), 18 | path=c( 19 | "ukbrapr_data/hesin.tsv", 20 | "ukbrapr_data/hesin_diag.tsv", 21 | "ukbrapr_data/hesin_oper.tsv", 22 | "ukbrapr_data/gp_clinical.tsv", 23 | "ukbrapr_data/gp_scripts.tsv", 24 | "ukbrapr_data/death.tsv", 25 | "ukbrapr_data/death_cause.tsv", 26 | "ukbrapr_data/selfrep_illness.tsv", 27 | "ukbrapr_data/cancer_registry.tsv", 28 | "ukbrapr_data/baseline_dates.tsv" 29 | ) 30 | ) 31 | 32 | 33 | # 34 | # codes lists ################################################### 35 | # 36 | 37 | library(tidyverse) 38 | 39 | # CKD -- from https://github.com/GEMINI-multimorbidity/GEMINI-LTC-code-list-Public 40 | ICD10 <- c("N18.3", "N18.4", "N18.5", "N18.6", "N18.9", "N19", "Z94.0") 41 | Read2 <- c("1Z12.", "1Z13.", "1Z14.", "1Z15.", "1Z16.", "1Z1a.", "1Z1B.", "1Z1b.", "1Z1C.", "1Z1c.", "1Z1D.", "1Z1d.", "1Z1E.", "1Z1e.", "1Z1F.", "1Z1f.", "1Z1G.", "1Z1H.", "1Z1J.", "1Z1K.", "1Z1L.", "1Z1T.", "1Z1V.", "1Z1W.", "1Z1X.", "1Z1Y.", "1Z1Z.", "K053.", "K054.", "K055.", "1Z1..", "K0E..", "K05..", "Kyu21", "D2150", "K06..", "6AA..", "66i..", "661M2", "661N2") 42 | CTV3 <- c("X30In", "XaLHI", "XaLHJ", "XaLHK", "XaNbn", "XaNbo", "XacAb", "XacAd", "XaO3t", "XacAe", "XaO3u", "XacAf", "XaO3v", "XacAh", "XaO3w", "XacAi", "XaO3x", "XaO3y", "XaO3z", "XaO40", "XaO41", "XaO42", "XacAM", "XacAN", "XacAO", "XacAV", "XacAW", "XacAX", "XaYb9", "XaYZW", "XaMJD", "XaMGE", "XaCLy", "XE0df", "XE0dg", "X30Iz", "Kyu21") 43 | codes_df_ckd <- data.frame( 44 | vocab_id = c( 45 | rep("ICD10", length(ICD10)), 46 | rep("Read2", length(Read2)), 47 | rep("CTV3", length(CTV3)) 48 | ), 49 | code = c(ICD10, Read2, CTV3) 50 | ) 51 | codes_df_ckd$condition = "ckd" 52 | codes_df_ckd = codes_df_ckd[,c("condition","vocab_id","code")] 53 | 54 | # Haemochromatosis (include self-reported) 55 | selfrep <- c("1507") 56 | ICD10 <- c("E83.1") 57 | ICD9 <- c("275.01") 58 | Read2 <- c("126A.","4L41.","677C0","C350.","C3500") 59 | CTV3 <- c("C3500","X40QQ","XaIyI","XaIyx","XaXHI","XE13K","X307o","X307p") 60 | codes_df_hh <- data.frame( 61 | vocab_id = c( 62 | rep("ukb_noncancer", length(selfrep)), 63 | rep("ICD10", length(ICD10)), 64 | rep("ICD9", length(ICD9)), 65 | rep("Read2", length(Read2)), 66 | rep("CTV3", length(CTV3)) 67 | ), 68 | code = c(selfrep, ICD10, ICD9, Read2, CTV3) 69 | ) 70 | codes_df_hh$condition = "hh" 71 | codes_df_hh = codes_df_hh[,c("condition","vocab_id","code")] 72 | 73 | # synthetic - all types of code! 74 | cancers = c("C22") 75 | OPCS3 = c("502","509") 76 | OPCS4 = c("J01") 77 | ICD9 = c("280","28243","440.2") 78 | codes_df_test = rbind( 79 | codes_df_hh, 80 | data.frame( 81 | condition="test", 82 | vocab_id = c( 83 | rep("ICD10", length(cancers)), 84 | rep("OPCS3", length(OPCS3)), 85 | rep("OPCS4", length(OPCS4)), 86 | rep("ICD9", length(ICD9)) 87 | ), 88 | code = c(cancers, OPCS3, OPCS4, ICD9) 89 | ) 90 | ) 91 | codes_df_test$condition = "test" 92 | 93 | 94 | # 95 | # UK Biobank field label schema ################################# 96 | # 97 | 98 | # from https://biobank.ctsu.ox.ac.uk/crystal/download.cgi 99 | ukb_schema <- NULL 100 | ukb_schema[["field"]] <- readr::read_tsv("http://biobank.ctsu.ox.ac.uk/ukb/scdown.cgi?fmt=txt&id=1", progress=FALSE, show_col_types=FALSE) 101 | ukb_schema[["time"]] <- readr::read_tsv("http://biobank.ctsu.ox.ac.uk/ukb/scdown.cgi?fmt=txt&id=20", progress=FALSE, show_col_types=FALSE) 102 | ukb_schema[["real"]] <- readr::read_tsv("http://biobank.ctsu.ox.ac.uk/ukb/scdown.cgi?fmt=txt&id=7", progress=FALSE, show_col_types=FALSE) 103 | ukb_schema[["date"]] <- readr::read_tsv("http://biobank.ctsu.ox.ac.uk/ukb/scdown.cgi?fmt=txt&id=8", progress=FALSE, show_col_types=FALSE) 104 | ukb_schema[["int"]] <- readr::read_tsv("http://biobank.ctsu.ox.ac.uk/ukb/scdown.cgi?fmt=txt&id=5", progress=FALSE, show_col_types=FALSE) 105 | ukb_schema[["string"]] <- readr::read_tsv("http://biobank.ctsu.ox.ac.uk/ukb/scdown.cgi?fmt=txt&id=6", progress=FALSE, show_col_types=FALSE) 106 | 107 | 108 | 109 | # 110 | # save as internal ########################################### 111 | # 112 | usethis::use_data( 113 | ukbrapr_paths, 114 | codes_df_ckd, codes_df_hh, codes_df_test, 115 | ukb_schema, 116 | internal = TRUE, overwrite = TRUE, compress = 'xz') 117 | 118 | 119 | 120 | 121 | 122 | -------------------------------------------------------------------------------- /man/get_df.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/get_df.R 3 | \name{get_df} 4 | \alias{get_df} 5 | \title{Get UK Biobank participant Date First (DF) diagnosis} 6 | \usage{ 7 | get_df( 8 | diagnosis_list, 9 | prefix = NULL, 10 | group_by = NULL, 11 | include_selfrep_illness = TRUE, 12 | include_death_cause = TRUE, 13 | include_gp_clinical = TRUE, 14 | include_hesin_diag = TRUE, 15 | include_hesin_oper = TRUE, 16 | include_cancer_registry = TRUE, 17 | use_baseline_dates = TRUE, 18 | file_paths = NULL, 19 | censoring_date = "30-10-2022", 20 | verbose = FALSE 21 | ) 22 | } 23 | \arguments{ 24 | \item{diagnosis_list}{A list of data frames. The output of `get_diagnoses()` i.e., the raw diagnosis and self-reported illness data that matched the provided codes list.} 25 | 26 | \item{prefix}{String. Prefix to add to variable names (e.g., if prefix="chd" the output variables would be "chd_gp_df", "chd_hes_df", "chd_df" etc.) 27 | \code{default=NULL}} 28 | 29 | \item{group_by}{String. If the codes list provided to `get_diagnoses()` (i.e., in diagnosis_list$codes_df) contained a grouping/condition variable, indicate the variable name here. 30 | "Date first" variables will be created for each prefix in the grouping variable. The `prefix` option is ignored, in favour of the names in the grouping variable. 31 | \code{default=NULL}} 32 | 33 | \item{include_selfrep_illness}{logical. Include self-reported diagnosesin the combined Date First output? If present in `diagnosis_list` will still provide a separate `_df` variable 34 | \code{default=TRUE}} 35 | 36 | \item{include_death_cause}{logical. Include the cause of death in the combined Date First output? If present in `diagnosis_list` will still provide a separate `_df` variable 37 | \code{default=TRUE}} 38 | 39 | \item{include_gp_clinical}{logical. Include the GP data in the combined Date First output? If present in `diagnosis_list` will still provide a separate `_df` variable 40 | \code{default=TRUE}} 41 | 42 | \item{include_hesin_diag}{logical. Include the HES diagnosis data in the combined Date First output? If present in `diagnosis_list` will still provide a separate `_df` variable 43 | \code{default=TRUE}} 44 | 45 | \item{include_hesin_oper}{logical. Include the HES OPCS (operations) data in the combined Date First output? If present in `diagnosis_list` will still provide a separate `_df` variable 46 | \code{default=TRUE}} 47 | 48 | \item{include_cancer_registry}{logical. Include the cancer registry data in the combined Date First output? If present in `diagnosis_list` will still provide a separate `_df` variable 49 | \code{default=TRUE}} 50 | 51 | \item{use_baseline_dates}{logical. If `baseline_dates` available in file paths, produce a binary 0/1 variable, indicating the controls (people without a diagnosis) and setting the date first `_df` field to the date of censoring (currently see `censoring_date` option). 52 | \code{default=TRUE}} 53 | 54 | \item{file_paths}{A data frame. Columns must be `object` and `path` containing paths to outputted files. If not provided will use those in `ukbrapr_paths` 55 | \code{default=NULL}} 56 | 57 | \item{censoring_date}{A string. If using baseline data to infer control participants, include a censoring date (set to NA if not desired). Use dd-mm-yyyy format. Default is the (current) HES date. 58 | \code{default="30-10-2022"}} 59 | 60 | \item{verbose}{Logical. Be verbose, 61 | \code{default=FALSE}} 62 | } 63 | \value{ 64 | Returns a single, "wide" data frame: the participant data for the requested diagnosis codes with "date first" `_df` variables. One for each source of data, and a combined variable. 65 | } 66 | \description{ 67 | For each participant identify the date of first diagnosis from all available electronic medical records & self-reported data. 68 | 69 | If `use_baseline_dates=TRUE` (the default) then will also produce a binary 0/1 variable, indicating the controls (people without a diagnosis) and setting the date first `_df` field to the date of censoring (currently 30 October 2022). 70 | } 71 | \examples{ 72 | 73 | ############################################### 74 | # example 1. haemochromatosis 75 | 76 | # get diagnosis data - returns list of data frames (one per source) 77 | diagnosis_list <- get_diagnoses(ukbrapR:::codes_df_hh) 78 | 79 | # for each participant, get Date First diagnosed with the condition 80 | diagnosis_df <- get_df(diagnosis_list, prefix="hh") 81 | 82 | ############################################### 83 | # example 2. get multiple diseases at once 84 | # don't have to all have the same code types/data sources 85 | 86 | codes = rbind(ukbrapR:::codes_df_hh, ukbrapR:::codes_df_ckd) 87 | print(codes) 88 | 89 | # get diagnosis data - returns list of data frames (one per source) 90 | diagnosis_list <- get_diagnoses(codes) 91 | 92 | # for each participant, get Date First diagnosed with the condition 93 | diagnosis_df <- get_df(diagnosis_list, group_by="condition") 94 | 95 | } 96 | \author{ 97 | Luke Pilling 98 | } 99 | -------------------------------------------------------------------------------- /R/get_selfrep_illness.R: -------------------------------------------------------------------------------- 1 | #' Get UK Biobank participant self-reported illness/year data for specific codes 2 | #' 3 | #' @author Luke Pilling 4 | #' 5 | #' @name get_selfrep_illness 6 | #' 7 | #' @noRd 8 | get_selfrep_illness <- function( 9 | codes_df, 10 | ukb_dat, 11 | verbose = FALSE 12 | ) { 13 | 14 | start_time <- Sys.time() 15 | 16 | vocab_col = "vocab_id" 17 | codes_col = "code" 18 | 19 | # Check input 20 | if (verbose) cat("Check inputs\n") 21 | if (! any(class(codes_df) %in% c("data.frame","tbl_df"))) stop("Codelist needs to be provided as a data frame") 22 | codes_df = as.data.frame(codes_df) # in case a tibble 23 | 24 | # check `codes_df` -- does it contain "ukb_cancer" or "ukb_noncancer" codes? Cannot have both. Makes no sense. 25 | if (! any(c("ukb_cancer","ukb_noncancer") %in% codes_df[,vocab_col])) stop("`vocab_col` column in `codes_df` needs to contain either 'ukb_cancer' or 'ukb_noncancer'") 26 | #if (all(c("ukb_cancer","ukb_noncancer") %in% codes_df[,vocab_col])) stop("`vocab_col` column in `codes_df` needs to contain either 'ukb_cancer' or 'ukb_noncancer' (not both)") 27 | 28 | # get codes 29 | codes_cancer <- codes_df[ codes_df[,vocab_col] == "ukb_cancer", codes_col] 30 | codes_noncancer <- codes_df[ codes_df[,vocab_col] == "ukb_noncancer" , codes_col] 31 | if (verbose) cat(" - ", length(unique(codes_cancer)), " cancer codes\n") 32 | if (verbose) cat(" - ", length(unique(codes_noncancer)), " non-cancer codes\n") 33 | 34 | # which are we getting? 35 | get_noncancer <- get_cancer <- FALSE 36 | if (length(codes_cancer)>0) get_cancer <- TRUE 37 | if (length(codes_noncancer)>0) get_noncancer <- TRUE 38 | 39 | # remove rows where participant has no data 40 | ukb_dat = ukb_dat |> dplyr::filter( 41 | dplyr::if_any( 42 | dplyr::starts_with("p"), 43 | ~!is.na(.) 44 | ) 45 | ) 46 | 47 | # check all visits for participant - create `selfrep` (binary, ever), `selfrep_df` (the "Date First" variable) and `selfrep_i` (the "instance" i.e., visit) 48 | # https://biobank.ctsu.ox.ac.uk/crystal/label.cgi?id=100074 49 | # will use 20001 (cancer code) and 20002 (non-cancer code) 50 | # will use 20006-9 the interpolated year/age (6 = cancer year, 8 = non-cancer year) 51 | # interpolated: 52 | # - If the participant gave a calendar year, then the best-fit time is half-way through that year. For example if the year was given as 1970, then the value presented is 1970.5 53 | # - If the participant gave their age then the value presented is the fractional year corresponding to the mid-point of that age. For example, if the participant said they were 30 years old then the value is the date at which they were 30years+6months. 54 | # - Interpolated values before the date of birth were truncated forwards to that time. 55 | # - Interpolated values after the time of data acquisition were truncated back to that time. 56 | # [-1] = "Date uncertain or unknown" and [-3] = "Preferred not to answer" (exclude both from DF) 57 | 58 | # use `tidyr::pivot_longer` to reduce the number of columns and increase the number of rows 59 | if (verbose) cli::cli_alert("Pivot self-report data") 60 | pivot_table <- function(d, v, n) { 61 | d |> 62 | dplyr::select(eid, dplyr::contains(v)) |> 63 | tidyr::pivot_longer(!eid, names_to = "instance", names_prefix = v, values_to = n) |> 64 | dplyr::filter(!is.na(!!rlang::sym(n))) 65 | } 66 | 67 | # create tmp empty objects 68 | ukb_dat_sr <- ukb_dat_noncancer_long <- ukb_dat_cancer_long <- NULL 69 | 70 | if (get_cancer) { 71 | ukb_dat_code <- pivot_table(ukb_dat, "p20001_", "cancer_code") 72 | ukb_dat_year <- pivot_table(ukb_dat, "p20006_", "cancer_year") 73 | ukb_dat_cancer_long <- dplyr::full_join(ukb_dat_code, ukb_dat_year, by = c("eid"="eid", "instance"="instance")) 74 | } 75 | if (get_noncancer) { 76 | ukb_dat_code <- pivot_table(ukb_dat, "p20002_", "noncancer_code") 77 | ukb_dat_year <- pivot_table(ukb_dat, "p20008_", "noncancer_year") 78 | ukb_dat_noncancer_long <- dplyr::full_join(ukb_dat_code, ukb_dat_year, by = c("eid"="eid", "instance"="instance")) 79 | } 80 | 81 | # subset to ukb-codes in provided list 82 | if (verbose) cli::cli_alert("Identify matching codes") 83 | if (get_cancer) ukb_dat_cancer_long = ukb_dat_cancer_long |> dplyr::filter(cancer_code %in% codes_cancer) 84 | if (get_noncancer) ukb_dat_noncancer_long = ukb_dat_noncancer_long |> dplyr::filter(noncancer_code %in% codes_noncancer) 85 | 86 | # join tables if both types of codes provided, otherwise just use one 87 | if (verbose) cli::cli_alert("Joining tables") 88 | if (!is.null(ukb_dat_cancer_long) & !is.null(ukb_dat_noncancer_long)) { 89 | # both code types provided 90 | ukb_dat_sr <- dplyr::full_join(ukb_dat_cancer_long, ukb_dat_noncancer_long, by = c("eid"="eid", "instance"="instance")) 91 | } else if (!is.null(ukb_dat_noncancer_long)) { 92 | # just non-cancer 93 | ukb_dat_sr <- ukb_dat_noncancer_long 94 | } else { 95 | # just cancer 96 | ukb_dat_sr <- ukb_dat_cancer_long 97 | } 98 | 99 | # finish 100 | #if (verbose) cli::cli_alert_success(c("Finished cancer registry: ", "{prettyunits::pretty_sec(as.numeric(difftime(Sys.time(), start_time, units=\"secs\")))}.")) 101 | 102 | # Return data 103 | return(ukb_dat_sr) 104 | 105 | } 106 | 107 | -------------------------------------------------------------------------------- /R/get_rap_phenos.R: -------------------------------------------------------------------------------- 1 | #' Get UK Biobank participant phenotype data 2 | #' 3 | #' @description Using a Spark node/cluster on the UK Biobank Research Analysis Platform (DNAnexus), use R to extract a provided set of variables. Using code from the UK Biobank DNAnexus team https://github.com/UK-Biobank/UKB-RAP-Notebooks/blob/main/NBs_Prelim/105_export_participant_data_to_r.ipynb 4 | #' 5 | #' @return Returns a data.frame (the participant data for the requested variables) 6 | #' 7 | #' @author Luke Pilling 8 | #' 9 | #' @name get_rap_phenos 10 | #' 11 | #' @param names A string or vector of strings. The variable name(s) required. e.g., c("eid","p31","p21003_i0") 12 | #' @param value_coding A string. How to handle coded fields. "replace" if a coding value exists, replace the raw value with the code; "raw" export the raw values of the field; "exclude" if a coding value exists, do not export the value (most commonly used with sparse fields). 13 | #' \code{default="replace"} 14 | #' @param names_are_titles Logical. Passing DNAnexus variable "titles" e.g., c("Age at recruitment", "Standing height | Instance 0"). 15 | #' \code{default=FALSE} 16 | #' @param record A string. The `dnanexus_link` file descriptor of the .dataset to use. Default (if left as NULL) is to use the most recently dispensed dataset. 17 | #' \code{default=most recent dataset} 18 | #' @param verbose Logical. Be verbose, 19 | #' \code{default=FALSE} 20 | #' 21 | #' @examples 22 | #' # get phenotype data 23 | #' ukb <- get_rap_phenos(c("eid","p31","p21003_i0","p53_i0")) 24 | #' 25 | #' # save to file on the RAP worker node 26 | #' readr::write_tsv(ukb, "ukbrap.phenos.20231114.txt.gz") 27 | #' 28 | #' # upload data to RAP storage 29 | #' upload_to_rap(file="ukbrap.phenos.20231114.txt.gz", dir="") 30 | #' 31 | #' @export 32 | #' 33 | get_rap_phenos <- function( 34 | names, 35 | value_coding = "replace", 36 | names_are_titles = FALSE, 37 | record = NULL, 38 | verbose = FALSE 39 | ) { 40 | 41 | start_time <- Sys.time() 42 | 43 | # install packages if required 44 | install.packages(setdiff(c("reticulate", "arrow", "sparklyr"), rownames(installed.packages())), dependencies = TRUE, quiet = TRUE) 45 | 46 | # Check input & options 47 | if (verbose) cat("Check input & options\n") 48 | if (! is.character(names) ) stop("`names` needs to be a character vector of UK Biobank RAP phenotype names, e.g., c(\"eid\",\"p31\",\"p21003_i0\")") 49 | 50 | # Check `eid` is included 51 | if (verbose) cat("Check `eid` is included\n") 52 | if (! "eid" %in% names) { 53 | cat("Adding `eid` to the requested variable names\n") 54 | names <- c("eid",names) 55 | } 56 | 57 | cat("Getting data for ", length(names), " phenotypes\n") 58 | 59 | # Import dxdata package and initialize Spark (dxdata) engine 60 | if (verbose) cat("Import dxdata package and initialize Spark (dxdata) engine\n") 61 | cat("*** If asked about creating a python environment, choose 'no' ****\n") 62 | dxdata <- reticulate::import("dxdata") 63 | 64 | # Connect to the dataset 65 | if (verbose) cat("Connect to the dataset\n") 66 | project <- system("dx env | grep project- | awk -F '\t' '{print $2}'", intern = TRUE) 67 | if (is.null(record)) { 68 | record <- system("dx describe *dataset | grep record- | awk -F ' ' '{print $2}' | sort | tail -n 1" , intern = TRUE) 69 | } 70 | DATASET_ID <- paste0(project, ":", record) 71 | dataset <- dxdata$load_dataset(id=DATASET_ID) 72 | if (verbose) cat(" - [", DATASET_ID, "]\n") 73 | 74 | # Select participant table 75 | if (verbose) cat("Select participant table\n") 76 | pheno <- dataset$entities_by_name[['participant']] 77 | 78 | # Select fields from participant table 79 | if (verbose) cat("Select fields from participant table\n") 80 | if (names_are_titles) { 81 | fld = purrr::map(names, function(x) pheno$find_field(title=x)) 82 | } else { 83 | fld = purrr::map(names, function(x) pheno$find_field(name=x) ) 84 | } 85 | 86 | # Define the Spark engine 87 | if (verbose) cat("Define the Spark engine\n") 88 | engine <- dxdata$connect(dialect="hive+pyspark") 89 | 90 | # Retrieve the fields defined in fld list 91 | if (verbose) cat("Retrieve the fields defined in fld list\n") 92 | df <- pheno$retrieve_fields(engine=engine, fields=fld, coding_values=value_coding) 93 | 94 | # Write the data to a temporary parquet file 95 | if (verbose) cat("Write the data to a temporary parquet file\n") 96 | uid_parquet <- runif(n=1, min=1e6, max=9e6) 97 | parquet_name <- paste0('tmpdf.',uid_parquet,'.parquet') 98 | system(paste0('hadoop fs -rm -r -f ', parquet_name), intern = TRUE) 99 | df$write$parquet(parquet_name) 100 | if (verbose) cat(paste0("Witten to ", parquet_name, "\n")) 101 | 102 | # Copy the temporary parquet file from distributed to the local file system 103 | if (verbose) cat("Copy the temporary parquet file from distributed to the local file system\n") 104 | if(dir.exists(parquet_name)) unlink(paste0('tmpdf.',uid_parquet,'.parquet'), recursive=TRUE) 105 | system(paste0('hadoop fs -copyToLocal ', parquet_name), intern = TRUE) 106 | 107 | # Read the dataset information R using Apache arrow package 108 | if (verbose) cat("Read the dataset information R using Apache arrow package\n") 109 | ds <- arrow::open_dataset(parquet_name) 110 | 111 | # Collect the data from the dataset to R memory 112 | if (verbose) cat("Collect the data from the dataset to R memory\n") 113 | tbl <- ds |> dplyr::collect() 114 | 115 | if (verbose) cat("Done. Time taken:", Sys.time() - start_time, "\n") 116 | 117 | tbl 118 | 119 | } 120 | 121 | -------------------------------------------------------------------------------- /vignettes/ascertain_diagnoses.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Ascertain diagnoses" 3 | description: > 4 | Ascertain UK Biobank participant diagnoses from all sources (medical records and self-report data). 5 | output: rmarkdown::html_vignette 6 | vignette: > 7 | %\VignetteIndexEntry{Ascertain diagnoses} 8 | %\VignetteEngine{knitr::rmarkdown} 9 | %\VignetteEncoding{UTF-8} 10 | --- 11 | 12 | ```{r, include = FALSE} 13 | knitr::opts_chunk$set( 14 | collapse = TRUE, 15 | comment = "#>" 16 | ) 17 | library(ukbrapR) 18 | ``` 19 | 20 | Diagnosis of conditions in UK Biobank participants come from multiple data sources: 21 | 22 | * Self-report during assessment 23 | 24 | * Hospital inpatient records (HES) 25 | 26 | * Primary care (GP) 27 | 28 | * Cancer registry 29 | 30 | * Cause of death 31 | 32 | The {ukbrapR} package makes it fast and easy to ascertain diagnoses from multiple UK Biobank data sources in the DNAnexus Research Analysis Platform (RAP). 33 | 34 | 35 | ## Requires exported files 36 | 37 | This only needs to happen once per project. Running `export_tables()` will submit the necessary `table-exporter` jobs to save the raw medical records files to the RAP persistent storage for the project. ~10Gb of text files are created. This will cost ~£0.15 per month to store in the RAP standard storage. 38 | 39 | Once the files are exported (~15mins) these can then be used by the below functions to extract diagnoses based on codes lists. 40 | 41 | 42 | ## Input 43 | 44 | Depending on the data source different coding vocabularies are required: 45 | 46 | * `ICD10` (for searching HES diagnoses, cause of death, and cancer registry) 47 | 48 | * `ICD9` (for searching older HES diagnosis data) 49 | 50 | * `Read2` and `CTV3` (for GP clinical events) 51 | 52 | * `OPCS3` and `OPCS4` (for HES operations) 53 | 54 | * `ukb_cancer` and `ukb_noncancer` (for self-reported illness at UK Biobank assessments - all instances will be searched) 55 | 56 | Ascertaining diagnoses typically takes two steps: 57 | 58 | 59 | ## 1. Get medical records and self-reported illness data for provided codes 60 | 61 | For a given set of diagnostic codes get the participant medical events and self-reported data. Returns a list of 6 data frames: the subset of the long clinical files with matched codes. 62 | 63 | Codes need to be provided as a data frame with two fields: `vocab_id` and `code`. Valid code vocabularies are listed above. Other cols (such as condition and description) are ignored. 64 | 65 | ```{r} 66 | # example diagnostic codes for Chronic Kidney Disease 67 | codes_df_ckd <- ukbrapR:::codes_df_ckd 68 | head(codes_df_ckd) 69 | 70 | # get diagnosis data - returns list of data frames (one per source) 71 | diagnosis_list <- get_diagnoses(codes_df_ckd) 72 | 73 | # N records for each source 74 | nrow(diagnosis_list$gp_clinical) 75 | nrow(diagnosis_list$hesin_diag) 76 | nrow(diagnosis_list$death_cause) 77 | ``` 78 | 79 | If providing primary care codes for measures (BMI etc) these are also returned (the `gp_clinical` object in the returned list contains all cols for matched codes). 80 | 81 | 82 | ## 2. Get date first diagnosed 83 | 84 | Usually the user is interested in combining the separate data sources into a combined phenotype: the date first diagnosed for each participant from the data/codes in step 1 (cause of death, HES diagnoses, GP clinical, cancer registry, HES operations, and self-reported illness fields). 85 | 86 | In addition to the "date first" `df` field are: 87 | 88 | - a `src` field indicating the source of the date of first diagnosis. 89 | - a `bin` field indicating the cases [1] and controls [0]. This relies on a small number of baseline fields also exported. The `df` field for the controls is the date of censoring (currently 30 October 2022). 90 | - a `bin_prev` field indicating whether the case was before the UK Biobank baseline assessment 91 | 92 | ```{r} 93 | # for each participant, get Date First diagnosed with the condition 94 | diagnosis_df <- get_df(diagnosis_list) 95 | 96 | names(diagnosis_df) 97 | summary(diagnosis_df) 98 | ``` 99 | 100 | You can add a prefix to all the variable names by specifying the "prefix" option: 101 | 102 | ```{r} 103 | diagnosis_df <- get_df(diagnosis_list, prefix="ckd") 104 | 105 | # how many cases ascertained? 106 | table(diagnosis_df$ckd_bin) 107 | 108 | # source of earliest diagnosis date 109 | table(diagnosis_df$ckd_src) 110 | 111 | # date of diagnosis for prevalent cases (i.e., before UK Biobank baseline assessment) 112 | summary(diagnosis_df$ckd_df[ diagnosis_df$ckd_bin_prev == 1 ]) 113 | ``` 114 | 115 | ## Ascertaining multiple conditions at once 116 | 117 | The default `get_df()` behaviour is to use all available codes. However, the most time-efficient way to get multiple conditions is to run `get_diagnoses()` once for all codes for the conditions you wish to ascertain, then get the "date first diagnosed" for each condition separately. In the codes data frame you just need a field indicating the condition name, that will become the variable prefixes. 118 | 119 | ```{r} 120 | # combine haemochromatosis and CKD codes together 121 | # each contain there columns: condition, vocab_id, and code 122 | # where `condition` is either "hh" or "ckd" and will become the variable prefix 123 | codes_df_combined <- rbind(ukbrapR:::codes_df_hh, ukbrapR:::codes_df_ckd) 124 | 125 | # get diagnosis data - returns list of data frames (one per source) 126 | diagnosis_list <- get_diagnoses(codes_df_combined) 127 | 128 | # for each participant, get Date First diagnosed with the condition 129 | diagnosis_df <- get_df(diagnosis_list, group_by="condition") 130 | 131 | # each condition has full set of output 132 | table(diagnosis_df$hh_bin) 133 | 134 | table(diagnosis_df$ckd_bin) 135 | ``` 136 | 137 | In the above example we also included a UK Biobank self-reported illness code for haemochromatosis, that was also ascertained (the Date First is run on each condition separately, they do not all need to have the same data sources). 138 | 139 | 140 | -------------------------------------------------------------------------------- /R/fields_to_phenos.R: -------------------------------------------------------------------------------- 1 | #' Check UK Biobank field IDs 2 | #' 3 | #' @description Check if provided field IDs are valid and return all possible phenotype names in the UK Biobank RAP 4 | #' 5 | #' @return Returns a vector of strings (valid phenotypes). 6 | #' 7 | #' @author Luke Pilling 8 | #' 9 | #' @name fields_to_phenos 10 | #' 11 | #' @param fields A vector of character strings. The field IDs to check if valid. 12 | #' @param filename A string. If provided, will save as a fieldname file ready for the table-exporter (including "eid"). 13 | #' \code{default=""} 14 | #' @param abort Logical. Abort if a field is missing?, 15 | #' \code{default=TRUE} 16 | #' @param verbose Logical. Be verbose, 17 | #' \code{default=FALSE} 18 | #' 19 | #' @examples 20 | #' 21 | #' # not instanced, not arrayed 22 | #' fields_to_phenos("31") # sex 23 | #' 24 | #' # instanced, not arrayed 25 | #' fields_to_phenos("53") # assessment date 26 | #' 27 | #' # instanced and arrayed 28 | #' fields_to_phenos("93") # systolic blood pressure, manual reading 29 | #' 30 | #' # instanced and arrayed, MRI assessments only 31 | #' fields_to_phenos("12673") # Heart rate recorded during vicorder run (Heart MRI) 32 | #' 33 | #' # check multiple simultaneously 34 | #' fields_to_phenos(c("31","93")) 35 | #' 36 | #' # only warn if an invalid field is provided (default is to abort) 37 | #' fields_to_phenos(c("31","notafield","93")) 38 | #' fields_to_phenos(c("31","notafield","93"), abort=FALSE) 39 | #' 40 | #' # save as fieldname file for the table-exporter (don't forget to upload to the RAP) 41 | #' fields_to_phenos(c("31","93"), filename="fieldnames.txt") 42 | #' 43 | #' @export 44 | #' 45 | fields_to_phenos <- function( 46 | fields, 47 | filename="", 48 | abort=TRUE, 49 | verbose=FALSE 50 | ) { 51 | 52 | # Check if 'field' is a character string of length 1 53 | if (class(fields)[1] != "character") { 54 | cli::cli_abort("Provided fields need to be a vector of character strings") # Abort if field is not a character string 55 | } 56 | 57 | # If filename provided, check it is valid 58 | save_file <- FALSE 59 | if (filename != "") { 60 | if (! is.character(filename)) cli::cli_abort("Provided filename need to be a character string") 61 | if (! length(filename) == 1) cli::cli_abort("Provided filename need to be a character string of length 1") 62 | save_file <- TRUE 63 | } 64 | 65 | # Apply field_to_phenos() to each field 66 | phenos <- purrr::map(fields, \(x) ukbrapR:::field_to_phenos(field=x, abort=abort, verbose=verbose)) |> purrr::list_c() 67 | 68 | # Save file, or return vector? 69 | if (save_file) { 70 | data.frame(c("eid",phenos)) |> readr::write_tsv(filename, col_names=FALSE, progress=FALSE) 71 | cli::cli_alert_success(stringr::str_c("Saved fields and phenos to {.file ", filename, "}")) 72 | } else { 73 | return(phenos) 74 | } 75 | 76 | } 77 | 78 | 79 | #' Function to check individual field_id 80 | #' 81 | #' @return Returns a vector of strings (valid phenotypes). 82 | #' 83 | #' @author Luke Pilling 84 | #' 85 | #' @name field_to_phenos 86 | #' 87 | #' @param field A string. The field ID (e.g., `31`) to check if valid. 88 | #' @param abort Logical. Abort if field is missing?, 89 | #' \code{default=TRUE} 90 | #' @param verbose Logical. Be verbose, 91 | #' \code{default=FALSE} 92 | #' 93 | #' @noRd 94 | #' 95 | field_to_phenos <- function( 96 | field, 97 | abort=TRUE, 98 | verbose=FALSE 99 | ) { 100 | 101 | # Check if 'field' is a character string of length 1 102 | if (class(field)[1] == "character") { 103 | if (length(field) > 1) { 104 | cli::cli_abort("field needs to be length 1") # Abort if field length is greater than 1 105 | } 106 | } else { 107 | cli::cli_abort("field needs to be a character string") # Abort if field is not a character string 108 | } 109 | 110 | # Extract the field ID, in case it's passed as 'p4080_i0_a0' or similar 111 | field_id <- field |> stringr::str_remove("p") |> stringr::str_split_i("_", 1) 112 | if (verbose) cli::cli_alert(c("Field ID: ", field_id)) 113 | 114 | # Get information for this field from the schema 115 | field_info <- ukbrapR:::ukb_schema[["field"]] |> dplyr::filter(field_id == !!field_id) 116 | 117 | # Was the provided field ID in the schema? 118 | if (nrow(field_info)==0) { 119 | if (abort) cli::cli_abort(stringr::str_c("Field ID [", field_id, "] not present in the UK Biobank schema (https://biobank.ctsu.ox.ac.uk/ukb/schema.cgi?id=1)")) 120 | if (!abort) { 121 | cli::cli_warn(stringr::str_c("Field ID [", field_id, "] not present in the UK Biobank schema (https://biobank.ctsu.ox.ac.uk/ukb/schema.cgi?id=1)")) 122 | return(NULL) 123 | } 124 | } 125 | 126 | # Initialize the basic field ID and valid fields list 127 | p_field_id <- stringr::str_c("p", field_id) 128 | valid_fields <- NULL 129 | 130 | # Check if the field is instanced and generate instances if true 131 | if (field_info$instanced == 1) { 132 | instances <- seq(field_info$instance_min, field_info$instance_max, 1) 133 | if (verbose) cli::cli_alert(stringr::str_c("Is instaced [", stringr::str_c(instances, collapse=","), "]")) 134 | } 135 | 136 | # Check if the field is arrayed and generate arrays if true 137 | # If the field is "multiple choice" value type then it is *not* arrayed 138 | if (field_info$arrayed == 1 & field_info$value_type != 22) { 139 | arrays <- seq(field_info$array_min, field_info$array_max, 1) 140 | if (verbose) cli::cli_alert(stringr::str_c("Is arrayed [", stringr::str_c(arrays, collapse=","), "]")) 141 | 142 | # Generate valid fields for non-instanced arrayed fields 143 | if (field_info$instanced == 0) { 144 | for (aa in 1:length(arrays)) { 145 | valid_fields <- c(valid_fields, stringr::str_c(p_field_id, "_a", arrays[aa])) 146 | } 147 | } 148 | 149 | # Generate valid fields for instanced arrayed fields 150 | if (field_info$instanced == 1) { 151 | for (ii in 1:length(instances)) { 152 | for (aa in 1:length(arrays)) { 153 | valid_fields <- c(valid_fields, stringr::str_c(p_field_id, "_i", instances[ii], "_a", arrays[aa])) 154 | } 155 | } 156 | } 157 | 158 | } else { 159 | 160 | # Generate valid fields for instanced non-arrayed fields 161 | if (field_info$instanced == 1) { 162 | for (ii in 1:length(instances)) { 163 | valid_fields <- c(valid_fields, stringr::str_c(p_field_id, "_i", instances[ii])) 164 | } 165 | } else { 166 | 167 | # not instanced or arrayed! 168 | valid_fields <- p_field_id 169 | 170 | } 171 | 172 | } 173 | 174 | # Return the list of valid fields for this field ID 175 | return(valid_fields) 176 | 177 | } 178 | -------------------------------------------------------------------------------- /R/label_ukb_field.R: -------------------------------------------------------------------------------- 1 | #' Update UK Biobank field with `title` and `label` from the schema 2 | #' 3 | #' @description Variables such as education and ethnicity are provided as integers but have specific codes. 4 | #' 5 | #' The UK Biobank schema are machine-readable dictionaries and mappings defining the internal structure of the online Showcase. https://biobank.ctsu.ox.ac.uk/crystal/schema.cgi 6 | #' 7 | #' This function updates a field in a data frame of UK Biobank with information from the Schema. 8 | #' 9 | #' @return Returns a data frame. 10 | #' 11 | #' @author Luke Pilling 12 | #' 13 | #' @name label_ukb_field 14 | #' 15 | #' @param d A data frame. The data frame containing the `field` to update. 16 | #' @param field A string. The field (e.g., `p54_i0`) in the provided data frame to update. 17 | #' @param field_id A string. If the field has been renamed (to e.g.,"assessment_centre") provide the field id here (e.g., "54"). 18 | #' \code{default=NULL} 19 | #' @param verbose Logical. Be verbose, 20 | #' \code{default=FALSE} 21 | #' 22 | #' @examples 23 | #' 24 | #' # update the Assessment Centre variable 25 | #' ukb <- ukbrapR::label_ukb_field(ukb, field="p54_i0") 26 | #' 27 | #' table(ukb$p54_i0) # tabulates the values 28 | #' table(haven::as_factor(ukb$p54_i0)) # tabulates the labels 29 | #' haven::print_labels(ukb$p54_i0) # show the value:label mapping for this variable 30 | #' 31 | #' # if the variable has been renamed, provide the "field" and "field_id" e.g., 32 | #' ukb <- ukbrapR::label_ukb_field(ukb, field="assessment_centre", field_id="54") 33 | #' 34 | #' @export 35 | #' 36 | label_ukb_field <- function( 37 | d, # the data frame 38 | field, # the variable name in the data 39 | field_id=NULL, # if not in UKB format, provide the field ID 40 | verbose=FALSE 41 | ) { 42 | 43 | # get symbol to use as variable name later 44 | if (verbose) cat(" - ", field, "\n") 45 | field_sym <- rlang::sym(field) 46 | 47 | # get field ID 48 | if (is.null(field_id)) field_id <- field |> stringr::str_remove("p") |> stringr::str_split_i("_", 1) 49 | if (verbose) print(" -- ", field_id, "\n") 50 | 51 | # get encoding ID & variable title 52 | field_eid <- ukbrapR:::ukb_schema[["field"]] |> dplyr::filter(field_id==!!field_id) |> dplyr::select(encoding_id) |> dplyr::pull() 53 | field_title <- ukbrapR:::ukb_schema[["field"]] |> dplyr::filter(field_id==!!field_id) |> dplyr::select(title) |> dplyr::pull() 54 | 55 | # search schema for coding 56 | index <- 0 57 | for (i in 2:6) if (field_eid %in% ukbrapR:::ukb_schema[[i]]$encoding_id) index <- i 58 | 59 | # if index was found, include labels, otherwise just update title 60 | if (index != 0) { 61 | 62 | # subset and convert to named vector 63 | field_coding <- ukbrapR:::ukb_schema[[index]] |> 64 | dplyr::filter(encoding_id == field_eid) |> 65 | dplyr::select(meaning, value) |> 66 | tibble::deframe() 67 | 68 | # skip labelling if values contain the "pipe" (implies collapsed array variables) 69 | if ( ! any(stringr::str_detect(na.omit(d[[field]]), stringr::fixed("|"))) ) { 70 | 71 | # update variable label 72 | d <- d |> dplyr::mutate( 73 | !! field_sym := haven::labelled( 74 | !! field_sym, 75 | labels = !! field_coding, 76 | label = !! field_title 77 | ) 78 | ) 79 | 80 | } 81 | 82 | } else { 83 | 84 | # update variable title 85 | d <- d |> dplyr::mutate( 86 | !! field_sym := haven::labelled( 87 | !! field_sym, 88 | label = !! field_title 89 | ) 90 | ) 91 | 92 | } 93 | 94 | return(d) 95 | 96 | } 97 | 98 | 99 | #' Update a data frame containing UK Biobank fields with `title` and `label` from the schema 100 | #' 101 | #' @description Variables such as education and ethnicity are provided as integers but have specific codes. 102 | #' 103 | #' The UK Biobank schema are machine-readable dictionaries and mappings defining the internal structure of the online Showcase. https://biobank.ctsu.ox.ac.uk/crystal/schema.cgi 104 | #' 105 | #' This function updates a data frame of UK Biobank data field with information from the Schema. 106 | #' 107 | #' It is in effect a wrapper to apply ukbrapR::label_ukb_field() to each variable in a data frame that looks like a UK Biobank field. 108 | #' 109 | #' Only recognised fields are modified (variables named things like "p54_i0"). Other variables are ignored. 110 | #' 111 | #' @return Returns a data frame. 112 | #' 113 | #' @author Luke Pilling 114 | #' 115 | #' @name label_ukb_fields 116 | #' 117 | #' @param d A data frame. The data frame containing UK Biobank fields to update. 118 | #' @param verbose Logical. Be verbose, 119 | #' \code{default=FALSE} 120 | #' 121 | #' @examples 122 | #' 123 | #' # say the below data frame contains 4 variables: `eid`, `p54_i0`, `p50_i0` and `age_at_assessment` 124 | #' names(ukb) 125 | #' 126 | #' # update the variables that looks like UK Biobank fields with titles and, where cateogrical, labels 127 | #' # i.e., `p54_i0` and `p50_i0` only -- `eid` and `age_at_assessment` are ignored 128 | #' ukb <- ukbrapR::label_ukb_fields(ukb) 129 | #' 130 | #' table(ukb$p54_i0) # tabulates the values 131 | #' table(haven::as_factor(ukb$p54_i0)) # tabulates the labels 132 | #' haven::print_labels(ukb$p54_i0) # show the value:label mapping for this variable 133 | #' 134 | #' @export 135 | #' 136 | label_ukb_fields <- function( 137 | d, 138 | verbose=FALSE 139 | ) { 140 | 141 | v <- packageVersion("ukbrapR") 142 | cli::cli_alert_info("ukbrapR v{v}") 143 | 144 | start_time <- Sys.time() 145 | 146 | # identify variables matching UK Biobank field format: start with "p", then have an integer before any "_" 147 | fields <- colnames(d) 148 | fields_int <- fields |> 149 | stringr::str_subset("^p") |> 150 | stringr::str_replace("p", "") |> 151 | stringr::str_split_i("_", 1) 152 | int_check <- function(vect) { 153 | vect <- as.character(vect) 154 | vect_int <- purrr::map(vect, \(x) all(unlist(stringr::str_split(x, "")) %in% 0:9)) |> purrr::list_c() 155 | return(vect[vect_int]) 156 | } 157 | fields_int <- int_check(fields_int) 158 | fields <- fields[ stringr::str_detect( fields, stringr::str_c( stringr::str_c("p", fields_int), collapse="|") ) ] 159 | 160 | # any matched? 161 | n_fields <- length(fields) 162 | if (n_fields==0) cli::cli_abort(c("x" = "No UK Biobank fields (e.g., 'p54_i0') identified in the provided data frame.")) 163 | 164 | # label each field identified 165 | cli::cli_alert("Labelling {n_fields} field{?s}") 166 | for (f in fields) d <- ukbrapR::label_ukb_field(d=d, field=f, verbose=verbose) 167 | 168 | # finished! 169 | cli::cli_alert_success(c("Finished. Time taken: ", "{prettyunits::pretty_sec(as.numeric(difftime(Sys.time(), start_time, units=\"secs\")))}.")) 170 | 171 | # return it 172 | return(d) 173 | 174 | } -------------------------------------------------------------------------------- /R/get_selfrep_illness_spark.R: -------------------------------------------------------------------------------- 1 | #' Get UK Biobank participant self-reported illness/year data for specific codes 2 | #' 3 | #' @description For a specific self-reported illness code or codes, identify whether the participant has self-reported at any visit, and identify the year. 4 | #' Intended for use on the UK Biobank DNnexus Research Analysis Platform, but if the user provides their own dataframe of UK Biobank self-reported fields this works on any system. 5 | #' 6 | #' @return Returns a data frame with four variables: eid, selfrep [binary, codes identified?], selfrep_df [date of reported illness], selfrep_i [instance the illness was first reported] 7 | #' 8 | #' @author Luke Pilling 9 | #' 10 | #' @name get_selfrep_illness_spark 11 | #' 12 | #' @param codes_df A data frame. Contains the `vocab_col` and `codes_col` i.e., an indicator of the vocabulary and the diagnostic codes. 13 | #' @param vocab_col A string. Column name in `codes_df` that contains the vocabulary indicator for the code (for self-reported it needs to be either "ukb_cancer" or "ukb_noncancer"). 14 | #' \code{default='vocab_id'} 15 | #' @param codes_col A string. Column name in `codes_df` that contains the self-reported disease code (e.g., 1507). 16 | #' \code{default='code'} 17 | #' @param ukb_dat A data frame. Optional. If not provided, will get the phenotypes from the RAP. Contains the self-reported illness fields e.g., `p20008_i0_a0`. 18 | #' \code{default=NULL} 19 | #' @param n_cancer_arrays An integer. It is not trivial to determine the max number of arrays to request from Spark for the self-reported illnesses. The defaults match the currently (Feb 2024) available data but may need increasing in the future. 20 | #' \code{default=5} 21 | #' @param n_noncancer_arrays An integer. It is not trivial to determine the max number of arrays to request from Spark for the self-reported illnesses. The defaults match the currently (Feb 2024) available data but may need increasing in the future. 22 | #' \code{default=30} 23 | #' @param verbose Logical. Be verbose, 24 | #' \code{default=FALSE} 25 | #' 26 | #' @examples 27 | #' # example diagnostic codes for haemochromatosis 28 | #' print(codes_df_hh) 29 | #' 30 | #' # get self-reported data - a data frame 31 | #' selfrep_df <- get_selfrep_illness_spark(codes_df_hh) 32 | #' 33 | #' # inspect variables 34 | #' table(selfrep_df$selfrep) 35 | #' summary(selfrep_df$selfrep_df) 36 | #' table(selfrep_df$selfrep_i) 37 | #' 38 | #' @export 39 | #' 40 | get_selfrep_illness_spark <- function( 41 | codes_df, 42 | vocab_col = "vocab_id", 43 | codes_col = "code", 44 | ukb_dat = NULL, 45 | n_cancer_arrays = 5, 46 | n_noncancer_arrays = 30, 47 | verbose = FALSE 48 | ) { 49 | 50 | start_time <- Sys.time() 51 | 52 | # Check input 53 | if (verbose) cat("Check inputs\n") 54 | if (! any(class(codes_df) %in% c("data.frame","tbl_df"))) stop("Codelist needs to be provided as a data frame") 55 | codes_df = as.data.frame(codes_df) # in case a tibble 56 | 57 | # check `codes_df` -- does it contain "ukb_cancer" or "ukb_noncancer" codes? Cannot have both. Makes no sense. 58 | if (! any(c("ukb_cancer","ukb_noncancer") %in% codes_df[,vocab_col])) stop("`vocab_col` column in `codes_df` needs to contain either 'ukb_cancer' or 'ukb_noncancer'") 59 | if (all(c("ukb_cancer","ukb_noncancer") %in% codes_df[,vocab_col])) stop("`vocab_col` column in `codes_df` needs to contain either 'ukb_cancer' or 'ukb_noncancer' (not both)") 60 | 61 | # get codes 62 | if (verbose) cat("Get codes, determine if cancer\n") 63 | codes <- codes_df[ codes_df[,vocab_col] %in% c("ukb_cancer","ukb_noncancer") ,codes_col] 64 | cat("Getting data on ", length(unique(codes)), " codes\n") 65 | 66 | # is cancer? 67 | is_cancer <- TRUE 68 | if ("ukb_noncancer" %in% codes_df[,vocab_col]) is_cancer <- FALSE 69 | 70 | # if `ukb_dat` is NULL use `get_rap_phenos()` to get phenotype data - all instances, all arrays 71 | # need: eid, 20001, 20002, 20006, 20007, 20008, 20009 72 | if (is.null(ukb_dat)) { 73 | 74 | if (verbose) cat("No `ukb_dat` object provided - will get from RAP\n") 75 | 76 | # Determine variable names needed (depends if cancer or non-cancer) 77 | # will use 20001 (cancer code) and 20002 (non-cancer code) 78 | # will use the interpolated year (20006 = cancer year, 20008 = non-cancer year) 79 | 80 | # RAP stores arrays as separate variables 81 | 82 | # get field names 83 | if (verbose) cat("Determine field names to request\n") 84 | names = "eid" 85 | 86 | # if cancer or non-cancer code: 87 | if (is_cancer) { 88 | # cancer code = 20001 89 | for (a in c(0:n_cancer_arrays)) names <- c(names, stringr::str_c("p20001_i", 0, "_a", a)) 90 | for (a in c(0:n_cancer_arrays)) names <- c(names, stringr::str_c("p20001_i", 1, "_a", a)) 91 | for (a in c(0:n_cancer_arrays)) names <- c(names, stringr::str_c("p20001_i", 2, "_a", a)) 92 | for (a in c(0:n_cancer_arrays)) names <- c(names, stringr::str_c("p20001_i", 3, "_a", a)) 93 | # cancer year = 20006 94 | for (a in c(0:n_cancer_arrays)) names <- c(names, stringr::str_c("p20006_i", 0, "_a", a)) 95 | for (a in c(0:n_cancer_arrays)) names <- c(names, stringr::str_c("p20006_i", 1, "_a", a)) 96 | for (a in c(0:n_cancer_arrays)) names <- c(names, stringr::str_c("p20006_i", 2, "_a", a)) 97 | for (a in c(0:n_cancer_arrays)) names <- c(names, stringr::str_c("p20006_i", 3, "_a", a)) 98 | } else { 99 | # non-cancer illness code = 20002 100 | for (a in c(0:n_noncancer_arrays)) names <- c(names, stringr::str_c("p20002_i", 0, "_a", a)) 101 | for (a in c(0:n_noncancer_arrays)) names <- c(names, stringr::str_c("p20002_i", 1, "_a", a)) 102 | for (a in c(0:n_noncancer_arrays)) names <- c(names, stringr::str_c("p20002_i", 2, "_a", a)) 103 | for (a in c(0:n_noncancer_arrays)) names <- c(names, stringr::str_c("p20002_i", 3, "_a", a)) 104 | # non-cancer illness year = 20008 105 | for (a in c(0:n_noncancer_arrays)) names <- c(names, stringr::str_c("p20008_i", 0, "_a", a)) 106 | for (a in c(0:n_noncancer_arrays)) names <- c(names, stringr::str_c("p20008_i", 1, "_a", a)) 107 | for (a in c(0:n_noncancer_arrays)) names <- c(names, stringr::str_c("p20008_i", 2, "_a", a)) 108 | for (a in c(0:n_noncancer_arrays)) names <- c(names, stringr::str_c("p20008_i", 3, "_a", a)) 109 | } 110 | 111 | if (verbose) print(names) 112 | 113 | # get fields from the RAP 114 | if (verbose) cat("Get fields from the RAP\n") 115 | ukb_dat <- ukbrapR::get_rap_phenos(names, value_coding = "raw", verbose = verbose) 116 | 117 | } else { 118 | cat("User is providing own `ukb_dat` with self-reported illness fields\n") 119 | } 120 | 121 | # remove any cols with all missing values 122 | ukb_dat = ukb_dat |> dplyr::select_if(~ !all(is.na(.))) 123 | 124 | # check all visits for participant - create `selfrep` (binary, ever), `selfrep_df` (the "Date First" variable) and `selfrep_i` (the "instance" i.e., visit) 125 | # https://biobank.ctsu.ox.ac.uk/crystal/label.cgi?id=100074 126 | # will use 20001 (cancer code) and 20002 (non-cancer code) 127 | # will use 20006-9 the interpolated year/age (6 = cancer year, 8 = non-cancer year) 128 | # interpolated: 129 | # - If the participant gave a calendar year, then the best-fit time is half-way through that year. For example if the year was given as 1970, then the value presented is 1970.5 130 | # - If the participant gave their age then the value presented is the fractional year corresponding to the mid-point of that age. For example, if the participant said they were 30 years old then the value is the date at which they were 30years+6months. 131 | # - Interpolated values before the date of birth were truncated forwards to that time. 132 | # - Interpolated values after the time of data acquisition were truncated back to that time. 133 | # [-1] = "Date uncertain or unknown" and [-3] = "Preferred not to answer" (exclude both from DF) 134 | 135 | # create empty vars in ukb_dat to modify 136 | ukb_dat$selfrep <- 0 137 | ukb_dat$selfrep_df <- NA 138 | ukb_dat$selfrep_i <- NA 139 | 140 | # for this instance, check if participant self-reported this code and record which array 141 | get_selfrep_i <- function(ukb_dat, codes, i, verbose) { 142 | 143 | if (verbose) cat("Get data from instance ", i, "\n") 144 | 145 | # variable prefix depends on cancer or not: 146 | if (is_cancer) { 147 | v_diag <- paste0("p20001_i", i) 148 | v_year <- paste0("p20006_i", i) 149 | } 150 | if (!is_cancer) { 151 | v_diag <- paste0("p20002_i", i) 152 | v_year <- paste0("p20008_i", i) 153 | } 154 | 155 | # Number of diagnosis columns 156 | n_cols <- sum(stringr::str_detect(names(ukb_dat), v_diag)) 157 | 158 | # Iterate through each diagnosis column 159 | for (a in 0:(n_cols-1)) { 160 | 161 | if (verbose) cat("Get data from instance ", i, " array ", a, "\n") 162 | 163 | diag_col <- rlang::sym(paste0(v_diag, "_a", a)) 164 | year_col <- rlang::sym(paste0(v_year, "_a", a)) 165 | 166 | # Update where the code matches 167 | ukb_dat <- ukb_dat |> dplyr::mutate( 168 | selfrep_i = dplyr::if_else(selfrep == 0 & !!diag_col %in% codes, i, selfrep_i, selfrep_i), 169 | selfrep_df = dplyr::if_else(selfrep == 0 & !!diag_col %in% codes, !!year_col, selfrep_df, selfrep_df), 170 | selfrep = dplyr::if_else(selfrep == 0 & !!diag_col %in% codes, 1, selfrep, selfrep) 171 | ) 172 | } 173 | 174 | return(ukb_dat) 175 | 176 | } 177 | 178 | # update selfrep variables with data from each instance 179 | ukb_dat = get_selfrep_i(ukb_dat, codes, 0, verbose) 180 | ukb_dat = get_selfrep_i(ukb_dat, codes, 1, verbose) 181 | ukb_dat = get_selfrep_i(ukb_dat, codes, 2, verbose) 182 | ukb_dat = get_selfrep_i(ukb_dat, codes, 3, verbose) 183 | 184 | if (verbose) cat("Format date first variable\n") 185 | 186 | # replace unknown or missing years with NA 187 | ukb_dat = dplyr::mutate(ukb_dat, selfrep_df = dplyr::if_else(selfrep_df < 0, NA, selfrep_df)) 188 | 189 | # convert decimal to date 190 | ukb_dat = dplyr::mutate(ukb_dat, selfrep_df = lubridate::as_date(lubridate::date_decimal(selfrep_df))) 191 | 192 | # finish 193 | if (verbose) cat("Done. Time taken:", Sys.time() - start_time, "\n") 194 | 195 | # Return data 196 | return(ukb_dat[,c("eid", "selfrep", "selfrep_df", "selfrep_i")]) 197 | 198 | } 199 | 200 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ukbrapR 2 | 3 | 4 | [![](https://img.shields.io/badge/version-0.3.9-informational.svg)](https://github.com/lcpilling/ukbrapR) 5 | [![](https://img.shields.io/github/last-commit/lcpilling/ukbrapR.svg)](https://github.com/lcpilling/ukbrapR/commits/main) 6 | [![](https://img.shields.io/badge/lifecycle-experimental-orange)](https://www.tidyverse.org/lifecycle/#experimental) 7 | [![DOI](https://zenodo.org/badge/709765135.svg)](https://zenodo.org/doi/10.5281/zenodo.11517716) 8 | 9 | 10 | ukbrapR (phonetically: 'U-K-B-wrapper') is an R package for working in the UK Biobank Research Analysis Platform (RAP). The aim is to make it quicker, easier, and more reproducible. 11 | 12 | > Since `v0.2.0` ukbrapR works best on a "normal" cluster using RStudio and raw data from the table-exporter. Old Spark functions are still available but are not updated. 13 | 14 | Wrapped server icon by DALL-E 15 | 16 | ## Installation 17 | 18 | In the DNAnexus Tools menu launch Posit Workbench and start an RStudio environment. 19 | 20 | ```r 21 | # install current version 22 | remotes::install_github("lcpilling/ukbrapR") 23 | ``` 24 | 25 | ## Features 26 | 27 | There are three main groups of functions: 28 | 29 | * :dna: [Genetics](#genetic-variants): extract genotypes from Bulk data, create polygenic score 30 | * :clipboard: [Diagnoses](#ascertain-diagnoses): ascertain from health records and self-reported illness data, determine date first diagnosed 31 | * :hammer_and_wrench: [Utilities](#utility-functions): Check field names, label data fields, upload/download files from RAP, and pull phenotypes from Spark 32 | 33 | ## Genetic variants 34 | 35 | Bulk imputed genotypes and variant calls from Whole Genome Sequencing are available and can be easily accessed in an RStudio instance. 36 | 37 | ### Extract variants 38 | 39 | `extract_variants()` by default uses [bgenix](https://enkre.net/cgi-bin/code/bgen) and [plink](https://www.cog-genomics.org/plink/) to subset the [imputed BGEN files](https://biobank.ctsu.ox.ac.uk/crystal/field.cgi?id=22828) and read it quickly and easily into R. 40 | 41 | The only required input is a data frame (or path to file) containing "rsid" and "chr" variables. See function documentation for further details/options (including the available `make_imputed_bed()` and `load_bed()` internal functions). 42 | 43 | ```r 44 | varlist <- data.frame(rsid=c("rs1800562","rs429358"), chr=c(6,19)) 45 | 46 | imputed_genotypes <- extract_variants(varlist) 47 | #> ~10 seconds 48 | 49 | dim(imputed_genotypes) 50 | #> [1] 487409 3 51 | ``` 52 | 53 | By setting option `source="dragen"` the function will instead subset the [DRAGEN WGS BGEN files](https://biobank.ctsu.ox.ac.uk/crystal/field.cgi?id=24309). This requires "pos" in the input data frame (build 38). 54 | 55 | ```r 56 | varlist_b38 <- data.frame(rsid=c("rs1800562","rs429358"), chr=c(6,19), pos=c(26092913,44908684)) 57 | 58 | dragen_genotypes <- extract_variants(varlist_b38, source="dragen") 59 | #> ~15 seconds 60 | ``` 61 | 62 | The highlight of developing this feature was naming the internal function `make_dragen_bed()` :dragon: :bed: 63 | 64 | > This assumes your project has access to the WGS BGEN files released April 2025. If not, run `ukbrapR:::make_dragen_bed_from_pvcfs()` to use [tabix](https://www.htslib.org/doc/tabix.html) and [plink](https://www.cog-genomics.org/plink/) to subset the [DRAGEN WGS pVCF files](https://biobank.ctsu.ox.ac.uk/crystal/field.cgi?id=24310). Much slower. 65 | 66 | 67 | ### Create polygenic score 68 | 69 | `create_pgs()` takes a data frame containing a list of variant associations with a trait and creates a weighted allele score using [plink](https://www.cog-genomics.org/plink/1.9/score). By default it uses the [imputed](https://biobank.ctsu.ox.ac.uk/crystal/field.cgi?id=22828) genotypes. 70 | 71 | The only required input is a data frame (or path to file) containing rsid, chr, pos, effect_allele, other_allele, beta. For DRAGEN pos is build 38. 72 | 73 | ```r 74 | # weights from GWAS of liver cirrhosis (Innes 2020 Gastroenterology doi:10.1053/j.gastro.2020.06.014) 75 | varlist_pgs <- readr::read_tsv(system.file("files", "pgs_liver_cirrhosis.txt", package="ukbrapR")) 76 | head(varlist_pgs) 77 | #> rsID CHR POS effect_allele other_allele effect_weight locus_name 78 | #> 79 | #> 1 rs2642438 1 220796686 A G -0.177 MARC1 80 | #> 2 rs11925835 3 56831417 T C -0.235 ARHGEF3 81 | #> 3 rs72613567 4 87310241 TA T -0.166 HSD17B13 82 | #> 4 rs2954038 8 125495147 C A 0.16 TRIB1 83 | #> 5 rs11065384 12 120985482 T C 0.275 HNF1A 84 | #> 6 rs28929474 14 94378610 T C 0.561 SERPINA1 85 | 86 | liver_pgs <- create_pgs( 87 | in_file=varlist_pgs, # can be a data frame or file path 88 | out_file="liver_cirrhosis.imputed.pgs", # {optional} prefix for created .bed and .tsv files 89 | pgs_name="liver_cirrhosis_pgs") # {optional} variable name 90 | #> → Extracting 9 variants from 8 imputed files 91 | #> ✔ PGS created! See file liver_cirrhosis.imputed.pgs.tsv 92 | #> ~1 minute 93 | 94 | summary(liver_pgs$liver_cirrhosis_pgs) 95 | #> Min. 1st Qu. Median Mean 3rd Qu. Max. 96 | #> 0.00000 0.06006 0.08200 0.08589 0.10722 0.26639 97 | ``` 98 | 99 | ## Ascertain diagnoses 100 | 101 | Diagnosis of conditions in UK Biobank participants come from multiple data sources. {ukbrapR} makes it fast and easy to ascertain diagnoses from multiple UK Biobank data sources in the DNAnexus Research Analysis Platform (RAP). Follow the below steps. See the website article for more details. 102 | 103 | ### 1. Export tables of raw data 104 | 105 | This only needs to happen once per project. Run `export_tables()` to submit the `table-exporter` jobs to save the required files to the RAP persistent storage. ~10Gb of text files are created, costing ~£0.15 per month to store. 106 | 107 | ### 2. Get diagnoses from all data sources 108 | 109 | For a given set of diagnostic codes get the participant Electronic Medical Records (EMR) and self-reported illess data. Returns a list containing up to 6 data frames: the subset of the clinical files with matched codes. 110 | 111 | Codes need to be provided as a data frame with two fields: `vocab_id` and `code`. Valid code vocabularies are: 112 | 113 | - `ICD10` (for searching HES diagnoses, cause of death, and cancer registry) 114 | - `ICD9` (for searching older HES diagnosis data) 115 | - `Read2` and `CTV3` (for GP clinical events) 116 | - `OPCS3` and `OPCS4` (for HES operations) 117 | - `ukb_cancer` and `ukb_noncancer` (for self-reported illness at UK Biobank assessments - all instances will be searched) 118 | 119 | ```r 120 | # example diagnostic codes for CKD 121 | codes_df_ckd <- ukbrapR:::codes_df_ckd 122 | head(codes_df_ckd) 123 | #> condition vocab_id code 124 | #> 1 ckd ICD10 N18.3 125 | #> 2 ckd ICD10 N18.4 126 | #> 3 ckd ICD10 N18.5 127 | #> ... 128 | 129 | # get diagnosis data - returns list of data frames (one per source) 130 | diagnosis_list <- get_diagnoses(codes_df_ckd) 131 | #> 7 ICD10 codes, 40 Read2 codes, 37 CTV3 codes 132 | #> ~2 minutes 133 | 134 | # N records for each source 135 | nrow(diagnosis_list$gp_clinical) # 29,083 136 | nrow(diagnosis_list$hesin_diag) # 206,390 137 | nrow(diagnosis_list$death_cause) # 1,962 138 | ``` 139 | 140 | ### 3. Get date first diagnosed 141 | 142 | Identify the date first diagnosed for each participant from any of datasets searched with `get_diagnoses()` (cause of death, HES diagnoses, GP clinical, cancer registry, HES operations, and self-reported illness fields). 143 | 144 | Also included are: 145 | 146 | - a `src` field indicating the source of the date of first diagnosis. 147 | - a `bin` field indicating the cases [1] and controls [0]. This relies on a small number of baseline fields also exported. The `df` field for the controls is the date of censoring (currently 30 October 2022). 148 | - a `bin_prev` field indicating whether the case was before the UK Biobank baseline assessment 149 | 150 | ```r 151 | # for each participant, get Date First diagnosed with the condition 152 | # {optional} add a prefix to the variable names with "prefix" 153 | diagnosis_df <- get_df(diagnosis_list, prefix="ckd") 154 | #> ~2 seconds 155 | 156 | # how many cases ascertained? 157 | table(diagnosis_df$ckd_bin) 158 | #> 0 1 159 | #> 470334 31935 160 | 161 | # source of earliest diagnosis date 162 | table(diagnosis_df$ckd_src) 163 | #> death gp hes selfrep_i0 selfrep_i1 selfrep_i2 selfrep_i3 164 | #> 224 12394 19310 85 16 63 3 165 | 166 | # date of diagnosis for prevalent cases (i.e., before UK Biobank baseline assessment) 167 | summary(diagnosis_df$ckd_df[ diagnosis_df$ckd_bin_prev == 1 ]) 168 | #> Min. 1st Qu. Median Mean 3rd Qu. Max. 169 | #> "1958-01-01" "2006-06-21" "2007-01-12" "2006-06-24" "2007-11-19" "2010-06-16" 170 | ``` 171 | 172 | ### Ascertaining multiple conditions at once 173 | 174 | The default `get_df()` behaviour is to use all available codes. However the most time-efficient way to get multiple conditions is to run `get_diagnoses()` once for all codes for the conditions you wish to ascertain, then get the "date first diagnosed" for each condition separately. In the codes data frame you just need a field indicating the condition name, that will become the variable prefixes. 175 | 176 | ```r 177 | # combine haemochromatosis and CKD codes together 178 | # each contain there columns: condition, vocab_id, and code 179 | # where `condition` is either "hh" or "ckd" and will become the variable prefix 180 | codes_df_combined = rbind(ukbrapR:::codes_df_hh, ukbrapR:::codes_df_ckd) 181 | 182 | # get diagnosis data - returns list of data frames (one per source) 183 | diagnosis_list <- get_diagnoses(codes_df_combined) 184 | 185 | # for each participant, get Date First diagnosed with the condition 186 | diagnosis_df = get_df(diagnosis_list, group_by="condition") 187 | 188 | # each condition has full set of output 189 | table(diagnosis_df$hh_bin) 190 | #> 0 1 191 | #> 500254 2015 192 | 193 | table(diagnosis_df$ckd_bin) 194 | #> 0 1 195 | #> 470334 31935 196 | ``` 197 | 198 | In the above example we also included a UK Biobank self-reported illness code for haemochromatosis, that was also ascertained (the Date First is run on each condition separately, they do not all need to have the same data sources). 199 | 200 | ## Utility functions 201 | 202 | * Check if field IDs are valid and return phenotype names with `fields_to_phenos()` 203 | * Label UK Biobank data fields with `label_ukb_fields()` 204 | * Upload/download files between worker and RAP with `upload_to_rap()` and `download_from_rap()` 205 | * Pull phenotypes from Spark instance with `get_rap_phenos()` 206 | 207 | ## Questions and comments 208 | 209 | Please report any bugs or [issues](https://github.com/lcpilling/ukbrapR/issues), and feel free to suggest changes as [pull requests](https://github.com/lcpilling/ukbrapR/pulls). Alternatively, feel free to contact me via e-mail L.Pilling@exeter.ac.uk 210 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | # ukbrapR v0.3.9 (4th Dec 2025) 2 | 3 | ### Changes 4 | - Fix issue with `dx` paths when downloading exported data in `/ukbrapr_data`. Previous method worked fine in RStudio but not Cloud Workstation, and highlighted a dependency on relative paths for `$ dx download` internal commands. 5 | - Add {tidyr} package to requirements. Has always been a requirement, but now it is explicit! 6 | 7 | 8 | # ukbrapR v0.3.8 (2nd Dec 2025) 9 | 10 | ### Changes 11 | - Implement suggestion by @nsandau (issue #34) to use `--brief --yes` flags when exporting tables with `export_tables()` (if running from terminal user is prompted for each job). 12 | 13 | 14 | # ukbrapR v0.3.7 (20th June 2025) 15 | 16 | ### Changes 17 | - `make_XXX_bed()` functions now give a more accurate ETA. Previously only considered the number of CHRs to search. Now considers the number of variants also. 18 | 19 | ### Bug fixes 20 | - Fix issue #31 where `fields_to_phenos()` incorrectly arrayed some "multiple choice" questions 21 | 22 | 23 | # ukbrapR v0.3.6 (29th May 2025) 24 | 25 | ### Change 26 | - `make_dragen_bed()` returns a more informative error if DRAGEN BGENs not found (suggests user is in a Project not updated since April 2025). 27 | - `make_XXX_bed()` functions now give a more accurate ETA. Previously only considered the number of CHRs to search. Now considers the number of variants also. 28 | - Removed indy/snow paths (i.e., my local servers) because we are now fully RAP working. 29 | - Removed Windows functionality. The RAP is always UNIX. 30 | 31 | 32 | # ukbrapR v0.3.5 (16th April 2025) 33 | 34 | ### New features 35 | - `create_pgs()` can now use a local BED file of already-extracted variants. Before, it assumed the user wanted to extract the variants prior to creating the PGS. 36 | 37 | 38 | # ukbrapR v0.3.4 (6th April 2025) 39 | 40 | ### New features 41 | - `make_dragen_bed()` now uses the BGEN files released in April 2025 (in prior versions used the pVCFs). 42 | - Means when `extract_variants()` or `create_pgs()` are called with option `source="dragen"` this is the default. 43 | - If your project does not have the new BGEN files available you can use `ukbrapR:::make_dragen_bed_from_pvcfs()` to extract from the pVCFs (slow). 44 | 45 | ### Changes 46 | - Updated internal schema to include new/updated fields. 47 | - Added "filename" option to `fields_to_phenos()` so that phenotype names can be written straight to a file ready for table-exporter. 48 | 49 | 50 | # ukbrapR v0.3.3 (18th March 2025) 51 | 52 | ### New features 53 | - New function `fields_to_phenos()` takes a vector of field IDs, check they are valid, and returns all possible UK Biobank RAP phenotypes from the schema 54 | 55 | 56 | # ukbrapR v0.3.2 (19th February 2025) 57 | 58 | ### Changes 59 | - Remove bundled plink, plink2 and bgenix files. Instead, download only if needed. 60 | - Add more consistent progress updates for `make_dragen_bed()` and `make_imputed_bed()` 61 | - Add "progress" options to `extract_variants()` and `create_pgs()` (default is FALSE). Default is TRUE if you directly call `make_dragen_bed()` or `make_imputed_bed()` 62 | 63 | ### Bug fixes 64 | - Fix `make_dragen_bed()` so it doesn't crash if the pVCF subset is empty (i.e., a searched-for chr:pos was missing) 65 | 66 | 67 | # ukbrapR v0.3.1 (10th February 2025) 68 | 69 | ### Bug fixes 70 | - Fix `make_dragen_bed()` position awk search, plink call 71 | - Fix `create_pgs()` when using WGS - needed to use chr:pos:a1:a2 not rsid 72 | - Fix `make_imputed_bed()` so it doesn't crash if the BGEN subset is empty (i.e., a searched-for rsid was missing) 73 | 74 | 75 | # ukbrapR v0.3.0 (29th January 2025) 76 | 77 | ### New features 78 | Suite of functions to extract and load genetic variants. Main ones of interest will be: 79 | 1. `extract_variants()` takes a list of variant rsIDs as input and extracts the imputed genotypes, loading to memory. This is really a wrapper around two other new functions: `make_imputed_bed()` and `load_bed()`. Also available in `make_dragen_bed()` to extract from whole genome sequence VCF files but this is pretty slow so usually user wants imputed variants. 80 | 2. `create_pgs()` creates a polygenic score (weighted allele score) using user-provided variants and weights. Loaded to memory but also saves a nicely formatted .tsv 81 | 82 | ### Breaking changes 83 | - Removing dependencies: reticulate, arrow, sparklyr. These take a few previous seconds to install every time and are rarely needed. Instead will be installed if user tries to use `get_rap_phenos()` 84 | - `get_emr_spark()` removed entirely. Much better to use `get_diagnoses()` which has had a *lot* of updates to functionality ad bud fixes. 85 | 86 | 87 | 88 | # ukbrapR v0.2.9 (12th January 2025) 89 | 90 | ### Bug fixes 91 | - Fixes for issue #19 (thanks to @nsandau for the help): 92 | 1. Where OPCS searches were not always performed correctly if only OPCS3/4 codes were provided. 93 | 2. When using "group_by" in `get_df()` some diagnoses were incorrectly carried over between groups when different vocabs were provided for each group (condition). 94 | 95 | ### Updates 96 | - Additional checking of `get_diagnoses()` input to abort if "blank" codes are provided to the grep. 97 | - When getting date first from self-reported illness data exclude "year" if < 1936 (earliest birth year for any participant) 98 | 99 | 100 | # ukbrapR v0.2.8 (05 October 2024) 101 | 102 | ### Bug fixes 103 | - Baseline dates TSV is now correctly located even if user changes working directory 104 | - HES operations dates were sometimes parsed as character - this is now fixed to parse as dates 105 | 106 | ### Updates 107 | - Warnings relating to parsing issues during grepping that are safe to ignore are now suppressed 108 | - Updates to documentation / examples / pkgdown site 109 | - New website articles to `ascertain_diagnoses`, `label_fields` and for `spark_functions` 110 | 111 | 112 | # ukbrapR v0.2.7 (30 September 2024) 113 | 114 | ### Updates 115 | - New function `label_ukb_field()` allows user to add titles and labels to UK Biobank fields provided as integers but are categorical. 116 | - New function `label_ukb_fields()` is a wrapper for the above. User just provides a data frame containing UK Biobank fields, and they all get formatted with titles (and labels if categorical). 117 | - Data from the UK Biobank schema (https://biobank.ctsu.ox.ac.uk/crystal/schema.cgi) are stored internally in `ukbrapR:::ukb_schema` 118 | - {haven} dependency added for labelling 119 | - Exported `baseline_dates.tsv` now also includes the assessment centres for completeness (but keeps the same filename to avoid any issues for current projects relying on already-exported files) 120 | 121 | 122 | # ukbrapR v0.2.6 (16 September 2024) 123 | 124 | ### Bug fix 125 | - Fix for issue #10. Grep issues if user provided only Read2 or CTV3 codes, if Read2 or CTV3 were <5 characters, or if Read2/CTV3 codes contained a hyphen. Thanks to @Simon-Leyss for highlighting. 126 | - Fix for issue #11. When getting self-reported illness codes there was a problem joining the tables if user only provided cancer codes. Thanks to @LauricF for highlighting. 127 | - Fix for when both types self-reported illness codes were provided. (Incorrect subsetting to just those codes provided after pivoting the long object.) 128 | 129 | 130 | # ukbrapR v0.2.5 (07 September 2024) 131 | 132 | ### Bug fix 133 | - When getting the date first cancer registry diagnosis, some rows were duplicated. This is now fixed so only one row per participant (the date first for any matched cancer ICD10) is returned. 134 | 135 | 136 | # ukbrapR v0.2.4 (05 September 2024) 137 | 138 | ### Changes 139 | - Updated internal paths for my servers `indy` and `snow` (for ongoing projects whilst we can still use local files...) 140 | - Updated how `get_diagnoses()` and `get_df()` handle a user-provided `file_paths` object 141 | 142 | 143 | # ukbrapR v0.2.3 (22 August 2024) 144 | 145 | ### Bug fixes 146 | - Fix for issue #8. In moving the HES ICD10 code block below the cancer registry code I acctidently put it within the `if (get_canreg) { }` condition. Thanks to @LauricF for highlighting. 147 | - Fix bullet points in pkgdown version of docs 148 | 149 | 150 | # ukbrapR v0.2.2 (21 August 2024) 151 | 152 | ### Update 153 | - The HESIN diagnosis search can now also include ICD9 codes in the provided codes data frame. These use fuzzy matching (similar to the ICD10s) so that searching for "280" also returns "2809" etc 154 | 155 | 156 | # ukbrapR v0.2.1 (10 August 2024) 157 | 158 | ### Bug fix 159 | - Fix for issue #5. The file paths for exported tables were not correctly specified in later calls of `get_diagnoses()` when the working directory is not the home directory. Thanks to @LauricF for highlighting. 160 | 161 | 162 | # ukbrapR v0.2.0 (30 July 2024) 163 | 164 | This is a major update as I move away from using Spark as the default environment, mostly due to the cost implications; it is significantly cheaper (and quicker!) to store and search exported raw text files in the RAP persistant storage than do everything in a Spark environment (plus the added benefit that the RStudio interface is available in "normal" instances). 165 | 166 | The Spark functions are available as before but all updates are to improve functionality in "normal" instances using RStudio, as we move to the new era of RAP-only UK Biobank analysis. 167 | 168 | ### Changes 169 | - Added internal data frame containing default paths for exported files in a RAP project (view with `ukbrapR:::ukbrapr_paths`) 170 | - Added function `export_tables()` which only needs to be run once when a new project is created. This submits the required table exporter commands to extract each of the tables in `ukbrapR:::ukbrapr_paths`. This can take ~15 minutes to export all the tables. ~10Gb of text files are created. This will cost ~£0.15 per month to store in the RAP standard storage. 171 | - `get_emr()` is split into two primary underlying functions: `get_emr_spark()` which has not changed, and `get_emr()` which is the "new way" (i.e., `get_emr_local()` is entirely removed) 172 | - Added functionality for `hesin_oper` (HES OPCS operations) searching for ICD10 codes in `get_emr()` 173 | - New/updated internal functions `get_cancer_registry()` asceratains cases using ICD10s in the `cancer_registry` data, and works much the same as `get_selfrep_illness()` 174 | - New function `get_diagnoses()` is a wrapper to get HES diagnosis, operations, cause of death, GP, cancer registry, and self-reported illness data -- i.e., once function to provide all codes to, and return all health-related data 175 | - `get_df()` takes all output from `get_diagnoses()` i.e., now also identifies date of first in matched `cancer_registry` and `hesin_oper` entries, in addition to `hes_diag`, `gp_clinical`, `death_cause` and `selfrep_illness` as before. 176 | - When getting "date first" using `get_df()` the baseline data is used to create binary case/control variables (for ever and prevalent), and for controls the censoring date is included in the overall `_df` variable (default is 30-10-2022). 177 | 178 | **To make it absolutely clear:** the Spark function `get_emr_spark()` has not been updated but I am no longer focussed on doing things this way. If you want to submit Pull Requests to improve functions please do. The below changes are to substantially improve the experience of using exported tables in the RAP environment only (if you have all the data on a local system already it will work, assuming you format correctly and provide the paths, but the RAP is the future). 179 | 180 | 181 | # ukbrapR v0.1.7 (28 July 2024) 182 | 183 | ### Bug fixes 184 | - Fix Spark database error when >1 dataset file is available. Fixes issue #3 185 | 186 | 187 | # ukbrapR v0.1.6 (03 July 2024) 188 | 189 | ### Bug fixes 190 | - Fix `get_df()` error when ascertaining GP diagnoses if 7-character codes were provided rather than 5 191 | 192 | ### Changes 193 | - `get_emr()` now accepts option "file_paths" - if not provided, attempts to get from Spark 194 | - Improve documentation and examples 195 | 196 | 197 | # ukbrapR v0.1.5 (01 July 2024) 198 | 199 | ### Bug fixes 200 | - Fix `get_df()` error occurring when not all sources are desired 201 | 202 | ### Changes 203 | - `get_emr_local()` option "local_paths" is now "file_paths" 204 | - Improve documentation and examples 205 | 206 | 207 | # ukbrapR v0.1.4 (12 June 2024) 208 | 209 | ### Bug fixes 210 | - Fix problem identifying ICD10 column name in RAP HESIN 211 | - Fix problem getting date first for GP data (excluding missing dates before summarizing) 212 | 213 | 214 | # ukbrapR v0.1.3 (8 June 2024) 215 | 216 | ### New feature 217 | - It is quicker/easier to ascertain multiple conditions at once to supply `get_emr()` with all the codes (as before), but now can use `get_df()` with option "group_by" to indicate the condition names in the `codes_df` object provided. See documentation. 218 | 219 | ### Changes 220 | - It is no longer possible to provide custom names for the `codes_df` to `get_emr()` -- these now must be `vocab_id` and `code` -- makes things much simpler. 221 | - Remove ICD9 code from `codes_df_hh` example as these are not currently used 222 | 223 | 224 | # ukbrapR v0.1.2 (6 June 2024) 225 | 226 | ### New features 227 | - New function `get_emr_local()`. If the user has text files for `hesin_diag` and `gp_clinical` etc. these can be searched (rather than Apache Spark queries). This therefore can work on "normal" DNAnexus nodes, or local servers. Most downstream functions also do not rely on Spark clusters if data extracts are available. 228 | 229 | ### Changes 230 | - Change URL to reflect my GitHub username change from `lukepilling` to `lcpilling` to be more consistent between different logins, websites, and social media 231 | -- https://lcpilling.github.io/ukbrapR 232 | -- https://github.com/lcpilling/ukbrapR 233 | - Added dependency {cli} for improved alert/error reporting 234 | 235 | 236 | # ukbrapR v0.1.1 (6 March 2024) 237 | 238 | ### New features 239 | - New argument "prefix" for `get_df()` - user can provide a string to prefix to the output variable names 240 | 241 | 242 | # ukbrapR v0.1.0 (21 Feb 2024) 243 | 244 | ### New features 245 | - `get_selfrep_illness()` - gets illness information from self-report fields. Derives a "date first" from the age/year reported, incorporating all visits for the participant 246 | - Two example code lists are incuded: `codes_df_ckd` (GEMINI CKD), and `codes_df_hh` (haemochromatosis, with self-report) 247 | 248 | ### Changes 249 | - `get_emr_df()` is re-named `get_df()` to reflect it can now include information from self-reported illness 250 | - `get_emr_diagnoses()` is re-named `get_emr()` to reflect it actually retrieves any record in `gp_clinical` not just diagnoses (e.g., BMI if appropriate codes provided) 251 | 252 | ### Bug fixes 253 | - So many 254 | 255 | 256 | # ukbrapR v0.0.2 (14 Nov 2023) 257 | 258 | ### New features 259 | - `get_emr_diagnoses()` - function to get electronic medical records diagnoses from Spark-based death records, hospital episode statistics, and primary care (GP) databases. 260 | - `get_emr_df()` - function to get date first diagnosed with any provided code from any above Electronic Medical Record source. 261 | 262 | ### Bug fixes 263 | - Extra input checking in `get_rap_phenos()` and output more consistent for direct use with `get_emr_*()` functions 264 | - Updated URL for example CKD clinical codes 265 | 266 | 267 | # ukbrapR v0.0.1 (26 Oct 2023) 268 | 269 | Initial release containing two functions: 270 | - `get_rap_phenos()` 271 | - `upload_to_rap()` 272 | 273 | -------------------------------------------------------------------------------- /R/export_tables.R: -------------------------------------------------------------------------------- 1 | #' Export diagnosis files to RAP persistent storage 2 | #' 3 | #' @description In the UK Biobank RAP export tables for HES, GP, death, and cancer registry data, plus self-reported illness fields, using the table-exporter. This is essentially a wrapper function to submit jobs to the table exporter. 4 | #' 5 | #' Suggest executing in an RStudio session. ~10Gb of text files are created. This will cost ~£0.15 per month to store in the RAP standard storage. 6 | #' 7 | #' @return NA 8 | #' 9 | #' @author Luke Pilling 10 | #' 11 | #' @name export_tables 12 | #' 13 | #' @param submit Logical. Actually submit `dx` commands. Default is FALSE i.e., just check inputs & file paths, then print the commands, 14 | #' \code{default=FALSE} 15 | #' @param ignore_warnings Logical. If an exported table already exists do not submit the table-exporter command unless this is TRUE, 16 | #' \code{default=FALSE} 17 | #' @param file_paths A data frame. Columns must be `object` and `path` containing paths to outputted files. If blank, will use the default paths, 18 | #' \code{default=ukbrapR:::ukbrapr_paths} 19 | #' @param dataset A string. If you wish to specify dataset. If blank, will use the most recently dispensed dataset in the main project directory. 20 | #' \code{default=app#####_#####.dataset} 21 | #' @param verbose Logical. Be verbose, 22 | #' \code{default=FALSE} 23 | #' 24 | #' @examples 25 | #' 26 | #' # To keep files organised this package assumes the following file structure 27 | #' # This object is not actually required but illstrates the defaults to be 28 | #' # created in your RAP space (override by providing a new `file_paths`): 29 | #' ukbrapr_paths = data.frame( 30 | #' object=c("death","death_cause","hesin","hesin_diag","hesin_oper","gp_clinical","gp_scripts","selfrep_illness","cancer_registry","baseline_dates"), 31 | #' path=c( 32 | #' "ukbrapr_data/death.tsv", 33 | #' "ukbrapr_data/death_cause.tsv", 34 | #' "ukbrapr_data/hesin.tsv", 35 | #' "ukbrapr_data/hesin_diag.tsv", 36 | #' "ukbrapr_data/hesin_oper.tsv", 37 | #' "ukbrapr_data/gp_clinical.tsv", 38 | #' "ukbrapr_data/gp_scripts.tsv", 39 | #' "ukbrapr_data/selfrep_illness.tsv", 40 | #' "ukbrapr_data/cancer_registry.tsv", 41 | #' "ukbrapr_data/baseline_dates.tsv" 42 | #' ) 43 | #' ) 44 | #' ukbrapr_paths 45 | #' 46 | #' # test run to see `dx run table-exporter` commands - but will not submit jobs 47 | #' export_tables() 48 | #' 49 | #' # Submit all `dx run table-exporter` commands. ~10Gb of text files are created. This will cost ~£0.15 per month to store in the RAP standard storage. 50 | #' export_tables(submit=TRUE) 51 | #' 52 | #' @export 53 | #' 54 | export_tables <- function( 55 | submit = FALSE, 56 | ignore_warnings = FALSE, 57 | file_paths = ukbrapR:::ukbrapr_paths, 58 | dataset = NULL, 59 | verbose = FALSE 60 | ) { 61 | 62 | # is this just a test? (Will not actually run any dx commands) 63 | if (!submit) { 64 | cli::cli_alert_info("Test run. Will not submit any {.code dx} system commands") 65 | ignore_warnings = TRUE 66 | } 67 | 68 | # do any files already exist? Warn if so! 69 | for (fp in file_paths$path) { 70 | if (file.exists(stringr::str_c("/mnt/project/", fp))) { 71 | if (ignore_warnings) cli::cli_alert_danger("File already exists on RAP at {.path {fp}}.") 72 | if (!ignore_warnings) cli::cli_abort("File already exists on RAP at {.path {fp}}.") 73 | } 74 | } 75 | 76 | # if output directory does not exist in the RAP, create it 77 | fp_dir = file_paths$path[1] 78 | if (stringr::str_detect(fp_dir, "/")) { 79 | fp_dir = stringr::str_split(fp_dir, "/")[[1]][1] 80 | if (!dir.exists(stringr::str_c("/mnt/project/", fp_dir))) { 81 | if (verbose) cli::cli_alert("Creating output directory in your RAP storage space: {.code dx mkdir {fp_dir}}") 82 | if (submit) system(stringr::str_c("dx mkdir ", fp_dir)) 83 | } 84 | } 85 | 86 | # get dataset id 87 | if (is.null(dataset)) { 88 | dataset = system("dx describe /*dataset | grep app | awk -F ' ' '{print $2}' | sort | tail -n 1", intern = TRUE) 89 | } 90 | if (verbose) cli::cli_alert_info("Using dataset {.file {dataset}}") 91 | 92 | # run table exporter commands 93 | ukbrapR:::export_tables_emr(dataset=dataset, submit=submit, verbose=verbose) 94 | ukbrapR:::export_tables_selfrep_illness(dataset=dataset, submit=submit, verbose=verbose) 95 | ukbrapR:::export_tables_cancer_registry(dataset=dataset, submit=submit, verbose=verbose) 96 | ukbrapR:::export_tables_baseline_info(dataset=dataset, submit=submit, verbose=verbose) 97 | 98 | cli::cli_alert_success("Submitted all table-exporter jobs.") 99 | cli::cli_alert_info("Can take ~15mins to complete.") 100 | cli::cli_alert_info("Files will be saved to `ukbrapr_data` directory in your RAP project presistent storage space.") 101 | cli::cli_alert_warning("~10Gb of text files are created. This will cost ~£0.15 per month to store in the RAP standard storage.") 102 | 103 | # was this just a test? 104 | if (!submit) { 105 | cli::cli_abort("This was a test run. No {.code dx table-exporter} commands were submitted. Re-run with {.code submit = TRUE} to submit.") 106 | } 107 | 108 | } 109 | 110 | 111 | #' Export tables for HES, GP, and death 112 | #' 113 | #' @description In the UK Biobank RAP export tables for HES, GP, and death using the table-exporter 114 | #' 115 | #' @return NA 116 | #' 117 | #' @author Luke Pilling 118 | #' 119 | #' @name export_tables_emr 120 | #' 121 | #' @noRd 122 | export_tables_emr <- function( 123 | dataset = NULL, 124 | submit = FALSE, 125 | verbose = FALSE 126 | ) { 127 | 128 | cli::cli_alert("Export tables: Electronic Medical Records") 129 | 130 | # get dataset id 131 | if (is.null(dataset)) { 132 | dataset = list.files("/mnt/project") |> stringr::str_subset(".dataset") 133 | dataset = dataset[1] 134 | } 135 | 136 | # submit table-exporter 137 | if (verbose) cli::cli_alert("dx run table-exporter for 'death'") 138 | table_exporter_command = stringr::str_c("dx run table-exporter -idataset_or_cohort_or_dashboard=", dataset, " -ientity='death' -ioutput='death' -ioutput_format='TSV' -icoding_option='RAW' --destination /ukbrapr_data/ --brief --yes") 139 | if (verbose|submit) cli::cli_text(table_exporter_command) 140 | if (submit) system(table_exporter_command) 141 | 142 | if (verbose) cli::cli_alert("dx run table-exporter for 'death_cause'") 143 | table_exporter_command = stringr::str_c("dx run table-exporter -idataset_or_cohort_or_dashboard=", dataset, " -ientity='death_cause' -ioutput='death_cause' -ioutput_format='TSV' -icoding_option='RAW' --destination /ukbrapr_data/ --brief --yes") 144 | if (verbose|submit) cli::cli_text(table_exporter_command) 145 | if (submit) system(table_exporter_command) 146 | 147 | if (verbose) cli::cli_alert("dx run table-exporter for 'hesin'") 148 | table_exporter_command = stringr::str_c("dx run table-exporter -idataset_or_cohort_or_dashboard=", dataset, " -ientity='hesin' -ioutput='hesin' -ioutput_format='TSV' -icoding_option='RAW' --destination /ukbrapr_data/ --brief --yes") 149 | if (verbose|submit) cli::cli_text(table_exporter_command) 150 | if (submit) system(table_exporter_command) 151 | 152 | if (verbose) cli::cli_alert("dx run table-exporter for 'hesin_diag'") 153 | table_exporter_command = stringr::str_c("dx run table-exporter -idataset_or_cohort_or_dashboard=", dataset, " -ientity='hesin_diag' -ioutput='hesin_diag' -ioutput_format='TSV' -icoding_option='RAW' --destination /ukbrapr_data/ --brief --yes") 154 | if (verbose|submit) cli::cli_text(table_exporter_command) 155 | if (submit) system(table_exporter_command) 156 | 157 | if (verbose) cli::cli_alert("dx run table-exporter for 'hesin_oper'") 158 | table_exporter_command = stringr::str_c("dx run table-exporter -idataset_or_cohort_or_dashboard=", dataset, " -ientity='hesin_oper' -ioutput='hesin_oper' -ioutput_format='TSV' -icoding_option='RAW' --destination /ukbrapr_data/ --brief --yes") 159 | if (verbose|submit) cli::cli_text(table_exporter_command) 160 | if (submit) system(table_exporter_command) 161 | 162 | if (verbose) cli::cli_alert("dx run table-exporter for 'gp_clinical'") 163 | table_exporter_command = stringr::str_c("dx run table-exporter -idataset_or_cohort_or_dashboard=", dataset, " -ientity='gp_clinical' -ioutput='gp_clinical' -ioutput_format='TSV' -icoding_option='RAW' --destination /ukbrapr_data/ --brief --yes") 164 | if (verbose|submit) cli::cli_text(table_exporter_command) 165 | if (submit) system(table_exporter_command) 166 | 167 | if (verbose) cli::cli_alert("dx run table-exporter for 'gp_scripts'") 168 | table_exporter_command = stringr::str_c("dx run table-exporter -idataset_or_cohort_or_dashboard=", dataset, " -ientity='gp_scripts' -ioutput='gp_scripts' -ioutput_format='TSV' -icoding_option='RAW' --destination /ukbrapr_data/ --brief --yes") 169 | if (verbose|submit) cli::cli_text(table_exporter_command) 170 | if (submit) system(table_exporter_command) 171 | 172 | } 173 | 174 | 175 | 176 | 177 | #' Extract self-reported illess fields 178 | #' 179 | #' @description In the UK Biobank RAP, submit a table-exporter job to extract the self-reported illness fields: 180 | #' 181 | #' @return NA 182 | #' 183 | #' @author Luke Pilling 184 | #' 185 | #' @name export_tables_selfrep_illness 186 | #' 187 | #' @noRd 188 | export_tables_selfrep_illness <- function( 189 | n_cancer_arrays = 5, 190 | n_noncancer_arrays = 30, 191 | n_instances = 3, 192 | dataset = NULL, 193 | submit = FALSE, 194 | verbose = FALSE 195 | ) { 196 | 197 | cli::cli_alert("Export table: Self-reported Illness fields") 198 | 199 | # RAP stores arrays as separate variables 200 | if (verbose) cli::cli_alert("Determine field names to request") 201 | if (verbose) cli::cli_alert(c("n_instances = ", n_instances)) 202 | if (verbose) cli::cli_alert(c("n_cancer_arrays = ", n_cancer_arrays)) 203 | if (verbose) cli::cli_alert(c("n_noncancer_arrays = ", n_noncancer_arrays)) 204 | 205 | # Determine variable names needed (depends if cancer or non-cancer) 206 | # will use 20001 (cancer code) and 20002 (non-cancer code) 207 | # will use the interpolated year (20006 = cancer year, 20008 = non-cancer year) 208 | 209 | # get field names 210 | names = "eid" 211 | 212 | # phenotypes 213 | # cancer code = 20001 214 | # cancer year = 20006 215 | for (p in c(20001, 20006)) { 216 | 217 | # instances 0:n_instances 218 | for (i in c(0:n_instances)) { 219 | 220 | # cancer arrays 221 | for (a in c(0:n_cancer_arrays)) names <- c(names, stringr::str_c("p", p, "_i", i, "_a", a)) 222 | 223 | } 224 | } 225 | 226 | # phenotypes 227 | # non-cancer illness code = 20002 228 | # non-cancer illness year = 20008 229 | for (p in c(20002, 20008)) { 230 | 231 | # instances 0:3 232 | for (i in c(0:n_instances)) { 233 | 234 | # non-cancer arrays 235 | for (a in c(0:n_noncancer_arrays)) names <- c(names, stringr::str_c("p", p, "_i", i, "_a", a)) 236 | 237 | } 238 | } 239 | 240 | if (verbose) print(names) 241 | 242 | # save and upload names file 243 | readr::write_tsv(data.frame(names), "fieldnames_selfrep_illness.txt", col_names=FALSE) 244 | if (submit) ukbrapR::upload_to_rap("fieldnames_selfrep_illness.txt", dir="/ukbrapr_data") 245 | 246 | # get dataset id 247 | if (is.null(dataset)) { 248 | dataset = list.files("/mnt/project") |> stringr::str_subset(".dataset") 249 | dataset = dataset[1] 250 | } 251 | 252 | # submit table-exporter 253 | table_exporter_command = stringr::str_c("dx run table-exporter -idataset_or_cohort_or_dashboard=", dataset, " -ifield_names_file_txt='/ukbrapr_data/fieldnames_selfrep_illness.txt' -ientity='participant' -ioutput='selfrep_illness' -ioutput_format='TSV' -icoding_option='RAW' --destination /ukbrapr_data/ --brief --yes") 254 | if (verbose|submit) cli::cli_text(table_exporter_command) 255 | if (submit) system(table_exporter_command) 256 | 257 | 258 | } 259 | 260 | 261 | 262 | #' Extract cancer registry fields 263 | #' 264 | #' @description In the UK Biobank RAP, submit a table-exporter job to extract the cancer registry fields: 265 | #' 266 | #' @return NA 267 | #' 268 | #' @author Luke Pilling 269 | #' 270 | #' @name export_tables_cancer_registry 271 | #' 272 | #' @noRd 273 | export_tables_cancer_registry <- function( 274 | n_cancer_arrays = 21, 275 | dataset = NULL, 276 | submit = FALSE, 277 | verbose = FALSE 278 | ) { 279 | 280 | cli::cli_alert("Export table: Cancer Registry fields") 281 | 282 | # RAP stores arrays as separate variables 283 | if (verbose) cli::cli_alert("Determine field names to request") 284 | if (verbose) cli::cli_alert(c("n_cancer_arrays = ", n_cancer_arrays)) 285 | # date vars = 40005 286 | # cancer vars = 40006 287 | # age vars = 40008 288 | # histology vars = 40011 289 | # behaviour vars = 40012 290 | 291 | # get field names 292 | names = "eid" 293 | 294 | # phenotypes 295 | for (p in c(40005, 40006, 40008, 40011, 40012)) { 296 | 297 | # instances 0:n_instances 298 | for (i in c(0:n_cancer_arrays)) names <- c(names, stringr::str_c("p", p, "_i", i)) 299 | 300 | } 301 | 302 | if (verbose) print(names) 303 | 304 | # save and upload names file 305 | readr::write_tsv(data.frame(names), "fieldnames_cancer_registry.txt", col_names=FALSE) 306 | if (submit) ukbrapR::upload_to_rap("fieldnames_cancer_registry.txt", dir="/ukbrapr_data") 307 | 308 | # get dataset id 309 | if (is.null(dataset)) { 310 | dataset = list.files("/mnt/project") |> stringr::str_subset(".dataset") 311 | dataset = dataset[1] 312 | } 313 | 314 | # submit table-exporter 315 | table_exporter_command = stringr::str_c("dx run table-exporter -idataset_or_cohort_or_dashboard=", dataset, " -ifield_names_file_txt='/ukbrapr_data/fieldnames_cancer_registry.txt' -ientity='participant' -ioutput='cancer_registry' -ioutput_format='TSV' -icoding_option='RAW' --destination /ukbrapr_data/ --brief --yes") 316 | if (verbose|submit) cli::cli_text(table_exporter_command) 317 | if (submit) system(table_exporter_command) 318 | 319 | } 320 | 321 | 322 | 323 | #' Extract useful baseline date fields and save to file 324 | #' 325 | #' @description In the UK Biobank RAP, submit a table-exporter job to extract some useful baseline fields: 326 | #' 327 | #' 1) date of baseline assessment [p53_i*], 2) month of birth [p52], 3) year of birth [p34], and 4) sex [p31] 328 | #' 329 | #' @return NA 330 | #' 331 | #' @author Luke Pilling 332 | #' 333 | #' @name export_tables_baseline_info 334 | #' 335 | #' @noRd 336 | export_tables_baseline_info <- function( 337 | dataset = NULL, 338 | submit = FALSE, 339 | verbose = FALSE 340 | ) { 341 | 342 | cli::cli_alert("Export table: Baseline Info fields") 343 | 344 | # get dataset id 345 | if (is.null(dataset)) { 346 | dataset = list.files("/mnt/project") |> stringr::str_subset(".dataset") 347 | dataset = dataset[1] 348 | } 349 | 350 | # baseline fields to export 351 | fields <- c( 352 | "eid", 353 | "p53_i0", "p53_i1", "p53_i2", "p53_i3", # assessment dates 354 | "p54_i0", "p54_i1", "p54_i2", "p54_i3", # assessment centres 355 | "p52", "p34", # month and year of birth 356 | "p31" # sex 357 | ) 358 | readr::write_tsv(data.frame(fields), "fieldnames_baseline_info.txt", col_names=FALSE) 359 | if (submit) ukbrapR::upload_to_rap("fieldnames_baseline_info.txt", dir="/ukbrapr_data") 360 | 361 | # submit table-exporter 362 | table_exporter_command = stringr::str_c("dx run table-exporter -idataset_or_cohort_or_dashboard=", dataset, " -ifield_names_file_txt='/ukbrapr_data/fieldnames_baseline_info.txt' -ientity='participant' -ioutput='baseline_dates' -ioutput_format='TSV' -icoding_option='RAW' --destination /ukbrapr_data/ --brief --yes") 363 | if (verbose|submit) cli::cli_text(table_exporter_command) 364 | if (submit) system(table_exporter_command) 365 | 366 | } 367 | 368 | -------------------------------------------------------------------------------- /R/get_diagnoses.R: -------------------------------------------------------------------------------- 1 | #' Get UK Biobank participant diagnosis data 2 | #' 3 | #' @description For a list of diagnostic codes get the HES, GP, cancer registry, operations, and self-reported illness data, matching the provided codes. 4 | #' 5 | #' Valid code vocabularies are: 6 | #' 7 | #' - ICD10 (for `hesin`, `death_cause` and `cancer_registry` searches) - fuzzy matching 8 | #' 9 | #' - ICD9 (for `hesin` searches) - fuzzy matching 10 | #' 11 | #' - Read2 / CTV3 (for `gp_clinical`) - exact matches on first 5 characters 12 | #' 13 | #' - OPCS3 / OPCS4 (for `hesin_oper`) - fuzzy matching 14 | #' 15 | #' - ukb_cancer / ukb_noncancer (for self-reported illness at UK Biobank assessments - all available will be searched) - exact matches 16 | #' 17 | #' This function relies on exported raw data files and thus does not need to be run in a Spark cluster. If the files are not in the default locations for the package you will need to specify the `file_paths` to exported tables. Recommend to run `export_tables()` once in your project to export the tables to the default paths for the package. 18 | #' 19 | #' @return Returns a list of data frames (the participant data for the requested diagnosis codes: `death_cause`, `hesin_diag`, `hesin_oper`, `gp_clinical`, `cancer_registry` and `selfrep_illness`. Also includes the original codes list) 20 | #' 21 | #' @author Luke Pilling 22 | #' 23 | #' @name get_diagnoses 24 | #' 25 | #' @param codes_df A data frame. Contains two columns: `code` and `vocab_id` i.e., a list of diagnostic codes, and an indicator of the vocabulary (ICD10, Read2, CTV3, OPCS3, OPCS4, ukb_cancer, and ukb_noncancer are recognised). Other columns are ignored. 26 | #' @param file_paths A data frame. Columns must be `object` and `path` containing paths to required files. Default assumes you have the tables exported in the RAP environment from 27 | #' ukbrapR::export_tables() 28 | #' \code{default=ukbrapR:::ukbrapr_paths} 29 | #' @param verbose Logical. Be verbose, 30 | #' \code{default=FALSE} 31 | #' 32 | #' @examples 33 | #' # example diagnostic codes for CKD from GEMINI multimorbidity project 34 | #' codes_df_ckd <- ukbrapR:::codes_df_ckd 35 | #' head(codes_df_ckd) 36 | #' 37 | #' # Get diagnosis data - returns list of data frames (one per source) 38 | #' # -- Requires exported tables - see `export_tables()` 39 | #' diagnosis_list <- get_diagnoses(codes_df_ckd) 40 | #' 41 | #' # don't forget to save and upload data to RAP persistent storage! 42 | #' save(diagnosis_list, "ukbrap.CKD.emr.20231114.RDat") 43 | #' upload_to_rap(file="ukbrap.CKD.*", dir="") 44 | #' 45 | #' @export 46 | #' 47 | get_diagnoses <- function( 48 | codes_df, 49 | file_paths = NULL, 50 | verbose = FALSE 51 | ) { 52 | 53 | v <- packageVersion("ukbrapR") 54 | cli::cli_alert_info("ukbrapR v{v}") 55 | 56 | start_time <- Sys.time() 57 | 58 | vocab_col = "vocab_id" 59 | codes_col = "code" 60 | 61 | # Check input 62 | if (verbose) cli::cli_alert("Checking inputs (codes, file paths, etc)") 63 | if (! any(class(codes_df) %in% c("data.frame","tbl","tbl_df"))) { 64 | cli::cli_abort(c( 65 | "{.var codes_df} must be a data.frame or tibble", 66 | "x" = "You've supplied a {.cls {class(codes_df)}} vector." 67 | )) 68 | } 69 | codes_df = as.data.frame(codes_df) # in case a tibble 70 | 71 | if (! vocab_col %in% colnames(codes_df)) stop("Codelist data frame needs to include vocabulary column `vocab_id`") 72 | if (! codes_col %in% colnames(codes_df)) stop("Codelist data frame needs to include codes column `code`") 73 | 74 | if (! any(c("ICD10","ICD9","Read2","CTV3","OPCS3","OPCS4","ukb_cancer","ukb_noncancer") %in% codes_df[,vocab_col])) stop("Vocabularies need to include at least one of ICD10, ICD9, Read2, CTV3, OPCS3, OPCS4, ukb_cancer, or ukb_noncancer") 75 | 76 | # if file_paths not provided assume default paths 77 | if (is.null(file_paths)) file_paths = ukbrapR:::ukbrapr_paths 78 | 79 | 80 | ######################################################################################################### 81 | # 82 | # check codes provided, determine what datasets we are going to search 83 | 84 | # Check code lists - only first 5 digits are used by UK Biobank 85 | cli::cli_alert("Checking provided codes (remember only the first 5 characters are used)") 86 | get_icd10 <- FALSE 87 | get_icd9 <- FALSE 88 | get_canreg <- FALSE 89 | get_gp <- FALSE 90 | get_oper <- FALSE 91 | get_selfrep <- FALSE 92 | ICD10s <- "" 93 | ICD9s <- "" 94 | Read2s <- "" 95 | CTV3s <- "" 96 | OPCS3s <- "" 97 | OPCS4s <- "" 98 | 99 | # throw error if any provided code has length 0 100 | if (any(stringr::str_length(codes_df[,codes_col]) == 0)) { 101 | cli::cli_abort("Blank code provided. Check your input codes lists to avoid unexpected matches.") 102 | } 103 | 104 | # warn if any provided code has length 1 105 | if (any(stringr::str_length(codes_df[,codes_col]) == 1)) { 106 | cli::cli_warn("Some provided code(s) have length 1. Check your input codes lists and matched outputs to avoid unexpected matches.") 107 | } 108 | 109 | # function to check for hyphens and abort if any provided 110 | # (suggests they want a range of codes. Safer to abort at ask the user to explicitly provide the codes to search for) 111 | hyphen_check <- function(codes, vocab) { 112 | if (any(stringr::str_detect(codes, "-"))) { 113 | codes = codes[ stringr::str_detect(codes, "-") ] 114 | cli::cli_abort(c( 115 | "{vocab} codes cannot contain hyphens, as this suggests you want a range of codes.", 116 | "x" = "Remove hyphens and provide the specific {vocab} codes you wish to ascertain (safer)." 117 | )) 118 | } 119 | } 120 | 121 | # get ICD10s. Remove "." dot characters. First 5 characters only. 122 | if (any(codes_df[,vocab_col] == "ICD10")) { 123 | get_icd10 <- TRUE 124 | ICD10s <- codes_df |> 125 | dplyr::filter(!!rlang::sym(vocab_col) == "ICD10") |> 126 | dplyr::select(!!rlang::sym(codes_col)) |> 127 | dplyr::pull() |> 128 | unique() |> 129 | stringr::str_remove(stringr::fixed(".")) |> 130 | stringr::str_sub(1, 5) 131 | cat(" - N unique ICD10 codes:", length(ICD10s), "\n") 132 | hyphen_check(ICD10s, "ICD10") 133 | if (any(stringr::str_starts(ICD10s, "C"))) get_canreg <- TRUE 134 | } 135 | 136 | # get ICD9s. Remove "." dot characters. First 5 characters only. 137 | if (any(codes_df[,vocab_col] == "ICD9")) { 138 | get_icd9 <- TRUE 139 | ICD9s <- codes_df |> 140 | dplyr::filter(!!rlang::sym(vocab_col) == "ICD9") |> 141 | dplyr::select(!!rlang::sym(codes_col)) |> 142 | dplyr::pull() |> 143 | unique() |> 144 | stringr::str_remove(stringr::fixed(".")) |> 145 | stringr::str_sub(1, 5) 146 | hyphen_check(ICD9s, "ICD9") 147 | cat(" - N unique ICD9 codes:", length(ICD9s), "\n") 148 | } 149 | 150 | # get Read2 and CTV3s. First 5 characters only. 151 | gp_codes = NULL 152 | if (any(codes_df[,vocab_col] == "Read2")) { 153 | get_gp <- TRUE 154 | Read2s <- codes_df[ codes_df[,vocab_col] == "Read2" , codes_col ] 155 | Read2s <- stringr::str_sub(Read2s, 1, 5) |> unique() 156 | gp_codes <- Read2s 157 | cat(" - N unique Read2 codes:", length(Read2s), "\n") 158 | hyphen_check(Read2s, "Read2") 159 | # any <5 characters? 160 | nchar_Read2s = nchar(Read2s) 161 | nchar_Read2s_n5 = length(nchar_Read2s[ nchar_Read2s < 5 ]) 162 | if (nchar_Read2s_n5 > 0) { 163 | cli::cli_abort(c( 164 | "i" = "Read2 codes must be at least 5 characters in length", 165 | "x" = "There {?is/are} {nchar_Read2s_n5} Read2 code{?s} < 5 characters." 166 | )) 167 | } 168 | } 169 | if (any(codes_df[,vocab_col] == "CTV3")) { 170 | get_gp <- TRUE 171 | CTV3s <- codes_df[ codes_df[,vocab_col] == "CTV3" , codes_col ] 172 | CTV3s <- stringr::str_sub(CTV3s, 1, 5) |> unique() 173 | if (is.null(gp_codes)) { 174 | gp_codes <- CTV3s 175 | } else { 176 | gp_codes <- c(gp_codes, CTV3s) 177 | } 178 | cat(" - N unique CTV3 codes:", length(CTV3s), "\n") 179 | hyphen_check(CTV3s, "CTV3") 180 | # any <5 characters? 181 | nchar_CTV3s = nchar(CTV3s) 182 | nchar_CTV3s_n5 = length(nchar_CTV3s[ nchar_CTV3s < 5 ]) 183 | if (nchar_CTV3s_n5 > 0) { 184 | cli::cli_abort(c( 185 | "i" = "CTV3 codes must be at least 5 characters in length", 186 | "x" = "There {?is/are} {nchar_CTV3s_n5} CTV3 code{?s} < 5 characters.", 187 | )) 188 | } 189 | } 190 | 191 | # get OPCS codes? Remove "." dot characters. First 5 characters only. 192 | oper_codes = NULL 193 | if (any(codes_df[,vocab_col] == "OPCS3")) { 194 | get_oper <- TRUE 195 | OPCS3s <- codes_df |> 196 | dplyr::filter(!!rlang::sym(vocab_col) == "OPCS3") |> 197 | dplyr::select(!!rlang::sym(codes_col)) |> 198 | dplyr::pull() |> 199 | unique() |> 200 | stringr::str_remove(stringr::fixed(".")) |> 201 | stringr::str_sub(1, 5) 202 | cat(" - N unique OPCS3 codes:", length(OPCS3s), "\n") 203 | hyphen_check(OPCS3s, "OPCS3") 204 | oper_codes = OPCS3s 205 | } 206 | if (any(codes_df[,vocab_col] == "OPCS4")) { 207 | get_oper <- TRUE 208 | OPCS4s <- codes_df |> 209 | dplyr::filter(!!rlang::sym(vocab_col) == "OPCS4") |> 210 | dplyr::select(!!rlang::sym(codes_col)) |> 211 | dplyr::pull() |> 212 | unique() |> 213 | stringr::str_remove(stringr::fixed(".")) |> 214 | stringr::str_sub(1, 5) 215 | cat(" - N unique OPCS4 codes:", length(OPCS4s), "\n") 216 | hyphen_check(OPCS4s, "OPCS4") 217 | oper_codes = c(oper_codes, OPCS4s) 218 | } 219 | 220 | # check for self-reported codes 221 | n_selfrep = length(unique(codes_df[codes_df[,vocab_col] %in% c("ukb_cancer","ukb_noncancer"),codes_col])) 222 | if (n_selfrep>0) { 223 | get_selfrep <- TRUE 224 | cat(" - N unique UKB-self-reported codes:", n_selfrep, "\n") 225 | } 226 | 227 | 228 | ######################################################################################################### 229 | # 230 | # check data is available, download if required 231 | 232 | # Check file paths are provided 233 | if (is.null(file_paths)) cli::cli_abort("Need to provide {.var file_paths}") 234 | if (! any(class(file_paths) %in% c("data.frame","tbl","tbl_df"))) { 235 | cli::cli_abort(c( 236 | "{.var file_paths} must be a data.frame or tibble", 237 | "x" = "You've supplied a {.cls {class(file_paths)}} vector." 238 | )) 239 | } 240 | if (colnames(file_paths)[1] != "object" | colnames(file_paths)[2] != "path") cli::cli_abort("{.var file_paths} needs two columns: `object` and `path`") 241 | 242 | # check all the required files are included 243 | must_include = "baseline_dates" 244 | if (get_icd10) must_include <- c(must_include, c("death","death_cause","hesin","hesin_diag")) 245 | if (get_icd9) must_include <- c(must_include, c("hesin","hesin_diag")) 246 | if (get_canreg) must_include <- c(must_include, c("cancer_registry")) 247 | if (get_gp) must_include <- c(must_include, c("gp_clinical")) 248 | if (get_oper) must_include <- c(must_include, c("hesin_oper")) 249 | if (get_selfrep) must_include <- c(must_include, c("selfrep_illness")) 250 | must_include = unique(must_include) 251 | 252 | for (file in must_include) if (! file %in% file_paths$object) cli::cli_abort("{.var file_paths} must contain {.path {file}}") 253 | 254 | # if files not already downloaded from RAP then download to user's home directory 255 | # only do this if the file paths are to "ukbrapr_data" 256 | files = file_paths$path[file_paths$object %in% must_include] 257 | if (stringr::str_detect(files[1], "ukbrapr_data")) { 258 | 259 | # get path to users home directory - create ukbrapr_data directory on worker (if already exists then nothing happens) 260 | home_path = as.character(Sys.getenv()["HOME"]) 261 | dir.create(stringr::str_c(home_path, "/ukbrapr_data"), showWarnings = FALSE) 262 | 263 | # do any need downloading from RAP? Or already been done? 264 | # each file now has two paths: a RAP path and a local path: 265 | dx_files <- NULL 266 | home_files <- stringr::str_c(home_path, "/", files) 267 | for (ii in 1:length(files)) if (! file.exists(home_files[ii])) dx_files = c(dx_files, files[ii]) 268 | 269 | # if any were missing, download them 270 | if (!is.null(dx_files)) { 271 | 272 | options(cli.progress_show_after = 0) 273 | cli::cli_progress_bar(format = "Downloading {.path {basename(file)}} from the RAP [{cli::pb_current}/{cli::pb_total}] {cli::pb_bar} {cli::pb_percent}", total = length(dx_files)) 274 | 275 | # copy file from RAP space to instance 276 | for (file in dx_files) { 277 | cli::cli_progress_update() 278 | system(stringr::str_c("dx download \"${DX_PROJECT_CONTEXT_ID}:/", file, "\" -o \"", home_path, "/ukbrapr_data\"")) 279 | } 280 | cli::cli_progress_done() 281 | options(cli.progress_show_after = 2) 282 | 283 | if (verbose) cli::cli_alert_info(c("Time taken so far: ", "{prettyunits::pretty_sec(as.numeric(difftime(Sys.time(), start_time, units=\"secs\")))}.")) 284 | 285 | } 286 | 287 | # add home directory prefix to file paths 288 | file_paths$path = stringr::str_c(home_path, "/", file_paths$path) 289 | 290 | } 291 | 292 | # check files exist 293 | for (file in file_paths$path[file_paths$object %in% must_include]) if (! file.exists(file)) cli::cli_abort("Could not find file {.path {file}}") 294 | 295 | # 296 | ######################################################################################################### 297 | # 298 | 299 | # Get data for each code vocabulary 300 | if (verbose) cli::cli_alert("Ascertaining codes from long EMR files") 301 | death_cause_tbl <- NULL # ICD10 302 | hesin_diag_tbl <- NULL # ICD10, ICD9 303 | cancer_registry_tbl <- NULL # ICD10 304 | gp_clinical_tbl <- NULL # Read2 / CTV3 305 | hesin_oper_tbl <- NULL # OPCS3 / OPCS4 306 | selfrep_illness_tbl <- NULL # ukb_cancer / ukb_noncancer 307 | 308 | if (get_icd10) { 309 | 310 | # 311 | # death data ########################################### 312 | # 313 | 314 | cli::cli_alert("Ascertaining cause of death data.") 315 | 316 | death_cause_path = file_paths$path[ file_paths$object=="death_cause" ] 317 | 318 | # create search string 319 | search_string <- paste0("grep -E ", sprintf('"%s"', stringr::str_flatten(ICD10s, collapse = "|")), " ", sprintf('%s', death_cause_path)) 320 | if (verbose) cat(" -- search string: ", search_string, "\n") 321 | 322 | # get file headers 323 | headers <- colnames(readr::read_tsv(death_cause_path, n_max=1, show_col_types=FALSE, progress=FALSE)) 324 | if (! "eid" %in% headers) headers[1] <- "eid" 325 | 326 | # use search string to only read lines that matched a code 327 | death_cause_tbl <- suppressWarnings(readr::read_tsv(pipe(search_string), col_names=headers, show_col_types=FALSE, progress=FALSE)) 328 | 329 | # if any matches returned, make sure eid is formatted nicely (remove file name) and the dates are dates 330 | if (nrow(death_cause_tbl)>0) { 331 | # match with date of death data 332 | death_tbl = suppressWarnings(readr::read_tsv(file_paths$path[ file_paths$object=="death" ], show_col_types=FALSE, progress=FALSE)) 333 | if (! "eid" %in% colnames(death_tbl)) colnames(death_tbl)[1] <- "eid" 334 | death_cause_tbl = dplyr::inner_join(death_tbl, death_cause_tbl, by=c("eid"="eid", "ins_index"="ins_index")) 335 | 336 | # format date col if not "Date" 337 | if (!lubridate::is.Date(death_cause_tbl$date_of_death)) death_cause_tbl <- death_cause_tbl |> dplyr::mutate(date_of_death = lubridate::dmy(date_of_death)) 338 | } 339 | 340 | cli::cli_alert_success("Loaded {.var death_cause} with {nrow(death_cause_tbl)} matched rows.") 341 | 342 | if (verbose) cli::cli_alert_info(c("Time taken so far: ", "{prettyunits::pretty_sec(as.numeric(difftime(Sys.time(), start_time, units=\"secs\")))}.")) 343 | 344 | # 345 | # cancer registry #################################### 346 | # 347 | 348 | # do any ICD10s start with a C? Skip if not. 349 | if (get_canreg) { 350 | 351 | cli::cli_alert("Ascertaining cancer registry data.") 352 | 353 | # load data 354 | cancer_registry_dat <- suppressWarnings(readr::read_tsv(file_paths$path[ file_paths$object=="cancer_registry" ], show_col_types = FALSE, progress = FALSE)) 355 | 356 | # get cancer registry data for these ICD10s 357 | cancer_registry_tbl <- ukbrapR:::get_cancer_registry(codes = ICD10s, ukb_dat = cancer_registry_dat, verbose = verbose) 358 | cli::cli_alert_success("Loaded {.var cancer_registry} with {nrow(cancer_registry_tbl)} matched rows.") 359 | 360 | rm(cancer_registry_dat) 361 | 362 | if (verbose) cli::cli_alert_info(c("Time taken so far: ", "{prettyunits::pretty_sec(as.numeric(difftime(Sys.time(), start_time, units=\"secs\")))}.")) 363 | 364 | } 365 | 366 | # 367 | # HES diagnosis data (ICD10s) ########################################### 368 | # 369 | 370 | cli::cli_alert("Ascertaining HES diagnosis data (ICD10s).") 371 | 372 | hesin_diag_path = file_paths$path[ file_paths$object=="hesin_diag" ] 373 | 374 | # create search string 375 | search_string <- paste0("grep -E ", sprintf('"%s"', stringr::str_flatten(ICD10s, collapse = "|")), " ", sprintf('%s', hesin_diag_path)) 376 | if (verbose) cat(" -- search string: ", search_string, "\n") 377 | 378 | # get file headers 379 | headers <- colnames(readr::read_tsv(hesin_diag_path, n_max=1, show_col_types=FALSE, progress=FALSE)) 380 | if (! "eid" %in% headers) headers[1] <- "eid" 381 | 382 | # use search string to only read lines that matched a code 383 | hesin_diag_tbl <- suppressWarnings(readr::read_tsv(pipe(search_string), col_names=headers, show_col_types=FALSE, progress=FALSE)) 384 | 385 | # if any matches returned, make sure eid is formatted nicely (remove file name), and dates are dates 386 | if (nrow(hesin_diag_tbl)>0) { 387 | # match with HES episode data 388 | hesin_tbl = suppressWarnings(readr::read_tsv(file_paths$path[ file_paths$object=="hesin" ], show_col_types=FALSE, progress=FALSE)) 389 | if (! "eid" %in% colnames(hesin_tbl)) colnames(hesin_tbl)[1] <- "eid" 390 | hesin_diag_tbl = dplyr::inner_join(hesin_tbl, hesin_diag_tbl, by=c("eid"="eid", "ins_index"="ins_index")) 391 | 392 | # format date cols if not "Date" 393 | date_cols = c("epistart", "epiend", "elecdate", "admidate", "disdate") 394 | for (dc in date_cols) { 395 | dc = rlang::sym(dc) 396 | if (!lubridate::is.Date(hesin_diag_tbl |> dplyr::select(!!dc) |> dplyr::pull())) { 397 | hesin_diag_tbl <- hesin_diag_tbl |> dplyr::mutate(!!dc := lubridate::dmy(!!dc)) 398 | } 399 | } 400 | } 401 | 402 | cli::cli_alert_success("Loaded {.var hesin_diag} with {nrow(hesin_diag_tbl)} matched rows.") 403 | 404 | if (verbose) cli::cli_alert_info(c("Time taken so far: ", "{prettyunits::pretty_sec(as.numeric(difftime(Sys.time(), start_time, units=\"secs\")))}.")) 405 | 406 | } 407 | 408 | # 409 | # ICD9 HES diagnosis data ########################################### 410 | # 411 | if (get_icd9) { 412 | 413 | cli::cli_alert("Ascertaining HES diagnosis data (ICD9s).") 414 | 415 | hesin_diag_path = file_paths$path[ file_paths$object=="hesin_diag" ] 416 | 417 | # create search string 418 | search_string <- paste0("grep -E ", sprintf('"%s"', stringr::str_flatten(ICD9s, collapse = "|")), " ", sprintf('%s', hesin_diag_path)) 419 | if (verbose) cat(" -- search string: ", search_string, "\n") 420 | 421 | # get file headers 422 | headers <- colnames(readr::read_tsv(hesin_diag_path, n_max=1, show_col_types=FALSE, progress=FALSE)) 423 | if (! "eid" %in% headers) headers[1] <- "eid" 424 | 425 | # use search string to only read lines that matched a code 426 | hesin_diag_tbl_icd9 <- suppressWarnings(readr::read_tsv(pipe(search_string), col_names=headers, show_col_types=FALSE, progress=FALSE)) 427 | 428 | # check we have actually matched any ICD9s 429 | # exclude missing ICD9s (EIDs may have been matched) 430 | if (nrow(hesin_diag_tbl_icd9)>0) 431 | hesin_diag_tbl_icd9 <- hesin_diag_tbl_icd9 |> dplyr::filter(!is.na(diag_icd9)) 432 | # check codes match START of code string 433 | ICD9_search = stringr::str_flatten(ICD9s, collapse = "|") 434 | if (nrow(hesin_diag_tbl_icd9)>0) 435 | hesin_diag_tbl_icd9 <- hesin_diag_tbl_icd9 |> dplyr::filter(stringr::str_starts(diag_icd9, !! ICD9_search)) 436 | 437 | # if any matches returned, make sure eid is formatted nicely (remove file name), and dates are dates 438 | if (nrow(hesin_diag_tbl_icd9)>0) { 439 | # match with HES episode data 440 | hesin_tbl = suppressWarnings(readr::read_tsv(file_paths$path[ file_paths$object=="hesin" ], show_col_types=FALSE, progress=FALSE)) 441 | if (! "eid" %in% colnames(hesin_tbl)) colnames(hesin_tbl)[1] <- "eid" 442 | hesin_diag_tbl_icd9 = dplyr::inner_join(hesin_tbl, hesin_diag_tbl_icd9, by=c("eid"="eid", "ins_index"="ins_index")) 443 | 444 | # format date cols if not "Date" 445 | date_cols = c("epistart", "epiend", "elecdate", "admidate", "disdate") 446 | for (dc in date_cols) { 447 | dc = rlang::sym(dc) 448 | if (!lubridate::is.Date(hesin_diag_tbl_icd9 |> dplyr::select(!!dc) |> dplyr::pull())) 449 | hesin_diag_tbl_icd9 <- hesin_diag_tbl_icd9 |> dplyr::mutate(!!dc := lubridate::dmy(!!dc)) 450 | } 451 | } 452 | 453 | cli::cli_alert_success("Loaded {.var hesin_diag} with {nrow(hesin_diag_tbl_icd9)} matched rows.") 454 | 455 | # combine with other hesin table? 456 | if (!is.null(hesin_diag_tbl)) hesin_diag_tbl = rbind(hesin_diag_tbl, hesin_diag_tbl_icd9) 457 | if (is.null(hesin_diag_tbl)) hesin_diag_tbl = hesin_diag_tbl_icd9 458 | 459 | if (verbose) cli::cli_alert_info(c("Time taken so far: ", "{prettyunits::pretty_sec(as.numeric(difftime(Sys.time(), start_time, units=\"secs\")))}.")) 460 | 461 | } 462 | 463 | # 464 | # HES operations data ########################################### 465 | # 466 | if (get_oper) { 467 | 468 | cli::cli_alert("Ascertaining HES operations data.") 469 | 470 | hesin_oper_path = file_paths$path[ file_paths$object=="hesin_oper" ] 471 | 472 | # create search string 473 | search_string <- paste0("grep -E ", sprintf('"%s"', stringr::str_flatten(oper_codes, collapse = "|")), " ", sprintf('%s', hesin_oper_path)) 474 | if (verbose) cat(" -- search string: ", search_string, "\n") 475 | 476 | # get file headers 477 | headers <- colnames(readr::read_tsv(hesin_oper_path, n_max=1, show_col_types=FALSE, progress=FALSE)) 478 | if (! "eid" %in% headers) headers[1] <- "eid" 479 | 480 | # use search string to only read lines that matched a code 481 | hesin_oper_tbl <- suppressWarnings(readr::read_tsv(pipe(search_string), col_names=headers, show_col_types=FALSE, progress=FALSE)) 482 | 483 | # if any matches returned, make sure eid is formatted nicely (remove file name), and dates are dates 484 | # make sure OPCS3 are exact 485 | if (nrow(hesin_oper_tbl)>0) { 486 | if (OPCS4s[1] != "" & OPCS3s[1] == "") hesin_oper_tbl <- hesin_oper_tbl |> dplyr::filter(stringr::str_detect(oper4, stringr::str_flatten(!! OPCS4s, collapse = "|"))) 487 | if (OPCS4s[1] == "" & OPCS3s[1] != "") hesin_oper_tbl <- hesin_oper_tbl |> dplyr::filter(stringr::str_starts(oper3, stringr::str_flatten(!! OPCS3s, collapse = "|"))) 488 | if (OPCS4s[1] != "" & OPCS3s[1] != "") hesin_oper_tbl <- hesin_oper_tbl |> dplyr::filter(stringr::str_starts(oper3, stringr::str_flatten(!! OPCS3s, collapse = "|")) | stringr::str_detect(oper4, stringr::str_flatten(!!OPCS4s, collapse = "|"))) 489 | 490 | if (is.character(hesin_oper_tbl$opdate)) hesin_oper_tbl$opdate <- lubridate::dmy(hesin_oper_tbl$opdate) 491 | } 492 | 493 | cli::cli_alert_success("Loaded {.var hesin_oper} with {nrow(hesin_oper_tbl)} matched rows.") 494 | 495 | if (verbose) cli::cli_alert_info(c("Time taken so far: ", "{prettyunits::pretty_sec(as.numeric(difftime(Sys.time(), start_time, units=\"secs\")))}.")) 496 | 497 | } 498 | 499 | # 500 | # Ascertaining GP clinical ######################################################## 501 | # 502 | if (get_gp) { 503 | 504 | cli::cli_alert("Ascertaining GP data.") 505 | 506 | gp_clinical_path = file_paths$path[ file_paths$object=="gp_clinical" ] 507 | 508 | # create search strings 509 | search_string <- paste0("grep -E ", sprintf('"%s"', stringr::str_flatten(gp_codes, collapse = "|")), " ", sprintf('%s', gp_clinical_path)) 510 | if (verbose) cat(" -- search string: ", search_string, "\n") 511 | 512 | # get file headers 513 | headers <- colnames(readr::read_tsv(gp_clinical_path, n_max=1, show_col_types=FALSE, progress=FALSE)) 514 | if (! "eid" %in% headers) headers[1] <- "eid" 515 | 516 | # use search string to only read lines that matched a code 517 | gp_clinical_tbl <- suppressWarnings(readr::read_tsv(pipe(search_string), col_names=headers, show_col_types=FALSE, progress=FALSE)) 518 | 519 | # if any matches returned, make sure eid is formatted nicely (remove file name), the codes are definite matches, and the dates are dates 520 | if (nrow(gp_clinical_tbl)>0) { 521 | gp_clinical_tbl <- gp_clinical_tbl |> dplyr::filter(read_2 %in% !!Read2s | read_3 %in% !!CTV3s) 522 | 523 | # format date col if not "Date" 524 | if (!lubridate::is.Date(gp_clinical_tbl$event_dt)) gp_clinical_tbl <- gp_clinical_tbl |> dplyr::mutate(event_dt = lubridate::dmy(event_dt)) 525 | } 526 | 527 | cli::cli_alert_success("Loaded {.var gp_clinical} with {nrow(gp_clinical_tbl)} matched rows.") 528 | 529 | if (verbose) cli::cli_alert_info(c("Time taken so far: ", "{prettyunits::pretty_sec(as.numeric(difftime(Sys.time(), start_time, units=\"secs\")))}.")) 530 | } 531 | 532 | # 533 | # Ascertaining self-reported illness ######################################################## 534 | # 535 | if (get_selfrep) { 536 | 537 | cli::cli_alert("Ascertaining self-reported illness data.") 538 | 539 | # load data 540 | selfrep_illness_dat <- suppressWarnings(readr::read_tsv(file_paths$path[ file_paths$object=="selfrep_illness" ], show_col_types = FALSE, progress = FALSE)) 541 | 542 | # get self-reported illness data - convert to long 543 | selfrep_illness_tbl <- ukbrapR:::get_selfrep_illness(codes_df = codes_df, ukb_dat = selfrep_illness_dat, verbose = verbose) 544 | cli::cli_alert_success("Loaded {.var selfrep_illness} with {nrow(selfrep_illness_tbl)} matched rows.") 545 | 546 | rm(selfrep_illness_dat) 547 | 548 | } 549 | 550 | 551 | # 552 | # 553 | # 554 | 555 | cli::cli_alert_success(c("Finished. Time taken: ", "{prettyunits::pretty_sec(as.numeric(difftime(Sys.time(), start_time, units=\"secs\")))}.")) 556 | 557 | # Return data as list 558 | output_list <- list( 559 | gp_clinical=gp_clinical_tbl, 560 | hesin_diag=hesin_diag_tbl, 561 | death_cause=death_cause_tbl, 562 | cancer_registry=cancer_registry_tbl, 563 | hesin_oper=hesin_oper_tbl, 564 | selfrep_illness=selfrep_illness_tbl, 565 | codes_df=tibble::as_tibble(codes_df) 566 | ) 567 | class(output_list) <- "ukbrapr_emr" 568 | return(output_list) 569 | 570 | } 571 | 572 | 573 | #' The old get_emr() function 574 | #' 575 | #' @export 576 | #' @noRd 577 | get_emr <- function( 578 | codes_df, 579 | spark_master = "spark://master:41000", 580 | file_paths = NULL, 581 | verbose = FALSE 582 | ) { 583 | lifecycle::deprecate_stop("0.2.0", "get_emr()", "get_diagnoses()") 584 | } 585 | 586 | -------------------------------------------------------------------------------- /R/get_df.R: -------------------------------------------------------------------------------- 1 | #' Get UK Biobank participant Date First (DF) diagnosis 2 | #' 3 | #' @description For each participant identify the date of first diagnosis from all available electronic medical records & self-reported data. 4 | #' 5 | #' If `use_baseline_dates=TRUE` (the default) then will also produce a binary 0/1 variable, indicating the controls (people without a diagnosis) and setting the date first `_df` field to the date of censoring (currently 30 October 2022). 6 | #' 7 | #' @return Returns a single, "wide" data frame: the participant data for the requested diagnosis codes with "date first" `_df` variables. One for each source of data, and a combined variable. 8 | #' 9 | #' @author Luke Pilling 10 | #' 11 | #' @name get_df 12 | #' 13 | #' @param diagnosis_list A list of data frames. The output of `get_diagnoses()` i.e., the raw diagnosis and self-reported illness data that matched the provided codes list. 14 | #' @param prefix String. Prefix to add to variable names (e.g., if prefix="chd" the output variables would be "chd_gp_df", "chd_hes_df", "chd_df" etc.) 15 | #' \code{default=NULL} 16 | #' @param group_by String. If the codes list provided to `get_diagnoses()` (i.e., in diagnosis_list$codes_df) contained a grouping/condition variable, indicate the variable name here. 17 | #' "Date first" variables will be created for each prefix in the grouping variable. The `prefix` option is ignored, in favour of the names in the grouping variable. 18 | #' \code{default=NULL} 19 | #' @param include_selfrep_illness logical. Include self-reported diagnosesin the combined Date First output? If present in `diagnosis_list` will still provide a separate `_df` variable 20 | #' \code{default=TRUE} 21 | #' @param include_death_cause logical. Include the cause of death in the combined Date First output? If present in `diagnosis_list` will still provide a separate `_df` variable 22 | #' \code{default=TRUE} 23 | #' @param include_gp_clinical logical. Include the GP data in the combined Date First output? If present in `diagnosis_list` will still provide a separate `_df` variable 24 | #' \code{default=TRUE} 25 | #' @param include_hesin_diag logical. Include the HES diagnosis data in the combined Date First output? If present in `diagnosis_list` will still provide a separate `_df` variable 26 | #' \code{default=TRUE} 27 | #' @param include_hesin_oper logical. Include the HES OPCS (operations) data in the combined Date First output? If present in `diagnosis_list` will still provide a separate `_df` variable 28 | #' \code{default=TRUE} 29 | #' @param include_cancer_registry logical. Include the cancer registry data in the combined Date First output? If present in `diagnosis_list` will still provide a separate `_df` variable 30 | #' \code{default=TRUE} 31 | #' @param use_baseline_dates logical. If `baseline_dates` available in file paths, produce a binary 0/1 variable, indicating the controls (people without a diagnosis) and setting the date first `_df` field to the date of censoring (currently see `censoring_date` option). 32 | #' \code{default=TRUE} 33 | #' @param file_paths A data frame. Columns must be `object` and `path` containing paths to outputted files. If not provided will use those in `ukbrapr_paths` 34 | #' \code{default=NULL} 35 | #' @param censoring_date A string. If using baseline data to infer control participants, include a censoring date (set to NA if not desired). Use dd-mm-yyyy format. Default is the (current) HES date. 36 | #' \code{default="30-10-2022"} 37 | #' @param verbose Logical. Be verbose, 38 | #' \code{default=FALSE} 39 | #' 40 | #' @examples 41 | #' 42 | #' ############################################### 43 | #' # example 1. haemochromatosis 44 | #' 45 | #' # get diagnosis data - returns list of data frames (one per source) 46 | #' diagnosis_list <- get_diagnoses(ukbrapR:::codes_df_hh) 47 | #' 48 | #' # for each participant, get Date First diagnosed with the condition 49 | #' diagnosis_df <- get_df(diagnosis_list, prefix="hh") 50 | #' 51 | #' ############################################### 52 | #' # example 2. get multiple diseases at once 53 | #' # don't have to all have the same code types/data sources 54 | #' 55 | #' codes = rbind(ukbrapR:::codes_df_hh, ukbrapR:::codes_df_ckd) 56 | #' print(codes) 57 | #' 58 | #' # get diagnosis data - returns list of data frames (one per source) 59 | #' diagnosis_list <- get_diagnoses(codes) 60 | #' 61 | #' # for each participant, get Date First diagnosed with the condition 62 | #' diagnosis_df <- get_df(diagnosis_list, group_by="condition") 63 | #' 64 | #' @export 65 | #' 66 | get_df <- function( 67 | diagnosis_list, 68 | prefix = NULL, 69 | group_by = NULL, 70 | include_selfrep_illness = TRUE, 71 | include_death_cause = TRUE, 72 | include_gp_clinical = TRUE, 73 | include_hesin_diag = TRUE, 74 | include_hesin_oper = TRUE, 75 | include_cancer_registry = TRUE, 76 | use_baseline_dates = TRUE, 77 | file_paths = NULL, 78 | censoring_date = "30-10-2022", 79 | verbose = FALSE 80 | ) { 81 | 82 | v <- packageVersion("ukbrapR") 83 | cli::cli_alert_info("ukbrapR v{v}") 84 | 85 | start_time <- Sys.time() 86 | 87 | # use baseline dates? 88 | if (use_baseline_dates) { 89 | 90 | # Is this one of my systems? If so, get the internal file_paths 91 | nodename <- as.character(Sys.info()['nodename']) 92 | if ( is.null(file_paths) & nodename %in% c("SNOW","SHAPTER") ) { 93 | file_paths = ukbrapR:::snow_paths 94 | if (verbose) cli::cli_alert_info("Identified server {nodename} - using predefined paths.") 95 | } 96 | if ( is.null(file_paths) & nodename == "indy.ex.ac.uk" ) { 97 | file_paths = ukbrapR:::indy_paths 98 | if (verbose) cli::cli_alert_info("Identified server {nodename} - using predefined paths.") 99 | } 100 | 101 | # if file_paths not provided assume default paths 102 | if (is.null(file_paths)) { 103 | file_paths = ukbrapR:::ukbrapr_paths 104 | # add users home directory to path 105 | home_path = as.character(Sys.getenv()["HOME"]) 106 | file_paths$path = stringr::str_c(home_path, "/", file_paths$path) 107 | } 108 | 109 | # does baseline_dates file exist? 110 | # -- this is downloaded to users home directory so need to add this to the path 111 | bl_file_path = file_paths$path[ file_paths$object=="baseline_dates" ] 112 | if (file.exists(bl_file_path)) { 113 | 114 | # read baseline dates 115 | bl_data = readr::read_tsv(bl_file_path, show_col_types = FALSE, progress = FALSE) 116 | 117 | # rename assessment date (p53_i0) for ease later 118 | bl_data = bl_data |> 119 | dplyr::rename(assessment_date_0 = p53_i0) |> 120 | dplyr::select(eid, assessment_date_0) 121 | 122 | # censoring date provided? 123 | if (!is.na(censoring_date)) censoring_date = lubridate::dmy(censoring_date) 124 | 125 | } else { 126 | use_baseline_dates = FALSE 127 | cli::cli_alert_warning("Could not find \"baseline dates\" file at path {.file {bl_file_path}} - continued without using it") 128 | } 129 | } 130 | 131 | # are we using a grouping variable? 132 | if (is.null(group_by)) { 133 | 134 | df_tbl = ukbrapR:::get_df1( 135 | diagnosis_list=diagnosis_list, 136 | include_selfrep_illness=include_selfrep_illness, 137 | include_gp_clinical=include_gp_clinical, 138 | include_death_cause=include_death_cause, 139 | include_hesin_diag=include_hesin_diag, 140 | include_cancer_registry=include_cancer_registry, 141 | include_hesin_oper=include_hesin_oper, 142 | prefix=prefix, 143 | verbose=verbose 144 | ) 145 | 146 | # add binary variables (ever, prev) & censoring date (if provided) 147 | if (use_baseline_dates) df_tbl = ukbrapR:::get_df1_add_bin(df=df_tbl, bd=bl_data, cd=censoring_date, prefix=prefix, verbose=verbose) 148 | 149 | } else { 150 | 151 | if (verbose) cli::cli_alert("Grouping variable detected - checking codes") 152 | 153 | # check input codes and group variable 154 | if (class(diagnosis_list) != "ukbrapr_emr") cli::cli_alert_warning(c("{.var diagnosis_list} should be of class {.cls ukbrapr_emr}", "x" = "You've supplied a {.cls {class(diagnosis_list)}} - behaviour may not be as intended.")) 155 | 156 | codes = as.data.frame(diagnosis_list[['codes_df']]) 157 | 158 | if (! group_by %in% colnames(codes)) cli::cli_abort("{.var diagnosis_list} codes data frame needs to contain the group column {group_by}") 159 | if (class(codes[,group_by]) != "character") cli::cli_abort(c("Group column {group_by} needs to be a character vector", "x" = "You've supplied a {.cls {class(codes[,group_by])}}.")) 160 | 161 | # for each grouping variable, subset diagnostic data, run get_df1(), and combine output 162 | groups = unique(codes[,group_by]) 163 | df_tbl = NULL 164 | cli::cli_alert("{length(groups)} group{?s} identified - getting date first for each") 165 | 166 | for (group in groups) { 167 | 168 | if (verbose) cli::cli_alert("Doing group {group}") 169 | 170 | # subset diagnostic codes 171 | codes_sub = codes[codes[,group_by]==group,] 172 | diagnosis_list_sub = diagnosis_list 173 | 174 | ## gp clinical 175 | gp_clinical_sub <- NULL 176 | if (!is.null(diagnosis_list_sub$gp_clinical) & any(codes_sub$vocab_id %in% c("Read2","CTV3"))) { 177 | Read2s = "" 178 | CTV3s = "" 179 | if (any(codes_sub$vocab_id == "Read2")) { 180 | Read2s <- codes_sub$code[codes_sub$vocab_id == "Read2"] 181 | Read2s <- stringr::str_sub(Read2s, 1, 5) |> unique() 182 | } 183 | if (any(codes_sub$vocab_id == "CTV3")) { 184 | CTV3s = codes_sub$code[codes_sub$vocab_id == "CTV3"] 185 | CTV3s <- stringr::str_sub(CTV3s, 1, 5) |> unique() 186 | } 187 | gp_clinical_sub <- diagnosis_list_sub$gp_clinical |> dplyr::filter(read_2 %in% !!Read2s | read_3 %in% !!CTV3s) 188 | } 189 | diagnosis_list_sub$gp_clinical <- gp_clinical_sub 190 | 191 | # create ICD10 search string 192 | if (any(codes_sub$vocab_id == "ICD10")) { 193 | ICD10s <- codes_sub |> 194 | dplyr::filter(vocab_id == "ICD10") |> 195 | dplyr::select(code) |> 196 | dplyr::pull() |> 197 | unique() |> 198 | stringr::str_remove(stringr::fixed(".")) |> 199 | stringr::str_sub(1, 5) 200 | ICD10_search = stringr::str_flatten(ICD10s, collapse = "|") 201 | } 202 | 203 | ## hesin_diag 204 | hesin_diag_sub = NULL 205 | if (!is.null(diagnosis_list_sub$hesin_diag) & any(codes_sub$vocab_id %in% c("ICD10","ICD9"))) { 206 | 207 | if (any(codes_sub$vocab_id == "ICD10")) { 208 | colnames(diagnosis_list_sub$hesin_diag) = tolower(colnames(diagnosis_list_sub$hesin_diag)) 209 | hesin_diag_sub = diagnosis_list_sub$hesin_diag |> dplyr::filter(stringr::str_detect(diag_icd10, !! ICD10_search)) 210 | } 211 | 212 | if (any(codes_sub$vocab_id == "ICD9")) { 213 | ICD9s = "" 214 | if (any(codes_sub$vocab_id == "ICD9")) { 215 | ICD9s <- codes_sub |> 216 | dplyr::filter(vocab_id == "ICD9") |> 217 | dplyr::select(code) |> 218 | dplyr::pull() |> 219 | unique() |> 220 | stringr::str_remove(stringr::fixed(".")) |> 221 | stringr::str_sub(1, 5) 222 | } 223 | ICD9_search = stringr::str_flatten(ICD9s, collapse = "|") 224 | colnames(diagnosis_list_sub$hesin_diag) = tolower(colnames(diagnosis_list_sub$hesin_diag)) 225 | hesin_diag_sub = rbind(hesin_diag_sub, diagnosis_list_sub$hesin_diag |> dplyr::filter(stringr::str_starts(diag_icd9, !! ICD9_search))) 226 | } 227 | 228 | } 229 | diagnosis_list_sub$hesin_diag <- hesin_diag_sub 230 | 231 | ## death_cause 232 | death_cause_sub <- NULL 233 | if (!is.null(diagnosis_list_sub$death_cause) & any(codes_sub$vocab_id == "ICD10")) { 234 | death_cause_sub <- diagnosis_list_sub$death_cause |> dplyr::filter(stringr::str_detect( cause_icd10, !! ICD10_search)) 235 | } 236 | diagnosis_list_sub$death_cause <- death_cause_sub 237 | 238 | ## cancer_registry 239 | cancer_registry_sub <- NULL 240 | if (!is.null(diagnosis_list_sub$cancer_registry) & any(codes_sub$vocab_id == "ICD10")) { 241 | cancer_registry_sub <- diagnosis_list_sub$cancer_registry |> dplyr::filter(stringr::str_detect( icd10, !! ICD10_search)) 242 | } 243 | diagnosis_list_sub$cancer_registry <- cancer_registry_sub 244 | 245 | ## hesin_oper 246 | hesin_oper_sub <- NULL 247 | if (!is.null(diagnosis_list_sub$hesin_oper) & any(codes_sub$vocab_id %in% c("OPCS3","OPCS4"))) { 248 | 249 | # get OPCS4 codes to search for 250 | OPCS4s = "" 251 | if (any(codes_sub$vocab_id == "OPCS4")) { 252 | OPCS4s <- codes_sub |> 253 | dplyr::filter(vocab_id == "OPCS4") |> 254 | dplyr::select(code) |> 255 | dplyr::pull() |> 256 | unique() |> 257 | stringr::str_remove(stringr::fixed(".")) |> 258 | stringr::str_sub(1, 5) 259 | } 260 | 261 | # get OPCS3 codes to search for 262 | OPCS3s = "" 263 | if (any(codes_sub$vocab_id == "OPCS3")) { 264 | OPCS3s <- codes_sub |> 265 | dplyr::filter(vocab_id == "OPCS3") |> 266 | dplyr::select(code) |> 267 | dplyr::pull() |> 268 | unique() |> 269 | stringr::str_remove(stringr::fixed(".")) |> 270 | stringr::str_sub(1, 5) 271 | } 272 | 273 | # subset hesin_oper to those matching either 274 | OPCS4_search = stringr::str_flatten(OPCS4s, collapse = "|") 275 | OPCS3_search = stringr::str_flatten(OPCS3s, collapse = "|") 276 | 277 | if (OPCS4s[1] != "" & OPCS3s[1] == "") hesin_oper_sub <- diagnosis_list_sub$hesin_oper |> dplyr::filter(stringr::str_detect(oper4, !! OPCS4_search)) 278 | if (OPCS4s[1] == "" & OPCS3s[1] != "") hesin_oper_sub <- diagnosis_list_sub$hesin_oper |> dplyr::filter(stringr::str_detect(oper3, !! OPCS3_search)) 279 | if (OPCS4s[1] != "" & OPCS3s[1] != "") hesin_oper_sub <- diagnosis_list_sub$hesin_oper |> dplyr::filter(stringr::str_detect(oper4, !! OPCS4_search) | stringr::str_detect(oper3, !! OPCS3_search)) 280 | } 281 | diagnosis_list_sub$hesin_oper <- hesin_oper_sub 282 | 283 | ## self-reported illness 284 | selfrep_illness_sub <- NULL 285 | if (!is.null(diagnosis_list_sub$selfrep_illness) & any(codes_sub$vocab_id %in% c("ukb_cancer","ukb_noncancer"))) { 286 | if (any(codes_sub$vocab_id == "ukb_cancer")) { 287 | codes_cancer = codes_sub$code[ codes_sub$vocab_id == "ukb_cancer" ] 288 | selfrep_illness_sub <- diagnosis_list_sub$selfrep_illness |> dplyr::filter(cancer_code %in% codes_cancer) 289 | } 290 | if (any(codes_sub$vocab_id == "ukb_noncancer")) { 291 | codes_noncancer = codes_sub$code[ codes_sub$vocab_id == "ukb_noncancer" ] 292 | selfrep_illness_sub <- diagnosis_list_sub$selfrep_illness |> dplyr::filter(noncancer_code %in% codes_noncancer) 293 | } 294 | } 295 | diagnosis_list_sub$selfrep_illness <- selfrep_illness_sub 296 | if (!any(codes_sub$vocab_id %in% c("ukb_cancer","ukb_noncancer"))) include_selfrep_illness <- FALSE 297 | 298 | 299 | # 300 | # 301 | # get DF for this condition 302 | df_tbl_sub = ukbrapR:::get_df1( 303 | diagnosis_list=diagnosis_list_sub, 304 | include_selfrep_illness=include_selfrep_illness, 305 | include_gp_clinical=include_gp_clinical, 306 | include_death_cause=include_death_cause, 307 | include_hesin_diag=include_hesin_diag, 308 | include_cancer_registry=include_cancer_registry, 309 | include_hesin_oper=include_hesin_oper, 310 | prefix=group, 311 | verbose=verbose 312 | ) 313 | 314 | # add binary variables (ever, prev) & censoring date (if provided) 315 | if (use_baseline_dates) df_tbl_sub = ukbrapR:::get_df1_add_bin(df=df_tbl_sub, bd=bl_data, cd=censoring_date, prefix=group, verbose=verbose) 316 | 317 | # merge with main DF table 318 | if (is.null(df_tbl)) { 319 | df_tbl = df_tbl_sub 320 | } else { 321 | df_tbl = dplyr::full_join(df_tbl, df_tbl_sub, by="eid") 322 | } 323 | 324 | } 325 | 326 | cli::cli_alert_info("Finished getting date first diagnosed for each group/condition.") 327 | 328 | } 329 | 330 | if (verbose) cli::cli_alert_success(c("Time taken: ", "{prettyunits::pretty_sec(as.numeric(difftime(Sys.time(), start_time, units=\"secs\")))}.")) 331 | 332 | # return table 333 | return(df_tbl) 334 | 335 | } 336 | 337 | 338 | #' Get UK Biobank participant Date First (DF) diagnosis for one condition 339 | #' 340 | #' @description For each participant identify the date of first diagnosis from all available electronic medical records & self-reported data. 341 | #' 342 | #' @return Returns a single, "wide" data frame: the participant data for the requested diagnosis codes with "date first" `_df` variables. One for each source of data, and a combined variable. 343 | #' 344 | #' @author Luke Pilling 345 | #' 346 | #' @name get_df1 347 | #' 348 | #' @noRd 349 | get_df1 <- function( 350 | diagnosis_list, 351 | include_selfrep_illness = TRUE, 352 | include_gp_clinical = TRUE, 353 | include_death_cause = TRUE, 354 | include_hesin_diag = TRUE, 355 | include_cancer_registry = TRUE, 356 | include_hesin_oper = TRUE, 357 | prefix = NULL, 358 | verbose = FALSE 359 | ) { 360 | 361 | #start_time <- Sys.time() 362 | 363 | # Check input 364 | if (verbose) cli::cli_alert("Check inputs\n") 365 | if (class(diagnosis_list) != "ukbrapr_emr") cli::cli_alert_warning(c("{.var diagnosis_list} should be of class {.cls ukbrapr_emr}", "x" = "You've supplied a {.cls {class(diagnosis_list)}} - behaviour may not be as intended.")) 366 | 367 | # "use" if there is any data (i.e., provide an individual _df column) -- "include" in the main combined only if specified by user 368 | use_selfrep <- use_gp_clinical <- use_death_cause <- use_hesin_diag <- use_cancer_registry <- use_hesin_oper <- FALSE 369 | 370 | if (include_selfrep_illness) if ( !is.null(diagnosis_list$selfrep_illness) ) if ( nrow(diagnosis_list$selfrep_illness)>0 ) use_selfrep <- TRUE 371 | 372 | if ( !is.null(diagnosis_list$gp_clinical) ) if ( nrow(diagnosis_list$gp_clinical)>0 ) use_gp_clinical <- TRUE 373 | if ( !is.null(diagnosis_list$death_cause) ) if ( nrow(diagnosis_list$death_cause)>0 ) use_death_cause <- TRUE 374 | if ( !is.null(diagnosis_list$hesin_diag) ) if ( nrow(diagnosis_list$hesin_diag)>0 ) use_hesin_diag <- TRUE 375 | if ( !is.null(diagnosis_list$cancer_registry) ) if ( nrow(diagnosis_list$cancer_registry)>0 ) use_cancer_registry <- TRUE 376 | if ( !is.null(diagnosis_list$hesin_oper) ) if ( nrow(diagnosis_list$hesin_oper)>0 ) use_hesin_oper <- TRUE 377 | 378 | # 379 | # 380 | # 381 | 382 | # Convert self-reported illness to "wide" Date First 383 | if (use_selfrep) { 384 | if (verbose) cli::cli_alert("Get date first diagnosis: selfrep_df\n") 385 | selfrep_illness <- ukbrapR:::get_selfrep_illness_df(codes_df=diagnosis_list$codes_df, ukb_dat=diagnosis_list$selfrep_illness, verbose=verbose) 386 | } 387 | 388 | # Convert gp_clinical to "wide" Date First 389 | if (use_gp_clinical) { 390 | if (verbose) cli::cli_alert("Get date first diagnosis: gp_df\n") 391 | gp_clinical <- diagnosis_list$gp_clinical |> 392 | dplyr::filter(!is.na(event_dt)) |> 393 | dplyr::group_by(eid) |> 394 | dplyr::summarize(gp_df=min(event_dt, na.rm=TRUE)) |> 395 | dplyr::mutate(gp_df = dplyr::if_else(is.finite(gp_df), gp_df, NA)) 396 | } 397 | 398 | # Convert death_cause to "wide" Date First 399 | if (use_death_cause) { 400 | if (verbose) cli::cli_alert("Get date first diagnosis: death_df\n") 401 | death_cause <- diagnosis_list$death_cause |> 402 | dplyr::filter(!is.na(date_of_death)) |> 403 | dplyr::group_by(eid) |> 404 | dplyr::summarize(death_df=min(date_of_death, na.rm=TRUE)) |> 405 | dplyr::mutate(death_df = dplyr::if_else(is.finite(death_df), death_df, NA)) 406 | } 407 | 408 | # Convert hesin_diag to "wide" Date First 409 | if (use_hesin_diag) { 410 | if (verbose) cli::cli_alert("Get date first diagnosis: hes_df\n") 411 | hesin_diag <- diagnosis_list$hesin_diag |> 412 | dplyr::mutate(diagnosis_date = epistart) |> 413 | dplyr::mutate(diagnosis_date = dplyr::if_else(is.na(diagnosis_date), epiend, diagnosis_date)) |> 414 | dplyr::mutate(diagnosis_date = dplyr::if_else(is.na(diagnosis_date), admidate, diagnosis_date)) |> 415 | dplyr::mutate(diagnosis_date = lubridate::as_date(dplyr::if_else(is.na(diagnosis_date), disdate, diagnosis_date))) |> 416 | dplyr::filter(!is.na(diagnosis_date)) |> 417 | dplyr::group_by(eid) |> 418 | dplyr::summarize(hes_df=min(diagnosis_date, na.rm=TRUE)) |> 419 | dplyr::mutate(hes_df = dplyr::if_else(is.finite(hes_df), hes_df, NA)) 420 | } 421 | 422 | # Convert cancer registry to "wide" Date First 423 | if (use_cancer_registry) { 424 | if (verbose) cli::cli_alert("Get date first diagnosis: canreg_df\n") 425 | cancer_registry <- ukbrapR:::get_cancer_registry_df(codes_df=diagnosis_list$codes_df, ukb_dat=diagnosis_list$cancer_registry, verbose=verbose) 426 | } 427 | 428 | # Convert hesin_oper to "wide" Date First 429 | if (use_hesin_oper) { 430 | if (verbose) cli::cli_alert("Get date first diagnosis: oper_df\n") 431 | hesin_oper <- diagnosis_list$hesin_oper |> 432 | dplyr::filter(!is.na(opdate)) |> 433 | dplyr::group_by(eid) |> 434 | dplyr::summarize(oper_df=min(opdate, na.rm=TRUE)) |> 435 | dplyr::mutate(oper_df = dplyr::if_else(is.finite(oper_df), oper_df, NA)) 436 | } 437 | 438 | # 439 | # 440 | # 441 | 442 | # Combine into single data frame 443 | if (verbose) cli::cli_alert("Combine into single wide data frame\n") 444 | diagnosis_df <- NULL 445 | if (use_selfrep) { 446 | diagnosis_df <- selfrep_illness 447 | } 448 | if (use_gp_clinical) { 449 | if (is.null(diagnosis_df)) { 450 | diagnosis_df <- gp_clinical 451 | } else { 452 | diagnosis_df <- dplyr::full_join(diagnosis_df, gp_clinical, by="eid") 453 | } 454 | } 455 | if (use_hesin_diag) { 456 | if (is.null(diagnosis_df)) { 457 | diagnosis_df <- hesin_diag 458 | } else { 459 | diagnosis_df <- dplyr::full_join(diagnosis_df, hesin_diag, by="eid") 460 | } 461 | } 462 | if (use_death_cause) { 463 | if (is.null(diagnosis_df)) { 464 | diagnosis_df <- death_cause 465 | } else { 466 | diagnosis_df <- dplyr::full_join(diagnosis_df, death_cause, by="eid") 467 | } 468 | } 469 | if (use_cancer_registry) { 470 | if (is.null(diagnosis_df)) { 471 | diagnosis_df <- cancer_registry 472 | } else { 473 | diagnosis_df <- dplyr::full_join(diagnosis_df, cancer_registry, by="eid") 474 | } 475 | } 476 | if (use_hesin_oper) { 477 | if (is.null(diagnosis_df)) { 478 | diagnosis_df <- hesin_oper 479 | } else { 480 | diagnosis_df <- dplyr::full_join(diagnosis_df, hesin_oper, by="eid") 481 | } 482 | } 483 | 484 | # 485 | # 486 | # 487 | 488 | # Combined "date first, any source" variable & "source" variable 489 | if (verbose) cli::cli_alert("Combined \"date first, any source\" variable\n") 490 | diagnosis_df$df <- NA 491 | diagnosis_df$src <- "" 492 | 493 | if (include_selfrep_illness & use_selfrep) { 494 | diagnosis_df <- diagnosis_df |> 495 | dplyr::mutate( 496 | df = selfrep_df, 497 | src = dplyr::if_else(! is.na(selfrep_df), stringr::str_c("selfrep_i", selfrep_i), NA)) |> 498 | dplyr::select(-selfrep, -selfrep_i) 499 | } 500 | 501 | if (include_gp_clinical & use_gp_clinical) { 502 | diagnosis_df <- diagnosis_df |> dplyr::mutate( 503 | src = dplyr::case_when( 504 | !is.na(gp_df) & is.na(df) ~ "gp", 505 | !is.na(gp_df) & !is.na(df) & gp_df dplyr::mutate( 516 | src = dplyr::case_when( 517 | !is.na(hes_df) & is.na(df) ~ "hes", 518 | !is.na(hes_df) & !is.na(df) & hes_df dplyr::mutate( 529 | src = dplyr::case_when( 530 | !is.na(death_df) & is.na(df) ~ "death", 531 | !is.na(death_df) & !is.na(df) & death_df dplyr::mutate( 542 | src = dplyr::case_when( 543 | !is.na(canreg_df) & is.na(df) ~ "canreg", 544 | !is.na(canreg_df) & !is.na(df) & canreg_df 551 | dplyr::select(-canreg) 552 | 553 | } 554 | 555 | if (include_hesin_oper & use_hesin_oper) { 556 | diagnosis_df <- diagnosis_df |> dplyr::mutate( 557 | src = dplyr::case_when( 558 | !is.na(oper_df) & is.na(df) ~ "hesin_oper", 559 | !is.na(oper_df) & !is.na(df) & oper_df dplyr::filter(src!="" & !is.na(df)) 570 | 571 | # adding variable name prefix? 572 | if (!is.null(prefix)) { 573 | if (is.character(prefix) & length(prefix) == 1) { 574 | names(diagnosis_df)[2:ncol(diagnosis_df)] = stringr::str_c(prefix, "_", names(diagnosis_df)[2:ncol(diagnosis_df)]) 575 | } else { 576 | cli::cli_alert_warning("Prefix was not a single string - variables names left as default") 577 | } 578 | } 579 | 580 | # 581 | # 582 | # done! 583 | 584 | #if (verbose) cat("Done. Time taken:", Sys.time() - start_time, "\n") 585 | 586 | diagnosis_df_nrow = nrow(diagnosis_df) 587 | if (is.null(prefix)) cli::cli_alert_success("Identified date of first diagnosis in {diagnosis_df_nrow} participants.") 588 | if (!is.null(prefix)) cli::cli_alert_success("Identified date of first {prefix} diagnosis in {diagnosis_df_nrow} participants.") 589 | 590 | # Return data frame 591 | diagnosis_df 592 | 593 | } 594 | 595 | 596 | 597 | #' Add binary variables and censoring date 598 | #' 599 | #' @description Ever and prevalent binary vars. Censoring date. Only to the combined _df variable 600 | #' 601 | #' @return NA 602 | #' 603 | #' @author Luke Pilling 604 | #' 605 | #' @name get_df1_add_bin 606 | #' 607 | #' @noRd 608 | get_df1_add_bin = function( 609 | df, 610 | bd, 611 | cd, 612 | prefix = NULL, 613 | verbose = FALSE 614 | ) { 615 | 616 | if (verbose) cli::cli_alert("Creating binary \"ever diagnosed\" field - adding censoring date {cd} to date first `_df` field") 617 | 618 | # if no prefix them colnames are just `df` etc. - if one provided then include an underscore 619 | if (is.null(prefix)) { 620 | prefix = "" 621 | } else { 622 | prefix = stringr::str_c(prefix, "_") 623 | } 624 | 625 | # define new variable names 626 | var_df = rlang::sym(stringr::str_c(prefix, "df")) 627 | var_bin = rlang::sym(stringr::str_c(prefix, "bin")) 628 | var_bin_prev = rlang::sym(stringr::str_c(prefix, "bin_prev")) 629 | 630 | # merge df and baseline data 631 | df = dplyr::full_join(df, bd, by="eid") 632 | 633 | # create binary "ever" 634 | df = df |> dplyr::mutate(!!var_bin := dplyr::if_else(!is.na(!!var_df), 1, 0)) 635 | 636 | # create prevalent variables 637 | df = df |> dplyr::mutate(!!var_bin_prev := dplyr::if_else(!!var_bin==1 & !!var_df dplyr::select(!assessment_date_0) 641 | 642 | # relocate src to end 643 | df = df |> dplyr::relocate(!!rlang::sym(stringr::str_c(prefix, "src")), .after = dplyr::last_col()) 644 | 645 | # add censoring date 646 | if (!is.na(cd)) df = df |> dplyr::mutate(!!var_df := dplyr::if_else(!!var_bin==0, cd, !!var_df)) 647 | 648 | # return 649 | return(df) 650 | } 651 | 652 | 653 | #' Get date first for cancer registry data 654 | #' 655 | #' @return NA 656 | #' 657 | #' @author Luke Pilling 658 | #' 659 | #' @name get_cancer_registry_df 660 | #' 661 | #' @noRd 662 | 663 | get_cancer_registry_df <- function( 664 | codes_df, 665 | ukb_dat, 666 | verbose = FALSE 667 | ) { 668 | 669 | start_time <- Sys.time() 670 | 671 | if (verbose) cat("Getting cancer registry data\n") 672 | 673 | # format codes 674 | vocab_col = "vocab_id" 675 | codes_col = "code" 676 | 677 | codes <- codes_df |> 678 | dplyr::filter(!!rlang::sym(vocab_col) == "ICD10") |> 679 | dplyr::select(!!rlang::sym(codes_col)) |> 680 | dplyr::pull() |> 681 | unique() |> 682 | stringr::str_remove(stringr::fixed(".")) |> 683 | stringr::str_sub(1, 5) 684 | codes_string = stringr::str_flatten(codes, collapse = "|") 685 | 686 | # create empty vars in ukb_dat to modify 687 | ukb_dat$canreg <- 0 688 | ukb_dat$canreg_df <- NA 689 | 690 | # for this instance, check if participant self-reported this code and record which array 691 | 692 | # Number of diagnosis columns 693 | n_i <- length(unique(ukb_dat$instance)) 694 | 695 | # Iterate through each diagnosis column 696 | for (i in 0:(n_i-1)) { 697 | 698 | if (verbose) cat("Get cancer registry data from instance ", i, "\n") 699 | 700 | # Update where the code matches 701 | ukb_dat <- ukb_dat |> dplyr::mutate( 702 | canreg_df = dplyr::if_else(canreg == 0 & stringr::str_detect(icd10, codes_string), date, canreg_df, canreg_df), 703 | canreg = dplyr::if_else(canreg == 0 & stringr::str_detect(icd10, codes_string), 1, canreg, canreg) 704 | ) 705 | } 706 | 707 | # keep date first for each participant 708 | ukb_dat <- ukb_dat |> 709 | dplyr::filter(canreg==1) |> 710 | dplyr::filter(!is.na(canreg_df)) |> 711 | dplyr::group_by(eid) |> 712 | dplyr::summarize(canreg_df=min(canreg_df, na.rm=TRUE)) |> 713 | dplyr::mutate(canreg_df = dplyr::if_else(is.finite(canreg_df), canreg_df, NA)) 714 | 715 | # add binary variable back in 716 | ukb_dat$canreg <- 1 717 | 718 | # finish 719 | if (verbose) cli::cli_alert_info(c("Finished cancer registry: ", "{prettyunits::pretty_sec(as.numeric(difftime(Sys.time(), start_time, units=\"secs\")))}.")) 720 | 721 | # Return data 722 | return(ukb_dat[,c("eid", "canreg", "canreg_df")]) 723 | 724 | } 725 | 726 | 727 | #' Get date first for self-reported illness data 728 | #' 729 | #' @return NA 730 | #' 731 | #' @author Luke Pilling 732 | #' 733 | #' @name get_selfrep_illness_df 734 | #' 735 | #' @noRd 736 | get_selfrep_illness_df <- function( 737 | codes_df, 738 | ukb_dat, 739 | verbose = FALSE 740 | ) { 741 | 742 | start_time <- Sys.time() 743 | 744 | if (verbose) cat("Getting self-reported illness data\n") 745 | 746 | # format codes 747 | vocab_col = "vocab_id" 748 | codes_col = "code" 749 | 750 | # get codes 751 | codes_cancer <- codes_noncancer <- NULL 752 | if (any(codes_df$vocab_id == "ukb_cancer")) codes_cancer = codes_df$code[ codes_df$vocab_id == "ukb_cancer" ] 753 | if (any(codes_df$vocab_id == "ukb_noncancer")) codes_noncancer = codes_df$code[ codes_df$vocab_id == "ukb_noncancer" ] 754 | 755 | # split instance and array 756 | ukb_dat <- ukb_dat |> 757 | tidyr::separate_wider_delim(instance, delim = "_", names = c("instance", "array")) |> 758 | dplyr::mutate( 759 | instance = stringr::str_replace_all(instance, "i", ""), 760 | array = stringr::str_replace_all(array, "a", "") 761 | ) 762 | 763 | # create empty vars in ukb_dat to modify 764 | ukb_dat$selfrep <- 0 765 | ukb_dat$selfrep_df <- NA 766 | ukb_dat$selfrep_i <- NA 767 | 768 | # for each instance, check if participant self-reported this code and record which array 769 | 770 | # Update where the code matches 771 | if (!is.null(codes_cancer)) { 772 | ukb_dat <- ukb_dat |> dplyr::mutate( 773 | selfrep_i = dplyr::if_else(selfrep == 0 & cancer_code %in% codes_cancer, stringr::str_c(instance, "_cancer"), selfrep_i, selfrep_i), 774 | selfrep_df = dplyr::if_else(selfrep == 0 & cancer_code %in% codes_cancer, cancer_year, selfrep_df, selfrep_df), 775 | selfrep = dplyr::if_else(selfrep == 0 & cancer_code %in% codes_cancer, 1, selfrep, selfrep) 776 | ) 777 | } 778 | if (!is.null(codes_noncancer)) { 779 | ukb_dat <- ukb_dat |> dplyr::mutate( 780 | selfrep_i = dplyr::if_else(selfrep == 0 & noncancer_code %in% codes_noncancer, stringr::str_c(instance, "_noncancer"), selfrep_i, selfrep_i), 781 | selfrep_df = dplyr::if_else(selfrep == 0 & noncancer_code %in% codes_noncancer, noncancer_year, selfrep_df, selfrep_df), 782 | selfrep = dplyr::if_else(selfrep == 0 & noncancer_code %in% codes_noncancer, 1, selfrep, selfrep) 783 | ) 784 | } 785 | 786 | # determine earliest date 787 | # remove if year is < 1900 788 | ukb_dat <- ukb_dat |> 789 | dplyr::filter(!is.na(selfrep_df) & selfrep_df >= 1936) |> 790 | dplyr::group_by(eid) |> 791 | dplyr::slice(which.min(selfrep_df)) |> 792 | dplyr::ungroup() 793 | 794 | # make sure date is actually a date and not year 795 | if (! lubridate::is.Date(ukb_dat$selfrep_df)) ukb_dat <- ukb_dat |> dplyr::mutate(selfrep_df = lubridate::as_date(lubridate::date_decimal(selfrep_df))) 796 | 797 | # finish 798 | if (verbose) cli::cli_alert_info(c("Finished self-reported illness: ", "{prettyunits::pretty_sec(as.numeric(difftime(Sys.time(), start_time, units=\"secs\")))}.")) 799 | 800 | # Return data 801 | return(ukb_dat[,c("eid", "selfrep", "selfrep_df", "selfrep_i")]) 802 | 803 | } 804 | 805 | 806 | 807 | 808 | --------------------------------------------------------------------------------