├── vignettes
├── .gitignore
├── spark_functions.Rmd
├── label_fields.Rmd
└── ascertain_diagnoses.Rmd
├── .Rbuildignore
├── R
├── sysdata.rda
├── get_emr_spark.R
├── upload_to_rap.R
├── get_cancer_registry.R
├── get_selfrep_illness.R
├── get_rap_phenos.R
├── fields_to_phenos.R
├── label_ukb_field.R
├── get_selfrep_illness_spark.R
├── export_tables.R
├── get_diagnoses.R
└── get_df.R
├── man
├── figures
│ └── ukbrapR.png
├── load_bed.Rd
├── download_from_rap.Rd
├── upload_to_rap.Rd
├── get_emr_spark.Rd
├── make_imputed_bed.Rd
├── make_dragen_bed.Rd
├── label_ukb_field.Rd
├── fields_to_phenos.Rd
├── label_ukb_fields.Rd
├── get_rap_phenos.Rd
├── extract_variants.Rd
├── create_pgs.Rd
├── get_diagnoses.Rd
├── get_selfrep_illness_spark.Rd
├── export_tables.Rd
└── get_df.Rd
├── inst
└── files
│ ├── dragen_pvcf_coordinates.csv.gz
│ ├── pgs_liver_cirrhosis.txt
│ └── pgs_liver_cirrhosis.README
├── NAMESPACE
├── .gitignore
├── _pkgdown.yml
├── DESCRIPTION
├── data-raw
└── internal_files.R
├── README.md
└── NEWS.md
/vignettes/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | *.R
3 |
--------------------------------------------------------------------------------
/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^_pkgdown\.yml$
2 | ^docs$
3 | ^pkgdown$
4 |
--------------------------------------------------------------------------------
/R/sysdata.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lcpilling/ukbrapR/HEAD/R/sysdata.rda
--------------------------------------------------------------------------------
/man/figures/ukbrapR.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lcpilling/ukbrapR/HEAD/man/figures/ukbrapR.png
--------------------------------------------------------------------------------
/inst/files/dragen_pvcf_coordinates.csv.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lcpilling/ukbrapR/HEAD/inst/files/dragen_pvcf_coordinates.csv.gz
--------------------------------------------------------------------------------
/inst/files/pgs_liver_cirrhosis.txt:
--------------------------------------------------------------------------------
1 | rsID CHR POS effect_allele other_allele effect_weight locus_name
2 | rs2642438 1 220796686 A G -0.177 MARC1
3 | rs11925835 3 56831417 T C -0.235 ARHGEF3
4 | rs72613567 4 87310241 TA T -0.166 HSD17B13
5 | rs2954038 8 125495147 C A 0.16 TRIB1
6 | rs11065384 12 120985482 T C 0.275 HNF1A
7 | rs28929474 14 94378610 T C 0.561 SERPINA1
8 | rs10401969 19 19296909 C T 0.678 SUGP1
9 | rs15052 19 41307470 C T 0.222 HNRNPUL1
10 | rs738408 22 43928850 T C 0.734 PNPLA3
11 |
--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
1 | # Generated by roxygen2: do not edit by hand
2 |
3 | export(create_pgs)
4 | export(download_from_rap)
5 | export(export_tables)
6 | export(extract_variants)
7 | export(fields_to_phenos)
8 | export(get_df)
9 | export(get_diagnoses)
10 | export(get_emr)
11 | export(get_emr_spark)
12 | export(get_rap_phenos)
13 | export(get_selfrep_illness_spark)
14 | export(label_ukb_field)
15 | export(label_ukb_fields)
16 | export(load_bed)
17 | export(make_dragen_bed)
18 | export(make_imputed_bed)
19 | export(upload_to_rap)
20 |
--------------------------------------------------------------------------------
/inst/files/pgs_liver_cirrhosis.README:
--------------------------------------------------------------------------------
1 | ###PGS CATALOG SCORING FILE - see https://www.pgscatalog.org/downloads/#dl_ftp_scoring for additional information
2 | #format_version=2.0
3 | ##POLYGENIC SCORE (PGS) INFORMATION
4 | #pgs_id=PGS000776
5 | #pgs_name=GRS9_Cirr
6 | #trait_reported=Cirrhosis
7 | #trait_mapped=cirrhosis of liver
8 | #trait_efo=EFO_0001422
9 | #genome_build=NR
10 | #variants_number=9
11 | #weight_type=beta
12 | ##SOURCE INFORMATION
13 | #pgp_id=PGP000180
14 | #citation=Innes H et al. Gastroenterology (2020). doi:10.1053/j.gastro.2020.06.014
15 | ##HARMONIZATION DETAILS
16 | #HmPOS_build=GRCh38
17 | #HmPOS_date=2022-07-29
18 | #HmPOS_match_chr={"True": null, "False": null}
19 | #HmPOS_match_pos={"True": null, "False": null}
--------------------------------------------------------------------------------
/man/load_bed.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/extract_variants.R
3 | \name{load_bed}
4 | \alias{load_bed}
5 | \title{Load BED file into memory}
6 | \usage{
7 | load_bed(in_bed, verbose = FALSE, very_verbose = FALSE)
8 | }
9 | \arguments{
10 | \item{in_bed}{A string. BED prefix}
11 |
12 | \item{verbose}{Logical. Be verbose (show individual steps),
13 | \code{default=FALSE}}
14 |
15 | \item{very_verbose}{Logical. Be very verbose (show individual steps & show terminal output from Plink etc),
16 | \code{default=FALSE}}
17 | }
18 | \value{
19 | A data frame
20 | }
21 | \description{
22 | Use Plink to convert BED to RAW then easily load it
23 | }
24 | \examples{
25 |
26 | liver_variants <- load_bed(in_bed="liver_cirrhosis.imputed.variants")
27 |
28 | }
29 | \author{
30 | Luke Pilling
31 | }
32 |
--------------------------------------------------------------------------------
/man/download_from_rap.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/upload_to_rap.R
3 | \name{download_from_rap}
4 | \alias{download_from_rap}
5 | \title{Use R to download a file to the UK Biobank RAP}
6 | \usage{
7 | download_from_rap(file, verbose = FALSE)
8 | }
9 | \arguments{
10 | \item{file}{A string. Filename of the file to be downloaded (character)}
11 |
12 | \item{verbose}{Logical. Be verbose,
13 | \code{default=FALSE}}
14 | }
15 | \value{
16 | NA
17 | }
18 | \description{
19 | Use R to download a file to the UK Biobank RAP (really just a wrapper for `dx download`)
20 | }
21 | \examples{
22 |
23 | readr::write_tsv(data.frame(x=1:10,y=11:20), "ukbrap.dummy.20231114.txt.gz")
24 |
25 | # download file to RAP storage
26 | download_from_rap(file="ukbrap.dummy.20231114.txt.gz", dir="extracts/")
27 |
28 | }
29 | \author{
30 | Luke Pilling
31 | }
32 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # History files
2 | .Rhistory
3 | .Rapp.history
4 |
5 | # Session Data files
6 | .RData
7 | .RDataTmp
8 |
9 | # User-specific files
10 | .Ruserdata
11 |
12 | # Example code in package build process
13 | *-Ex.R
14 |
15 | # Output files from R CMD build
16 | /*.tar.gz
17 |
18 | # Output files from R CMD check
19 | /*.Rcheck/
20 |
21 | # RStudio files
22 | .Rproj.user/
23 |
24 | # produced vignettes
25 | vignettes/*.html
26 | vignettes/*.pdf
27 |
28 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
29 | .httr-oauth
30 |
31 | # knitr and R markdown default cache directories
32 | *_cache/
33 | /cache/
34 |
35 | # Temporary files created by R markdown
36 | *.utf8.md
37 | *.knit.md
38 |
39 | # R Environment Variables
40 | .Renviron
41 |
42 | # pkgdown site
43 | docs/
44 |
45 | # translation temp files
46 | po/*~
47 |
48 | # RStudio Connect folder
49 | rsconnect/
50 | docs
51 | inst/doc
52 |
--------------------------------------------------------------------------------
/man/upload_to_rap.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/upload_to_rap.R
3 | \name{upload_to_rap}
4 | \alias{upload_to_rap}
5 | \title{Use R to upload a file to the UK Biobank RAP}
6 | \usage{
7 | upload_to_rap(file, dir = "FALSE", verbose = FALSE)
8 | }
9 | \arguments{
10 | \item{file}{A string. Filename of the file to be uploaded (character)}
11 |
12 | \item{dir}{A string. Target directory in the RAP space. If blank, the current working directory (character)}
13 |
14 | \item{verbose}{Logical. Be verbose,
15 | \code{default=FALSE}}
16 | }
17 | \value{
18 | NA
19 | }
20 | \description{
21 | Use R to upload a file to the UK Biobank RAP (really just a wrapper for `dx upload`)
22 | }
23 | \examples{
24 |
25 | readr::write_tsv(data.frame(x=1:10,y=11:20), "ukbrap.dummy.20231114.txt.gz")
26 |
27 | # upload file to RAP storage
28 | upload_to_rap(file="ukbrap.dummy.20231114.txt.gz", dir="extracts/")
29 |
30 | }
31 | \author{
32 | Luke Pilling
33 | }
34 |
--------------------------------------------------------------------------------
/man/get_emr_spark.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/get_emr_spark.R
3 | \name{get_emr_spark}
4 | \alias{get_emr_spark}
5 | \title{Get UK Biobank participant Electronic Medical Records (EMR) data in a RAP Spark environment}
6 | \usage{
7 | get_emr_spark(
8 | codes_df = NULL,
9 | spark_master = "spark://master:41000",
10 | verbose = FALSE
11 | )
12 | }
13 | \arguments{
14 | \item{codes_df}{A data frame. Contains two columns: `code` and `vocab_id` i.e., a list of diagnostic codes, and an indicator of the vocabulary. Other columns are ignored.}
15 |
16 | \item{spark_master}{A string. The `master` argmuent passed to `sparklyr::spark_connect()`.
17 | \code{default='spark://master:41000'}}
18 |
19 | \item{verbose}{Logical. Be verbose,
20 | \code{default=FALSE}}
21 | }
22 | \value{
23 | Returns nothing
24 | }
25 | \description{
26 | This function is completely removed. Better to use `get_diagnoses()`. Use a historic release of this package if you really need it https://github.com/lcpilling/ukbrapR/releases
27 | }
28 | \author{
29 | Luke Pilling
30 | }
31 |
--------------------------------------------------------------------------------
/R/get_emr_spark.R:
--------------------------------------------------------------------------------
1 | #' Get UK Biobank participant Electronic Medical Records (EMR) data in a RAP Spark environment
2 | #'
3 | #' @description
4 | #'
5 | #' This function is completely removed. Better to use `get_diagnoses()`. Use a historic release of this package if you really need it https://github.com/lcpilling/ukbrapR/releases
6 | #'
7 | #' @return Returns nothing
8 | #'
9 | #' @author Luke Pilling
10 | #'
11 | #' @name get_emr_spark
12 | #'
13 | #' @param codes_df A data frame. Contains two columns: `code` and `vocab_id` i.e., a list of diagnostic codes, and an indicator of the vocabulary. Other columns are ignored.
14 | #' @param spark_master A string. The `master` argmuent passed to `sparklyr::spark_connect()`.
15 | #' \code{default='spark://master:41000'}
16 | #' @param verbose Logical. Be verbose,
17 | #' \code{default=FALSE}
18 | #'
19 | #' @export
20 | #'
21 | get_emr_spark <- function(
22 | codes_df=NULL,
23 | spark_master = "spark://master:41000",
24 | verbose=FALSE
25 | ) {
26 |
27 | lifecycle::deprecate_warn("0.3.0", "get_emr_spark()", "get_diagnoses()", details="This function is now completely removed. Use a historic release of this package if you really need it https://github.com/lcpilling/ukbrapR/releases")
28 |
29 | }
30 |
31 |
32 |
33 |
--------------------------------------------------------------------------------
/_pkgdown.yml:
--------------------------------------------------------------------------------
1 | url: https://lcpilling.github.io/ukbrapR/
2 | template:
3 | bootstrap: 5
4 | light-switch: true
5 |
6 | authors:
7 | Luke Pilling:
8 | href: "https://lcpilling.github.io"
9 |
10 | reference:
11 | - title: Export required data
12 | - contents:
13 | - export_tables
14 | - title: Extract genetic variants
15 | - contents:
16 | - extract_variants
17 | - make_imputed_bed
18 | - make_dragen_bed
19 | - load_bed
20 | - title: Create polygenic score
21 | - contents:
22 | - create_pgs
23 | - title: Ascertain diagnoses and determine date first diagnosed
24 | - contents:
25 | - get_diagnoses
26 | - get_df
27 | - title: Working with UK Biobank data fields
28 | - contents:
29 | - fields_to_phenos
30 | - label_ukb_fields
31 | - label_ukb_field
32 | - title: Upload/download from RAP/worker storage
33 | - contents:
34 | - upload_to_rap
35 | - download_from_rap
36 | - title: Spark
37 | des: These functions do not require intermediate files but must be executed in a JupyterLab Spark environment.
38 | - contents:
39 | - get_rap_phenos
40 | - get_selfrep_illness_spark
41 | - get_emr_spark
42 |
43 | articles:
44 | - title: Get started
45 | navbar: ~
46 | contents:
47 | - ascertain_diagnoses
48 | - label_fields
49 | - spark_functions
50 |
51 |
--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
1 | Package: ukbrapR
2 | Title: R functions to use in the UK Biobank Research Analysis Platform (RAP)
3 | Version: 0.3.9
4 | Authors@R: c(person("Luke", "Pilling",
5 | email = "L.Pilling@exeter.ac.uk",
6 | role = c("aut", "cre"),
7 | comment = c(ORCID = "0000-0002-3332-8454")))
8 | Description: R functions to use in the UK Biobank Research Analysis Platform (RAP). The aim is to make working in the RAP quicker, easier, and more reproducible. The package is designed to work in a "normal" cluster using RStudio, and raw UK Biobank data from the table-exporter (rather than on a Spark cluster).
9 | License: GPL-3
10 | URL: https://lcpilling.github.io/ukbrapR, https://github.com/lcpilling/ukbrapR
11 | Depends:
12 | R (>= 4.1.1)
13 | Imports:
14 | dplyr (>= 1.1.0),
15 | readr (>= 2.0.0),
16 | stringr (>= 1.5.0),
17 | purrr (>= 1.0.0),
18 | lubridate (>= 1.9.0),
19 | tidyr (>= 1.3.0),
20 | rlang (>= 1.0.0),
21 | cli (>= 3.6.1),
22 | prettyunits (>= 1.0.0),
23 | lifecycle (>= 1.0.0),
24 | haven (>= 2.5.0)
25 | Encoding: UTF-8
26 | LazyData: true
27 | RoxygenNote: 7.3.2
28 | BugReports: https://github.com/lcpilling/ukbrapR/issues
29 | Suggests:
30 | knitr,
31 | rmarkdown
32 | VignetteBuilder: knitr
33 |
--------------------------------------------------------------------------------
/man/make_imputed_bed.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/extract_variants.R
3 | \name{make_imputed_bed}
4 | \alias{make_imputed_bed}
5 | \title{Extract variants from imputed genotype file(s) into single BED file}
6 | \usage{
7 | make_imputed_bed(
8 | in_file,
9 | out_bed,
10 | progress = TRUE,
11 | verbose = FALSE,
12 | very_verbose = FALSE
13 | )
14 | }
15 | \arguments{
16 | \item{in_file}{A data frame or file path. Contains at least two columns: `rsID` and `CHR`. Other columns are ignored.}
17 |
18 | \item{out_bed}{A string.}
19 |
20 | \item{progress}{Logical. Show progress through each individual file,
21 | \code{default=TRUE}}
22 |
23 | \item{verbose}{Logical. Be verbose (show individual steps),
24 | \code{default=FALSE}}
25 |
26 | \item{very_verbose}{Logical. Be very verbose (show individual steps & show terminal output from Plink etc),
27 | \code{default=FALSE}}
28 | }
29 | \value{
30 | A single merged BED file (and BIM and FAM files)
31 | }
32 | \description{
33 | For a given set of genomic coordinates extract from the UK Biobank imputed genotypes (BGEN files, field 22828) into a single BED file.
34 | }
35 | \examples{
36 |
37 | make_imputed_bed(in_file=system.file("files", "pgs_liver_cirrhosis.txt", package="ukbrapR"), out_bed="liver_cirrhosis.imputed.variants")
38 |
39 | }
40 | \author{
41 | Luke Pilling
42 | }
43 |
--------------------------------------------------------------------------------
/man/make_dragen_bed.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/extract_variants.R
3 | \name{make_dragen_bed}
4 | \alias{make_dragen_bed}
5 | \title{Extract variants from DRAGEN BGEN file(s) into single BED file}
6 | \usage{
7 | make_dragen_bed(
8 | in_file,
9 | out_bed,
10 | progress = TRUE,
11 | verbose = FALSE,
12 | very_verbose = FALSE
13 | )
14 | }
15 | \arguments{
16 | \item{in_file}{A data frame or file path. Contains at least two columns: `chr` and `pos` (in build 38). Other columns are ignored.}
17 |
18 | \item{out_bed}{A string.}
19 |
20 | \item{progress}{Logical. Show progress through each individual file,
21 | \code{default=TRUE}}
22 |
23 | \item{verbose}{Logical. Be verbose (show individual steps),
24 | \code{default=FALSE}}
25 |
26 | \item{very_verbose}{Logical. Be very verbose (show individual steps & show terminal output from Plink etc),
27 | \code{default=FALSE}}
28 | }
29 | \value{
30 | A single merged BED file (and BIM and FAM files)
31 | }
32 | \description{
33 | For a given set of genomic coordinates extract the UK Biobank WGS DRAGEN variant calls (from the BGEN format, field 24309) into a single BED file.
34 |
35 | This assumes your project has access to the WGS BGEN files released April 2025. If not, run `ukbrapR:::make_dragen_bed_from_pvcfs()` to use [tabix] and [plink] to subset the [DRAGEN WGS pVCF files].
36 | }
37 | \examples{
38 |
39 | make_dragen_bed(in_file=system.file("files", "pgs_liver_cirrhosis.txt", package="ukbrapR"), out_bed="liver_cirrhosis.dragen.variants")
40 |
41 | }
42 | \author{
43 | Luke Pilling
44 | }
45 |
--------------------------------------------------------------------------------
/man/label_ukb_field.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/label_ukb_field.R
3 | \name{label_ukb_field}
4 | \alias{label_ukb_field}
5 | \title{Update UK Biobank field with `title` and `label` from the schema}
6 | \usage{
7 | label_ukb_field(d, field, field_id = NULL, verbose = FALSE)
8 | }
9 | \arguments{
10 | \item{d}{A data frame. The data frame containing the `field` to update.}
11 |
12 | \item{field}{A string. The field (e.g., `p54_i0`) in the provided data frame to update.}
13 |
14 | \item{field_id}{A string. If the field has been renamed (to e.g.,"assessment_centre") provide the field id here (e.g., "54").
15 | \code{default=NULL}}
16 |
17 | \item{verbose}{Logical. Be verbose,
18 | \code{default=FALSE}}
19 | }
20 | \value{
21 | Returns a data frame.
22 | }
23 | \description{
24 | Variables such as education and ethnicity are provided as integers but have specific codes.
25 |
26 | The UK Biobank schema are machine-readable dictionaries and mappings defining the internal structure of the online Showcase. https://biobank.ctsu.ox.ac.uk/crystal/schema.cgi
27 |
28 | This function updates a field in a data frame of UK Biobank with information from the Schema.
29 | }
30 | \examples{
31 |
32 | # update the Assessment Centre variable
33 | ukb <- ukbrapR::label_ukb_field(ukb, field="p54_i0")
34 |
35 | table(ukb$p54_i0) # tabulates the values
36 | table(haven::as_factor(ukb$p54_i0)) # tabulates the labels
37 | haven::print_labels(ukb$p54_i0) # show the value:label mapping for this variable
38 |
39 | # if the variable has been renamed, provide the "field" and "field_id" e.g.,
40 | ukb <- ukbrapR::label_ukb_field(ukb, field="assessment_centre", field_id="54")
41 |
42 | }
43 | \author{
44 | Luke Pilling
45 | }
46 |
--------------------------------------------------------------------------------
/man/fields_to_phenos.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/fields_to_phenos.R
3 | \name{fields_to_phenos}
4 | \alias{fields_to_phenos}
5 | \title{Check UK Biobank field IDs}
6 | \usage{
7 | fields_to_phenos(fields, filename = "", abort = TRUE, verbose = FALSE)
8 | }
9 | \arguments{
10 | \item{fields}{A vector of character strings. The field IDs to check if valid.}
11 |
12 | \item{filename}{A string. If provided, will save as a fieldname file ready for the table-exporter (including "eid").
13 | \code{default=""}}
14 |
15 | \item{abort}{Logical. Abort if a field is missing?,
16 | \code{default=TRUE}}
17 |
18 | \item{verbose}{Logical. Be verbose,
19 | \code{default=FALSE}}
20 | }
21 | \value{
22 | Returns a vector of strings (valid phenotypes).
23 | }
24 | \description{
25 | Check if provided field IDs are valid and return all possible phenotype names in the UK Biobank RAP
26 | }
27 | \examples{
28 |
29 | # not instanced, not arrayed
30 | fields_to_phenos("31") # sex
31 |
32 | # instanced, not arrayed
33 | fields_to_phenos("53") # assessment date
34 |
35 | # instanced and arrayed
36 | fields_to_phenos("93") # systolic blood pressure, manual reading
37 |
38 | # instanced and arrayed, MRI assessments only
39 | fields_to_phenos("12673") # Heart rate recorded during vicorder run (Heart MRI)
40 |
41 | # check multiple simultaneously
42 | fields_to_phenos(c("31","93"))
43 |
44 | # only warn if an invalid field is provided (default is to abort)
45 | fields_to_phenos(c("31","notafield","93"))
46 | fields_to_phenos(c("31","notafield","93"), abort=FALSE)
47 |
48 | # save as fieldname file for the table-exporter (don't forget to upload to the RAP)
49 | fields_to_phenos(c("31","93"), filename="fieldnames.txt")
50 |
51 | }
52 | \author{
53 | Luke Pilling
54 | }
55 |
--------------------------------------------------------------------------------
/man/label_ukb_fields.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/label_ukb_field.R
3 | \name{label_ukb_fields}
4 | \alias{label_ukb_fields}
5 | \title{Update a data frame containing UK Biobank fields with `title` and `label` from the schema}
6 | \usage{
7 | label_ukb_fields(d, verbose = FALSE)
8 | }
9 | \arguments{
10 | \item{d}{A data frame. The data frame containing UK Biobank fields to update.}
11 |
12 | \item{verbose}{Logical. Be verbose,
13 | \code{default=FALSE}}
14 | }
15 | \value{
16 | Returns a data frame.
17 | }
18 | \description{
19 | Variables such as education and ethnicity are provided as integers but have specific codes.
20 |
21 | The UK Biobank schema are machine-readable dictionaries and mappings defining the internal structure of the online Showcase. https://biobank.ctsu.ox.ac.uk/crystal/schema.cgi
22 |
23 | This function updates a data frame of UK Biobank data field with information from the Schema.
24 |
25 | It is in effect a wrapper to apply ukbrapR::label_ukb_field() to each variable in a data frame that looks like a UK Biobank field.
26 |
27 | Only recognised fields are modified (variables named things like "p54_i0"). Other variables are ignored.
28 | }
29 | \examples{
30 |
31 | # say the below data frame contains 4 variables: `eid`, `p54_i0`, `p50_i0` and `age_at_assessment`
32 | names(ukb)
33 |
34 | # update the variables that looks like UK Biobank fields with titles and, where cateogrical, labels
35 | # i.e., `p54_i0` and `p50_i0` only -- `eid` and `age_at_assessment` are ignored
36 | ukb <- ukbrapR::label_ukb_fields(ukb)
37 |
38 | table(ukb$p54_i0) # tabulates the values
39 | table(haven::as_factor(ukb$p54_i0)) # tabulates the labels
40 | haven::print_labels(ukb$p54_i0) # show the value:label mapping for this variable
41 |
42 | }
43 | \author{
44 | Luke Pilling
45 | }
46 |
--------------------------------------------------------------------------------
/man/get_rap_phenos.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/get_rap_phenos.R
3 | \name{get_rap_phenos}
4 | \alias{get_rap_phenos}
5 | \title{Get UK Biobank participant phenotype data}
6 | \usage{
7 | get_rap_phenos(
8 | names,
9 | value_coding = "replace",
10 | names_are_titles = FALSE,
11 | record = NULL,
12 | verbose = FALSE
13 | )
14 | }
15 | \arguments{
16 | \item{names}{A string or vector of strings. The variable name(s) required. e.g., c("eid","p31","p21003_i0")}
17 |
18 | \item{value_coding}{A string. How to handle coded fields. "replace" if a coding value exists, replace the raw value with the code; "raw" export the raw values of the field; "exclude" if a coding value exists, do not export the value (most commonly used with sparse fields).
19 | \code{default="replace"}}
20 |
21 | \item{names_are_titles}{Logical. Passing DNAnexus variable "titles" e.g., c("Age at recruitment", "Standing height | Instance 0").
22 | \code{default=FALSE}}
23 |
24 | \item{record}{A string. The `dnanexus_link` file descriptor of the .dataset to use. Default (if left as NULL) is to use the most recently dispensed dataset.
25 | \code{default=most recent dataset}}
26 |
27 | \item{verbose}{Logical. Be verbose,
28 | \code{default=FALSE}}
29 | }
30 | \value{
31 | Returns a data.frame (the participant data for the requested variables)
32 | }
33 | \description{
34 | Using a Spark node/cluster on the UK Biobank Research Analysis Platform (DNAnexus), use R to extract a provided set of variables. Using code from the UK Biobank DNAnexus team https://github.com/UK-Biobank/UKB-RAP-Notebooks/blob/main/NBs_Prelim/105_export_participant_data_to_r.ipynb
35 | }
36 | \examples{
37 | # get phenotype data
38 | ukb <- get_rap_phenos(c("eid","p31","p21003_i0","p53_i0"))
39 |
40 | # save to file on the RAP worker node
41 | readr::write_tsv(ukb, "ukbrap.phenos.20231114.txt.gz")
42 |
43 | # upload data to RAP storage
44 | upload_to_rap(file="ukbrap.phenos.20231114.txt.gz", dir="")
45 |
46 | }
47 | \author{
48 | Luke Pilling
49 | }
50 |
--------------------------------------------------------------------------------
/vignettes/spark_functions.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Spark functions"
3 | description: >
4 | Pull phenotype data from Spark environment.
5 | output: rmarkdown::html_vignette
6 | vignette: >
7 | %\VignetteIndexEntry{Spark functions}
8 | %\VignetteEngine{knitr::rmarkdown}
9 | %\VignetteEncoding{UTF-8}
10 | ---
11 |
12 | ```{r, include = FALSE}
13 | knitr::opts_chunk$set(
14 | collapse = TRUE,
15 | comment = "#>"
16 | )
17 | library(ukbrapR)
18 | ```
19 |
20 |
21 | ## Pull phenotype data from Spark environment to an R data frame
22 |
23 | **Needs to be run in an Apache Spark environment on the UK Biobank DNAnexus RAP.**
24 |
25 | Recommend launching a Spark cluster with at least `mem1_hdd1_v2_x16` and **2 nodes** otherwise this can fail with error "...ensure that workers...have sufficient resources"
26 |
27 | The underlying code is mostly from the [UK Biobank GitHub](https://github.com/UK-Biobank/UKB-RAP-Notebooks/blob/main/NBs_Prelim/105_export_participant_data_to_r.ipynb).
28 |
29 | ```{r, eval=FALSE, echo=TRUE}
30 | # get phenotype data (participant ID, sex, baseline age, and baseline assessment date)
31 | ukb <- get_rap_phenos(c("eid", "p31", "p21003_i0", "p53_i0"))
32 | #> 48.02 sec elapsed
33 |
34 | # summary of data
35 | table(ukb$p31)
36 | #> Female Male
37 | #> 273297 229067
38 | summary(ukb$p21003_i0)
39 | #> Min. 1st Qu. Median Mean 3rd Qu. Max.
40 | #> 37.00 50.00 58.00 56.53 63.00 73.00
41 | ```
42 |
43 | ### No more updates...
44 |
45 | I am moving away from using Spark as the default environment, mostly due to the cost implications; it is significantly cheaper (and quicker!) to store and search exported raw text files in the RAP persistant storage than do everything in a Spark environment (plus the added benefit that the RStudio interface is available in "normal" instances).
46 |
47 | The Spark functions are available as before but all updates are to improve functionality in "normal" instances using RStudio, as we move to the new era of RAP-only UK Biobank analysis.
48 |
49 | If you need to see the previous release documentation follow the tags to the version required: https://github.com/lcpilling/ukbrapR/tree/v0.1.7
50 |
51 |
--------------------------------------------------------------------------------
/man/extract_variants.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/extract_variants.R
3 | \name{extract_variants}
4 | \alias{extract_variants}
5 | \title{Extract variants from bulk data and load to memory}
6 | \usage{
7 | extract_variants(
8 | in_file,
9 | out_bed = "tmp",
10 | source = "imputed",
11 | overwrite = FALSE,
12 | progress = FALSE,
13 | verbose = FALSE,
14 | very_verbose = FALSE
15 | )
16 | }
17 | \arguments{
18 | \item{in_file}{A data frame or file path. Contains rsid, chr, and pos. For imputed genos pos is build 37. For DRAGEN pos is build 38. Other columns are ignored.}
19 |
20 | \item{out_bed}{A string. Prefix for output files (optional)
21 | \code{default="tmp"}}
22 |
23 | \item{source}{A string. Either "imputed" or "dragen" - indicating whether the variants should be from "UKB imputation from genotype" (field 22828) or "DRAGEN population level WGS variants, PLINK format [500k release]" (field 24308)
24 | \code{default="imputed"}}
25 |
26 | \item{overwrite}{Logical. Overwrite output BED files? (If output prefix is left as 'tmp' overwrite is set to TRUE),
27 | \code{default=FALSE}}
28 |
29 | \item{progress}{Logical. Show progress through each individual file,
30 | \code{default=FALSE}}
31 |
32 | \item{verbose}{Logical. Be verbose (show individual steps),
33 | \code{default=FALSE}}
34 |
35 | \item{very_verbose}{Logical. Be very verbose (show individual steps & show terminal output from Plink etc),
36 | \code{default=FALSE}}
37 | }
38 | \value{
39 | A data frame
40 | }
41 | \description{
42 | Use user-provided list of genetic variants to extract from imputed BGEN files (field 22828) or WGS DRAGEN BGEN files (field 24309) data and load as data.frame
43 |
44 | If selecting the DRAGEN data as the source, this assumes your project has access to the WGS BGEN files released April 2025. If not, run `ukbrapR:::make_dragen_bed_from_pvcfs()` to use [tabix] and [plink] to subset the [DRAGEN WGS pVCF files].
45 | }
46 | \examples{
47 |
48 | liver_variants <- extract_variants(in_file=system.file("files", "pgs_liver_cirrhosis.txt", package="ukbrapR"), out_bed="liver_cirrhosis.imputed.variants")
49 |
50 | }
51 | \author{
52 | Luke Pilling
53 | }
54 |
--------------------------------------------------------------------------------
/vignettes/label_fields.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Label fields"
3 | description: >
4 | Assign categorical UK Biobank fields the labels from the showcase schema.
5 | output: rmarkdown::html_vignette
6 | vignette: >
7 | %\VignetteIndexEntry{Label fields}
8 | %\VignetteEngine{knitr::rmarkdown}
9 | %\VignetteEncoding{UTF-8}
10 | ---
11 |
12 | ```{r, include = FALSE}
13 | knitr::opts_chunk$set(
14 | collapse = TRUE,
15 | comment = "#>"
16 | )
17 | library(ukbrapR)
18 | ```
19 |
20 | Categorical fields are exported as integers but are encoded with labels.
21 |
22 | For example [20116 "Smoking status"](https://biobank.ctsu.ox.ac.uk/crystal/field.cgi?id=20116):
23 |
24 | | Coding | Meaning |
25 | |--------|----------------------|
26 | | -3 | Prefer not to answer |
27 | | 0 | Never |
28 | | 1 | Previous |
29 | | 2 | Current |
30 |
31 | This package includes two functions to label a single UK Biobank field or a data frame of them using the [UK Biobank encoding schema](https://biobank.ctsu.ox.ac.uk/crystal/schema.cgi). Examples:
32 |
33 | ```{r, eval=FALSE, echo=TRUE}
34 | # update the Smoking status field
35 | ukb <- label_ukb_field(ukb, field="p20116_i0")
36 |
37 | table(ukb$p20116_i0) # tabulates the values
38 | #> -3 0 1 2
39 | #> 2057 273405 172966 52949
40 |
41 | table(haven::as_factor(ukb$p20116_i0)) # tabulates the labels
42 | #> Prefer not to answer Never Previous Current
43 | #> 2057 273405 172966 52949
44 |
45 | haven::print_labels(ukb$p20116_i0) # show the value:label mapping for this variable
46 | #> Labels:
47 | #> value label
48 | #> -3 Prefer not to answer
49 | #> 0 Never
50 | #> 1 Previous
51 | #> 2 Current
52 |
53 | #
54 | # if you have a whole data frame of exported fields, you can use the wrapper function label_ukb_fields()
55 |
56 | # say the `ukb` data frame contains 4 variables: `eid`, `p54_i0`, `p31` and `age_at_assessment`
57 |
58 | # update the variables that looks like UK Biobank fields with titles and, where cateogrical, labels
59 | # i.e., `p54_i0` and `p31` only -- `eid` and `age_at_assessment` are ignored
60 | ukb <- label_ukb_fields(ukb)
61 |
62 | table(ukb$p31) # tabulates the values
63 | #> 0 1
64 | #> 273238 229031
65 |
66 | table(haven::as_factor(ukb$p31)) # tabulates the labels
67 | #> Female Male
68 | #> 273238 229031
69 | ```
70 |
71 |
--------------------------------------------------------------------------------
/man/create_pgs.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/extract_variants.R
3 | \name{create_pgs}
4 | \alias{create_pgs}
5 | \title{Create a polygenic score}
6 | \usage{
7 | create_pgs(
8 | in_file,
9 | out_file = "tmp",
10 | pgs_name = "pgs",
11 | source = "imputed",
12 | is_bed = FALSE,
13 | overwrite = FALSE,
14 | progress = FALSE,
15 | verbose = FALSE,
16 | very_verbose = FALSE
17 | )
18 | }
19 | \arguments{
20 | \item{in_file}{A data frame or file path. Must contain rsid, chr, pos, effect_allele, other_allele, beta. For imputed genos pos is build 37. For DRAGEN pos is build 38. Other columns are ignored.}
21 |
22 | \item{out_file}{A string. Prefix for output files (optional)
23 | \code{default="tmp"}}
24 |
25 | \item{pgs_name}{A string. Variable name for created PGS (optional)
26 | \code{default="pgs"}}
27 |
28 | \item{source}{A string. Either "imputed" or "dragen" - indicating whether the variants should be from "UKB imputation from genotype" (field 22828) or "DRAGEN population level WGS variants, PLINK format [500k release]" (field 24308). Can instead be a path to a local BED file, if `is_bed=TRUE`.
29 | \code{default="imputed"}}
30 |
31 | \item{is_bed}{Logical. If you already have a BED file containing the required variants set this to TRUE and provide a path to the BED file in the `source` option,
32 | \code{default=FALSE}}
33 |
34 | \item{overwrite}{Logical. Overwrite output BED files? (If out_file is left as 'tmp' overwrite is set to TRUE),
35 | \code{default=FALSE}}
36 |
37 | \item{progress}{Logical. Show progress through each individual file,
38 | \code{default=FALSE}}
39 |
40 | \item{verbose}{Logical. Be verbose (show individual steps),
41 | \code{default=FALSE}}
42 |
43 | \item{very_verbose}{Logical. Be very verbose (show individual steps & show terminal output from Plink etc),
44 | \code{default=FALSE}}
45 | }
46 | \value{
47 | A data frame
48 | }
49 | \description{
50 | Use user-provided list of genetic variants with weights for a trait to create a polygenic score. Uses the imputed BGEN files (field 22828) or WGS DRAGEN BGEN files (field 24309) data and load as data.frame
51 |
52 | If selecting the DRAGEN data as the source, this assumes your project has access to the WGS BGEN files released April 2025. If not, run `ukbrapR:::make_dragen_bed_from_pvcfs()` to use [tabix] and [plink] to subset the [DRAGEN WGS pVCF files].
53 | }
54 | \examples{
55 |
56 | liver_pgs <- create_pgs(in_file=system.file("files", "pgs_liver_cirrhosis.txt", package="ukbrapR"), out_file="liver_cirrhosis.imputed.pgs", pgs_name="liver_cirrhosis_pgs")
57 |
58 | }
59 | \author{
60 | Luke Pilling
61 | }
62 |
--------------------------------------------------------------------------------
/R/upload_to_rap.R:
--------------------------------------------------------------------------------
1 | #' Use R to upload a file to the UK Biobank RAP
2 | #'
3 | #' @description Use R to upload a file to the UK Biobank RAP (really just a wrapper for `dx upload`)
4 | #'
5 | #' @return NA
6 | #'
7 | #' @author Luke Pilling
8 | #'
9 | #' @name upload_to_rap
10 | #'
11 | #' @param file A string. Filename of the file to be uploaded (character)
12 | #' @param dir A string. Target directory in the RAP space. If blank, the current working directory (character)
13 | #' @param verbose Logical. Be verbose,
14 | #' \code{default=FALSE}
15 | #'
16 | #' @examples
17 | #'
18 | #' readr::write_tsv(data.frame(x=1:10,y=11:20), "ukbrap.dummy.20231114.txt.gz")
19 | #'
20 | #' # upload file to RAP storage
21 | #' upload_to_rap(file="ukbrap.dummy.20231114.txt.gz", dir="extracts/")
22 | #'
23 | #' @export
24 | #'
25 | upload_to_rap <- function(
26 | file,
27 | dir = "FALSE",
28 | verbose = FALSE
29 | ) {
30 |
31 | if (verbose) cat("Check input & options\n")
32 | if (! is.character(file) ) stop("`file` needs to be a string of the filename to upload")
33 | if (! file.exists(file) ) stop("`file` not found")
34 |
35 | if (dir == "FALSE") {
36 | if (verbose) cat("Uploading to current working directory\n")
37 | system(paste0("dx upload ", file))
38 | } else {
39 | if (verbose) cat(paste0("Uploading to `dir`: ", dir, "\n"))
40 | if (substr(dir, nchar(dir), nchar(dir)) != "/") dir <- paste0(dir, "/")
41 | system(paste0("dx mkdir -p ", dir))
42 | system(paste0("dx upload ", file, " --path ", dir))
43 | }
44 |
45 | }
46 |
47 |
48 | #' Use R to download a file to the UK Biobank RAP
49 | #'
50 | #' @description Use R to download a file to the UK Biobank RAP (really just a wrapper for `dx download`)
51 | #'
52 | #' @return NA
53 | #'
54 | #' @author Luke Pilling
55 | #'
56 | #' @name download_from_rap
57 | #'
58 | #' @param file A string. Filename of the file to be downloaded (character)
59 | #' @param verbose Logical. Be verbose,
60 | #' \code{default=FALSE}
61 | #'
62 | #' @examples
63 | #'
64 | #' readr::write_tsv(data.frame(x=1:10,y=11:20), "ukbrap.dummy.20231114.txt.gz")
65 | #'
66 | #' # download file to RAP storage
67 | #' download_from_rap(file="ukbrap.dummy.20231114.txt.gz", dir="extracts/")
68 | #'
69 | #' @export
70 | #'
71 | download_from_rap <- function(
72 | file,
73 | verbose = FALSE
74 | ) {
75 |
76 | if (verbose) cat("Check input & options\n")
77 | if (! is.character(file) ) stop("`file` needs to be a string of the filename to download")
78 | if (! file.exists(file) ) stop("`file` not found")
79 |
80 | if (verbose) cat("downloading to current working directory\n")
81 | system(paste0("dx download ", file))
82 |
83 | }
84 |
85 |
86 |
--------------------------------------------------------------------------------
/man/get_diagnoses.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/get_diagnoses.R
3 | \name{get_diagnoses}
4 | \alias{get_diagnoses}
5 | \title{Get UK Biobank participant diagnosis data}
6 | \usage{
7 | get_diagnoses(codes_df, file_paths = NULL, verbose = FALSE)
8 | }
9 | \arguments{
10 | \item{codes_df}{A data frame. Contains two columns: `code` and `vocab_id` i.e., a list of diagnostic codes, and an indicator of the vocabulary (ICD10, Read2, CTV3, OPCS3, OPCS4, ukb_cancer, and ukb_noncancer are recognised). Other columns are ignored.}
11 |
12 | \item{file_paths}{A data frame. Columns must be `object` and `path` containing paths to required files. Default assumes you have the tables exported in the RAP environment from
13 | ukbrapR::export_tables()
14 | \code{default=ukbrapR:::ukbrapr_paths}}
15 |
16 | \item{verbose}{Logical. Be verbose,
17 | \code{default=FALSE}}
18 | }
19 | \value{
20 | Returns a list of data frames (the participant data for the requested diagnosis codes: `death_cause`, `hesin_diag`, `hesin_oper`, `gp_clinical`, `cancer_registry` and `selfrep_illness`. Also includes the original codes list)
21 | }
22 | \description{
23 | For a list of diagnostic codes get the HES, GP, cancer registry, operations, and self-reported illness data, matching the provided codes.
24 |
25 | Valid code vocabularies are:
26 |
27 | - ICD10 (for `hesin`, `death_cause` and `cancer_registry` searches) - fuzzy matching
28 |
29 | - ICD9 (for `hesin` searches) - fuzzy matching
30 |
31 | - Read2 / CTV3 (for `gp_clinical`) - exact matches on first 5 characters
32 |
33 | - OPCS3 / OPCS4 (for `hesin_oper`) - fuzzy matching
34 |
35 | - ukb_cancer / ukb_noncancer (for self-reported illness at UK Biobank assessments - all available will be searched) - exact matches
36 |
37 | This function relies on exported raw data files and thus does not need to be run in a Spark cluster. If the files are not in the default locations for the package you will need to specify the `file_paths` to exported tables. Recommend to run `export_tables()` once in your project to export the tables to the default paths for the package.
38 | }
39 | \examples{
40 | # example diagnostic codes for CKD from GEMINI multimorbidity project
41 | codes_df_ckd <- ukbrapR:::codes_df_ckd
42 | head(codes_df_ckd)
43 |
44 | # Get diagnosis data - returns list of data frames (one per source)
45 | # -- Requires exported tables - see `export_tables()`
46 | diagnosis_list <- get_diagnoses(codes_df_ckd)
47 |
48 | # don't forget to save and upload data to RAP persistent storage!
49 | save(diagnosis_list, "ukbrap.CKD.emr.20231114.RDat")
50 | upload_to_rap(file="ukbrap.CKD.*", dir="")
51 |
52 | }
53 | \author{
54 | Luke Pilling
55 | }
56 |
--------------------------------------------------------------------------------
/man/get_selfrep_illness_spark.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/get_selfrep_illness_spark.R
3 | \name{get_selfrep_illness_spark}
4 | \alias{get_selfrep_illness_spark}
5 | \title{Get UK Biobank participant self-reported illness/year data for specific codes}
6 | \usage{
7 | get_selfrep_illness_spark(
8 | codes_df,
9 | vocab_col = "vocab_id",
10 | codes_col = "code",
11 | ukb_dat = NULL,
12 | n_cancer_arrays = 5,
13 | n_noncancer_arrays = 30,
14 | verbose = FALSE
15 | )
16 | }
17 | \arguments{
18 | \item{codes_df}{A data frame. Contains the `vocab_col` and `codes_col` i.e., an indicator of the vocabulary and the diagnostic codes.}
19 |
20 | \item{vocab_col}{A string. Column name in `codes_df` that contains the vocabulary indicator for the code (for self-reported it needs to be either "ukb_cancer" or "ukb_noncancer").
21 | \code{default='vocab_id'}}
22 |
23 | \item{codes_col}{A string. Column name in `codes_df` that contains the self-reported disease code (e.g., 1507).
24 | \code{default='code'}}
25 |
26 | \item{ukb_dat}{A data frame. Optional. If not provided, will get the phenotypes from the RAP. Contains the self-reported illness fields e.g., `p20008_i0_a0`.
27 | \code{default=NULL}}
28 |
29 | \item{n_cancer_arrays}{An integer. It is not trivial to determine the max number of arrays to request from Spark for the self-reported illnesses. The defaults match the currently (Feb 2024) available data but may need increasing in the future.
30 | \code{default=5}}
31 |
32 | \item{n_noncancer_arrays}{An integer. It is not trivial to determine the max number of arrays to request from Spark for the self-reported illnesses. The defaults match the currently (Feb 2024) available data but may need increasing in the future.
33 | \code{default=30}}
34 |
35 | \item{verbose}{Logical. Be verbose,
36 | \code{default=FALSE}}
37 | }
38 | \value{
39 | Returns a data frame with four variables: eid, selfrep [binary, codes identified?], selfrep_df [date of reported illness], selfrep_i [instance the illness was first reported]
40 | }
41 | \description{
42 | For a specific self-reported illness code or codes, identify whether the participant has self-reported at any visit, and identify the year.
43 | Intended for use on the UK Biobank DNnexus Research Analysis Platform, but if the user provides their own dataframe of UK Biobank self-reported fields this works on any system.
44 | }
45 | \examples{
46 | # example diagnostic codes for haemochromatosis
47 | print(codes_df_hh)
48 |
49 | # get self-reported data - a data frame
50 | selfrep_df <- get_selfrep_illness_spark(codes_df_hh)
51 |
52 | # inspect variables
53 | table(selfrep_df$selfrep)
54 | summary(selfrep_df$selfrep_df)
55 | table(selfrep_df$selfrep_i)
56 |
57 | }
58 | \author{
59 | Luke Pilling
60 | }
61 |
--------------------------------------------------------------------------------
/man/export_tables.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/export_tables.R
3 | \name{export_tables}
4 | \alias{export_tables}
5 | \title{Export diagnosis files to RAP persistent storage}
6 | \usage{
7 | export_tables(
8 | submit = FALSE,
9 | ignore_warnings = FALSE,
10 | file_paths = ukbrapR:::ukbrapr_paths,
11 | dataset = NULL,
12 | verbose = FALSE
13 | )
14 | }
15 | \arguments{
16 | \item{submit}{Logical. Actually submit `dx` commands. Default is FALSE i.e., just check inputs & file paths, then print the commands,
17 | \code{default=FALSE}}
18 |
19 | \item{ignore_warnings}{Logical. If an exported table already exists do not submit the table-exporter command unless this is TRUE,
20 | \code{default=FALSE}}
21 |
22 | \item{file_paths}{A data frame. Columns must be `object` and `path` containing paths to outputted files. If blank, will use the default paths,
23 | \code{default=ukbrapR:::ukbrapr_paths}}
24 |
25 | \item{dataset}{A string. If you wish to specify dataset. If blank, will use the most recently dispensed dataset in the main project directory.
26 | \code{default=app#####_#####.dataset}}
27 |
28 | \item{verbose}{Logical. Be verbose,
29 | \code{default=FALSE}}
30 | }
31 | \value{
32 | NA
33 | }
34 | \description{
35 | In the UK Biobank RAP export tables for HES, GP, death, and cancer registry data, plus self-reported illness fields, using the table-exporter. This is essentially a wrapper function to submit jobs to the table exporter.
36 |
37 | Suggest executing in an RStudio session. ~10Gb of text files are created. This will cost ~£0.15 per month to store in the RAP standard storage.
38 | }
39 | \examples{
40 |
41 | # To keep files organised this package assumes the following file structure
42 | # This object is not actually required but illstrates the defaults to be
43 | # created in your RAP space (override by providing a new `file_paths`):
44 | ukbrapr_paths = data.frame(
45 | object=c("death","death_cause","hesin","hesin_diag","hesin_oper","gp_clinical","gp_scripts","selfrep_illness","cancer_registry","baseline_dates"),
46 | path=c(
47 | "ukbrapr_data/death.tsv",
48 | "ukbrapr_data/death_cause.tsv",
49 | "ukbrapr_data/hesin.tsv",
50 | "ukbrapr_data/hesin_diag.tsv",
51 | "ukbrapr_data/hesin_oper.tsv",
52 | "ukbrapr_data/gp_clinical.tsv",
53 | "ukbrapr_data/gp_scripts.tsv",
54 | "ukbrapr_data/selfrep_illness.tsv",
55 | "ukbrapr_data/cancer_registry.tsv",
56 | "ukbrapr_data/baseline_dates.tsv"
57 | )
58 | )
59 | ukbrapr_paths
60 |
61 | # test run to see `dx run table-exporter` commands - but will not submit jobs
62 | export_tables()
63 |
64 | # Submit all `dx run table-exporter` commands. ~10Gb of text files are created. This will cost ~£0.15 per month to store in the RAP standard storage.
65 | export_tables(submit=TRUE)
66 |
67 | }
68 | \author{
69 | Luke Pilling
70 | }
71 |
--------------------------------------------------------------------------------
/R/get_cancer_registry.R:
--------------------------------------------------------------------------------
1 | #' Get cancer registry data for specific codes
2 | #'
3 | #' @author Luke Pilling
4 | #'
5 | #' @name get_cancer_registry
6 | #'
7 | #' @noRd
8 | get_cancer_registry <- function(
9 | codes,
10 | ukb_dat,
11 | verbose = FALSE
12 | ) {
13 |
14 | start_time <- Sys.time()
15 |
16 | # Check input
17 | if (verbose) cli::cli_alert_info("Searching cancer registry data for {length(unique(codes))} ICD10 codes")
18 |
19 | # remove rows where participant has no cancer data
20 | ukb_dat = ukb_dat |> dplyr::filter(
21 | dplyr::if_any(
22 | dplyr::starts_with("p"),
23 | ~!is.na(.)
24 | )
25 | )
26 |
27 | # check all visits for participant - create `canreg` (binary, ever), `canreg_df` (date first) and `canreg_i` (the "instance" i.e., visit)
28 | # https://biobank.ctsu.ox.ac.uk/crystal/label.cgi?id=100092
29 | # date vars = 40005
30 | # cancer vars = 40006
31 | # age vars = 40008
32 | # histology vars = 40011
33 | # behaviour vars = 40012
34 |
35 | # variable prefix
36 | v_icd10 <- "p40006_"
37 | v_date <- "p40005_"
38 | v_age <- "p40008_"
39 | v_histology <- "p40011_"
40 | v_behaviour <- "p40012_"
41 |
42 | # use `tidyr::pivot_longer` to reduce the number of columns and increase the number of rows
43 | if (verbose) cli::cli_alert("Pivot cancer registry data")
44 | pivot_cancer <- function(d, v, n) {
45 | d |>
46 | dplyr::select(eid, dplyr::contains(v)) |>
47 | tidyr::pivot_longer(!eid, names_to = "instance", names_prefix = v, values_to = n)
48 | }
49 | ukb_dat_icd10 <- pivot_cancer(ukb_dat, v_icd10, "icd10")
50 | ukb_dat_date <- pivot_cancer(ukb_dat, v_date, "date")
51 | ukb_dat_age <- pivot_cancer(ukb_dat, v_age, "age")
52 | ukb_dat_histology <- pivot_cancer(ukb_dat, v_histology, "histology")
53 | ukb_dat_behaviour <- pivot_cancer(ukb_dat, v_behaviour, "behaviour")
54 |
55 | # join tables
56 | if (verbose) cli::cli_alert("Join cancer registry data")
57 | ukb_dat_cr = purrr::reduce(list(ukb_dat_icd10, ukb_dat_date, ukb_dat_age, ukb_dat_histology, ukb_dat_behaviour), dplyr::full_join, by = c("eid"="eid", "instance"="instance"))
58 |
59 | # remove rows where participant has no cancer data
60 | ukb_dat_cr = ukb_dat_cr |> dplyr::filter(
61 | dplyr::if_any(
62 | c("icd10","date","age","histology","behaviour"),
63 | ~!is.na(.)
64 | )
65 | )
66 |
67 | # subset to ICD10s in provided codes
68 | if (verbose) cli::cli_alert("Identify matching codes")
69 | ukb_dat_cr = ukb_dat_cr |>
70 | dplyr::filter(
71 | stringr::str_detect(
72 | icd10,
73 | stringr::str_flatten(codes, collapse = "|")
74 | )
75 | )
76 |
77 | # finish
78 | #if (verbose) cli::cli_alert_success(c("Finished cancer registry: ", "{prettyunits::pretty_sec(as.numeric(difftime(Sys.time(), start_time, units=\"secs\")))}."))
79 |
80 | # Return data
81 | return(ukb_dat_cr)
82 |
83 | }
84 |
85 |
--------------------------------------------------------------------------------
/data-raw/internal_files.R:
--------------------------------------------------------------------------------
1 |
2 | #
3 | # file paths ################################################
4 | #
5 |
6 | ukbrapr_paths = data.frame(
7 | object=c(
8 | "hesin",
9 | "hesin_diag",
10 | "hesin_oper",
11 | "gp_clinical",
12 | "gp_scripts",
13 | "death",
14 | "death_cause",
15 | "selfrep_illness",
16 | "cancer_registry",
17 | "baseline_dates"),
18 | path=c(
19 | "ukbrapr_data/hesin.tsv",
20 | "ukbrapr_data/hesin_diag.tsv",
21 | "ukbrapr_data/hesin_oper.tsv",
22 | "ukbrapr_data/gp_clinical.tsv",
23 | "ukbrapr_data/gp_scripts.tsv",
24 | "ukbrapr_data/death.tsv",
25 | "ukbrapr_data/death_cause.tsv",
26 | "ukbrapr_data/selfrep_illness.tsv",
27 | "ukbrapr_data/cancer_registry.tsv",
28 | "ukbrapr_data/baseline_dates.tsv"
29 | )
30 | )
31 |
32 |
33 | #
34 | # codes lists ###################################################
35 | #
36 |
37 | library(tidyverse)
38 |
39 | # CKD -- from https://github.com/GEMINI-multimorbidity/GEMINI-LTC-code-list-Public
40 | ICD10 <- c("N18.3", "N18.4", "N18.5", "N18.6", "N18.9", "N19", "Z94.0")
41 | Read2 <- c("1Z12.", "1Z13.", "1Z14.", "1Z15.", "1Z16.", "1Z1a.", "1Z1B.", "1Z1b.", "1Z1C.", "1Z1c.", "1Z1D.", "1Z1d.", "1Z1E.", "1Z1e.", "1Z1F.", "1Z1f.", "1Z1G.", "1Z1H.", "1Z1J.", "1Z1K.", "1Z1L.", "1Z1T.", "1Z1V.", "1Z1W.", "1Z1X.", "1Z1Y.", "1Z1Z.", "K053.", "K054.", "K055.", "1Z1..", "K0E..", "K05..", "Kyu21", "D2150", "K06..", "6AA..", "66i..", "661M2", "661N2")
42 | CTV3 <- c("X30In", "XaLHI", "XaLHJ", "XaLHK", "XaNbn", "XaNbo", "XacAb", "XacAd", "XaO3t", "XacAe", "XaO3u", "XacAf", "XaO3v", "XacAh", "XaO3w", "XacAi", "XaO3x", "XaO3y", "XaO3z", "XaO40", "XaO41", "XaO42", "XacAM", "XacAN", "XacAO", "XacAV", "XacAW", "XacAX", "XaYb9", "XaYZW", "XaMJD", "XaMGE", "XaCLy", "XE0df", "XE0dg", "X30Iz", "Kyu21")
43 | codes_df_ckd <- data.frame(
44 | vocab_id = c(
45 | rep("ICD10", length(ICD10)),
46 | rep("Read2", length(Read2)),
47 | rep("CTV3", length(CTV3))
48 | ),
49 | code = c(ICD10, Read2, CTV3)
50 | )
51 | codes_df_ckd$condition = "ckd"
52 | codes_df_ckd = codes_df_ckd[,c("condition","vocab_id","code")]
53 |
54 | # Haemochromatosis (include self-reported)
55 | selfrep <- c("1507")
56 | ICD10 <- c("E83.1")
57 | ICD9 <- c("275.01")
58 | Read2 <- c("126A.","4L41.","677C0","C350.","C3500")
59 | CTV3 <- c("C3500","X40QQ","XaIyI","XaIyx","XaXHI","XE13K","X307o","X307p")
60 | codes_df_hh <- data.frame(
61 | vocab_id = c(
62 | rep("ukb_noncancer", length(selfrep)),
63 | rep("ICD10", length(ICD10)),
64 | rep("ICD9", length(ICD9)),
65 | rep("Read2", length(Read2)),
66 | rep("CTV3", length(CTV3))
67 | ),
68 | code = c(selfrep, ICD10, ICD9, Read2, CTV3)
69 | )
70 | codes_df_hh$condition = "hh"
71 | codes_df_hh = codes_df_hh[,c("condition","vocab_id","code")]
72 |
73 | # synthetic - all types of code!
74 | cancers = c("C22")
75 | OPCS3 = c("502","509")
76 | OPCS4 = c("J01")
77 | ICD9 = c("280","28243","440.2")
78 | codes_df_test = rbind(
79 | codes_df_hh,
80 | data.frame(
81 | condition="test",
82 | vocab_id = c(
83 | rep("ICD10", length(cancers)),
84 | rep("OPCS3", length(OPCS3)),
85 | rep("OPCS4", length(OPCS4)),
86 | rep("ICD9", length(ICD9))
87 | ),
88 | code = c(cancers, OPCS3, OPCS4, ICD9)
89 | )
90 | )
91 | codes_df_test$condition = "test"
92 |
93 |
94 | #
95 | # UK Biobank field label schema #################################
96 | #
97 |
98 | # from https://biobank.ctsu.ox.ac.uk/crystal/download.cgi
99 | ukb_schema <- NULL
100 | ukb_schema[["field"]] <- readr::read_tsv("http://biobank.ctsu.ox.ac.uk/ukb/scdown.cgi?fmt=txt&id=1", progress=FALSE, show_col_types=FALSE)
101 | ukb_schema[["time"]] <- readr::read_tsv("http://biobank.ctsu.ox.ac.uk/ukb/scdown.cgi?fmt=txt&id=20", progress=FALSE, show_col_types=FALSE)
102 | ukb_schema[["real"]] <- readr::read_tsv("http://biobank.ctsu.ox.ac.uk/ukb/scdown.cgi?fmt=txt&id=7", progress=FALSE, show_col_types=FALSE)
103 | ukb_schema[["date"]] <- readr::read_tsv("http://biobank.ctsu.ox.ac.uk/ukb/scdown.cgi?fmt=txt&id=8", progress=FALSE, show_col_types=FALSE)
104 | ukb_schema[["int"]] <- readr::read_tsv("http://biobank.ctsu.ox.ac.uk/ukb/scdown.cgi?fmt=txt&id=5", progress=FALSE, show_col_types=FALSE)
105 | ukb_schema[["string"]] <- readr::read_tsv("http://biobank.ctsu.ox.ac.uk/ukb/scdown.cgi?fmt=txt&id=6", progress=FALSE, show_col_types=FALSE)
106 |
107 |
108 |
109 | #
110 | # save as internal ###########################################
111 | #
112 | usethis::use_data(
113 | ukbrapr_paths,
114 | codes_df_ckd, codes_df_hh, codes_df_test,
115 | ukb_schema,
116 | internal = TRUE, overwrite = TRUE, compress = 'xz')
117 |
118 |
119 |
120 |
121 |
122 |
--------------------------------------------------------------------------------
/man/get_df.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/get_df.R
3 | \name{get_df}
4 | \alias{get_df}
5 | \title{Get UK Biobank participant Date First (DF) diagnosis}
6 | \usage{
7 | get_df(
8 | diagnosis_list,
9 | prefix = NULL,
10 | group_by = NULL,
11 | include_selfrep_illness = TRUE,
12 | include_death_cause = TRUE,
13 | include_gp_clinical = TRUE,
14 | include_hesin_diag = TRUE,
15 | include_hesin_oper = TRUE,
16 | include_cancer_registry = TRUE,
17 | use_baseline_dates = TRUE,
18 | file_paths = NULL,
19 | censoring_date = "30-10-2022",
20 | verbose = FALSE
21 | )
22 | }
23 | \arguments{
24 | \item{diagnosis_list}{A list of data frames. The output of `get_diagnoses()` i.e., the raw diagnosis and self-reported illness data that matched the provided codes list.}
25 |
26 | \item{prefix}{String. Prefix to add to variable names (e.g., if prefix="chd" the output variables would be "chd_gp_df", "chd_hes_df", "chd_df" etc.)
27 | \code{default=NULL}}
28 |
29 | \item{group_by}{String. If the codes list provided to `get_diagnoses()` (i.e., in diagnosis_list$codes_df) contained a grouping/condition variable, indicate the variable name here.
30 | "Date first" variables will be created for each prefix in the grouping variable. The `prefix` option is ignored, in favour of the names in the grouping variable.
31 | \code{default=NULL}}
32 |
33 | \item{include_selfrep_illness}{logical. Include self-reported diagnosesin the combined Date First output? If present in `diagnosis_list` will still provide a separate `_df` variable
34 | \code{default=TRUE}}
35 |
36 | \item{include_death_cause}{logical. Include the cause of death in the combined Date First output? If present in `diagnosis_list` will still provide a separate `_df` variable
37 | \code{default=TRUE}}
38 |
39 | \item{include_gp_clinical}{logical. Include the GP data in the combined Date First output? If present in `diagnosis_list` will still provide a separate `_df` variable
40 | \code{default=TRUE}}
41 |
42 | \item{include_hesin_diag}{logical. Include the HES diagnosis data in the combined Date First output? If present in `diagnosis_list` will still provide a separate `_df` variable
43 | \code{default=TRUE}}
44 |
45 | \item{include_hesin_oper}{logical. Include the HES OPCS (operations) data in the combined Date First output? If present in `diagnosis_list` will still provide a separate `_df` variable
46 | \code{default=TRUE}}
47 |
48 | \item{include_cancer_registry}{logical. Include the cancer registry data in the combined Date First output? If present in `diagnosis_list` will still provide a separate `_df` variable
49 | \code{default=TRUE}}
50 |
51 | \item{use_baseline_dates}{logical. If `baseline_dates` available in file paths, produce a binary 0/1 variable, indicating the controls (people without a diagnosis) and setting the date first `_df` field to the date of censoring (currently see `censoring_date` option).
52 | \code{default=TRUE}}
53 |
54 | \item{file_paths}{A data frame. Columns must be `object` and `path` containing paths to outputted files. If not provided will use those in `ukbrapr_paths`
55 | \code{default=NULL}}
56 |
57 | \item{censoring_date}{A string. If using baseline data to infer control participants, include a censoring date (set to NA if not desired). Use dd-mm-yyyy format. Default is the (current) HES date.
58 | \code{default="30-10-2022"}}
59 |
60 | \item{verbose}{Logical. Be verbose,
61 | \code{default=FALSE}}
62 | }
63 | \value{
64 | Returns a single, "wide" data frame: the participant data for the requested diagnosis codes with "date first" `_df` variables. One for each source of data, and a combined variable.
65 | }
66 | \description{
67 | For each participant identify the date of first diagnosis from all available electronic medical records & self-reported data.
68 |
69 | If `use_baseline_dates=TRUE` (the default) then will also produce a binary 0/1 variable, indicating the controls (people without a diagnosis) and setting the date first `_df` field to the date of censoring (currently 30 October 2022).
70 | }
71 | \examples{
72 |
73 | ###############################################
74 | # example 1. haemochromatosis
75 |
76 | # get diagnosis data - returns list of data frames (one per source)
77 | diagnosis_list <- get_diagnoses(ukbrapR:::codes_df_hh)
78 |
79 | # for each participant, get Date First diagnosed with the condition
80 | diagnosis_df <- get_df(diagnosis_list, prefix="hh")
81 |
82 | ###############################################
83 | # example 2. get multiple diseases at once
84 | # don't have to all have the same code types/data sources
85 |
86 | codes = rbind(ukbrapR:::codes_df_hh, ukbrapR:::codes_df_ckd)
87 | print(codes)
88 |
89 | # get diagnosis data - returns list of data frames (one per source)
90 | diagnosis_list <- get_diagnoses(codes)
91 |
92 | # for each participant, get Date First diagnosed with the condition
93 | diagnosis_df <- get_df(diagnosis_list, group_by="condition")
94 |
95 | }
96 | \author{
97 | Luke Pilling
98 | }
99 |
--------------------------------------------------------------------------------
/R/get_selfrep_illness.R:
--------------------------------------------------------------------------------
1 | #' Get UK Biobank participant self-reported illness/year data for specific codes
2 | #'
3 | #' @author Luke Pilling
4 | #'
5 | #' @name get_selfrep_illness
6 | #'
7 | #' @noRd
8 | get_selfrep_illness <- function(
9 | codes_df,
10 | ukb_dat,
11 | verbose = FALSE
12 | ) {
13 |
14 | start_time <- Sys.time()
15 |
16 | vocab_col = "vocab_id"
17 | codes_col = "code"
18 |
19 | # Check input
20 | if (verbose) cat("Check inputs\n")
21 | if (! any(class(codes_df) %in% c("data.frame","tbl_df"))) stop("Codelist needs to be provided as a data frame")
22 | codes_df = as.data.frame(codes_df) # in case a tibble
23 |
24 | # check `codes_df` -- does it contain "ukb_cancer" or "ukb_noncancer" codes? Cannot have both. Makes no sense.
25 | if (! any(c("ukb_cancer","ukb_noncancer") %in% codes_df[,vocab_col])) stop("`vocab_col` column in `codes_df` needs to contain either 'ukb_cancer' or 'ukb_noncancer'")
26 | #if (all(c("ukb_cancer","ukb_noncancer") %in% codes_df[,vocab_col])) stop("`vocab_col` column in `codes_df` needs to contain either 'ukb_cancer' or 'ukb_noncancer' (not both)")
27 |
28 | # get codes
29 | codes_cancer <- codes_df[ codes_df[,vocab_col] == "ukb_cancer", codes_col]
30 | codes_noncancer <- codes_df[ codes_df[,vocab_col] == "ukb_noncancer" , codes_col]
31 | if (verbose) cat(" - ", length(unique(codes_cancer)), " cancer codes\n")
32 | if (verbose) cat(" - ", length(unique(codes_noncancer)), " non-cancer codes\n")
33 |
34 | # which are we getting?
35 | get_noncancer <- get_cancer <- FALSE
36 | if (length(codes_cancer)>0) get_cancer <- TRUE
37 | if (length(codes_noncancer)>0) get_noncancer <- TRUE
38 |
39 | # remove rows where participant has no data
40 | ukb_dat = ukb_dat |> dplyr::filter(
41 | dplyr::if_any(
42 | dplyr::starts_with("p"),
43 | ~!is.na(.)
44 | )
45 | )
46 |
47 | # check all visits for participant - create `selfrep` (binary, ever), `selfrep_df` (the "Date First" variable) and `selfrep_i` (the "instance" i.e., visit)
48 | # https://biobank.ctsu.ox.ac.uk/crystal/label.cgi?id=100074
49 | # will use 20001 (cancer code) and 20002 (non-cancer code)
50 | # will use 20006-9 the interpolated year/age (6 = cancer year, 8 = non-cancer year)
51 | # interpolated:
52 | # - If the participant gave a calendar year, then the best-fit time is half-way through that year. For example if the year was given as 1970, then the value presented is 1970.5
53 | # - If the participant gave their age then the value presented is the fractional year corresponding to the mid-point of that age. For example, if the participant said they were 30 years old then the value is the date at which they were 30years+6months.
54 | # - Interpolated values before the date of birth were truncated forwards to that time.
55 | # - Interpolated values after the time of data acquisition were truncated back to that time.
56 | # [-1] = "Date uncertain or unknown" and [-3] = "Preferred not to answer" (exclude both from DF)
57 |
58 | # use `tidyr::pivot_longer` to reduce the number of columns and increase the number of rows
59 | if (verbose) cli::cli_alert("Pivot self-report data")
60 | pivot_table <- function(d, v, n) {
61 | d |>
62 | dplyr::select(eid, dplyr::contains(v)) |>
63 | tidyr::pivot_longer(!eid, names_to = "instance", names_prefix = v, values_to = n) |>
64 | dplyr::filter(!is.na(!!rlang::sym(n)))
65 | }
66 |
67 | # create tmp empty objects
68 | ukb_dat_sr <- ukb_dat_noncancer_long <- ukb_dat_cancer_long <- NULL
69 |
70 | if (get_cancer) {
71 | ukb_dat_code <- pivot_table(ukb_dat, "p20001_", "cancer_code")
72 | ukb_dat_year <- pivot_table(ukb_dat, "p20006_", "cancer_year")
73 | ukb_dat_cancer_long <- dplyr::full_join(ukb_dat_code, ukb_dat_year, by = c("eid"="eid", "instance"="instance"))
74 | }
75 | if (get_noncancer) {
76 | ukb_dat_code <- pivot_table(ukb_dat, "p20002_", "noncancer_code")
77 | ukb_dat_year <- pivot_table(ukb_dat, "p20008_", "noncancer_year")
78 | ukb_dat_noncancer_long <- dplyr::full_join(ukb_dat_code, ukb_dat_year, by = c("eid"="eid", "instance"="instance"))
79 | }
80 |
81 | # subset to ukb-codes in provided list
82 | if (verbose) cli::cli_alert("Identify matching codes")
83 | if (get_cancer) ukb_dat_cancer_long = ukb_dat_cancer_long |> dplyr::filter(cancer_code %in% codes_cancer)
84 | if (get_noncancer) ukb_dat_noncancer_long = ukb_dat_noncancer_long |> dplyr::filter(noncancer_code %in% codes_noncancer)
85 |
86 | # join tables if both types of codes provided, otherwise just use one
87 | if (verbose) cli::cli_alert("Joining tables")
88 | if (!is.null(ukb_dat_cancer_long) & !is.null(ukb_dat_noncancer_long)) {
89 | # both code types provided
90 | ukb_dat_sr <- dplyr::full_join(ukb_dat_cancer_long, ukb_dat_noncancer_long, by = c("eid"="eid", "instance"="instance"))
91 | } else if (!is.null(ukb_dat_noncancer_long)) {
92 | # just non-cancer
93 | ukb_dat_sr <- ukb_dat_noncancer_long
94 | } else {
95 | # just cancer
96 | ukb_dat_sr <- ukb_dat_cancer_long
97 | }
98 |
99 | # finish
100 | #if (verbose) cli::cli_alert_success(c("Finished cancer registry: ", "{prettyunits::pretty_sec(as.numeric(difftime(Sys.time(), start_time, units=\"secs\")))}."))
101 |
102 | # Return data
103 | return(ukb_dat_sr)
104 |
105 | }
106 |
107 |
--------------------------------------------------------------------------------
/R/get_rap_phenos.R:
--------------------------------------------------------------------------------
1 | #' Get UK Biobank participant phenotype data
2 | #'
3 | #' @description Using a Spark node/cluster on the UK Biobank Research Analysis Platform (DNAnexus), use R to extract a provided set of variables. Using code from the UK Biobank DNAnexus team https://github.com/UK-Biobank/UKB-RAP-Notebooks/blob/main/NBs_Prelim/105_export_participant_data_to_r.ipynb
4 | #'
5 | #' @return Returns a data.frame (the participant data for the requested variables)
6 | #'
7 | #' @author Luke Pilling
8 | #'
9 | #' @name get_rap_phenos
10 | #'
11 | #' @param names A string or vector of strings. The variable name(s) required. e.g., c("eid","p31","p21003_i0")
12 | #' @param value_coding A string. How to handle coded fields. "replace" if a coding value exists, replace the raw value with the code; "raw" export the raw values of the field; "exclude" if a coding value exists, do not export the value (most commonly used with sparse fields).
13 | #' \code{default="replace"}
14 | #' @param names_are_titles Logical. Passing DNAnexus variable "titles" e.g., c("Age at recruitment", "Standing height | Instance 0").
15 | #' \code{default=FALSE}
16 | #' @param record A string. The `dnanexus_link` file descriptor of the .dataset to use. Default (if left as NULL) is to use the most recently dispensed dataset.
17 | #' \code{default=most recent dataset}
18 | #' @param verbose Logical. Be verbose,
19 | #' \code{default=FALSE}
20 | #'
21 | #' @examples
22 | #' # get phenotype data
23 | #' ukb <- get_rap_phenos(c("eid","p31","p21003_i0","p53_i0"))
24 | #'
25 | #' # save to file on the RAP worker node
26 | #' readr::write_tsv(ukb, "ukbrap.phenos.20231114.txt.gz")
27 | #'
28 | #' # upload data to RAP storage
29 | #' upload_to_rap(file="ukbrap.phenos.20231114.txt.gz", dir="")
30 | #'
31 | #' @export
32 | #'
33 | get_rap_phenos <- function(
34 | names,
35 | value_coding = "replace",
36 | names_are_titles = FALSE,
37 | record = NULL,
38 | verbose = FALSE
39 | ) {
40 |
41 | start_time <- Sys.time()
42 |
43 | # install packages if required
44 | install.packages(setdiff(c("reticulate", "arrow", "sparklyr"), rownames(installed.packages())), dependencies = TRUE, quiet = TRUE)
45 |
46 | # Check input & options
47 | if (verbose) cat("Check input & options\n")
48 | if (! is.character(names) ) stop("`names` needs to be a character vector of UK Biobank RAP phenotype names, e.g., c(\"eid\",\"p31\",\"p21003_i0\")")
49 |
50 | # Check `eid` is included
51 | if (verbose) cat("Check `eid` is included\n")
52 | if (! "eid" %in% names) {
53 | cat("Adding `eid` to the requested variable names\n")
54 | names <- c("eid",names)
55 | }
56 |
57 | cat("Getting data for ", length(names), " phenotypes\n")
58 |
59 | # Import dxdata package and initialize Spark (dxdata) engine
60 | if (verbose) cat("Import dxdata package and initialize Spark (dxdata) engine\n")
61 | cat("*** If asked about creating a python environment, choose 'no' ****\n")
62 | dxdata <- reticulate::import("dxdata")
63 |
64 | # Connect to the dataset
65 | if (verbose) cat("Connect to the dataset\n")
66 | project <- system("dx env | grep project- | awk -F '\t' '{print $2}'", intern = TRUE)
67 | if (is.null(record)) {
68 | record <- system("dx describe *dataset | grep record- | awk -F ' ' '{print $2}' | sort | tail -n 1" , intern = TRUE)
69 | }
70 | DATASET_ID <- paste0(project, ":", record)
71 | dataset <- dxdata$load_dataset(id=DATASET_ID)
72 | if (verbose) cat(" - [", DATASET_ID, "]\n")
73 |
74 | # Select participant table
75 | if (verbose) cat("Select participant table\n")
76 | pheno <- dataset$entities_by_name[['participant']]
77 |
78 | # Select fields from participant table
79 | if (verbose) cat("Select fields from participant table\n")
80 | if (names_are_titles) {
81 | fld = purrr::map(names, function(x) pheno$find_field(title=x))
82 | } else {
83 | fld = purrr::map(names, function(x) pheno$find_field(name=x) )
84 | }
85 |
86 | # Define the Spark engine
87 | if (verbose) cat("Define the Spark engine\n")
88 | engine <- dxdata$connect(dialect="hive+pyspark")
89 |
90 | # Retrieve the fields defined in fld list
91 | if (verbose) cat("Retrieve the fields defined in fld list\n")
92 | df <- pheno$retrieve_fields(engine=engine, fields=fld, coding_values=value_coding)
93 |
94 | # Write the data to a temporary parquet file
95 | if (verbose) cat("Write the data to a temporary parquet file\n")
96 | uid_parquet <- runif(n=1, min=1e6, max=9e6)
97 | parquet_name <- paste0('tmpdf.',uid_parquet,'.parquet')
98 | system(paste0('hadoop fs -rm -r -f ', parquet_name), intern = TRUE)
99 | df$write$parquet(parquet_name)
100 | if (verbose) cat(paste0("Witten to ", parquet_name, "\n"))
101 |
102 | # Copy the temporary parquet file from distributed to the local file system
103 | if (verbose) cat("Copy the temporary parquet file from distributed to the local file system\n")
104 | if(dir.exists(parquet_name)) unlink(paste0('tmpdf.',uid_parquet,'.parquet'), recursive=TRUE)
105 | system(paste0('hadoop fs -copyToLocal ', parquet_name), intern = TRUE)
106 |
107 | # Read the dataset information R using Apache arrow package
108 | if (verbose) cat("Read the dataset information R using Apache arrow package\n")
109 | ds <- arrow::open_dataset(parquet_name)
110 |
111 | # Collect the data from the dataset to R memory
112 | if (verbose) cat("Collect the data from the dataset to R memory\n")
113 | tbl <- ds |> dplyr::collect()
114 |
115 | if (verbose) cat("Done. Time taken:", Sys.time() - start_time, "\n")
116 |
117 | tbl
118 |
119 | }
120 |
121 |
--------------------------------------------------------------------------------
/vignettes/ascertain_diagnoses.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Ascertain diagnoses"
3 | description: >
4 | Ascertain UK Biobank participant diagnoses from all sources (medical records and self-report data).
5 | output: rmarkdown::html_vignette
6 | vignette: >
7 | %\VignetteIndexEntry{Ascertain diagnoses}
8 | %\VignetteEngine{knitr::rmarkdown}
9 | %\VignetteEncoding{UTF-8}
10 | ---
11 |
12 | ```{r, include = FALSE}
13 | knitr::opts_chunk$set(
14 | collapse = TRUE,
15 | comment = "#>"
16 | )
17 | library(ukbrapR)
18 | ```
19 |
20 | Diagnosis of conditions in UK Biobank participants come from multiple data sources:
21 |
22 | * Self-report during assessment
23 |
24 | * Hospital inpatient records (HES)
25 |
26 | * Primary care (GP)
27 |
28 | * Cancer registry
29 |
30 | * Cause of death
31 |
32 | The {ukbrapR} package makes it fast and easy to ascertain diagnoses from multiple UK Biobank data sources in the DNAnexus Research Analysis Platform (RAP).
33 |
34 |
35 | ## Requires exported files
36 |
37 | This only needs to happen once per project. Running `export_tables()` will submit the necessary `table-exporter` jobs to save the raw medical records files to the RAP persistent storage for the project. ~10Gb of text files are created. This will cost ~£0.15 per month to store in the RAP standard storage.
38 |
39 | Once the files are exported (~15mins) these can then be used by the below functions to extract diagnoses based on codes lists.
40 |
41 |
42 | ## Input
43 |
44 | Depending on the data source different coding vocabularies are required:
45 |
46 | * `ICD10` (for searching HES diagnoses, cause of death, and cancer registry)
47 |
48 | * `ICD9` (for searching older HES diagnosis data)
49 |
50 | * `Read2` and `CTV3` (for GP clinical events)
51 |
52 | * `OPCS3` and `OPCS4` (for HES operations)
53 |
54 | * `ukb_cancer` and `ukb_noncancer` (for self-reported illness at UK Biobank assessments - all instances will be searched)
55 |
56 | Ascertaining diagnoses typically takes two steps:
57 |
58 |
59 | ## 1. Get medical records and self-reported illness data for provided codes
60 |
61 | For a given set of diagnostic codes get the participant medical events and self-reported data. Returns a list of 6 data frames: the subset of the long clinical files with matched codes.
62 |
63 | Codes need to be provided as a data frame with two fields: `vocab_id` and `code`. Valid code vocabularies are listed above. Other cols (such as condition and description) are ignored.
64 |
65 | ```{r}
66 | # example diagnostic codes for Chronic Kidney Disease
67 | codes_df_ckd <- ukbrapR:::codes_df_ckd
68 | head(codes_df_ckd)
69 |
70 | # get diagnosis data - returns list of data frames (one per source)
71 | diagnosis_list <- get_diagnoses(codes_df_ckd)
72 |
73 | # N records for each source
74 | nrow(diagnosis_list$gp_clinical)
75 | nrow(diagnosis_list$hesin_diag)
76 | nrow(diagnosis_list$death_cause)
77 | ```
78 |
79 | If providing primary care codes for measures (BMI etc) these are also returned (the `gp_clinical` object in the returned list contains all cols for matched codes).
80 |
81 |
82 | ## 2. Get date first diagnosed
83 |
84 | Usually the user is interested in combining the separate data sources into a combined phenotype: the date first diagnosed for each participant from the data/codes in step 1 (cause of death, HES diagnoses, GP clinical, cancer registry, HES operations, and self-reported illness fields).
85 |
86 | In addition to the "date first" `df` field are:
87 |
88 | - a `src` field indicating the source of the date of first diagnosis.
89 | - a `bin` field indicating the cases [1] and controls [0]. This relies on a small number of baseline fields also exported. The `df` field for the controls is the date of censoring (currently 30 October 2022).
90 | - a `bin_prev` field indicating whether the case was before the UK Biobank baseline assessment
91 |
92 | ```{r}
93 | # for each participant, get Date First diagnosed with the condition
94 | diagnosis_df <- get_df(diagnosis_list)
95 |
96 | names(diagnosis_df)
97 | summary(diagnosis_df)
98 | ```
99 |
100 | You can add a prefix to all the variable names by specifying the "prefix" option:
101 |
102 | ```{r}
103 | diagnosis_df <- get_df(diagnosis_list, prefix="ckd")
104 |
105 | # how many cases ascertained?
106 | table(diagnosis_df$ckd_bin)
107 |
108 | # source of earliest diagnosis date
109 | table(diagnosis_df$ckd_src)
110 |
111 | # date of diagnosis for prevalent cases (i.e., before UK Biobank baseline assessment)
112 | summary(diagnosis_df$ckd_df[ diagnosis_df$ckd_bin_prev == 1 ])
113 | ```
114 |
115 | ## Ascertaining multiple conditions at once
116 |
117 | The default `get_df()` behaviour is to use all available codes. However, the most time-efficient way to get multiple conditions is to run `get_diagnoses()` once for all codes for the conditions you wish to ascertain, then get the "date first diagnosed" for each condition separately. In the codes data frame you just need a field indicating the condition name, that will become the variable prefixes.
118 |
119 | ```{r}
120 | # combine haemochromatosis and CKD codes together
121 | # each contain there columns: condition, vocab_id, and code
122 | # where `condition` is either "hh" or "ckd" and will become the variable prefix
123 | codes_df_combined <- rbind(ukbrapR:::codes_df_hh, ukbrapR:::codes_df_ckd)
124 |
125 | # get diagnosis data - returns list of data frames (one per source)
126 | diagnosis_list <- get_diagnoses(codes_df_combined)
127 |
128 | # for each participant, get Date First diagnosed with the condition
129 | diagnosis_df <- get_df(diagnosis_list, group_by="condition")
130 |
131 | # each condition has full set of output
132 | table(diagnosis_df$hh_bin)
133 |
134 | table(diagnosis_df$ckd_bin)
135 | ```
136 |
137 | In the above example we also included a UK Biobank self-reported illness code for haemochromatosis, that was also ascertained (the Date First is run on each condition separately, they do not all need to have the same data sources).
138 |
139 |
140 |
--------------------------------------------------------------------------------
/R/fields_to_phenos.R:
--------------------------------------------------------------------------------
1 | #' Check UK Biobank field IDs
2 | #'
3 | #' @description Check if provided field IDs are valid and return all possible phenotype names in the UK Biobank RAP
4 | #'
5 | #' @return Returns a vector of strings (valid phenotypes).
6 | #'
7 | #' @author Luke Pilling
8 | #'
9 | #' @name fields_to_phenos
10 | #'
11 | #' @param fields A vector of character strings. The field IDs to check if valid.
12 | #' @param filename A string. If provided, will save as a fieldname file ready for the table-exporter (including "eid").
13 | #' \code{default=""}
14 | #' @param abort Logical. Abort if a field is missing?,
15 | #' \code{default=TRUE}
16 | #' @param verbose Logical. Be verbose,
17 | #' \code{default=FALSE}
18 | #'
19 | #' @examples
20 | #'
21 | #' # not instanced, not arrayed
22 | #' fields_to_phenos("31") # sex
23 | #'
24 | #' # instanced, not arrayed
25 | #' fields_to_phenos("53") # assessment date
26 | #'
27 | #' # instanced and arrayed
28 | #' fields_to_phenos("93") # systolic blood pressure, manual reading
29 | #'
30 | #' # instanced and arrayed, MRI assessments only
31 | #' fields_to_phenos("12673") # Heart rate recorded during vicorder run (Heart MRI)
32 | #'
33 | #' # check multiple simultaneously
34 | #' fields_to_phenos(c("31","93"))
35 | #'
36 | #' # only warn if an invalid field is provided (default is to abort)
37 | #' fields_to_phenos(c("31","notafield","93"))
38 | #' fields_to_phenos(c("31","notafield","93"), abort=FALSE)
39 | #'
40 | #' # save as fieldname file for the table-exporter (don't forget to upload to the RAP)
41 | #' fields_to_phenos(c("31","93"), filename="fieldnames.txt")
42 | #'
43 | #' @export
44 | #'
45 | fields_to_phenos <- function(
46 | fields,
47 | filename="",
48 | abort=TRUE,
49 | verbose=FALSE
50 | ) {
51 |
52 | # Check if 'field' is a character string of length 1
53 | if (class(fields)[1] != "character") {
54 | cli::cli_abort("Provided fields need to be a vector of character strings") # Abort if field is not a character string
55 | }
56 |
57 | # If filename provided, check it is valid
58 | save_file <- FALSE
59 | if (filename != "") {
60 | if (! is.character(filename)) cli::cli_abort("Provided filename need to be a character string")
61 | if (! length(filename) == 1) cli::cli_abort("Provided filename need to be a character string of length 1")
62 | save_file <- TRUE
63 | }
64 |
65 | # Apply field_to_phenos() to each field
66 | phenos <- purrr::map(fields, \(x) ukbrapR:::field_to_phenos(field=x, abort=abort, verbose=verbose)) |> purrr::list_c()
67 |
68 | # Save file, or return vector?
69 | if (save_file) {
70 | data.frame(c("eid",phenos)) |> readr::write_tsv(filename, col_names=FALSE, progress=FALSE)
71 | cli::cli_alert_success(stringr::str_c("Saved fields and phenos to {.file ", filename, "}"))
72 | } else {
73 | return(phenos)
74 | }
75 |
76 | }
77 |
78 |
79 | #' Function to check individual field_id
80 | #'
81 | #' @return Returns a vector of strings (valid phenotypes).
82 | #'
83 | #' @author Luke Pilling
84 | #'
85 | #' @name field_to_phenos
86 | #'
87 | #' @param field A string. The field ID (e.g., `31`) to check if valid.
88 | #' @param abort Logical. Abort if field is missing?,
89 | #' \code{default=TRUE}
90 | #' @param verbose Logical. Be verbose,
91 | #' \code{default=FALSE}
92 | #'
93 | #' @noRd
94 | #'
95 | field_to_phenos <- function(
96 | field,
97 | abort=TRUE,
98 | verbose=FALSE
99 | ) {
100 |
101 | # Check if 'field' is a character string of length 1
102 | if (class(field)[1] == "character") {
103 | if (length(field) > 1) {
104 | cli::cli_abort("field needs to be length 1") # Abort if field length is greater than 1
105 | }
106 | } else {
107 | cli::cli_abort("field needs to be a character string") # Abort if field is not a character string
108 | }
109 |
110 | # Extract the field ID, in case it's passed as 'p4080_i0_a0' or similar
111 | field_id <- field |> stringr::str_remove("p") |> stringr::str_split_i("_", 1)
112 | if (verbose) cli::cli_alert(c("Field ID: ", field_id))
113 |
114 | # Get information for this field from the schema
115 | field_info <- ukbrapR:::ukb_schema[["field"]] |> dplyr::filter(field_id == !!field_id)
116 |
117 | # Was the provided field ID in the schema?
118 | if (nrow(field_info)==0) {
119 | if (abort) cli::cli_abort(stringr::str_c("Field ID [", field_id, "] not present in the UK Biobank schema (https://biobank.ctsu.ox.ac.uk/ukb/schema.cgi?id=1)"))
120 | if (!abort) {
121 | cli::cli_warn(stringr::str_c("Field ID [", field_id, "] not present in the UK Biobank schema (https://biobank.ctsu.ox.ac.uk/ukb/schema.cgi?id=1)"))
122 | return(NULL)
123 | }
124 | }
125 |
126 | # Initialize the basic field ID and valid fields list
127 | p_field_id <- stringr::str_c("p", field_id)
128 | valid_fields <- NULL
129 |
130 | # Check if the field is instanced and generate instances if true
131 | if (field_info$instanced == 1) {
132 | instances <- seq(field_info$instance_min, field_info$instance_max, 1)
133 | if (verbose) cli::cli_alert(stringr::str_c("Is instaced [", stringr::str_c(instances, collapse=","), "]"))
134 | }
135 |
136 | # Check if the field is arrayed and generate arrays if true
137 | # If the field is "multiple choice" value type then it is *not* arrayed
138 | if (field_info$arrayed == 1 & field_info$value_type != 22) {
139 | arrays <- seq(field_info$array_min, field_info$array_max, 1)
140 | if (verbose) cli::cli_alert(stringr::str_c("Is arrayed [", stringr::str_c(arrays, collapse=","), "]"))
141 |
142 | # Generate valid fields for non-instanced arrayed fields
143 | if (field_info$instanced == 0) {
144 | for (aa in 1:length(arrays)) {
145 | valid_fields <- c(valid_fields, stringr::str_c(p_field_id, "_a", arrays[aa]))
146 | }
147 | }
148 |
149 | # Generate valid fields for instanced arrayed fields
150 | if (field_info$instanced == 1) {
151 | for (ii in 1:length(instances)) {
152 | for (aa in 1:length(arrays)) {
153 | valid_fields <- c(valid_fields, stringr::str_c(p_field_id, "_i", instances[ii], "_a", arrays[aa]))
154 | }
155 | }
156 | }
157 |
158 | } else {
159 |
160 | # Generate valid fields for instanced non-arrayed fields
161 | if (field_info$instanced == 1) {
162 | for (ii in 1:length(instances)) {
163 | valid_fields <- c(valid_fields, stringr::str_c(p_field_id, "_i", instances[ii]))
164 | }
165 | } else {
166 |
167 | # not instanced or arrayed!
168 | valid_fields <- p_field_id
169 |
170 | }
171 |
172 | }
173 |
174 | # Return the list of valid fields for this field ID
175 | return(valid_fields)
176 |
177 | }
178 |
--------------------------------------------------------------------------------
/R/label_ukb_field.R:
--------------------------------------------------------------------------------
1 | #' Update UK Biobank field with `title` and `label` from the schema
2 | #'
3 | #' @description Variables such as education and ethnicity are provided as integers but have specific codes.
4 | #'
5 | #' The UK Biobank schema are machine-readable dictionaries and mappings defining the internal structure of the online Showcase. https://biobank.ctsu.ox.ac.uk/crystal/schema.cgi
6 | #'
7 | #' This function updates a field in a data frame of UK Biobank with information from the Schema.
8 | #'
9 | #' @return Returns a data frame.
10 | #'
11 | #' @author Luke Pilling
12 | #'
13 | #' @name label_ukb_field
14 | #'
15 | #' @param d A data frame. The data frame containing the `field` to update.
16 | #' @param field A string. The field (e.g., `p54_i0`) in the provided data frame to update.
17 | #' @param field_id A string. If the field has been renamed (to e.g.,"assessment_centre") provide the field id here (e.g., "54").
18 | #' \code{default=NULL}
19 | #' @param verbose Logical. Be verbose,
20 | #' \code{default=FALSE}
21 | #'
22 | #' @examples
23 | #'
24 | #' # update the Assessment Centre variable
25 | #' ukb <- ukbrapR::label_ukb_field(ukb, field="p54_i0")
26 | #'
27 | #' table(ukb$p54_i0) # tabulates the values
28 | #' table(haven::as_factor(ukb$p54_i0)) # tabulates the labels
29 | #' haven::print_labels(ukb$p54_i0) # show the value:label mapping for this variable
30 | #'
31 | #' # if the variable has been renamed, provide the "field" and "field_id" e.g.,
32 | #' ukb <- ukbrapR::label_ukb_field(ukb, field="assessment_centre", field_id="54")
33 | #'
34 | #' @export
35 | #'
36 | label_ukb_field <- function(
37 | d, # the data frame
38 | field, # the variable name in the data
39 | field_id=NULL, # if not in UKB format, provide the field ID
40 | verbose=FALSE
41 | ) {
42 |
43 | # get symbol to use as variable name later
44 | if (verbose) cat(" - ", field, "\n")
45 | field_sym <- rlang::sym(field)
46 |
47 | # get field ID
48 | if (is.null(field_id)) field_id <- field |> stringr::str_remove("p") |> stringr::str_split_i("_", 1)
49 | if (verbose) print(" -- ", field_id, "\n")
50 |
51 | # get encoding ID & variable title
52 | field_eid <- ukbrapR:::ukb_schema[["field"]] |> dplyr::filter(field_id==!!field_id) |> dplyr::select(encoding_id) |> dplyr::pull()
53 | field_title <- ukbrapR:::ukb_schema[["field"]] |> dplyr::filter(field_id==!!field_id) |> dplyr::select(title) |> dplyr::pull()
54 |
55 | # search schema for coding
56 | index <- 0
57 | for (i in 2:6) if (field_eid %in% ukbrapR:::ukb_schema[[i]]$encoding_id) index <- i
58 |
59 | # if index was found, include labels, otherwise just update title
60 | if (index != 0) {
61 |
62 | # subset and convert to named vector
63 | field_coding <- ukbrapR:::ukb_schema[[index]] |>
64 | dplyr::filter(encoding_id == field_eid) |>
65 | dplyr::select(meaning, value) |>
66 | tibble::deframe()
67 |
68 | # skip labelling if values contain the "pipe" (implies collapsed array variables)
69 | if ( ! any(stringr::str_detect(na.omit(d[[field]]), stringr::fixed("|"))) ) {
70 |
71 | # update variable label
72 | d <- d |> dplyr::mutate(
73 | !! field_sym := haven::labelled(
74 | !! field_sym,
75 | labels = !! field_coding,
76 | label = !! field_title
77 | )
78 | )
79 |
80 | }
81 |
82 | } else {
83 |
84 | # update variable title
85 | d <- d |> dplyr::mutate(
86 | !! field_sym := haven::labelled(
87 | !! field_sym,
88 | label = !! field_title
89 | )
90 | )
91 |
92 | }
93 |
94 | return(d)
95 |
96 | }
97 |
98 |
99 | #' Update a data frame containing UK Biobank fields with `title` and `label` from the schema
100 | #'
101 | #' @description Variables such as education and ethnicity are provided as integers but have specific codes.
102 | #'
103 | #' The UK Biobank schema are machine-readable dictionaries and mappings defining the internal structure of the online Showcase. https://biobank.ctsu.ox.ac.uk/crystal/schema.cgi
104 | #'
105 | #' This function updates a data frame of UK Biobank data field with information from the Schema.
106 | #'
107 | #' It is in effect a wrapper to apply ukbrapR::label_ukb_field() to each variable in a data frame that looks like a UK Biobank field.
108 | #'
109 | #' Only recognised fields are modified (variables named things like "p54_i0"). Other variables are ignored.
110 | #'
111 | #' @return Returns a data frame.
112 | #'
113 | #' @author Luke Pilling
114 | #'
115 | #' @name label_ukb_fields
116 | #'
117 | #' @param d A data frame. The data frame containing UK Biobank fields to update.
118 | #' @param verbose Logical. Be verbose,
119 | #' \code{default=FALSE}
120 | #'
121 | #' @examples
122 | #'
123 | #' # say the below data frame contains 4 variables: `eid`, `p54_i0`, `p50_i0` and `age_at_assessment`
124 | #' names(ukb)
125 | #'
126 | #' # update the variables that looks like UK Biobank fields with titles and, where cateogrical, labels
127 | #' # i.e., `p54_i0` and `p50_i0` only -- `eid` and `age_at_assessment` are ignored
128 | #' ukb <- ukbrapR::label_ukb_fields(ukb)
129 | #'
130 | #' table(ukb$p54_i0) # tabulates the values
131 | #' table(haven::as_factor(ukb$p54_i0)) # tabulates the labels
132 | #' haven::print_labels(ukb$p54_i0) # show the value:label mapping for this variable
133 | #'
134 | #' @export
135 | #'
136 | label_ukb_fields <- function(
137 | d,
138 | verbose=FALSE
139 | ) {
140 |
141 | v <- packageVersion("ukbrapR")
142 | cli::cli_alert_info("ukbrapR v{v}")
143 |
144 | start_time <- Sys.time()
145 |
146 | # identify variables matching UK Biobank field format: start with "p", then have an integer before any "_"
147 | fields <- colnames(d)
148 | fields_int <- fields |>
149 | stringr::str_subset("^p") |>
150 | stringr::str_replace("p", "") |>
151 | stringr::str_split_i("_", 1)
152 | int_check <- function(vect) {
153 | vect <- as.character(vect)
154 | vect_int <- purrr::map(vect, \(x) all(unlist(stringr::str_split(x, "")) %in% 0:9)) |> purrr::list_c()
155 | return(vect[vect_int])
156 | }
157 | fields_int <- int_check(fields_int)
158 | fields <- fields[ stringr::str_detect( fields, stringr::str_c( stringr::str_c("p", fields_int), collapse="|") ) ]
159 |
160 | # any matched?
161 | n_fields <- length(fields)
162 | if (n_fields==0) cli::cli_abort(c("x" = "No UK Biobank fields (e.g., 'p54_i0') identified in the provided data frame."))
163 |
164 | # label each field identified
165 | cli::cli_alert("Labelling {n_fields} field{?s}")
166 | for (f in fields) d <- ukbrapR::label_ukb_field(d=d, field=f, verbose=verbose)
167 |
168 | # finished!
169 | cli::cli_alert_success(c("Finished. Time taken: ", "{prettyunits::pretty_sec(as.numeric(difftime(Sys.time(), start_time, units=\"secs\")))}."))
170 |
171 | # return it
172 | return(d)
173 |
174 | }
--------------------------------------------------------------------------------
/R/get_selfrep_illness_spark.R:
--------------------------------------------------------------------------------
1 | #' Get UK Biobank participant self-reported illness/year data for specific codes
2 | #'
3 | #' @description For a specific self-reported illness code or codes, identify whether the participant has self-reported at any visit, and identify the year.
4 | #' Intended for use on the UK Biobank DNnexus Research Analysis Platform, but if the user provides their own dataframe of UK Biobank self-reported fields this works on any system.
5 | #'
6 | #' @return Returns a data frame with four variables: eid, selfrep [binary, codes identified?], selfrep_df [date of reported illness], selfrep_i [instance the illness was first reported]
7 | #'
8 | #' @author Luke Pilling
9 | #'
10 | #' @name get_selfrep_illness_spark
11 | #'
12 | #' @param codes_df A data frame. Contains the `vocab_col` and `codes_col` i.e., an indicator of the vocabulary and the diagnostic codes.
13 | #' @param vocab_col A string. Column name in `codes_df` that contains the vocabulary indicator for the code (for self-reported it needs to be either "ukb_cancer" or "ukb_noncancer").
14 | #' \code{default='vocab_id'}
15 | #' @param codes_col A string. Column name in `codes_df` that contains the self-reported disease code (e.g., 1507).
16 | #' \code{default='code'}
17 | #' @param ukb_dat A data frame. Optional. If not provided, will get the phenotypes from the RAP. Contains the self-reported illness fields e.g., `p20008_i0_a0`.
18 | #' \code{default=NULL}
19 | #' @param n_cancer_arrays An integer. It is not trivial to determine the max number of arrays to request from Spark for the self-reported illnesses. The defaults match the currently (Feb 2024) available data but may need increasing in the future.
20 | #' \code{default=5}
21 | #' @param n_noncancer_arrays An integer. It is not trivial to determine the max number of arrays to request from Spark for the self-reported illnesses. The defaults match the currently (Feb 2024) available data but may need increasing in the future.
22 | #' \code{default=30}
23 | #' @param verbose Logical. Be verbose,
24 | #' \code{default=FALSE}
25 | #'
26 | #' @examples
27 | #' # example diagnostic codes for haemochromatosis
28 | #' print(codes_df_hh)
29 | #'
30 | #' # get self-reported data - a data frame
31 | #' selfrep_df <- get_selfrep_illness_spark(codes_df_hh)
32 | #'
33 | #' # inspect variables
34 | #' table(selfrep_df$selfrep)
35 | #' summary(selfrep_df$selfrep_df)
36 | #' table(selfrep_df$selfrep_i)
37 | #'
38 | #' @export
39 | #'
40 | get_selfrep_illness_spark <- function(
41 | codes_df,
42 | vocab_col = "vocab_id",
43 | codes_col = "code",
44 | ukb_dat = NULL,
45 | n_cancer_arrays = 5,
46 | n_noncancer_arrays = 30,
47 | verbose = FALSE
48 | ) {
49 |
50 | start_time <- Sys.time()
51 |
52 | # Check input
53 | if (verbose) cat("Check inputs\n")
54 | if (! any(class(codes_df) %in% c("data.frame","tbl_df"))) stop("Codelist needs to be provided as a data frame")
55 | codes_df = as.data.frame(codes_df) # in case a tibble
56 |
57 | # check `codes_df` -- does it contain "ukb_cancer" or "ukb_noncancer" codes? Cannot have both. Makes no sense.
58 | if (! any(c("ukb_cancer","ukb_noncancer") %in% codes_df[,vocab_col])) stop("`vocab_col` column in `codes_df` needs to contain either 'ukb_cancer' or 'ukb_noncancer'")
59 | if (all(c("ukb_cancer","ukb_noncancer") %in% codes_df[,vocab_col])) stop("`vocab_col` column in `codes_df` needs to contain either 'ukb_cancer' or 'ukb_noncancer' (not both)")
60 |
61 | # get codes
62 | if (verbose) cat("Get codes, determine if cancer\n")
63 | codes <- codes_df[ codes_df[,vocab_col] %in% c("ukb_cancer","ukb_noncancer") ,codes_col]
64 | cat("Getting data on ", length(unique(codes)), " codes\n")
65 |
66 | # is cancer?
67 | is_cancer <- TRUE
68 | if ("ukb_noncancer" %in% codes_df[,vocab_col]) is_cancer <- FALSE
69 |
70 | # if `ukb_dat` is NULL use `get_rap_phenos()` to get phenotype data - all instances, all arrays
71 | # need: eid, 20001, 20002, 20006, 20007, 20008, 20009
72 | if (is.null(ukb_dat)) {
73 |
74 | if (verbose) cat("No `ukb_dat` object provided - will get from RAP\n")
75 |
76 | # Determine variable names needed (depends if cancer or non-cancer)
77 | # will use 20001 (cancer code) and 20002 (non-cancer code)
78 | # will use the interpolated year (20006 = cancer year, 20008 = non-cancer year)
79 |
80 | # RAP stores arrays as separate variables
81 |
82 | # get field names
83 | if (verbose) cat("Determine field names to request\n")
84 | names = "eid"
85 |
86 | # if cancer or non-cancer code:
87 | if (is_cancer) {
88 | # cancer code = 20001
89 | for (a in c(0:n_cancer_arrays)) names <- c(names, stringr::str_c("p20001_i", 0, "_a", a))
90 | for (a in c(0:n_cancer_arrays)) names <- c(names, stringr::str_c("p20001_i", 1, "_a", a))
91 | for (a in c(0:n_cancer_arrays)) names <- c(names, stringr::str_c("p20001_i", 2, "_a", a))
92 | for (a in c(0:n_cancer_arrays)) names <- c(names, stringr::str_c("p20001_i", 3, "_a", a))
93 | # cancer year = 20006
94 | for (a in c(0:n_cancer_arrays)) names <- c(names, stringr::str_c("p20006_i", 0, "_a", a))
95 | for (a in c(0:n_cancer_arrays)) names <- c(names, stringr::str_c("p20006_i", 1, "_a", a))
96 | for (a in c(0:n_cancer_arrays)) names <- c(names, stringr::str_c("p20006_i", 2, "_a", a))
97 | for (a in c(0:n_cancer_arrays)) names <- c(names, stringr::str_c("p20006_i", 3, "_a", a))
98 | } else {
99 | # non-cancer illness code = 20002
100 | for (a in c(0:n_noncancer_arrays)) names <- c(names, stringr::str_c("p20002_i", 0, "_a", a))
101 | for (a in c(0:n_noncancer_arrays)) names <- c(names, stringr::str_c("p20002_i", 1, "_a", a))
102 | for (a in c(0:n_noncancer_arrays)) names <- c(names, stringr::str_c("p20002_i", 2, "_a", a))
103 | for (a in c(0:n_noncancer_arrays)) names <- c(names, stringr::str_c("p20002_i", 3, "_a", a))
104 | # non-cancer illness year = 20008
105 | for (a in c(0:n_noncancer_arrays)) names <- c(names, stringr::str_c("p20008_i", 0, "_a", a))
106 | for (a in c(0:n_noncancer_arrays)) names <- c(names, stringr::str_c("p20008_i", 1, "_a", a))
107 | for (a in c(0:n_noncancer_arrays)) names <- c(names, stringr::str_c("p20008_i", 2, "_a", a))
108 | for (a in c(0:n_noncancer_arrays)) names <- c(names, stringr::str_c("p20008_i", 3, "_a", a))
109 | }
110 |
111 | if (verbose) print(names)
112 |
113 | # get fields from the RAP
114 | if (verbose) cat("Get fields from the RAP\n")
115 | ukb_dat <- ukbrapR::get_rap_phenos(names, value_coding = "raw", verbose = verbose)
116 |
117 | } else {
118 | cat("User is providing own `ukb_dat` with self-reported illness fields\n")
119 | }
120 |
121 | # remove any cols with all missing values
122 | ukb_dat = ukb_dat |> dplyr::select_if(~ !all(is.na(.)))
123 |
124 | # check all visits for participant - create `selfrep` (binary, ever), `selfrep_df` (the "Date First" variable) and `selfrep_i` (the "instance" i.e., visit)
125 | # https://biobank.ctsu.ox.ac.uk/crystal/label.cgi?id=100074
126 | # will use 20001 (cancer code) and 20002 (non-cancer code)
127 | # will use 20006-9 the interpolated year/age (6 = cancer year, 8 = non-cancer year)
128 | # interpolated:
129 | # - If the participant gave a calendar year, then the best-fit time is half-way through that year. For example if the year was given as 1970, then the value presented is 1970.5
130 | # - If the participant gave their age then the value presented is the fractional year corresponding to the mid-point of that age. For example, if the participant said they were 30 years old then the value is the date at which they were 30years+6months.
131 | # - Interpolated values before the date of birth were truncated forwards to that time.
132 | # - Interpolated values after the time of data acquisition were truncated back to that time.
133 | # [-1] = "Date uncertain or unknown" and [-3] = "Preferred not to answer" (exclude both from DF)
134 |
135 | # create empty vars in ukb_dat to modify
136 | ukb_dat$selfrep <- 0
137 | ukb_dat$selfrep_df <- NA
138 | ukb_dat$selfrep_i <- NA
139 |
140 | # for this instance, check if participant self-reported this code and record which array
141 | get_selfrep_i <- function(ukb_dat, codes, i, verbose) {
142 |
143 | if (verbose) cat("Get data from instance ", i, "\n")
144 |
145 | # variable prefix depends on cancer or not:
146 | if (is_cancer) {
147 | v_diag <- paste0("p20001_i", i)
148 | v_year <- paste0("p20006_i", i)
149 | }
150 | if (!is_cancer) {
151 | v_diag <- paste0("p20002_i", i)
152 | v_year <- paste0("p20008_i", i)
153 | }
154 |
155 | # Number of diagnosis columns
156 | n_cols <- sum(stringr::str_detect(names(ukb_dat), v_diag))
157 |
158 | # Iterate through each diagnosis column
159 | for (a in 0:(n_cols-1)) {
160 |
161 | if (verbose) cat("Get data from instance ", i, " array ", a, "\n")
162 |
163 | diag_col <- rlang::sym(paste0(v_diag, "_a", a))
164 | year_col <- rlang::sym(paste0(v_year, "_a", a))
165 |
166 | # Update where the code matches
167 | ukb_dat <- ukb_dat |> dplyr::mutate(
168 | selfrep_i = dplyr::if_else(selfrep == 0 & !!diag_col %in% codes, i, selfrep_i, selfrep_i),
169 | selfrep_df = dplyr::if_else(selfrep == 0 & !!diag_col %in% codes, !!year_col, selfrep_df, selfrep_df),
170 | selfrep = dplyr::if_else(selfrep == 0 & !!diag_col %in% codes, 1, selfrep, selfrep)
171 | )
172 | }
173 |
174 | return(ukb_dat)
175 |
176 | }
177 |
178 | # update selfrep variables with data from each instance
179 | ukb_dat = get_selfrep_i(ukb_dat, codes, 0, verbose)
180 | ukb_dat = get_selfrep_i(ukb_dat, codes, 1, verbose)
181 | ukb_dat = get_selfrep_i(ukb_dat, codes, 2, verbose)
182 | ukb_dat = get_selfrep_i(ukb_dat, codes, 3, verbose)
183 |
184 | if (verbose) cat("Format date first variable\n")
185 |
186 | # replace unknown or missing years with NA
187 | ukb_dat = dplyr::mutate(ukb_dat, selfrep_df = dplyr::if_else(selfrep_df < 0, NA, selfrep_df))
188 |
189 | # convert decimal to date
190 | ukb_dat = dplyr::mutate(ukb_dat, selfrep_df = lubridate::as_date(lubridate::date_decimal(selfrep_df)))
191 |
192 | # finish
193 | if (verbose) cat("Done. Time taken:", Sys.time() - start_time, "\n")
194 |
195 | # Return data
196 | return(ukb_dat[,c("eid", "selfrep", "selfrep_df", "selfrep_i")])
197 |
198 | }
199 |
200 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # ukbrapR
2 |
3 |
4 | [](https://github.com/lcpilling/ukbrapR)
5 | [](https://github.com/lcpilling/ukbrapR/commits/main)
6 | [](https://www.tidyverse.org/lifecycle/#experimental)
7 | [](https://zenodo.org/doi/10.5281/zenodo.11517716)
8 |
9 |
10 | ukbrapR (phonetically: 'U-K-B-wrapper') is an R package for working in the UK Biobank Research Analysis Platform (RAP). The aim is to make it quicker, easier, and more reproducible.
11 |
12 | > Since `v0.2.0` ukbrapR works best on a "normal" cluster using RStudio and raw data from the table-exporter. Old Spark functions are still available but are not updated.
13 |
14 | Wrapped server icon by DALL-E
15 |
16 | ## Installation
17 |
18 | In the DNAnexus Tools menu launch Posit Workbench and start an RStudio environment.
19 |
20 | ```r
21 | # install current version
22 | remotes::install_github("lcpilling/ukbrapR")
23 | ```
24 |
25 | ## Features
26 |
27 | There are three main groups of functions:
28 |
29 | * :dna: [Genetics](#genetic-variants): extract genotypes from Bulk data, create polygenic score
30 | * :clipboard: [Diagnoses](#ascertain-diagnoses): ascertain from health records and self-reported illness data, determine date first diagnosed
31 | * :hammer_and_wrench: [Utilities](#utility-functions): Check field names, label data fields, upload/download files from RAP, and pull phenotypes from Spark
32 |
33 | ## Genetic variants
34 |
35 | Bulk imputed genotypes and variant calls from Whole Genome Sequencing are available and can be easily accessed in an RStudio instance.
36 |
37 | ### Extract variants
38 |
39 | `extract_variants()` by default uses [bgenix](https://enkre.net/cgi-bin/code/bgen) and [plink](https://www.cog-genomics.org/plink/) to subset the [imputed BGEN files](https://biobank.ctsu.ox.ac.uk/crystal/field.cgi?id=22828) and read it quickly and easily into R.
40 |
41 | The only required input is a data frame (or path to file) containing "rsid" and "chr" variables. See function documentation for further details/options (including the available `make_imputed_bed()` and `load_bed()` internal functions).
42 |
43 | ```r
44 | varlist <- data.frame(rsid=c("rs1800562","rs429358"), chr=c(6,19))
45 |
46 | imputed_genotypes <- extract_variants(varlist)
47 | #> ~10 seconds
48 |
49 | dim(imputed_genotypes)
50 | #> [1] 487409 3
51 | ```
52 |
53 | By setting option `source="dragen"` the function will instead subset the [DRAGEN WGS BGEN files](https://biobank.ctsu.ox.ac.uk/crystal/field.cgi?id=24309). This requires "pos" in the input data frame (build 38).
54 |
55 | ```r
56 | varlist_b38 <- data.frame(rsid=c("rs1800562","rs429358"), chr=c(6,19), pos=c(26092913,44908684))
57 |
58 | dragen_genotypes <- extract_variants(varlist_b38, source="dragen")
59 | #> ~15 seconds
60 | ```
61 |
62 | The highlight of developing this feature was naming the internal function `make_dragen_bed()` :dragon: :bed:
63 |
64 | > This assumes your project has access to the WGS BGEN files released April 2025. If not, run `ukbrapR:::make_dragen_bed_from_pvcfs()` to use [tabix](https://www.htslib.org/doc/tabix.html) and [plink](https://www.cog-genomics.org/plink/) to subset the [DRAGEN WGS pVCF files](https://biobank.ctsu.ox.ac.uk/crystal/field.cgi?id=24310). Much slower.
65 |
66 |
67 | ### Create polygenic score
68 |
69 | `create_pgs()` takes a data frame containing a list of variant associations with a trait and creates a weighted allele score using [plink](https://www.cog-genomics.org/plink/1.9/score). By default it uses the [imputed](https://biobank.ctsu.ox.ac.uk/crystal/field.cgi?id=22828) genotypes.
70 |
71 | The only required input is a data frame (or path to file) containing rsid, chr, pos, effect_allele, other_allele, beta. For DRAGEN pos is build 38.
72 |
73 | ```r
74 | # weights from GWAS of liver cirrhosis (Innes 2020 Gastroenterology doi:10.1053/j.gastro.2020.06.014)
75 | varlist_pgs <- readr::read_tsv(system.file("files", "pgs_liver_cirrhosis.txt", package="ukbrapR"))
76 | head(varlist_pgs)
77 | #> rsID CHR POS effect_allele other_allele effect_weight locus_name
78 | #>
79 | #> 1 rs2642438 1 220796686 A G -0.177 MARC1
80 | #> 2 rs11925835 3 56831417 T C -0.235 ARHGEF3
81 | #> 3 rs72613567 4 87310241 TA T -0.166 HSD17B13
82 | #> 4 rs2954038 8 125495147 C A 0.16 TRIB1
83 | #> 5 rs11065384 12 120985482 T C 0.275 HNF1A
84 | #> 6 rs28929474 14 94378610 T C 0.561 SERPINA1
85 |
86 | liver_pgs <- create_pgs(
87 | in_file=varlist_pgs, # can be a data frame or file path
88 | out_file="liver_cirrhosis.imputed.pgs", # {optional} prefix for created .bed and .tsv files
89 | pgs_name="liver_cirrhosis_pgs") # {optional} variable name
90 | #> → Extracting 9 variants from 8 imputed files
91 | #> ✔ PGS created! See file liver_cirrhosis.imputed.pgs.tsv
92 | #> ~1 minute
93 |
94 | summary(liver_pgs$liver_cirrhosis_pgs)
95 | #> Min. 1st Qu. Median Mean 3rd Qu. Max.
96 | #> 0.00000 0.06006 0.08200 0.08589 0.10722 0.26639
97 | ```
98 |
99 | ## Ascertain diagnoses
100 |
101 | Diagnosis of conditions in UK Biobank participants come from multiple data sources. {ukbrapR} makes it fast and easy to ascertain diagnoses from multiple UK Biobank data sources in the DNAnexus Research Analysis Platform (RAP). Follow the below steps. See the website article for more details.
102 |
103 | ### 1. Export tables of raw data
104 |
105 | This only needs to happen once per project. Run `export_tables()` to submit the `table-exporter` jobs to save the required files to the RAP persistent storage. ~10Gb of text files are created, costing ~£0.15 per month to store.
106 |
107 | ### 2. Get diagnoses from all data sources
108 |
109 | For a given set of diagnostic codes get the participant Electronic Medical Records (EMR) and self-reported illess data. Returns a list containing up to 6 data frames: the subset of the clinical files with matched codes.
110 |
111 | Codes need to be provided as a data frame with two fields: `vocab_id` and `code`. Valid code vocabularies are:
112 |
113 | - `ICD10` (for searching HES diagnoses, cause of death, and cancer registry)
114 | - `ICD9` (for searching older HES diagnosis data)
115 | - `Read2` and `CTV3` (for GP clinical events)
116 | - `OPCS3` and `OPCS4` (for HES operations)
117 | - `ukb_cancer` and `ukb_noncancer` (for self-reported illness at UK Biobank assessments - all instances will be searched)
118 |
119 | ```r
120 | # example diagnostic codes for CKD
121 | codes_df_ckd <- ukbrapR:::codes_df_ckd
122 | head(codes_df_ckd)
123 | #> condition vocab_id code
124 | #> 1 ckd ICD10 N18.3
125 | #> 2 ckd ICD10 N18.4
126 | #> 3 ckd ICD10 N18.5
127 | #> ...
128 |
129 | # get diagnosis data - returns list of data frames (one per source)
130 | diagnosis_list <- get_diagnoses(codes_df_ckd)
131 | #> 7 ICD10 codes, 40 Read2 codes, 37 CTV3 codes
132 | #> ~2 minutes
133 |
134 | # N records for each source
135 | nrow(diagnosis_list$gp_clinical) # 29,083
136 | nrow(diagnosis_list$hesin_diag) # 206,390
137 | nrow(diagnosis_list$death_cause) # 1,962
138 | ```
139 |
140 | ### 3. Get date first diagnosed
141 |
142 | Identify the date first diagnosed for each participant from any of datasets searched with `get_diagnoses()` (cause of death, HES diagnoses, GP clinical, cancer registry, HES operations, and self-reported illness fields).
143 |
144 | Also included are:
145 |
146 | - a `src` field indicating the source of the date of first diagnosis.
147 | - a `bin` field indicating the cases [1] and controls [0]. This relies on a small number of baseline fields also exported. The `df` field for the controls is the date of censoring (currently 30 October 2022).
148 | - a `bin_prev` field indicating whether the case was before the UK Biobank baseline assessment
149 |
150 | ```r
151 | # for each participant, get Date First diagnosed with the condition
152 | # {optional} add a prefix to the variable names with "prefix"
153 | diagnosis_df <- get_df(diagnosis_list, prefix="ckd")
154 | #> ~2 seconds
155 |
156 | # how many cases ascertained?
157 | table(diagnosis_df$ckd_bin)
158 | #> 0 1
159 | #> 470334 31935
160 |
161 | # source of earliest diagnosis date
162 | table(diagnosis_df$ckd_src)
163 | #> death gp hes selfrep_i0 selfrep_i1 selfrep_i2 selfrep_i3
164 | #> 224 12394 19310 85 16 63 3
165 |
166 | # date of diagnosis for prevalent cases (i.e., before UK Biobank baseline assessment)
167 | summary(diagnosis_df$ckd_df[ diagnosis_df$ckd_bin_prev == 1 ])
168 | #> Min. 1st Qu. Median Mean 3rd Qu. Max.
169 | #> "1958-01-01" "2006-06-21" "2007-01-12" "2006-06-24" "2007-11-19" "2010-06-16"
170 | ```
171 |
172 | ### Ascertaining multiple conditions at once
173 |
174 | The default `get_df()` behaviour is to use all available codes. However the most time-efficient way to get multiple conditions is to run `get_diagnoses()` once for all codes for the conditions you wish to ascertain, then get the "date first diagnosed" for each condition separately. In the codes data frame you just need a field indicating the condition name, that will become the variable prefixes.
175 |
176 | ```r
177 | # combine haemochromatosis and CKD codes together
178 | # each contain there columns: condition, vocab_id, and code
179 | # where `condition` is either "hh" or "ckd" and will become the variable prefix
180 | codes_df_combined = rbind(ukbrapR:::codes_df_hh, ukbrapR:::codes_df_ckd)
181 |
182 | # get diagnosis data - returns list of data frames (one per source)
183 | diagnosis_list <- get_diagnoses(codes_df_combined)
184 |
185 | # for each participant, get Date First diagnosed with the condition
186 | diagnosis_df = get_df(diagnosis_list, group_by="condition")
187 |
188 | # each condition has full set of output
189 | table(diagnosis_df$hh_bin)
190 | #> 0 1
191 | #> 500254 2015
192 |
193 | table(diagnosis_df$ckd_bin)
194 | #> 0 1
195 | #> 470334 31935
196 | ```
197 |
198 | In the above example we also included a UK Biobank self-reported illness code for haemochromatosis, that was also ascertained (the Date First is run on each condition separately, they do not all need to have the same data sources).
199 |
200 | ## Utility functions
201 |
202 | * Check if field IDs are valid and return phenotype names with `fields_to_phenos()`
203 | * Label UK Biobank data fields with `label_ukb_fields()`
204 | * Upload/download files between worker and RAP with `upload_to_rap()` and `download_from_rap()`
205 | * Pull phenotypes from Spark instance with `get_rap_phenos()`
206 |
207 | ## Questions and comments
208 |
209 | Please report any bugs or [issues](https://github.com/lcpilling/ukbrapR/issues), and feel free to suggest changes as [pull requests](https://github.com/lcpilling/ukbrapR/pulls). Alternatively, feel free to contact me via e-mail L.Pilling@exeter.ac.uk
210 |
--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
1 | # ukbrapR v0.3.9 (4th Dec 2025)
2 |
3 | ### Changes
4 | - Fix issue with `dx` paths when downloading exported data in `/ukbrapr_data`. Previous method worked fine in RStudio but not Cloud Workstation, and highlighted a dependency on relative paths for `$ dx download` internal commands.
5 | - Add {tidyr} package to requirements. Has always been a requirement, but now it is explicit!
6 |
7 |
8 | # ukbrapR v0.3.8 (2nd Dec 2025)
9 |
10 | ### Changes
11 | - Implement suggestion by @nsandau (issue #34) to use `--brief --yes` flags when exporting tables with `export_tables()` (if running from terminal user is prompted for each job).
12 |
13 |
14 | # ukbrapR v0.3.7 (20th June 2025)
15 |
16 | ### Changes
17 | - `make_XXX_bed()` functions now give a more accurate ETA. Previously only considered the number of CHRs to search. Now considers the number of variants also.
18 |
19 | ### Bug fixes
20 | - Fix issue #31 where `fields_to_phenos()` incorrectly arrayed some "multiple choice" questions
21 |
22 |
23 | # ukbrapR v0.3.6 (29th May 2025)
24 |
25 | ### Change
26 | - `make_dragen_bed()` returns a more informative error if DRAGEN BGENs not found (suggests user is in a Project not updated since April 2025).
27 | - `make_XXX_bed()` functions now give a more accurate ETA. Previously only considered the number of CHRs to search. Now considers the number of variants also.
28 | - Removed indy/snow paths (i.e., my local servers) because we are now fully RAP working.
29 | - Removed Windows functionality. The RAP is always UNIX.
30 |
31 |
32 | # ukbrapR v0.3.5 (16th April 2025)
33 |
34 | ### New features
35 | - `create_pgs()` can now use a local BED file of already-extracted variants. Before, it assumed the user wanted to extract the variants prior to creating the PGS.
36 |
37 |
38 | # ukbrapR v0.3.4 (6th April 2025)
39 |
40 | ### New features
41 | - `make_dragen_bed()` now uses the BGEN files released in April 2025 (in prior versions used the pVCFs).
42 | - Means when `extract_variants()` or `create_pgs()` are called with option `source="dragen"` this is the default.
43 | - If your project does not have the new BGEN files available you can use `ukbrapR:::make_dragen_bed_from_pvcfs()` to extract from the pVCFs (slow).
44 |
45 | ### Changes
46 | - Updated internal schema to include new/updated fields.
47 | - Added "filename" option to `fields_to_phenos()` so that phenotype names can be written straight to a file ready for table-exporter.
48 |
49 |
50 | # ukbrapR v0.3.3 (18th March 2025)
51 |
52 | ### New features
53 | - New function `fields_to_phenos()` takes a vector of field IDs, check they are valid, and returns all possible UK Biobank RAP phenotypes from the schema
54 |
55 |
56 | # ukbrapR v0.3.2 (19th February 2025)
57 |
58 | ### Changes
59 | - Remove bundled plink, plink2 and bgenix files. Instead, download only if needed.
60 | - Add more consistent progress updates for `make_dragen_bed()` and `make_imputed_bed()`
61 | - Add "progress" options to `extract_variants()` and `create_pgs()` (default is FALSE). Default is TRUE if you directly call `make_dragen_bed()` or `make_imputed_bed()`
62 |
63 | ### Bug fixes
64 | - Fix `make_dragen_bed()` so it doesn't crash if the pVCF subset is empty (i.e., a searched-for chr:pos was missing)
65 |
66 |
67 | # ukbrapR v0.3.1 (10th February 2025)
68 |
69 | ### Bug fixes
70 | - Fix `make_dragen_bed()` position awk search, plink call
71 | - Fix `create_pgs()` when using WGS - needed to use chr:pos:a1:a2 not rsid
72 | - Fix `make_imputed_bed()` so it doesn't crash if the BGEN subset is empty (i.e., a searched-for rsid was missing)
73 |
74 |
75 | # ukbrapR v0.3.0 (29th January 2025)
76 |
77 | ### New features
78 | Suite of functions to extract and load genetic variants. Main ones of interest will be:
79 | 1. `extract_variants()` takes a list of variant rsIDs as input and extracts the imputed genotypes, loading to memory. This is really a wrapper around two other new functions: `make_imputed_bed()` and `load_bed()`. Also available in `make_dragen_bed()` to extract from whole genome sequence VCF files but this is pretty slow so usually user wants imputed variants.
80 | 2. `create_pgs()` creates a polygenic score (weighted allele score) using user-provided variants and weights. Loaded to memory but also saves a nicely formatted .tsv
81 |
82 | ### Breaking changes
83 | - Removing dependencies: reticulate, arrow, sparklyr. These take a few previous seconds to install every time and are rarely needed. Instead will be installed if user tries to use `get_rap_phenos()`
84 | - `get_emr_spark()` removed entirely. Much better to use `get_diagnoses()` which has had a *lot* of updates to functionality ad bud fixes.
85 |
86 |
87 |
88 | # ukbrapR v0.2.9 (12th January 2025)
89 |
90 | ### Bug fixes
91 | - Fixes for issue #19 (thanks to @nsandau for the help):
92 | 1. Where OPCS searches were not always performed correctly if only OPCS3/4 codes were provided.
93 | 2. When using "group_by" in `get_df()` some diagnoses were incorrectly carried over between groups when different vocabs were provided for each group (condition).
94 |
95 | ### Updates
96 | - Additional checking of `get_diagnoses()` input to abort if "blank" codes are provided to the grep.
97 | - When getting date first from self-reported illness data exclude "year" if < 1936 (earliest birth year for any participant)
98 |
99 |
100 | # ukbrapR v0.2.8 (05 October 2024)
101 |
102 | ### Bug fixes
103 | - Baseline dates TSV is now correctly located even if user changes working directory
104 | - HES operations dates were sometimes parsed as character - this is now fixed to parse as dates
105 |
106 | ### Updates
107 | - Warnings relating to parsing issues during grepping that are safe to ignore are now suppressed
108 | - Updates to documentation / examples / pkgdown site
109 | - New website articles to `ascertain_diagnoses`, `label_fields` and for `spark_functions`
110 |
111 |
112 | # ukbrapR v0.2.7 (30 September 2024)
113 |
114 | ### Updates
115 | - New function `label_ukb_field()` allows user to add titles and labels to UK Biobank fields provided as integers but are categorical.
116 | - New function `label_ukb_fields()` is a wrapper for the above. User just provides a data frame containing UK Biobank fields, and they all get formatted with titles (and labels if categorical).
117 | - Data from the UK Biobank schema (https://biobank.ctsu.ox.ac.uk/crystal/schema.cgi) are stored internally in `ukbrapR:::ukb_schema`
118 | - {haven} dependency added for labelling
119 | - Exported `baseline_dates.tsv` now also includes the assessment centres for completeness (but keeps the same filename to avoid any issues for current projects relying on already-exported files)
120 |
121 |
122 | # ukbrapR v0.2.6 (16 September 2024)
123 |
124 | ### Bug fix
125 | - Fix for issue #10. Grep issues if user provided only Read2 or CTV3 codes, if Read2 or CTV3 were <5 characters, or if Read2/CTV3 codes contained a hyphen. Thanks to @Simon-Leyss for highlighting.
126 | - Fix for issue #11. When getting self-reported illness codes there was a problem joining the tables if user only provided cancer codes. Thanks to @LauricF for highlighting.
127 | - Fix for when both types self-reported illness codes were provided. (Incorrect subsetting to just those codes provided after pivoting the long object.)
128 |
129 |
130 | # ukbrapR v0.2.5 (07 September 2024)
131 |
132 | ### Bug fix
133 | - When getting the date first cancer registry diagnosis, some rows were duplicated. This is now fixed so only one row per participant (the date first for any matched cancer ICD10) is returned.
134 |
135 |
136 | # ukbrapR v0.2.4 (05 September 2024)
137 |
138 | ### Changes
139 | - Updated internal paths for my servers `indy` and `snow` (for ongoing projects whilst we can still use local files...)
140 | - Updated how `get_diagnoses()` and `get_df()` handle a user-provided `file_paths` object
141 |
142 |
143 | # ukbrapR v0.2.3 (22 August 2024)
144 |
145 | ### Bug fixes
146 | - Fix for issue #8. In moving the HES ICD10 code block below the cancer registry code I acctidently put it within the `if (get_canreg) { }` condition. Thanks to @LauricF for highlighting.
147 | - Fix bullet points in pkgdown version of docs
148 |
149 |
150 | # ukbrapR v0.2.2 (21 August 2024)
151 |
152 | ### Update
153 | - The HESIN diagnosis search can now also include ICD9 codes in the provided codes data frame. These use fuzzy matching (similar to the ICD10s) so that searching for "280" also returns "2809" etc
154 |
155 |
156 | # ukbrapR v0.2.1 (10 August 2024)
157 |
158 | ### Bug fix
159 | - Fix for issue #5. The file paths for exported tables were not correctly specified in later calls of `get_diagnoses()` when the working directory is not the home directory. Thanks to @LauricF for highlighting.
160 |
161 |
162 | # ukbrapR v0.2.0 (30 July 2024)
163 |
164 | This is a major update as I move away from using Spark as the default environment, mostly due to the cost implications; it is significantly cheaper (and quicker!) to store and search exported raw text files in the RAP persistant storage than do everything in a Spark environment (plus the added benefit that the RStudio interface is available in "normal" instances).
165 |
166 | The Spark functions are available as before but all updates are to improve functionality in "normal" instances using RStudio, as we move to the new era of RAP-only UK Biobank analysis.
167 |
168 | ### Changes
169 | - Added internal data frame containing default paths for exported files in a RAP project (view with `ukbrapR:::ukbrapr_paths`)
170 | - Added function `export_tables()` which only needs to be run once when a new project is created. This submits the required table exporter commands to extract each of the tables in `ukbrapR:::ukbrapr_paths`. This can take ~15 minutes to export all the tables. ~10Gb of text files are created. This will cost ~£0.15 per month to store in the RAP standard storage.
171 | - `get_emr()` is split into two primary underlying functions: `get_emr_spark()` which has not changed, and `get_emr()` which is the "new way" (i.e., `get_emr_local()` is entirely removed)
172 | - Added functionality for `hesin_oper` (HES OPCS operations) searching for ICD10 codes in `get_emr()`
173 | - New/updated internal functions `get_cancer_registry()` asceratains cases using ICD10s in the `cancer_registry` data, and works much the same as `get_selfrep_illness()`
174 | - New function `get_diagnoses()` is a wrapper to get HES diagnosis, operations, cause of death, GP, cancer registry, and self-reported illness data -- i.e., once function to provide all codes to, and return all health-related data
175 | - `get_df()` takes all output from `get_diagnoses()` i.e., now also identifies date of first in matched `cancer_registry` and `hesin_oper` entries, in addition to `hes_diag`, `gp_clinical`, `death_cause` and `selfrep_illness` as before.
176 | - When getting "date first" using `get_df()` the baseline data is used to create binary case/control variables (for ever and prevalent), and for controls the censoring date is included in the overall `_df` variable (default is 30-10-2022).
177 |
178 | **To make it absolutely clear:** the Spark function `get_emr_spark()` has not been updated but I am no longer focussed on doing things this way. If you want to submit Pull Requests to improve functions please do. The below changes are to substantially improve the experience of using exported tables in the RAP environment only (if you have all the data on a local system already it will work, assuming you format correctly and provide the paths, but the RAP is the future).
179 |
180 |
181 | # ukbrapR v0.1.7 (28 July 2024)
182 |
183 | ### Bug fixes
184 | - Fix Spark database error when >1 dataset file is available. Fixes issue #3
185 |
186 |
187 | # ukbrapR v0.1.6 (03 July 2024)
188 |
189 | ### Bug fixes
190 | - Fix `get_df()` error when ascertaining GP diagnoses if 7-character codes were provided rather than 5
191 |
192 | ### Changes
193 | - `get_emr()` now accepts option "file_paths" - if not provided, attempts to get from Spark
194 | - Improve documentation and examples
195 |
196 |
197 | # ukbrapR v0.1.5 (01 July 2024)
198 |
199 | ### Bug fixes
200 | - Fix `get_df()` error occurring when not all sources are desired
201 |
202 | ### Changes
203 | - `get_emr_local()` option "local_paths" is now "file_paths"
204 | - Improve documentation and examples
205 |
206 |
207 | # ukbrapR v0.1.4 (12 June 2024)
208 |
209 | ### Bug fixes
210 | - Fix problem identifying ICD10 column name in RAP HESIN
211 | - Fix problem getting date first for GP data (excluding missing dates before summarizing)
212 |
213 |
214 | # ukbrapR v0.1.3 (8 June 2024)
215 |
216 | ### New feature
217 | - It is quicker/easier to ascertain multiple conditions at once to supply `get_emr()` with all the codes (as before), but now can use `get_df()` with option "group_by" to indicate the condition names in the `codes_df` object provided. See documentation.
218 |
219 | ### Changes
220 | - It is no longer possible to provide custom names for the `codes_df` to `get_emr()` -- these now must be `vocab_id` and `code` -- makes things much simpler.
221 | - Remove ICD9 code from `codes_df_hh` example as these are not currently used
222 |
223 |
224 | # ukbrapR v0.1.2 (6 June 2024)
225 |
226 | ### New features
227 | - New function `get_emr_local()`. If the user has text files for `hesin_diag` and `gp_clinical` etc. these can be searched (rather than Apache Spark queries). This therefore can work on "normal" DNAnexus nodes, or local servers. Most downstream functions also do not rely on Spark clusters if data extracts are available.
228 |
229 | ### Changes
230 | - Change URL to reflect my GitHub username change from `lukepilling` to `lcpilling` to be more consistent between different logins, websites, and social media
231 | -- https://lcpilling.github.io/ukbrapR
232 | -- https://github.com/lcpilling/ukbrapR
233 | - Added dependency {cli} for improved alert/error reporting
234 |
235 |
236 | # ukbrapR v0.1.1 (6 March 2024)
237 |
238 | ### New features
239 | - New argument "prefix" for `get_df()` - user can provide a string to prefix to the output variable names
240 |
241 |
242 | # ukbrapR v0.1.0 (21 Feb 2024)
243 |
244 | ### New features
245 | - `get_selfrep_illness()` - gets illness information from self-report fields. Derives a "date first" from the age/year reported, incorporating all visits for the participant
246 | - Two example code lists are incuded: `codes_df_ckd` (GEMINI CKD), and `codes_df_hh` (haemochromatosis, with self-report)
247 |
248 | ### Changes
249 | - `get_emr_df()` is re-named `get_df()` to reflect it can now include information from self-reported illness
250 | - `get_emr_diagnoses()` is re-named `get_emr()` to reflect it actually retrieves any record in `gp_clinical` not just diagnoses (e.g., BMI if appropriate codes provided)
251 |
252 | ### Bug fixes
253 | - So many
254 |
255 |
256 | # ukbrapR v0.0.2 (14 Nov 2023)
257 |
258 | ### New features
259 | - `get_emr_diagnoses()` - function to get electronic medical records diagnoses from Spark-based death records, hospital episode statistics, and primary care (GP) databases.
260 | - `get_emr_df()` - function to get date first diagnosed with any provided code from any above Electronic Medical Record source.
261 |
262 | ### Bug fixes
263 | - Extra input checking in `get_rap_phenos()` and output more consistent for direct use with `get_emr_*()` functions
264 | - Updated URL for example CKD clinical codes
265 |
266 |
267 | # ukbrapR v0.0.1 (26 Oct 2023)
268 |
269 | Initial release containing two functions:
270 | - `get_rap_phenos()`
271 | - `upload_to_rap()`
272 |
273 |
--------------------------------------------------------------------------------
/R/export_tables.R:
--------------------------------------------------------------------------------
1 | #' Export diagnosis files to RAP persistent storage
2 | #'
3 | #' @description In the UK Biobank RAP export tables for HES, GP, death, and cancer registry data, plus self-reported illness fields, using the table-exporter. This is essentially a wrapper function to submit jobs to the table exporter.
4 | #'
5 | #' Suggest executing in an RStudio session. ~10Gb of text files are created. This will cost ~£0.15 per month to store in the RAP standard storage.
6 | #'
7 | #' @return NA
8 | #'
9 | #' @author Luke Pilling
10 | #'
11 | #' @name export_tables
12 | #'
13 | #' @param submit Logical. Actually submit `dx` commands. Default is FALSE i.e., just check inputs & file paths, then print the commands,
14 | #' \code{default=FALSE}
15 | #' @param ignore_warnings Logical. If an exported table already exists do not submit the table-exporter command unless this is TRUE,
16 | #' \code{default=FALSE}
17 | #' @param file_paths A data frame. Columns must be `object` and `path` containing paths to outputted files. If blank, will use the default paths,
18 | #' \code{default=ukbrapR:::ukbrapr_paths}
19 | #' @param dataset A string. If you wish to specify dataset. If blank, will use the most recently dispensed dataset in the main project directory.
20 | #' \code{default=app#####_#####.dataset}
21 | #' @param verbose Logical. Be verbose,
22 | #' \code{default=FALSE}
23 | #'
24 | #' @examples
25 | #'
26 | #' # To keep files organised this package assumes the following file structure
27 | #' # This object is not actually required but illstrates the defaults to be
28 | #' # created in your RAP space (override by providing a new `file_paths`):
29 | #' ukbrapr_paths = data.frame(
30 | #' object=c("death","death_cause","hesin","hesin_diag","hesin_oper","gp_clinical","gp_scripts","selfrep_illness","cancer_registry","baseline_dates"),
31 | #' path=c(
32 | #' "ukbrapr_data/death.tsv",
33 | #' "ukbrapr_data/death_cause.tsv",
34 | #' "ukbrapr_data/hesin.tsv",
35 | #' "ukbrapr_data/hesin_diag.tsv",
36 | #' "ukbrapr_data/hesin_oper.tsv",
37 | #' "ukbrapr_data/gp_clinical.tsv",
38 | #' "ukbrapr_data/gp_scripts.tsv",
39 | #' "ukbrapr_data/selfrep_illness.tsv",
40 | #' "ukbrapr_data/cancer_registry.tsv",
41 | #' "ukbrapr_data/baseline_dates.tsv"
42 | #' )
43 | #' )
44 | #' ukbrapr_paths
45 | #'
46 | #' # test run to see `dx run table-exporter` commands - but will not submit jobs
47 | #' export_tables()
48 | #'
49 | #' # Submit all `dx run table-exporter` commands. ~10Gb of text files are created. This will cost ~£0.15 per month to store in the RAP standard storage.
50 | #' export_tables(submit=TRUE)
51 | #'
52 | #' @export
53 | #'
54 | export_tables <- function(
55 | submit = FALSE,
56 | ignore_warnings = FALSE,
57 | file_paths = ukbrapR:::ukbrapr_paths,
58 | dataset = NULL,
59 | verbose = FALSE
60 | ) {
61 |
62 | # is this just a test? (Will not actually run any dx commands)
63 | if (!submit) {
64 | cli::cli_alert_info("Test run. Will not submit any {.code dx} system commands")
65 | ignore_warnings = TRUE
66 | }
67 |
68 | # do any files already exist? Warn if so!
69 | for (fp in file_paths$path) {
70 | if (file.exists(stringr::str_c("/mnt/project/", fp))) {
71 | if (ignore_warnings) cli::cli_alert_danger("File already exists on RAP at {.path {fp}}.")
72 | if (!ignore_warnings) cli::cli_abort("File already exists on RAP at {.path {fp}}.")
73 | }
74 | }
75 |
76 | # if output directory does not exist in the RAP, create it
77 | fp_dir = file_paths$path[1]
78 | if (stringr::str_detect(fp_dir, "/")) {
79 | fp_dir = stringr::str_split(fp_dir, "/")[[1]][1]
80 | if (!dir.exists(stringr::str_c("/mnt/project/", fp_dir))) {
81 | if (verbose) cli::cli_alert("Creating output directory in your RAP storage space: {.code dx mkdir {fp_dir}}")
82 | if (submit) system(stringr::str_c("dx mkdir ", fp_dir))
83 | }
84 | }
85 |
86 | # get dataset id
87 | if (is.null(dataset)) {
88 | dataset = system("dx describe /*dataset | grep app | awk -F ' ' '{print $2}' | sort | tail -n 1", intern = TRUE)
89 | }
90 | if (verbose) cli::cli_alert_info("Using dataset {.file {dataset}}")
91 |
92 | # run table exporter commands
93 | ukbrapR:::export_tables_emr(dataset=dataset, submit=submit, verbose=verbose)
94 | ukbrapR:::export_tables_selfrep_illness(dataset=dataset, submit=submit, verbose=verbose)
95 | ukbrapR:::export_tables_cancer_registry(dataset=dataset, submit=submit, verbose=verbose)
96 | ukbrapR:::export_tables_baseline_info(dataset=dataset, submit=submit, verbose=verbose)
97 |
98 | cli::cli_alert_success("Submitted all table-exporter jobs.")
99 | cli::cli_alert_info("Can take ~15mins to complete.")
100 | cli::cli_alert_info("Files will be saved to `ukbrapr_data` directory in your RAP project presistent storage space.")
101 | cli::cli_alert_warning("~10Gb of text files are created. This will cost ~£0.15 per month to store in the RAP standard storage.")
102 |
103 | # was this just a test?
104 | if (!submit) {
105 | cli::cli_abort("This was a test run. No {.code dx table-exporter} commands were submitted. Re-run with {.code submit = TRUE} to submit.")
106 | }
107 |
108 | }
109 |
110 |
111 | #' Export tables for HES, GP, and death
112 | #'
113 | #' @description In the UK Biobank RAP export tables for HES, GP, and death using the table-exporter
114 | #'
115 | #' @return NA
116 | #'
117 | #' @author Luke Pilling
118 | #'
119 | #' @name export_tables_emr
120 | #'
121 | #' @noRd
122 | export_tables_emr <- function(
123 | dataset = NULL,
124 | submit = FALSE,
125 | verbose = FALSE
126 | ) {
127 |
128 | cli::cli_alert("Export tables: Electronic Medical Records")
129 |
130 | # get dataset id
131 | if (is.null(dataset)) {
132 | dataset = list.files("/mnt/project") |> stringr::str_subset(".dataset")
133 | dataset = dataset[1]
134 | }
135 |
136 | # submit table-exporter
137 | if (verbose) cli::cli_alert("dx run table-exporter for 'death'")
138 | table_exporter_command = stringr::str_c("dx run table-exporter -idataset_or_cohort_or_dashboard=", dataset, " -ientity='death' -ioutput='death' -ioutput_format='TSV' -icoding_option='RAW' --destination /ukbrapr_data/ --brief --yes")
139 | if (verbose|submit) cli::cli_text(table_exporter_command)
140 | if (submit) system(table_exporter_command)
141 |
142 | if (verbose) cli::cli_alert("dx run table-exporter for 'death_cause'")
143 | table_exporter_command = stringr::str_c("dx run table-exporter -idataset_or_cohort_or_dashboard=", dataset, " -ientity='death_cause' -ioutput='death_cause' -ioutput_format='TSV' -icoding_option='RAW' --destination /ukbrapr_data/ --brief --yes")
144 | if (verbose|submit) cli::cli_text(table_exporter_command)
145 | if (submit) system(table_exporter_command)
146 |
147 | if (verbose) cli::cli_alert("dx run table-exporter for 'hesin'")
148 | table_exporter_command = stringr::str_c("dx run table-exporter -idataset_or_cohort_or_dashboard=", dataset, " -ientity='hesin' -ioutput='hesin' -ioutput_format='TSV' -icoding_option='RAW' --destination /ukbrapr_data/ --brief --yes")
149 | if (verbose|submit) cli::cli_text(table_exporter_command)
150 | if (submit) system(table_exporter_command)
151 |
152 | if (verbose) cli::cli_alert("dx run table-exporter for 'hesin_diag'")
153 | table_exporter_command = stringr::str_c("dx run table-exporter -idataset_or_cohort_or_dashboard=", dataset, " -ientity='hesin_diag' -ioutput='hesin_diag' -ioutput_format='TSV' -icoding_option='RAW' --destination /ukbrapr_data/ --brief --yes")
154 | if (verbose|submit) cli::cli_text(table_exporter_command)
155 | if (submit) system(table_exporter_command)
156 |
157 | if (verbose) cli::cli_alert("dx run table-exporter for 'hesin_oper'")
158 | table_exporter_command = stringr::str_c("dx run table-exporter -idataset_or_cohort_or_dashboard=", dataset, " -ientity='hesin_oper' -ioutput='hesin_oper' -ioutput_format='TSV' -icoding_option='RAW' --destination /ukbrapr_data/ --brief --yes")
159 | if (verbose|submit) cli::cli_text(table_exporter_command)
160 | if (submit) system(table_exporter_command)
161 |
162 | if (verbose) cli::cli_alert("dx run table-exporter for 'gp_clinical'")
163 | table_exporter_command = stringr::str_c("dx run table-exporter -idataset_or_cohort_or_dashboard=", dataset, " -ientity='gp_clinical' -ioutput='gp_clinical' -ioutput_format='TSV' -icoding_option='RAW' --destination /ukbrapr_data/ --brief --yes")
164 | if (verbose|submit) cli::cli_text(table_exporter_command)
165 | if (submit) system(table_exporter_command)
166 |
167 | if (verbose) cli::cli_alert("dx run table-exporter for 'gp_scripts'")
168 | table_exporter_command = stringr::str_c("dx run table-exporter -idataset_or_cohort_or_dashboard=", dataset, " -ientity='gp_scripts' -ioutput='gp_scripts' -ioutput_format='TSV' -icoding_option='RAW' --destination /ukbrapr_data/ --brief --yes")
169 | if (verbose|submit) cli::cli_text(table_exporter_command)
170 | if (submit) system(table_exporter_command)
171 |
172 | }
173 |
174 |
175 |
176 |
177 | #' Extract self-reported illess fields
178 | #'
179 | #' @description In the UK Biobank RAP, submit a table-exporter job to extract the self-reported illness fields:
180 | #'
181 | #' @return NA
182 | #'
183 | #' @author Luke Pilling
184 | #'
185 | #' @name export_tables_selfrep_illness
186 | #'
187 | #' @noRd
188 | export_tables_selfrep_illness <- function(
189 | n_cancer_arrays = 5,
190 | n_noncancer_arrays = 30,
191 | n_instances = 3,
192 | dataset = NULL,
193 | submit = FALSE,
194 | verbose = FALSE
195 | ) {
196 |
197 | cli::cli_alert("Export table: Self-reported Illness fields")
198 |
199 | # RAP stores arrays as separate variables
200 | if (verbose) cli::cli_alert("Determine field names to request")
201 | if (verbose) cli::cli_alert(c("n_instances = ", n_instances))
202 | if (verbose) cli::cli_alert(c("n_cancer_arrays = ", n_cancer_arrays))
203 | if (verbose) cli::cli_alert(c("n_noncancer_arrays = ", n_noncancer_arrays))
204 |
205 | # Determine variable names needed (depends if cancer or non-cancer)
206 | # will use 20001 (cancer code) and 20002 (non-cancer code)
207 | # will use the interpolated year (20006 = cancer year, 20008 = non-cancer year)
208 |
209 | # get field names
210 | names = "eid"
211 |
212 | # phenotypes
213 | # cancer code = 20001
214 | # cancer year = 20006
215 | for (p in c(20001, 20006)) {
216 |
217 | # instances 0:n_instances
218 | for (i in c(0:n_instances)) {
219 |
220 | # cancer arrays
221 | for (a in c(0:n_cancer_arrays)) names <- c(names, stringr::str_c("p", p, "_i", i, "_a", a))
222 |
223 | }
224 | }
225 |
226 | # phenotypes
227 | # non-cancer illness code = 20002
228 | # non-cancer illness year = 20008
229 | for (p in c(20002, 20008)) {
230 |
231 | # instances 0:3
232 | for (i in c(0:n_instances)) {
233 |
234 | # non-cancer arrays
235 | for (a in c(0:n_noncancer_arrays)) names <- c(names, stringr::str_c("p", p, "_i", i, "_a", a))
236 |
237 | }
238 | }
239 |
240 | if (verbose) print(names)
241 |
242 | # save and upload names file
243 | readr::write_tsv(data.frame(names), "fieldnames_selfrep_illness.txt", col_names=FALSE)
244 | if (submit) ukbrapR::upload_to_rap("fieldnames_selfrep_illness.txt", dir="/ukbrapr_data")
245 |
246 | # get dataset id
247 | if (is.null(dataset)) {
248 | dataset = list.files("/mnt/project") |> stringr::str_subset(".dataset")
249 | dataset = dataset[1]
250 | }
251 |
252 | # submit table-exporter
253 | table_exporter_command = stringr::str_c("dx run table-exporter -idataset_or_cohort_or_dashboard=", dataset, " -ifield_names_file_txt='/ukbrapr_data/fieldnames_selfrep_illness.txt' -ientity='participant' -ioutput='selfrep_illness' -ioutput_format='TSV' -icoding_option='RAW' --destination /ukbrapr_data/ --brief --yes")
254 | if (verbose|submit) cli::cli_text(table_exporter_command)
255 | if (submit) system(table_exporter_command)
256 |
257 |
258 | }
259 |
260 |
261 |
262 | #' Extract cancer registry fields
263 | #'
264 | #' @description In the UK Biobank RAP, submit a table-exporter job to extract the cancer registry fields:
265 | #'
266 | #' @return NA
267 | #'
268 | #' @author Luke Pilling
269 | #'
270 | #' @name export_tables_cancer_registry
271 | #'
272 | #' @noRd
273 | export_tables_cancer_registry <- function(
274 | n_cancer_arrays = 21,
275 | dataset = NULL,
276 | submit = FALSE,
277 | verbose = FALSE
278 | ) {
279 |
280 | cli::cli_alert("Export table: Cancer Registry fields")
281 |
282 | # RAP stores arrays as separate variables
283 | if (verbose) cli::cli_alert("Determine field names to request")
284 | if (verbose) cli::cli_alert(c("n_cancer_arrays = ", n_cancer_arrays))
285 | # date vars = 40005
286 | # cancer vars = 40006
287 | # age vars = 40008
288 | # histology vars = 40011
289 | # behaviour vars = 40012
290 |
291 | # get field names
292 | names = "eid"
293 |
294 | # phenotypes
295 | for (p in c(40005, 40006, 40008, 40011, 40012)) {
296 |
297 | # instances 0:n_instances
298 | for (i in c(0:n_cancer_arrays)) names <- c(names, stringr::str_c("p", p, "_i", i))
299 |
300 | }
301 |
302 | if (verbose) print(names)
303 |
304 | # save and upload names file
305 | readr::write_tsv(data.frame(names), "fieldnames_cancer_registry.txt", col_names=FALSE)
306 | if (submit) ukbrapR::upload_to_rap("fieldnames_cancer_registry.txt", dir="/ukbrapr_data")
307 |
308 | # get dataset id
309 | if (is.null(dataset)) {
310 | dataset = list.files("/mnt/project") |> stringr::str_subset(".dataset")
311 | dataset = dataset[1]
312 | }
313 |
314 | # submit table-exporter
315 | table_exporter_command = stringr::str_c("dx run table-exporter -idataset_or_cohort_or_dashboard=", dataset, " -ifield_names_file_txt='/ukbrapr_data/fieldnames_cancer_registry.txt' -ientity='participant' -ioutput='cancer_registry' -ioutput_format='TSV' -icoding_option='RAW' --destination /ukbrapr_data/ --brief --yes")
316 | if (verbose|submit) cli::cli_text(table_exporter_command)
317 | if (submit) system(table_exporter_command)
318 |
319 | }
320 |
321 |
322 |
323 | #' Extract useful baseline date fields and save to file
324 | #'
325 | #' @description In the UK Biobank RAP, submit a table-exporter job to extract some useful baseline fields:
326 | #'
327 | #' 1) date of baseline assessment [p53_i*], 2) month of birth [p52], 3) year of birth [p34], and 4) sex [p31]
328 | #'
329 | #' @return NA
330 | #'
331 | #' @author Luke Pilling
332 | #'
333 | #' @name export_tables_baseline_info
334 | #'
335 | #' @noRd
336 | export_tables_baseline_info <- function(
337 | dataset = NULL,
338 | submit = FALSE,
339 | verbose = FALSE
340 | ) {
341 |
342 | cli::cli_alert("Export table: Baseline Info fields")
343 |
344 | # get dataset id
345 | if (is.null(dataset)) {
346 | dataset = list.files("/mnt/project") |> stringr::str_subset(".dataset")
347 | dataset = dataset[1]
348 | }
349 |
350 | # baseline fields to export
351 | fields <- c(
352 | "eid",
353 | "p53_i0", "p53_i1", "p53_i2", "p53_i3", # assessment dates
354 | "p54_i0", "p54_i1", "p54_i2", "p54_i3", # assessment centres
355 | "p52", "p34", # month and year of birth
356 | "p31" # sex
357 | )
358 | readr::write_tsv(data.frame(fields), "fieldnames_baseline_info.txt", col_names=FALSE)
359 | if (submit) ukbrapR::upload_to_rap("fieldnames_baseline_info.txt", dir="/ukbrapr_data")
360 |
361 | # submit table-exporter
362 | table_exporter_command = stringr::str_c("dx run table-exporter -idataset_or_cohort_or_dashboard=", dataset, " -ifield_names_file_txt='/ukbrapr_data/fieldnames_baseline_info.txt' -ientity='participant' -ioutput='baseline_dates' -ioutput_format='TSV' -icoding_option='RAW' --destination /ukbrapr_data/ --brief --yes")
363 | if (verbose|submit) cli::cli_text(table_exporter_command)
364 | if (submit) system(table_exporter_command)
365 |
366 | }
367 |
368 |
--------------------------------------------------------------------------------
/R/get_diagnoses.R:
--------------------------------------------------------------------------------
1 | #' Get UK Biobank participant diagnosis data
2 | #'
3 | #' @description For a list of diagnostic codes get the HES, GP, cancer registry, operations, and self-reported illness data, matching the provided codes.
4 | #'
5 | #' Valid code vocabularies are:
6 | #'
7 | #' - ICD10 (for `hesin`, `death_cause` and `cancer_registry` searches) - fuzzy matching
8 | #'
9 | #' - ICD9 (for `hesin` searches) - fuzzy matching
10 | #'
11 | #' - Read2 / CTV3 (for `gp_clinical`) - exact matches on first 5 characters
12 | #'
13 | #' - OPCS3 / OPCS4 (for `hesin_oper`) - fuzzy matching
14 | #'
15 | #' - ukb_cancer / ukb_noncancer (for self-reported illness at UK Biobank assessments - all available will be searched) - exact matches
16 | #'
17 | #' This function relies on exported raw data files and thus does not need to be run in a Spark cluster. If the files are not in the default locations for the package you will need to specify the `file_paths` to exported tables. Recommend to run `export_tables()` once in your project to export the tables to the default paths for the package.
18 | #'
19 | #' @return Returns a list of data frames (the participant data for the requested diagnosis codes: `death_cause`, `hesin_diag`, `hesin_oper`, `gp_clinical`, `cancer_registry` and `selfrep_illness`. Also includes the original codes list)
20 | #'
21 | #' @author Luke Pilling
22 | #'
23 | #' @name get_diagnoses
24 | #'
25 | #' @param codes_df A data frame. Contains two columns: `code` and `vocab_id` i.e., a list of diagnostic codes, and an indicator of the vocabulary (ICD10, Read2, CTV3, OPCS3, OPCS4, ukb_cancer, and ukb_noncancer are recognised). Other columns are ignored.
26 | #' @param file_paths A data frame. Columns must be `object` and `path` containing paths to required files. Default assumes you have the tables exported in the RAP environment from
27 | #' ukbrapR::export_tables()
28 | #' \code{default=ukbrapR:::ukbrapr_paths}
29 | #' @param verbose Logical. Be verbose,
30 | #' \code{default=FALSE}
31 | #'
32 | #' @examples
33 | #' # example diagnostic codes for CKD from GEMINI multimorbidity project
34 | #' codes_df_ckd <- ukbrapR:::codes_df_ckd
35 | #' head(codes_df_ckd)
36 | #'
37 | #' # Get diagnosis data - returns list of data frames (one per source)
38 | #' # -- Requires exported tables - see `export_tables()`
39 | #' diagnosis_list <- get_diagnoses(codes_df_ckd)
40 | #'
41 | #' # don't forget to save and upload data to RAP persistent storage!
42 | #' save(diagnosis_list, "ukbrap.CKD.emr.20231114.RDat")
43 | #' upload_to_rap(file="ukbrap.CKD.*", dir="")
44 | #'
45 | #' @export
46 | #'
47 | get_diagnoses <- function(
48 | codes_df,
49 | file_paths = NULL,
50 | verbose = FALSE
51 | ) {
52 |
53 | v <- packageVersion("ukbrapR")
54 | cli::cli_alert_info("ukbrapR v{v}")
55 |
56 | start_time <- Sys.time()
57 |
58 | vocab_col = "vocab_id"
59 | codes_col = "code"
60 |
61 | # Check input
62 | if (verbose) cli::cli_alert("Checking inputs (codes, file paths, etc)")
63 | if (! any(class(codes_df) %in% c("data.frame","tbl","tbl_df"))) {
64 | cli::cli_abort(c(
65 | "{.var codes_df} must be a data.frame or tibble",
66 | "x" = "You've supplied a {.cls {class(codes_df)}} vector."
67 | ))
68 | }
69 | codes_df = as.data.frame(codes_df) # in case a tibble
70 |
71 | if (! vocab_col %in% colnames(codes_df)) stop("Codelist data frame needs to include vocabulary column `vocab_id`")
72 | if (! codes_col %in% colnames(codes_df)) stop("Codelist data frame needs to include codes column `code`")
73 |
74 | if (! any(c("ICD10","ICD9","Read2","CTV3","OPCS3","OPCS4","ukb_cancer","ukb_noncancer") %in% codes_df[,vocab_col])) stop("Vocabularies need to include at least one of ICD10, ICD9, Read2, CTV3, OPCS3, OPCS4, ukb_cancer, or ukb_noncancer")
75 |
76 | # if file_paths not provided assume default paths
77 | if (is.null(file_paths)) file_paths = ukbrapR:::ukbrapr_paths
78 |
79 |
80 | #########################################################################################################
81 | #
82 | # check codes provided, determine what datasets we are going to search
83 |
84 | # Check code lists - only first 5 digits are used by UK Biobank
85 | cli::cli_alert("Checking provided codes (remember only the first 5 characters are used)")
86 | get_icd10 <- FALSE
87 | get_icd9 <- FALSE
88 | get_canreg <- FALSE
89 | get_gp <- FALSE
90 | get_oper <- FALSE
91 | get_selfrep <- FALSE
92 | ICD10s <- ""
93 | ICD9s <- ""
94 | Read2s <- ""
95 | CTV3s <- ""
96 | OPCS3s <- ""
97 | OPCS4s <- ""
98 |
99 | # throw error if any provided code has length 0
100 | if (any(stringr::str_length(codes_df[,codes_col]) == 0)) {
101 | cli::cli_abort("Blank code provided. Check your input codes lists to avoid unexpected matches.")
102 | }
103 |
104 | # warn if any provided code has length 1
105 | if (any(stringr::str_length(codes_df[,codes_col]) == 1)) {
106 | cli::cli_warn("Some provided code(s) have length 1. Check your input codes lists and matched outputs to avoid unexpected matches.")
107 | }
108 |
109 | # function to check for hyphens and abort if any provided
110 | # (suggests they want a range of codes. Safer to abort at ask the user to explicitly provide the codes to search for)
111 | hyphen_check <- function(codes, vocab) {
112 | if (any(stringr::str_detect(codes, "-"))) {
113 | codes = codes[ stringr::str_detect(codes, "-") ]
114 | cli::cli_abort(c(
115 | "{vocab} codes cannot contain hyphens, as this suggests you want a range of codes.",
116 | "x" = "Remove hyphens and provide the specific {vocab} codes you wish to ascertain (safer)."
117 | ))
118 | }
119 | }
120 |
121 | # get ICD10s. Remove "." dot characters. First 5 characters only.
122 | if (any(codes_df[,vocab_col] == "ICD10")) {
123 | get_icd10 <- TRUE
124 | ICD10s <- codes_df |>
125 | dplyr::filter(!!rlang::sym(vocab_col) == "ICD10") |>
126 | dplyr::select(!!rlang::sym(codes_col)) |>
127 | dplyr::pull() |>
128 | unique() |>
129 | stringr::str_remove(stringr::fixed(".")) |>
130 | stringr::str_sub(1, 5)
131 | cat(" - N unique ICD10 codes:", length(ICD10s), "\n")
132 | hyphen_check(ICD10s, "ICD10")
133 | if (any(stringr::str_starts(ICD10s, "C"))) get_canreg <- TRUE
134 | }
135 |
136 | # get ICD9s. Remove "." dot characters. First 5 characters only.
137 | if (any(codes_df[,vocab_col] == "ICD9")) {
138 | get_icd9 <- TRUE
139 | ICD9s <- codes_df |>
140 | dplyr::filter(!!rlang::sym(vocab_col) == "ICD9") |>
141 | dplyr::select(!!rlang::sym(codes_col)) |>
142 | dplyr::pull() |>
143 | unique() |>
144 | stringr::str_remove(stringr::fixed(".")) |>
145 | stringr::str_sub(1, 5)
146 | hyphen_check(ICD9s, "ICD9")
147 | cat(" - N unique ICD9 codes:", length(ICD9s), "\n")
148 | }
149 |
150 | # get Read2 and CTV3s. First 5 characters only.
151 | gp_codes = NULL
152 | if (any(codes_df[,vocab_col] == "Read2")) {
153 | get_gp <- TRUE
154 | Read2s <- codes_df[ codes_df[,vocab_col] == "Read2" , codes_col ]
155 | Read2s <- stringr::str_sub(Read2s, 1, 5) |> unique()
156 | gp_codes <- Read2s
157 | cat(" - N unique Read2 codes:", length(Read2s), "\n")
158 | hyphen_check(Read2s, "Read2")
159 | # any <5 characters?
160 | nchar_Read2s = nchar(Read2s)
161 | nchar_Read2s_n5 = length(nchar_Read2s[ nchar_Read2s < 5 ])
162 | if (nchar_Read2s_n5 > 0) {
163 | cli::cli_abort(c(
164 | "i" = "Read2 codes must be at least 5 characters in length",
165 | "x" = "There {?is/are} {nchar_Read2s_n5} Read2 code{?s} < 5 characters."
166 | ))
167 | }
168 | }
169 | if (any(codes_df[,vocab_col] == "CTV3")) {
170 | get_gp <- TRUE
171 | CTV3s <- codes_df[ codes_df[,vocab_col] == "CTV3" , codes_col ]
172 | CTV3s <- stringr::str_sub(CTV3s, 1, 5) |> unique()
173 | if (is.null(gp_codes)) {
174 | gp_codes <- CTV3s
175 | } else {
176 | gp_codes <- c(gp_codes, CTV3s)
177 | }
178 | cat(" - N unique CTV3 codes:", length(CTV3s), "\n")
179 | hyphen_check(CTV3s, "CTV3")
180 | # any <5 characters?
181 | nchar_CTV3s = nchar(CTV3s)
182 | nchar_CTV3s_n5 = length(nchar_CTV3s[ nchar_CTV3s < 5 ])
183 | if (nchar_CTV3s_n5 > 0) {
184 | cli::cli_abort(c(
185 | "i" = "CTV3 codes must be at least 5 characters in length",
186 | "x" = "There {?is/are} {nchar_CTV3s_n5} CTV3 code{?s} < 5 characters.",
187 | ))
188 | }
189 | }
190 |
191 | # get OPCS codes? Remove "." dot characters. First 5 characters only.
192 | oper_codes = NULL
193 | if (any(codes_df[,vocab_col] == "OPCS3")) {
194 | get_oper <- TRUE
195 | OPCS3s <- codes_df |>
196 | dplyr::filter(!!rlang::sym(vocab_col) == "OPCS3") |>
197 | dplyr::select(!!rlang::sym(codes_col)) |>
198 | dplyr::pull() |>
199 | unique() |>
200 | stringr::str_remove(stringr::fixed(".")) |>
201 | stringr::str_sub(1, 5)
202 | cat(" - N unique OPCS3 codes:", length(OPCS3s), "\n")
203 | hyphen_check(OPCS3s, "OPCS3")
204 | oper_codes = OPCS3s
205 | }
206 | if (any(codes_df[,vocab_col] == "OPCS4")) {
207 | get_oper <- TRUE
208 | OPCS4s <- codes_df |>
209 | dplyr::filter(!!rlang::sym(vocab_col) == "OPCS4") |>
210 | dplyr::select(!!rlang::sym(codes_col)) |>
211 | dplyr::pull() |>
212 | unique() |>
213 | stringr::str_remove(stringr::fixed(".")) |>
214 | stringr::str_sub(1, 5)
215 | cat(" - N unique OPCS4 codes:", length(OPCS4s), "\n")
216 | hyphen_check(OPCS4s, "OPCS4")
217 | oper_codes = c(oper_codes, OPCS4s)
218 | }
219 |
220 | # check for self-reported codes
221 | n_selfrep = length(unique(codes_df[codes_df[,vocab_col] %in% c("ukb_cancer","ukb_noncancer"),codes_col]))
222 | if (n_selfrep>0) {
223 | get_selfrep <- TRUE
224 | cat(" - N unique UKB-self-reported codes:", n_selfrep, "\n")
225 | }
226 |
227 |
228 | #########################################################################################################
229 | #
230 | # check data is available, download if required
231 |
232 | # Check file paths are provided
233 | if (is.null(file_paths)) cli::cli_abort("Need to provide {.var file_paths}")
234 | if (! any(class(file_paths) %in% c("data.frame","tbl","tbl_df"))) {
235 | cli::cli_abort(c(
236 | "{.var file_paths} must be a data.frame or tibble",
237 | "x" = "You've supplied a {.cls {class(file_paths)}} vector."
238 | ))
239 | }
240 | if (colnames(file_paths)[1] != "object" | colnames(file_paths)[2] != "path") cli::cli_abort("{.var file_paths} needs two columns: `object` and `path`")
241 |
242 | # check all the required files are included
243 | must_include = "baseline_dates"
244 | if (get_icd10) must_include <- c(must_include, c("death","death_cause","hesin","hesin_diag"))
245 | if (get_icd9) must_include <- c(must_include, c("hesin","hesin_diag"))
246 | if (get_canreg) must_include <- c(must_include, c("cancer_registry"))
247 | if (get_gp) must_include <- c(must_include, c("gp_clinical"))
248 | if (get_oper) must_include <- c(must_include, c("hesin_oper"))
249 | if (get_selfrep) must_include <- c(must_include, c("selfrep_illness"))
250 | must_include = unique(must_include)
251 |
252 | for (file in must_include) if (! file %in% file_paths$object) cli::cli_abort("{.var file_paths} must contain {.path {file}}")
253 |
254 | # if files not already downloaded from RAP then download to user's home directory
255 | # only do this if the file paths are to "ukbrapr_data"
256 | files = file_paths$path[file_paths$object %in% must_include]
257 | if (stringr::str_detect(files[1], "ukbrapr_data")) {
258 |
259 | # get path to users home directory - create ukbrapr_data directory on worker (if already exists then nothing happens)
260 | home_path = as.character(Sys.getenv()["HOME"])
261 | dir.create(stringr::str_c(home_path, "/ukbrapr_data"), showWarnings = FALSE)
262 |
263 | # do any need downloading from RAP? Or already been done?
264 | # each file now has two paths: a RAP path and a local path:
265 | dx_files <- NULL
266 | home_files <- stringr::str_c(home_path, "/", files)
267 | for (ii in 1:length(files)) if (! file.exists(home_files[ii])) dx_files = c(dx_files, files[ii])
268 |
269 | # if any were missing, download them
270 | if (!is.null(dx_files)) {
271 |
272 | options(cli.progress_show_after = 0)
273 | cli::cli_progress_bar(format = "Downloading {.path {basename(file)}} from the RAP [{cli::pb_current}/{cli::pb_total}] {cli::pb_bar} {cli::pb_percent}", total = length(dx_files))
274 |
275 | # copy file from RAP space to instance
276 | for (file in dx_files) {
277 | cli::cli_progress_update()
278 | system(stringr::str_c("dx download \"${DX_PROJECT_CONTEXT_ID}:/", file, "\" -o \"", home_path, "/ukbrapr_data\""))
279 | }
280 | cli::cli_progress_done()
281 | options(cli.progress_show_after = 2)
282 |
283 | if (verbose) cli::cli_alert_info(c("Time taken so far: ", "{prettyunits::pretty_sec(as.numeric(difftime(Sys.time(), start_time, units=\"secs\")))}."))
284 |
285 | }
286 |
287 | # add home directory prefix to file paths
288 | file_paths$path = stringr::str_c(home_path, "/", file_paths$path)
289 |
290 | }
291 |
292 | # check files exist
293 | for (file in file_paths$path[file_paths$object %in% must_include]) if (! file.exists(file)) cli::cli_abort("Could not find file {.path {file}}")
294 |
295 | #
296 | #########################################################################################################
297 | #
298 |
299 | # Get data for each code vocabulary
300 | if (verbose) cli::cli_alert("Ascertaining codes from long EMR files")
301 | death_cause_tbl <- NULL # ICD10
302 | hesin_diag_tbl <- NULL # ICD10, ICD9
303 | cancer_registry_tbl <- NULL # ICD10
304 | gp_clinical_tbl <- NULL # Read2 / CTV3
305 | hesin_oper_tbl <- NULL # OPCS3 / OPCS4
306 | selfrep_illness_tbl <- NULL # ukb_cancer / ukb_noncancer
307 |
308 | if (get_icd10) {
309 |
310 | #
311 | # death data ###########################################
312 | #
313 |
314 | cli::cli_alert("Ascertaining cause of death data.")
315 |
316 | death_cause_path = file_paths$path[ file_paths$object=="death_cause" ]
317 |
318 | # create search string
319 | search_string <- paste0("grep -E ", sprintf('"%s"', stringr::str_flatten(ICD10s, collapse = "|")), " ", sprintf('%s', death_cause_path))
320 | if (verbose) cat(" -- search string: ", search_string, "\n")
321 |
322 | # get file headers
323 | headers <- colnames(readr::read_tsv(death_cause_path, n_max=1, show_col_types=FALSE, progress=FALSE))
324 | if (! "eid" %in% headers) headers[1] <- "eid"
325 |
326 | # use search string to only read lines that matched a code
327 | death_cause_tbl <- suppressWarnings(readr::read_tsv(pipe(search_string), col_names=headers, show_col_types=FALSE, progress=FALSE))
328 |
329 | # if any matches returned, make sure eid is formatted nicely (remove file name) and the dates are dates
330 | if (nrow(death_cause_tbl)>0) {
331 | # match with date of death data
332 | death_tbl = suppressWarnings(readr::read_tsv(file_paths$path[ file_paths$object=="death" ], show_col_types=FALSE, progress=FALSE))
333 | if (! "eid" %in% colnames(death_tbl)) colnames(death_tbl)[1] <- "eid"
334 | death_cause_tbl = dplyr::inner_join(death_tbl, death_cause_tbl, by=c("eid"="eid", "ins_index"="ins_index"))
335 |
336 | # format date col if not "Date"
337 | if (!lubridate::is.Date(death_cause_tbl$date_of_death)) death_cause_tbl <- death_cause_tbl |> dplyr::mutate(date_of_death = lubridate::dmy(date_of_death))
338 | }
339 |
340 | cli::cli_alert_success("Loaded {.var death_cause} with {nrow(death_cause_tbl)} matched rows.")
341 |
342 | if (verbose) cli::cli_alert_info(c("Time taken so far: ", "{prettyunits::pretty_sec(as.numeric(difftime(Sys.time(), start_time, units=\"secs\")))}."))
343 |
344 | #
345 | # cancer registry ####################################
346 | #
347 |
348 | # do any ICD10s start with a C? Skip if not.
349 | if (get_canreg) {
350 |
351 | cli::cli_alert("Ascertaining cancer registry data.")
352 |
353 | # load data
354 | cancer_registry_dat <- suppressWarnings(readr::read_tsv(file_paths$path[ file_paths$object=="cancer_registry" ], show_col_types = FALSE, progress = FALSE))
355 |
356 | # get cancer registry data for these ICD10s
357 | cancer_registry_tbl <- ukbrapR:::get_cancer_registry(codes = ICD10s, ukb_dat = cancer_registry_dat, verbose = verbose)
358 | cli::cli_alert_success("Loaded {.var cancer_registry} with {nrow(cancer_registry_tbl)} matched rows.")
359 |
360 | rm(cancer_registry_dat)
361 |
362 | if (verbose) cli::cli_alert_info(c("Time taken so far: ", "{prettyunits::pretty_sec(as.numeric(difftime(Sys.time(), start_time, units=\"secs\")))}."))
363 |
364 | }
365 |
366 | #
367 | # HES diagnosis data (ICD10s) ###########################################
368 | #
369 |
370 | cli::cli_alert("Ascertaining HES diagnosis data (ICD10s).")
371 |
372 | hesin_diag_path = file_paths$path[ file_paths$object=="hesin_diag" ]
373 |
374 | # create search string
375 | search_string <- paste0("grep -E ", sprintf('"%s"', stringr::str_flatten(ICD10s, collapse = "|")), " ", sprintf('%s', hesin_diag_path))
376 | if (verbose) cat(" -- search string: ", search_string, "\n")
377 |
378 | # get file headers
379 | headers <- colnames(readr::read_tsv(hesin_diag_path, n_max=1, show_col_types=FALSE, progress=FALSE))
380 | if (! "eid" %in% headers) headers[1] <- "eid"
381 |
382 | # use search string to only read lines that matched a code
383 | hesin_diag_tbl <- suppressWarnings(readr::read_tsv(pipe(search_string), col_names=headers, show_col_types=FALSE, progress=FALSE))
384 |
385 | # if any matches returned, make sure eid is formatted nicely (remove file name), and dates are dates
386 | if (nrow(hesin_diag_tbl)>0) {
387 | # match with HES episode data
388 | hesin_tbl = suppressWarnings(readr::read_tsv(file_paths$path[ file_paths$object=="hesin" ], show_col_types=FALSE, progress=FALSE))
389 | if (! "eid" %in% colnames(hesin_tbl)) colnames(hesin_tbl)[1] <- "eid"
390 | hesin_diag_tbl = dplyr::inner_join(hesin_tbl, hesin_diag_tbl, by=c("eid"="eid", "ins_index"="ins_index"))
391 |
392 | # format date cols if not "Date"
393 | date_cols = c("epistart", "epiend", "elecdate", "admidate", "disdate")
394 | for (dc in date_cols) {
395 | dc = rlang::sym(dc)
396 | if (!lubridate::is.Date(hesin_diag_tbl |> dplyr::select(!!dc) |> dplyr::pull())) {
397 | hesin_diag_tbl <- hesin_diag_tbl |> dplyr::mutate(!!dc := lubridate::dmy(!!dc))
398 | }
399 | }
400 | }
401 |
402 | cli::cli_alert_success("Loaded {.var hesin_diag} with {nrow(hesin_diag_tbl)} matched rows.")
403 |
404 | if (verbose) cli::cli_alert_info(c("Time taken so far: ", "{prettyunits::pretty_sec(as.numeric(difftime(Sys.time(), start_time, units=\"secs\")))}."))
405 |
406 | }
407 |
408 | #
409 | # ICD9 HES diagnosis data ###########################################
410 | #
411 | if (get_icd9) {
412 |
413 | cli::cli_alert("Ascertaining HES diagnosis data (ICD9s).")
414 |
415 | hesin_diag_path = file_paths$path[ file_paths$object=="hesin_diag" ]
416 |
417 | # create search string
418 | search_string <- paste0("grep -E ", sprintf('"%s"', stringr::str_flatten(ICD9s, collapse = "|")), " ", sprintf('%s', hesin_diag_path))
419 | if (verbose) cat(" -- search string: ", search_string, "\n")
420 |
421 | # get file headers
422 | headers <- colnames(readr::read_tsv(hesin_diag_path, n_max=1, show_col_types=FALSE, progress=FALSE))
423 | if (! "eid" %in% headers) headers[1] <- "eid"
424 |
425 | # use search string to only read lines that matched a code
426 | hesin_diag_tbl_icd9 <- suppressWarnings(readr::read_tsv(pipe(search_string), col_names=headers, show_col_types=FALSE, progress=FALSE))
427 |
428 | # check we have actually matched any ICD9s
429 | # exclude missing ICD9s (EIDs may have been matched)
430 | if (nrow(hesin_diag_tbl_icd9)>0)
431 | hesin_diag_tbl_icd9 <- hesin_diag_tbl_icd9 |> dplyr::filter(!is.na(diag_icd9))
432 | # check codes match START of code string
433 | ICD9_search = stringr::str_flatten(ICD9s, collapse = "|")
434 | if (nrow(hesin_diag_tbl_icd9)>0)
435 | hesin_diag_tbl_icd9 <- hesin_diag_tbl_icd9 |> dplyr::filter(stringr::str_starts(diag_icd9, !! ICD9_search))
436 |
437 | # if any matches returned, make sure eid is formatted nicely (remove file name), and dates are dates
438 | if (nrow(hesin_diag_tbl_icd9)>0) {
439 | # match with HES episode data
440 | hesin_tbl = suppressWarnings(readr::read_tsv(file_paths$path[ file_paths$object=="hesin" ], show_col_types=FALSE, progress=FALSE))
441 | if (! "eid" %in% colnames(hesin_tbl)) colnames(hesin_tbl)[1] <- "eid"
442 | hesin_diag_tbl_icd9 = dplyr::inner_join(hesin_tbl, hesin_diag_tbl_icd9, by=c("eid"="eid", "ins_index"="ins_index"))
443 |
444 | # format date cols if not "Date"
445 | date_cols = c("epistart", "epiend", "elecdate", "admidate", "disdate")
446 | for (dc in date_cols) {
447 | dc = rlang::sym(dc)
448 | if (!lubridate::is.Date(hesin_diag_tbl_icd9 |> dplyr::select(!!dc) |> dplyr::pull()))
449 | hesin_diag_tbl_icd9 <- hesin_diag_tbl_icd9 |> dplyr::mutate(!!dc := lubridate::dmy(!!dc))
450 | }
451 | }
452 |
453 | cli::cli_alert_success("Loaded {.var hesin_diag} with {nrow(hesin_diag_tbl_icd9)} matched rows.")
454 |
455 | # combine with other hesin table?
456 | if (!is.null(hesin_diag_tbl)) hesin_diag_tbl = rbind(hesin_diag_tbl, hesin_diag_tbl_icd9)
457 | if (is.null(hesin_diag_tbl)) hesin_diag_tbl = hesin_diag_tbl_icd9
458 |
459 | if (verbose) cli::cli_alert_info(c("Time taken so far: ", "{prettyunits::pretty_sec(as.numeric(difftime(Sys.time(), start_time, units=\"secs\")))}."))
460 |
461 | }
462 |
463 | #
464 | # HES operations data ###########################################
465 | #
466 | if (get_oper) {
467 |
468 | cli::cli_alert("Ascertaining HES operations data.")
469 |
470 | hesin_oper_path = file_paths$path[ file_paths$object=="hesin_oper" ]
471 |
472 | # create search string
473 | search_string <- paste0("grep -E ", sprintf('"%s"', stringr::str_flatten(oper_codes, collapse = "|")), " ", sprintf('%s', hesin_oper_path))
474 | if (verbose) cat(" -- search string: ", search_string, "\n")
475 |
476 | # get file headers
477 | headers <- colnames(readr::read_tsv(hesin_oper_path, n_max=1, show_col_types=FALSE, progress=FALSE))
478 | if (! "eid" %in% headers) headers[1] <- "eid"
479 |
480 | # use search string to only read lines that matched a code
481 | hesin_oper_tbl <- suppressWarnings(readr::read_tsv(pipe(search_string), col_names=headers, show_col_types=FALSE, progress=FALSE))
482 |
483 | # if any matches returned, make sure eid is formatted nicely (remove file name), and dates are dates
484 | # make sure OPCS3 are exact
485 | if (nrow(hesin_oper_tbl)>0) {
486 | if (OPCS4s[1] != "" & OPCS3s[1] == "") hesin_oper_tbl <- hesin_oper_tbl |> dplyr::filter(stringr::str_detect(oper4, stringr::str_flatten(!! OPCS4s, collapse = "|")))
487 | if (OPCS4s[1] == "" & OPCS3s[1] != "") hesin_oper_tbl <- hesin_oper_tbl |> dplyr::filter(stringr::str_starts(oper3, stringr::str_flatten(!! OPCS3s, collapse = "|")))
488 | if (OPCS4s[1] != "" & OPCS3s[1] != "") hesin_oper_tbl <- hesin_oper_tbl |> dplyr::filter(stringr::str_starts(oper3, stringr::str_flatten(!! OPCS3s, collapse = "|")) | stringr::str_detect(oper4, stringr::str_flatten(!!OPCS4s, collapse = "|")))
489 |
490 | if (is.character(hesin_oper_tbl$opdate)) hesin_oper_tbl$opdate <- lubridate::dmy(hesin_oper_tbl$opdate)
491 | }
492 |
493 | cli::cli_alert_success("Loaded {.var hesin_oper} with {nrow(hesin_oper_tbl)} matched rows.")
494 |
495 | if (verbose) cli::cli_alert_info(c("Time taken so far: ", "{prettyunits::pretty_sec(as.numeric(difftime(Sys.time(), start_time, units=\"secs\")))}."))
496 |
497 | }
498 |
499 | #
500 | # Ascertaining GP clinical ########################################################
501 | #
502 | if (get_gp) {
503 |
504 | cli::cli_alert("Ascertaining GP data.")
505 |
506 | gp_clinical_path = file_paths$path[ file_paths$object=="gp_clinical" ]
507 |
508 | # create search strings
509 | search_string <- paste0("grep -E ", sprintf('"%s"', stringr::str_flatten(gp_codes, collapse = "|")), " ", sprintf('%s', gp_clinical_path))
510 | if (verbose) cat(" -- search string: ", search_string, "\n")
511 |
512 | # get file headers
513 | headers <- colnames(readr::read_tsv(gp_clinical_path, n_max=1, show_col_types=FALSE, progress=FALSE))
514 | if (! "eid" %in% headers) headers[1] <- "eid"
515 |
516 | # use search string to only read lines that matched a code
517 | gp_clinical_tbl <- suppressWarnings(readr::read_tsv(pipe(search_string), col_names=headers, show_col_types=FALSE, progress=FALSE))
518 |
519 | # if any matches returned, make sure eid is formatted nicely (remove file name), the codes are definite matches, and the dates are dates
520 | if (nrow(gp_clinical_tbl)>0) {
521 | gp_clinical_tbl <- gp_clinical_tbl |> dplyr::filter(read_2 %in% !!Read2s | read_3 %in% !!CTV3s)
522 |
523 | # format date col if not "Date"
524 | if (!lubridate::is.Date(gp_clinical_tbl$event_dt)) gp_clinical_tbl <- gp_clinical_tbl |> dplyr::mutate(event_dt = lubridate::dmy(event_dt))
525 | }
526 |
527 | cli::cli_alert_success("Loaded {.var gp_clinical} with {nrow(gp_clinical_tbl)} matched rows.")
528 |
529 | if (verbose) cli::cli_alert_info(c("Time taken so far: ", "{prettyunits::pretty_sec(as.numeric(difftime(Sys.time(), start_time, units=\"secs\")))}."))
530 | }
531 |
532 | #
533 | # Ascertaining self-reported illness ########################################################
534 | #
535 | if (get_selfrep) {
536 |
537 | cli::cli_alert("Ascertaining self-reported illness data.")
538 |
539 | # load data
540 | selfrep_illness_dat <- suppressWarnings(readr::read_tsv(file_paths$path[ file_paths$object=="selfrep_illness" ], show_col_types = FALSE, progress = FALSE))
541 |
542 | # get self-reported illness data - convert to long
543 | selfrep_illness_tbl <- ukbrapR:::get_selfrep_illness(codes_df = codes_df, ukb_dat = selfrep_illness_dat, verbose = verbose)
544 | cli::cli_alert_success("Loaded {.var selfrep_illness} with {nrow(selfrep_illness_tbl)} matched rows.")
545 |
546 | rm(selfrep_illness_dat)
547 |
548 | }
549 |
550 |
551 | #
552 | #
553 | #
554 |
555 | cli::cli_alert_success(c("Finished. Time taken: ", "{prettyunits::pretty_sec(as.numeric(difftime(Sys.time(), start_time, units=\"secs\")))}."))
556 |
557 | # Return data as list
558 | output_list <- list(
559 | gp_clinical=gp_clinical_tbl,
560 | hesin_diag=hesin_diag_tbl,
561 | death_cause=death_cause_tbl,
562 | cancer_registry=cancer_registry_tbl,
563 | hesin_oper=hesin_oper_tbl,
564 | selfrep_illness=selfrep_illness_tbl,
565 | codes_df=tibble::as_tibble(codes_df)
566 | )
567 | class(output_list) <- "ukbrapr_emr"
568 | return(output_list)
569 |
570 | }
571 |
572 |
573 | #' The old get_emr() function
574 | #'
575 | #' @export
576 | #' @noRd
577 | get_emr <- function(
578 | codes_df,
579 | spark_master = "spark://master:41000",
580 | file_paths = NULL,
581 | verbose = FALSE
582 | ) {
583 | lifecycle::deprecate_stop("0.2.0", "get_emr()", "get_diagnoses()")
584 | }
585 |
586 |
--------------------------------------------------------------------------------
/R/get_df.R:
--------------------------------------------------------------------------------
1 | #' Get UK Biobank participant Date First (DF) diagnosis
2 | #'
3 | #' @description For each participant identify the date of first diagnosis from all available electronic medical records & self-reported data.
4 | #'
5 | #' If `use_baseline_dates=TRUE` (the default) then will also produce a binary 0/1 variable, indicating the controls (people without a diagnosis) and setting the date first `_df` field to the date of censoring (currently 30 October 2022).
6 | #'
7 | #' @return Returns a single, "wide" data frame: the participant data for the requested diagnosis codes with "date first" `_df` variables. One for each source of data, and a combined variable.
8 | #'
9 | #' @author Luke Pilling
10 | #'
11 | #' @name get_df
12 | #'
13 | #' @param diagnosis_list A list of data frames. The output of `get_diagnoses()` i.e., the raw diagnosis and self-reported illness data that matched the provided codes list.
14 | #' @param prefix String. Prefix to add to variable names (e.g., if prefix="chd" the output variables would be "chd_gp_df", "chd_hes_df", "chd_df" etc.)
15 | #' \code{default=NULL}
16 | #' @param group_by String. If the codes list provided to `get_diagnoses()` (i.e., in diagnosis_list$codes_df) contained a grouping/condition variable, indicate the variable name here.
17 | #' "Date first" variables will be created for each prefix in the grouping variable. The `prefix` option is ignored, in favour of the names in the grouping variable.
18 | #' \code{default=NULL}
19 | #' @param include_selfrep_illness logical. Include self-reported diagnosesin the combined Date First output? If present in `diagnosis_list` will still provide a separate `_df` variable
20 | #' \code{default=TRUE}
21 | #' @param include_death_cause logical. Include the cause of death in the combined Date First output? If present in `diagnosis_list` will still provide a separate `_df` variable
22 | #' \code{default=TRUE}
23 | #' @param include_gp_clinical logical. Include the GP data in the combined Date First output? If present in `diagnosis_list` will still provide a separate `_df` variable
24 | #' \code{default=TRUE}
25 | #' @param include_hesin_diag logical. Include the HES diagnosis data in the combined Date First output? If present in `diagnosis_list` will still provide a separate `_df` variable
26 | #' \code{default=TRUE}
27 | #' @param include_hesin_oper logical. Include the HES OPCS (operations) data in the combined Date First output? If present in `diagnosis_list` will still provide a separate `_df` variable
28 | #' \code{default=TRUE}
29 | #' @param include_cancer_registry logical. Include the cancer registry data in the combined Date First output? If present in `diagnosis_list` will still provide a separate `_df` variable
30 | #' \code{default=TRUE}
31 | #' @param use_baseline_dates logical. If `baseline_dates` available in file paths, produce a binary 0/1 variable, indicating the controls (people without a diagnosis) and setting the date first `_df` field to the date of censoring (currently see `censoring_date` option).
32 | #' \code{default=TRUE}
33 | #' @param file_paths A data frame. Columns must be `object` and `path` containing paths to outputted files. If not provided will use those in `ukbrapr_paths`
34 | #' \code{default=NULL}
35 | #' @param censoring_date A string. If using baseline data to infer control participants, include a censoring date (set to NA if not desired). Use dd-mm-yyyy format. Default is the (current) HES date.
36 | #' \code{default="30-10-2022"}
37 | #' @param verbose Logical. Be verbose,
38 | #' \code{default=FALSE}
39 | #'
40 | #' @examples
41 | #'
42 | #' ###############################################
43 | #' # example 1. haemochromatosis
44 | #'
45 | #' # get diagnosis data - returns list of data frames (one per source)
46 | #' diagnosis_list <- get_diagnoses(ukbrapR:::codes_df_hh)
47 | #'
48 | #' # for each participant, get Date First diagnosed with the condition
49 | #' diagnosis_df <- get_df(diagnosis_list, prefix="hh")
50 | #'
51 | #' ###############################################
52 | #' # example 2. get multiple diseases at once
53 | #' # don't have to all have the same code types/data sources
54 | #'
55 | #' codes = rbind(ukbrapR:::codes_df_hh, ukbrapR:::codes_df_ckd)
56 | #' print(codes)
57 | #'
58 | #' # get diagnosis data - returns list of data frames (one per source)
59 | #' diagnosis_list <- get_diagnoses(codes)
60 | #'
61 | #' # for each participant, get Date First diagnosed with the condition
62 | #' diagnosis_df <- get_df(diagnosis_list, group_by="condition")
63 | #'
64 | #' @export
65 | #'
66 | get_df <- function(
67 | diagnosis_list,
68 | prefix = NULL,
69 | group_by = NULL,
70 | include_selfrep_illness = TRUE,
71 | include_death_cause = TRUE,
72 | include_gp_clinical = TRUE,
73 | include_hesin_diag = TRUE,
74 | include_hesin_oper = TRUE,
75 | include_cancer_registry = TRUE,
76 | use_baseline_dates = TRUE,
77 | file_paths = NULL,
78 | censoring_date = "30-10-2022",
79 | verbose = FALSE
80 | ) {
81 |
82 | v <- packageVersion("ukbrapR")
83 | cli::cli_alert_info("ukbrapR v{v}")
84 |
85 | start_time <- Sys.time()
86 |
87 | # use baseline dates?
88 | if (use_baseline_dates) {
89 |
90 | # Is this one of my systems? If so, get the internal file_paths
91 | nodename <- as.character(Sys.info()['nodename'])
92 | if ( is.null(file_paths) & nodename %in% c("SNOW","SHAPTER") ) {
93 | file_paths = ukbrapR:::snow_paths
94 | if (verbose) cli::cli_alert_info("Identified server {nodename} - using predefined paths.")
95 | }
96 | if ( is.null(file_paths) & nodename == "indy.ex.ac.uk" ) {
97 | file_paths = ukbrapR:::indy_paths
98 | if (verbose) cli::cli_alert_info("Identified server {nodename} - using predefined paths.")
99 | }
100 |
101 | # if file_paths not provided assume default paths
102 | if (is.null(file_paths)) {
103 | file_paths = ukbrapR:::ukbrapr_paths
104 | # add users home directory to path
105 | home_path = as.character(Sys.getenv()["HOME"])
106 | file_paths$path = stringr::str_c(home_path, "/", file_paths$path)
107 | }
108 |
109 | # does baseline_dates file exist?
110 | # -- this is downloaded to users home directory so need to add this to the path
111 | bl_file_path = file_paths$path[ file_paths$object=="baseline_dates" ]
112 | if (file.exists(bl_file_path)) {
113 |
114 | # read baseline dates
115 | bl_data = readr::read_tsv(bl_file_path, show_col_types = FALSE, progress = FALSE)
116 |
117 | # rename assessment date (p53_i0) for ease later
118 | bl_data = bl_data |>
119 | dplyr::rename(assessment_date_0 = p53_i0) |>
120 | dplyr::select(eid, assessment_date_0)
121 |
122 | # censoring date provided?
123 | if (!is.na(censoring_date)) censoring_date = lubridate::dmy(censoring_date)
124 |
125 | } else {
126 | use_baseline_dates = FALSE
127 | cli::cli_alert_warning("Could not find \"baseline dates\" file at path {.file {bl_file_path}} - continued without using it")
128 | }
129 | }
130 |
131 | # are we using a grouping variable?
132 | if (is.null(group_by)) {
133 |
134 | df_tbl = ukbrapR:::get_df1(
135 | diagnosis_list=diagnosis_list,
136 | include_selfrep_illness=include_selfrep_illness,
137 | include_gp_clinical=include_gp_clinical,
138 | include_death_cause=include_death_cause,
139 | include_hesin_diag=include_hesin_diag,
140 | include_cancer_registry=include_cancer_registry,
141 | include_hesin_oper=include_hesin_oper,
142 | prefix=prefix,
143 | verbose=verbose
144 | )
145 |
146 | # add binary variables (ever, prev) & censoring date (if provided)
147 | if (use_baseline_dates) df_tbl = ukbrapR:::get_df1_add_bin(df=df_tbl, bd=bl_data, cd=censoring_date, prefix=prefix, verbose=verbose)
148 |
149 | } else {
150 |
151 | if (verbose) cli::cli_alert("Grouping variable detected - checking codes")
152 |
153 | # check input codes and group variable
154 | if (class(diagnosis_list) != "ukbrapr_emr") cli::cli_alert_warning(c("{.var diagnosis_list} should be of class {.cls ukbrapr_emr}", "x" = "You've supplied a {.cls {class(diagnosis_list)}} - behaviour may not be as intended."))
155 |
156 | codes = as.data.frame(diagnosis_list[['codes_df']])
157 |
158 | if (! group_by %in% colnames(codes)) cli::cli_abort("{.var diagnosis_list} codes data frame needs to contain the group column {group_by}")
159 | if (class(codes[,group_by]) != "character") cli::cli_abort(c("Group column {group_by} needs to be a character vector", "x" = "You've supplied a {.cls {class(codes[,group_by])}}."))
160 |
161 | # for each grouping variable, subset diagnostic data, run get_df1(), and combine output
162 | groups = unique(codes[,group_by])
163 | df_tbl = NULL
164 | cli::cli_alert("{length(groups)} group{?s} identified - getting date first for each")
165 |
166 | for (group in groups) {
167 |
168 | if (verbose) cli::cli_alert("Doing group {group}")
169 |
170 | # subset diagnostic codes
171 | codes_sub = codes[codes[,group_by]==group,]
172 | diagnosis_list_sub = diagnosis_list
173 |
174 | ## gp clinical
175 | gp_clinical_sub <- NULL
176 | if (!is.null(diagnosis_list_sub$gp_clinical) & any(codes_sub$vocab_id %in% c("Read2","CTV3"))) {
177 | Read2s = ""
178 | CTV3s = ""
179 | if (any(codes_sub$vocab_id == "Read2")) {
180 | Read2s <- codes_sub$code[codes_sub$vocab_id == "Read2"]
181 | Read2s <- stringr::str_sub(Read2s, 1, 5) |> unique()
182 | }
183 | if (any(codes_sub$vocab_id == "CTV3")) {
184 | CTV3s = codes_sub$code[codes_sub$vocab_id == "CTV3"]
185 | CTV3s <- stringr::str_sub(CTV3s, 1, 5) |> unique()
186 | }
187 | gp_clinical_sub <- diagnosis_list_sub$gp_clinical |> dplyr::filter(read_2 %in% !!Read2s | read_3 %in% !!CTV3s)
188 | }
189 | diagnosis_list_sub$gp_clinical <- gp_clinical_sub
190 |
191 | # create ICD10 search string
192 | if (any(codes_sub$vocab_id == "ICD10")) {
193 | ICD10s <- codes_sub |>
194 | dplyr::filter(vocab_id == "ICD10") |>
195 | dplyr::select(code) |>
196 | dplyr::pull() |>
197 | unique() |>
198 | stringr::str_remove(stringr::fixed(".")) |>
199 | stringr::str_sub(1, 5)
200 | ICD10_search = stringr::str_flatten(ICD10s, collapse = "|")
201 | }
202 |
203 | ## hesin_diag
204 | hesin_diag_sub = NULL
205 | if (!is.null(diagnosis_list_sub$hesin_diag) & any(codes_sub$vocab_id %in% c("ICD10","ICD9"))) {
206 |
207 | if (any(codes_sub$vocab_id == "ICD10")) {
208 | colnames(diagnosis_list_sub$hesin_diag) = tolower(colnames(diagnosis_list_sub$hesin_diag))
209 | hesin_diag_sub = diagnosis_list_sub$hesin_diag |> dplyr::filter(stringr::str_detect(diag_icd10, !! ICD10_search))
210 | }
211 |
212 | if (any(codes_sub$vocab_id == "ICD9")) {
213 | ICD9s = ""
214 | if (any(codes_sub$vocab_id == "ICD9")) {
215 | ICD9s <- codes_sub |>
216 | dplyr::filter(vocab_id == "ICD9") |>
217 | dplyr::select(code) |>
218 | dplyr::pull() |>
219 | unique() |>
220 | stringr::str_remove(stringr::fixed(".")) |>
221 | stringr::str_sub(1, 5)
222 | }
223 | ICD9_search = stringr::str_flatten(ICD9s, collapse = "|")
224 | colnames(diagnosis_list_sub$hesin_diag) = tolower(colnames(diagnosis_list_sub$hesin_diag))
225 | hesin_diag_sub = rbind(hesin_diag_sub, diagnosis_list_sub$hesin_diag |> dplyr::filter(stringr::str_starts(diag_icd9, !! ICD9_search)))
226 | }
227 |
228 | }
229 | diagnosis_list_sub$hesin_diag <- hesin_diag_sub
230 |
231 | ## death_cause
232 | death_cause_sub <- NULL
233 | if (!is.null(diagnosis_list_sub$death_cause) & any(codes_sub$vocab_id == "ICD10")) {
234 | death_cause_sub <- diagnosis_list_sub$death_cause |> dplyr::filter(stringr::str_detect( cause_icd10, !! ICD10_search))
235 | }
236 | diagnosis_list_sub$death_cause <- death_cause_sub
237 |
238 | ## cancer_registry
239 | cancer_registry_sub <- NULL
240 | if (!is.null(diagnosis_list_sub$cancer_registry) & any(codes_sub$vocab_id == "ICD10")) {
241 | cancer_registry_sub <- diagnosis_list_sub$cancer_registry |> dplyr::filter(stringr::str_detect( icd10, !! ICD10_search))
242 | }
243 | diagnosis_list_sub$cancer_registry <- cancer_registry_sub
244 |
245 | ## hesin_oper
246 | hesin_oper_sub <- NULL
247 | if (!is.null(diagnosis_list_sub$hesin_oper) & any(codes_sub$vocab_id %in% c("OPCS3","OPCS4"))) {
248 |
249 | # get OPCS4 codes to search for
250 | OPCS4s = ""
251 | if (any(codes_sub$vocab_id == "OPCS4")) {
252 | OPCS4s <- codes_sub |>
253 | dplyr::filter(vocab_id == "OPCS4") |>
254 | dplyr::select(code) |>
255 | dplyr::pull() |>
256 | unique() |>
257 | stringr::str_remove(stringr::fixed(".")) |>
258 | stringr::str_sub(1, 5)
259 | }
260 |
261 | # get OPCS3 codes to search for
262 | OPCS3s = ""
263 | if (any(codes_sub$vocab_id == "OPCS3")) {
264 | OPCS3s <- codes_sub |>
265 | dplyr::filter(vocab_id == "OPCS3") |>
266 | dplyr::select(code) |>
267 | dplyr::pull() |>
268 | unique() |>
269 | stringr::str_remove(stringr::fixed(".")) |>
270 | stringr::str_sub(1, 5)
271 | }
272 |
273 | # subset hesin_oper to those matching either
274 | OPCS4_search = stringr::str_flatten(OPCS4s, collapse = "|")
275 | OPCS3_search = stringr::str_flatten(OPCS3s, collapse = "|")
276 |
277 | if (OPCS4s[1] != "" & OPCS3s[1] == "") hesin_oper_sub <- diagnosis_list_sub$hesin_oper |> dplyr::filter(stringr::str_detect(oper4, !! OPCS4_search))
278 | if (OPCS4s[1] == "" & OPCS3s[1] != "") hesin_oper_sub <- diagnosis_list_sub$hesin_oper |> dplyr::filter(stringr::str_detect(oper3, !! OPCS3_search))
279 | if (OPCS4s[1] != "" & OPCS3s[1] != "") hesin_oper_sub <- diagnosis_list_sub$hesin_oper |> dplyr::filter(stringr::str_detect(oper4, !! OPCS4_search) | stringr::str_detect(oper3, !! OPCS3_search))
280 | }
281 | diagnosis_list_sub$hesin_oper <- hesin_oper_sub
282 |
283 | ## self-reported illness
284 | selfrep_illness_sub <- NULL
285 | if (!is.null(diagnosis_list_sub$selfrep_illness) & any(codes_sub$vocab_id %in% c("ukb_cancer","ukb_noncancer"))) {
286 | if (any(codes_sub$vocab_id == "ukb_cancer")) {
287 | codes_cancer = codes_sub$code[ codes_sub$vocab_id == "ukb_cancer" ]
288 | selfrep_illness_sub <- diagnosis_list_sub$selfrep_illness |> dplyr::filter(cancer_code %in% codes_cancer)
289 | }
290 | if (any(codes_sub$vocab_id == "ukb_noncancer")) {
291 | codes_noncancer = codes_sub$code[ codes_sub$vocab_id == "ukb_noncancer" ]
292 | selfrep_illness_sub <- diagnosis_list_sub$selfrep_illness |> dplyr::filter(noncancer_code %in% codes_noncancer)
293 | }
294 | }
295 | diagnosis_list_sub$selfrep_illness <- selfrep_illness_sub
296 | if (!any(codes_sub$vocab_id %in% c("ukb_cancer","ukb_noncancer"))) include_selfrep_illness <- FALSE
297 |
298 |
299 | #
300 | #
301 | # get DF for this condition
302 | df_tbl_sub = ukbrapR:::get_df1(
303 | diagnosis_list=diagnosis_list_sub,
304 | include_selfrep_illness=include_selfrep_illness,
305 | include_gp_clinical=include_gp_clinical,
306 | include_death_cause=include_death_cause,
307 | include_hesin_diag=include_hesin_diag,
308 | include_cancer_registry=include_cancer_registry,
309 | include_hesin_oper=include_hesin_oper,
310 | prefix=group,
311 | verbose=verbose
312 | )
313 |
314 | # add binary variables (ever, prev) & censoring date (if provided)
315 | if (use_baseline_dates) df_tbl_sub = ukbrapR:::get_df1_add_bin(df=df_tbl_sub, bd=bl_data, cd=censoring_date, prefix=group, verbose=verbose)
316 |
317 | # merge with main DF table
318 | if (is.null(df_tbl)) {
319 | df_tbl = df_tbl_sub
320 | } else {
321 | df_tbl = dplyr::full_join(df_tbl, df_tbl_sub, by="eid")
322 | }
323 |
324 | }
325 |
326 | cli::cli_alert_info("Finished getting date first diagnosed for each group/condition.")
327 |
328 | }
329 |
330 | if (verbose) cli::cli_alert_success(c("Time taken: ", "{prettyunits::pretty_sec(as.numeric(difftime(Sys.time(), start_time, units=\"secs\")))}."))
331 |
332 | # return table
333 | return(df_tbl)
334 |
335 | }
336 |
337 |
338 | #' Get UK Biobank participant Date First (DF) diagnosis for one condition
339 | #'
340 | #' @description For each participant identify the date of first diagnosis from all available electronic medical records & self-reported data.
341 | #'
342 | #' @return Returns a single, "wide" data frame: the participant data for the requested diagnosis codes with "date first" `_df` variables. One for each source of data, and a combined variable.
343 | #'
344 | #' @author Luke Pilling
345 | #'
346 | #' @name get_df1
347 | #'
348 | #' @noRd
349 | get_df1 <- function(
350 | diagnosis_list,
351 | include_selfrep_illness = TRUE,
352 | include_gp_clinical = TRUE,
353 | include_death_cause = TRUE,
354 | include_hesin_diag = TRUE,
355 | include_cancer_registry = TRUE,
356 | include_hesin_oper = TRUE,
357 | prefix = NULL,
358 | verbose = FALSE
359 | ) {
360 |
361 | #start_time <- Sys.time()
362 |
363 | # Check input
364 | if (verbose) cli::cli_alert("Check inputs\n")
365 | if (class(diagnosis_list) != "ukbrapr_emr") cli::cli_alert_warning(c("{.var diagnosis_list} should be of class {.cls ukbrapr_emr}", "x" = "You've supplied a {.cls {class(diagnosis_list)}} - behaviour may not be as intended."))
366 |
367 | # "use" if there is any data (i.e., provide an individual _df column) -- "include" in the main combined only if specified by user
368 | use_selfrep <- use_gp_clinical <- use_death_cause <- use_hesin_diag <- use_cancer_registry <- use_hesin_oper <- FALSE
369 |
370 | if (include_selfrep_illness) if ( !is.null(diagnosis_list$selfrep_illness) ) if ( nrow(diagnosis_list$selfrep_illness)>0 ) use_selfrep <- TRUE
371 |
372 | if ( !is.null(diagnosis_list$gp_clinical) ) if ( nrow(diagnosis_list$gp_clinical)>0 ) use_gp_clinical <- TRUE
373 | if ( !is.null(diagnosis_list$death_cause) ) if ( nrow(diagnosis_list$death_cause)>0 ) use_death_cause <- TRUE
374 | if ( !is.null(diagnosis_list$hesin_diag) ) if ( nrow(diagnosis_list$hesin_diag)>0 ) use_hesin_diag <- TRUE
375 | if ( !is.null(diagnosis_list$cancer_registry) ) if ( nrow(diagnosis_list$cancer_registry)>0 ) use_cancer_registry <- TRUE
376 | if ( !is.null(diagnosis_list$hesin_oper) ) if ( nrow(diagnosis_list$hesin_oper)>0 ) use_hesin_oper <- TRUE
377 |
378 | #
379 | #
380 | #
381 |
382 | # Convert self-reported illness to "wide" Date First
383 | if (use_selfrep) {
384 | if (verbose) cli::cli_alert("Get date first diagnosis: selfrep_df\n")
385 | selfrep_illness <- ukbrapR:::get_selfrep_illness_df(codes_df=diagnosis_list$codes_df, ukb_dat=diagnosis_list$selfrep_illness, verbose=verbose)
386 | }
387 |
388 | # Convert gp_clinical to "wide" Date First
389 | if (use_gp_clinical) {
390 | if (verbose) cli::cli_alert("Get date first diagnosis: gp_df\n")
391 | gp_clinical <- diagnosis_list$gp_clinical |>
392 | dplyr::filter(!is.na(event_dt)) |>
393 | dplyr::group_by(eid) |>
394 | dplyr::summarize(gp_df=min(event_dt, na.rm=TRUE)) |>
395 | dplyr::mutate(gp_df = dplyr::if_else(is.finite(gp_df), gp_df, NA))
396 | }
397 |
398 | # Convert death_cause to "wide" Date First
399 | if (use_death_cause) {
400 | if (verbose) cli::cli_alert("Get date first diagnosis: death_df\n")
401 | death_cause <- diagnosis_list$death_cause |>
402 | dplyr::filter(!is.na(date_of_death)) |>
403 | dplyr::group_by(eid) |>
404 | dplyr::summarize(death_df=min(date_of_death, na.rm=TRUE)) |>
405 | dplyr::mutate(death_df = dplyr::if_else(is.finite(death_df), death_df, NA))
406 | }
407 |
408 | # Convert hesin_diag to "wide" Date First
409 | if (use_hesin_diag) {
410 | if (verbose) cli::cli_alert("Get date first diagnosis: hes_df\n")
411 | hesin_diag <- diagnosis_list$hesin_diag |>
412 | dplyr::mutate(diagnosis_date = epistart) |>
413 | dplyr::mutate(diagnosis_date = dplyr::if_else(is.na(diagnosis_date), epiend, diagnosis_date)) |>
414 | dplyr::mutate(diagnosis_date = dplyr::if_else(is.na(diagnosis_date), admidate, diagnosis_date)) |>
415 | dplyr::mutate(diagnosis_date = lubridate::as_date(dplyr::if_else(is.na(diagnosis_date), disdate, diagnosis_date))) |>
416 | dplyr::filter(!is.na(diagnosis_date)) |>
417 | dplyr::group_by(eid) |>
418 | dplyr::summarize(hes_df=min(diagnosis_date, na.rm=TRUE)) |>
419 | dplyr::mutate(hes_df = dplyr::if_else(is.finite(hes_df), hes_df, NA))
420 | }
421 |
422 | # Convert cancer registry to "wide" Date First
423 | if (use_cancer_registry) {
424 | if (verbose) cli::cli_alert("Get date first diagnosis: canreg_df\n")
425 | cancer_registry <- ukbrapR:::get_cancer_registry_df(codes_df=diagnosis_list$codes_df, ukb_dat=diagnosis_list$cancer_registry, verbose=verbose)
426 | }
427 |
428 | # Convert hesin_oper to "wide" Date First
429 | if (use_hesin_oper) {
430 | if (verbose) cli::cli_alert("Get date first diagnosis: oper_df\n")
431 | hesin_oper <- diagnosis_list$hesin_oper |>
432 | dplyr::filter(!is.na(opdate)) |>
433 | dplyr::group_by(eid) |>
434 | dplyr::summarize(oper_df=min(opdate, na.rm=TRUE)) |>
435 | dplyr::mutate(oper_df = dplyr::if_else(is.finite(oper_df), oper_df, NA))
436 | }
437 |
438 | #
439 | #
440 | #
441 |
442 | # Combine into single data frame
443 | if (verbose) cli::cli_alert("Combine into single wide data frame\n")
444 | diagnosis_df <- NULL
445 | if (use_selfrep) {
446 | diagnosis_df <- selfrep_illness
447 | }
448 | if (use_gp_clinical) {
449 | if (is.null(diagnosis_df)) {
450 | diagnosis_df <- gp_clinical
451 | } else {
452 | diagnosis_df <- dplyr::full_join(diagnosis_df, gp_clinical, by="eid")
453 | }
454 | }
455 | if (use_hesin_diag) {
456 | if (is.null(diagnosis_df)) {
457 | diagnosis_df <- hesin_diag
458 | } else {
459 | diagnosis_df <- dplyr::full_join(diagnosis_df, hesin_diag, by="eid")
460 | }
461 | }
462 | if (use_death_cause) {
463 | if (is.null(diagnosis_df)) {
464 | diagnosis_df <- death_cause
465 | } else {
466 | diagnosis_df <- dplyr::full_join(diagnosis_df, death_cause, by="eid")
467 | }
468 | }
469 | if (use_cancer_registry) {
470 | if (is.null(diagnosis_df)) {
471 | diagnosis_df <- cancer_registry
472 | } else {
473 | diagnosis_df <- dplyr::full_join(diagnosis_df, cancer_registry, by="eid")
474 | }
475 | }
476 | if (use_hesin_oper) {
477 | if (is.null(diagnosis_df)) {
478 | diagnosis_df <- hesin_oper
479 | } else {
480 | diagnosis_df <- dplyr::full_join(diagnosis_df, hesin_oper, by="eid")
481 | }
482 | }
483 |
484 | #
485 | #
486 | #
487 |
488 | # Combined "date first, any source" variable & "source" variable
489 | if (verbose) cli::cli_alert("Combined \"date first, any source\" variable\n")
490 | diagnosis_df$df <- NA
491 | diagnosis_df$src <- ""
492 |
493 | if (include_selfrep_illness & use_selfrep) {
494 | diagnosis_df <- diagnosis_df |>
495 | dplyr::mutate(
496 | df = selfrep_df,
497 | src = dplyr::if_else(! is.na(selfrep_df), stringr::str_c("selfrep_i", selfrep_i), NA)) |>
498 | dplyr::select(-selfrep, -selfrep_i)
499 | }
500 |
501 | if (include_gp_clinical & use_gp_clinical) {
502 | diagnosis_df <- diagnosis_df |> dplyr::mutate(
503 | src = dplyr::case_when(
504 | !is.na(gp_df) & is.na(df) ~ "gp",
505 | !is.na(gp_df) & !is.na(df) & gp_df dplyr::mutate(
516 | src = dplyr::case_when(
517 | !is.na(hes_df) & is.na(df) ~ "hes",
518 | !is.na(hes_df) & !is.na(df) & hes_df dplyr::mutate(
529 | src = dplyr::case_when(
530 | !is.na(death_df) & is.na(df) ~ "death",
531 | !is.na(death_df) & !is.na(df) & death_df dplyr::mutate(
542 | src = dplyr::case_when(
543 | !is.na(canreg_df) & is.na(df) ~ "canreg",
544 | !is.na(canreg_df) & !is.na(df) & canreg_df
551 | dplyr::select(-canreg)
552 |
553 | }
554 |
555 | if (include_hesin_oper & use_hesin_oper) {
556 | diagnosis_df <- diagnosis_df |> dplyr::mutate(
557 | src = dplyr::case_when(
558 | !is.na(oper_df) & is.na(df) ~ "hesin_oper",
559 | !is.na(oper_df) & !is.na(df) & oper_df dplyr::filter(src!="" & !is.na(df))
570 |
571 | # adding variable name prefix?
572 | if (!is.null(prefix)) {
573 | if (is.character(prefix) & length(prefix) == 1) {
574 | names(diagnosis_df)[2:ncol(diagnosis_df)] = stringr::str_c(prefix, "_", names(diagnosis_df)[2:ncol(diagnosis_df)])
575 | } else {
576 | cli::cli_alert_warning("Prefix was not a single string - variables names left as default")
577 | }
578 | }
579 |
580 | #
581 | #
582 | # done!
583 |
584 | #if (verbose) cat("Done. Time taken:", Sys.time() - start_time, "\n")
585 |
586 | diagnosis_df_nrow = nrow(diagnosis_df)
587 | if (is.null(prefix)) cli::cli_alert_success("Identified date of first diagnosis in {diagnosis_df_nrow} participants.")
588 | if (!is.null(prefix)) cli::cli_alert_success("Identified date of first {prefix} diagnosis in {diagnosis_df_nrow} participants.")
589 |
590 | # Return data frame
591 | diagnosis_df
592 |
593 | }
594 |
595 |
596 |
597 | #' Add binary variables and censoring date
598 | #'
599 | #' @description Ever and prevalent binary vars. Censoring date. Only to the combined _df variable
600 | #'
601 | #' @return NA
602 | #'
603 | #' @author Luke Pilling
604 | #'
605 | #' @name get_df1_add_bin
606 | #'
607 | #' @noRd
608 | get_df1_add_bin = function(
609 | df,
610 | bd,
611 | cd,
612 | prefix = NULL,
613 | verbose = FALSE
614 | ) {
615 |
616 | if (verbose) cli::cli_alert("Creating binary \"ever diagnosed\" field - adding censoring date {cd} to date first `_df` field")
617 |
618 | # if no prefix them colnames are just `df` etc. - if one provided then include an underscore
619 | if (is.null(prefix)) {
620 | prefix = ""
621 | } else {
622 | prefix = stringr::str_c(prefix, "_")
623 | }
624 |
625 | # define new variable names
626 | var_df = rlang::sym(stringr::str_c(prefix, "df"))
627 | var_bin = rlang::sym(stringr::str_c(prefix, "bin"))
628 | var_bin_prev = rlang::sym(stringr::str_c(prefix, "bin_prev"))
629 |
630 | # merge df and baseline data
631 | df = dplyr::full_join(df, bd, by="eid")
632 |
633 | # create binary "ever"
634 | df = df |> dplyr::mutate(!!var_bin := dplyr::if_else(!is.na(!!var_df), 1, 0))
635 |
636 | # create prevalent variables
637 | df = df |> dplyr::mutate(!!var_bin_prev := dplyr::if_else(!!var_bin==1 & !!var_df dplyr::select(!assessment_date_0)
641 |
642 | # relocate src to end
643 | df = df |> dplyr::relocate(!!rlang::sym(stringr::str_c(prefix, "src")), .after = dplyr::last_col())
644 |
645 | # add censoring date
646 | if (!is.na(cd)) df = df |> dplyr::mutate(!!var_df := dplyr::if_else(!!var_bin==0, cd, !!var_df))
647 |
648 | # return
649 | return(df)
650 | }
651 |
652 |
653 | #' Get date first for cancer registry data
654 | #'
655 | #' @return NA
656 | #'
657 | #' @author Luke Pilling
658 | #'
659 | #' @name get_cancer_registry_df
660 | #'
661 | #' @noRd
662 |
663 | get_cancer_registry_df <- function(
664 | codes_df,
665 | ukb_dat,
666 | verbose = FALSE
667 | ) {
668 |
669 | start_time <- Sys.time()
670 |
671 | if (verbose) cat("Getting cancer registry data\n")
672 |
673 | # format codes
674 | vocab_col = "vocab_id"
675 | codes_col = "code"
676 |
677 | codes <- codes_df |>
678 | dplyr::filter(!!rlang::sym(vocab_col) == "ICD10") |>
679 | dplyr::select(!!rlang::sym(codes_col)) |>
680 | dplyr::pull() |>
681 | unique() |>
682 | stringr::str_remove(stringr::fixed(".")) |>
683 | stringr::str_sub(1, 5)
684 | codes_string = stringr::str_flatten(codes, collapse = "|")
685 |
686 | # create empty vars in ukb_dat to modify
687 | ukb_dat$canreg <- 0
688 | ukb_dat$canreg_df <- NA
689 |
690 | # for this instance, check if participant self-reported this code and record which array
691 |
692 | # Number of diagnosis columns
693 | n_i <- length(unique(ukb_dat$instance))
694 |
695 | # Iterate through each diagnosis column
696 | for (i in 0:(n_i-1)) {
697 |
698 | if (verbose) cat("Get cancer registry data from instance ", i, "\n")
699 |
700 | # Update where the code matches
701 | ukb_dat <- ukb_dat |> dplyr::mutate(
702 | canreg_df = dplyr::if_else(canreg == 0 & stringr::str_detect(icd10, codes_string), date, canreg_df, canreg_df),
703 | canreg = dplyr::if_else(canreg == 0 & stringr::str_detect(icd10, codes_string), 1, canreg, canreg)
704 | )
705 | }
706 |
707 | # keep date first for each participant
708 | ukb_dat <- ukb_dat |>
709 | dplyr::filter(canreg==1) |>
710 | dplyr::filter(!is.na(canreg_df)) |>
711 | dplyr::group_by(eid) |>
712 | dplyr::summarize(canreg_df=min(canreg_df, na.rm=TRUE)) |>
713 | dplyr::mutate(canreg_df = dplyr::if_else(is.finite(canreg_df), canreg_df, NA))
714 |
715 | # add binary variable back in
716 | ukb_dat$canreg <- 1
717 |
718 | # finish
719 | if (verbose) cli::cli_alert_info(c("Finished cancer registry: ", "{prettyunits::pretty_sec(as.numeric(difftime(Sys.time(), start_time, units=\"secs\")))}."))
720 |
721 | # Return data
722 | return(ukb_dat[,c("eid", "canreg", "canreg_df")])
723 |
724 | }
725 |
726 |
727 | #' Get date first for self-reported illness data
728 | #'
729 | #' @return NA
730 | #'
731 | #' @author Luke Pilling
732 | #'
733 | #' @name get_selfrep_illness_df
734 | #'
735 | #' @noRd
736 | get_selfrep_illness_df <- function(
737 | codes_df,
738 | ukb_dat,
739 | verbose = FALSE
740 | ) {
741 |
742 | start_time <- Sys.time()
743 |
744 | if (verbose) cat("Getting self-reported illness data\n")
745 |
746 | # format codes
747 | vocab_col = "vocab_id"
748 | codes_col = "code"
749 |
750 | # get codes
751 | codes_cancer <- codes_noncancer <- NULL
752 | if (any(codes_df$vocab_id == "ukb_cancer")) codes_cancer = codes_df$code[ codes_df$vocab_id == "ukb_cancer" ]
753 | if (any(codes_df$vocab_id == "ukb_noncancer")) codes_noncancer = codes_df$code[ codes_df$vocab_id == "ukb_noncancer" ]
754 |
755 | # split instance and array
756 | ukb_dat <- ukb_dat |>
757 | tidyr::separate_wider_delim(instance, delim = "_", names = c("instance", "array")) |>
758 | dplyr::mutate(
759 | instance = stringr::str_replace_all(instance, "i", ""),
760 | array = stringr::str_replace_all(array, "a", "")
761 | )
762 |
763 | # create empty vars in ukb_dat to modify
764 | ukb_dat$selfrep <- 0
765 | ukb_dat$selfrep_df <- NA
766 | ukb_dat$selfrep_i <- NA
767 |
768 | # for each instance, check if participant self-reported this code and record which array
769 |
770 | # Update where the code matches
771 | if (!is.null(codes_cancer)) {
772 | ukb_dat <- ukb_dat |> dplyr::mutate(
773 | selfrep_i = dplyr::if_else(selfrep == 0 & cancer_code %in% codes_cancer, stringr::str_c(instance, "_cancer"), selfrep_i, selfrep_i),
774 | selfrep_df = dplyr::if_else(selfrep == 0 & cancer_code %in% codes_cancer, cancer_year, selfrep_df, selfrep_df),
775 | selfrep = dplyr::if_else(selfrep == 0 & cancer_code %in% codes_cancer, 1, selfrep, selfrep)
776 | )
777 | }
778 | if (!is.null(codes_noncancer)) {
779 | ukb_dat <- ukb_dat |> dplyr::mutate(
780 | selfrep_i = dplyr::if_else(selfrep == 0 & noncancer_code %in% codes_noncancer, stringr::str_c(instance, "_noncancer"), selfrep_i, selfrep_i),
781 | selfrep_df = dplyr::if_else(selfrep == 0 & noncancer_code %in% codes_noncancer, noncancer_year, selfrep_df, selfrep_df),
782 | selfrep = dplyr::if_else(selfrep == 0 & noncancer_code %in% codes_noncancer, 1, selfrep, selfrep)
783 | )
784 | }
785 |
786 | # determine earliest date
787 | # remove if year is < 1900
788 | ukb_dat <- ukb_dat |>
789 | dplyr::filter(!is.na(selfrep_df) & selfrep_df >= 1936) |>
790 | dplyr::group_by(eid) |>
791 | dplyr::slice(which.min(selfrep_df)) |>
792 | dplyr::ungroup()
793 |
794 | # make sure date is actually a date and not year
795 | if (! lubridate::is.Date(ukb_dat$selfrep_df)) ukb_dat <- ukb_dat |> dplyr::mutate(selfrep_df = lubridate::as_date(lubridate::date_decimal(selfrep_df)))
796 |
797 | # finish
798 | if (verbose) cli::cli_alert_info(c("Finished self-reported illness: ", "{prettyunits::pretty_sec(as.numeric(difftime(Sys.time(), start_time, units=\"secs\")))}."))
799 |
800 | # Return data
801 | return(ukb_dat[,c("eid", "selfrep", "selfrep_df", "selfrep_i")])
802 |
803 | }
804 |
805 |
806 |
807 |
808 |
--------------------------------------------------------------------------------