├── .Rbuildignore ├── .gitattributes ├── .github └── workflows │ ├── check-standard.yaml │ ├── pkgdown.yaml │ ├── rhub.yaml │ └── test-coverage.yaml ├── .gitignore ├── CRAN-SUBMISSION ├── DESCRIPTION ├── LICENSE ├── LICENSE.md ├── NAMESPACE ├── NEWS.md ├── R ├── checks.R ├── metacore.R ├── spec_builder.R ├── utils-pipe.R ├── utils.R ├── validators.R ├── xml_builders.R ├── xml_helpers.R └── zzz.R ├── README.Rmd ├── README.md ├── _pkgdown.yml ├── cran-comments.md ├── inst └── extdata │ ├── ADaM_define.xml │ ├── SDTM_define.xml │ ├── SDTM_spec_CDISC_pilot.xlsx │ ├── mock_spec.xlsx │ ├── p21_mock.xlsx │ ├── pilot_ADaM.rda │ └── pilot_SDTM.rda ├── man ├── MetaCore_filter.Rd ├── check_columns.Rd ├── check_structure.Rd ├── check_words.Rd ├── checks.Rd ├── create_tbl.Rd ├── define_to_metacore.Rd ├── figures │ ├── .DS_Store │ ├── labeled-code_list.png │ ├── labeled-derivation.png │ ├── labeled-ds_spec.png │ ├── labeled-ds_vars.png │ ├── labeled-supp.png │ ├── labeled-value_spec.png │ ├── labeled-var_spec.png │ ├── labeled_schema.png │ ├── metacore.PNG │ └── schema-colors.png ├── get_control_term.Rd ├── get_keys.Rd ├── is_metacore.Rd ├── load_metacore.Rd ├── metacore.Rd ├── metacore_example.Rd ├── pipe.Rd ├── read_all_sheets.Rd ├── save_metacore.Rd ├── select_dataset.Rd ├── spec_to_metacore.Rd ├── spec_type.Rd ├── spec_type_to_codelist.Rd ├── spec_type_to_derivations.Rd ├── spec_type_to_ds_spec.Rd ├── spec_type_to_ds_vars.Rd ├── spec_type_to_value_spec.Rd ├── spec_type_to_var_spec.Rd ├── xml_to_codelist.Rd ├── xml_to_derivations.Rd ├── xml_to_ds_spec.Rd ├── xml_to_ds_vars.Rd ├── xml_to_value_spec.Rd └── xml_to_var_spec.Rd ├── metacore.Rproj ├── tests ├── testthat.R └── testthat │ ├── define-2021.xml │ ├── define2-0-0.xsl │ ├── example_spec.xlsx │ ├── metacore-figure │ ├── unnamed-chunk-1-1.png │ └── unnamed-chunk-2-1.png │ ├── spec_no_val.xlsx │ ├── test-checks.R │ ├── test-metacore.R │ ├── test-reader.R │ ├── test-utils.R │ └── test-validators.R └── vignettes ├── .gitignore ├── Building_Specification_Readers.Rmd └── Example.Rmd /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^metacore\.Rproj$ 2 | ^\.Rproj\.user$ 3 | ^dev$ 4 | ^LICENSE\.md$ 5 | ^README\.Rmd$ 6 | ^codecov\.yml$ 7 | ^_pkgdown\.yml$ 8 | ^docs$ 9 | ^pkgdown$ 10 | ^\.github$ 11 | cran-comments.md 12 | ^doc$ 13 | ^Meta$ 14 | ^CRAN-RELEASE$ 15 | ^CRAN-SUBMISSION$ 16 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | tests/testthat/* linguist-vendored 2 | -------------------------------------------------------------------------------- /.github/workflows/check-standard.yaml: -------------------------------------------------------------------------------- 1 | # For help debugging build failures open an issue on the RStudio community with the 'github-actions' tag. 2 | # https://community.rstudio.com/new-topic?category=Package%20development&tags=github-actions 3 | on: 4 | push: 5 | branches: 6 | - main 7 | - master 8 | - dev 9 | pull_request: 10 | branches: 11 | - main 12 | - master 13 | - dev 14 | 15 | name: R-CMD-check 16 | 17 | jobs: 18 | R-CMD-check: 19 | runs-on: ${{ matrix.config.os }} 20 | 21 | name: ${{ matrix.config.os }} (${{ matrix.config.r }}) 22 | 23 | strategy: 24 | fail-fast: false 25 | matrix: 26 | config: 27 | - {os: windows-latest, r: 'release'} 28 | - {os: macOS-latest, r: 'release'} 29 | - {os: ubuntu-20.04, r: 'release', rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest"} 30 | - {os: ubuntu-20.04, r: 'devel', rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest"} 31 | 32 | env: 33 | REPOS: ${{ matrix.config.repos }} 34 | R_REMOTES_NO_ERRORS_FROM_WARNINGS: true 35 | RSPM: ${{ matrix.config.rspm }} 36 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 37 | 38 | steps: 39 | - uses: actions/checkout@v2 40 | 41 | - uses: r-lib/actions/setup-r@v2 42 | with: 43 | r-version: ${{ matrix.config.r }} 44 | 45 | - uses: r-lib/actions/setup-pandoc@v2 46 | 47 | - name: Query dependencies 48 | run: | 49 | install.packages('remotes') 50 | saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2) 51 | writeLines(sprintf("R-%i.%i", getRversion()$major, getRversion()$minor), ".github/R-version") 52 | shell: Rscript {0} 53 | 54 | - name: Restore R package cache 55 | if: runner.os != 'Windows' 56 | uses: actions/cache@v2 57 | with: 58 | path: ${{ env.R_LIBS_USER }} 59 | key: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-${{ hashFiles('.github/depends.Rds') }} 60 | restore-keys: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1- 61 | 62 | - name: Install system dependencies 63 | if: runner.os == 'Linux' 64 | run: | 65 | while read -r cmd 66 | do 67 | eval sudo $cmd 68 | done < <(Rscript -e 'writeLines(remotes::system_requirements("ubuntu", "20.04"))') 69 | 70 | - name: Install dependencies 71 | run: | 72 | repos <- Sys.getenv("REPOS") 73 | if (repos != "") options(repos = repos) 74 | remotes::install_deps(dependencies = TRUE) 75 | remotes::install_cran("rcmdcheck") 76 | shell: Rscript {0} 77 | 78 | - name: Check 79 | env: 80 | _R_CHECK_CRAN_INCOMING_REMOTE_: false 81 | run: | 82 | options(crayon.enabled = TRUE) 83 | rcmdcheck::rcmdcheck(args = c("--no-manual", "--as-cran"), error_on = "warning", check_dir = "check") 84 | shell: Rscript {0} 85 | 86 | - name: Upload check results 87 | if: failure() 88 | uses: actions/upload-artifact@main 89 | with: 90 | name: ${{ runner.os }}-r${{ matrix.config.r }}-results 91 | path: check 92 | -------------------------------------------------------------------------------- /.github/workflows/pkgdown.yaml: -------------------------------------------------------------------------------- 1 | on: 2 | push: 3 | branches: 4 | - main 5 | - master 6 | 7 | name: pkgdown 8 | 9 | jobs: 10 | pkgdown: 11 | runs-on: macOS-latest 12 | env: 13 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 14 | steps: 15 | - uses: actions/checkout@v2 16 | 17 | - uses: r-lib/actions/setup-r@v2 18 | 19 | - uses: r-lib/actions/setup-pandoc@v2 20 | 21 | - name: Query dependencies 22 | run: | 23 | install.packages('remotes') 24 | saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2) 25 | writeLines(sprintf("R-%i.%i", getRversion()$major, getRversion()$minor), ".github/R-version") 26 | shell: Rscript {0} 27 | 28 | - name: Restore R package cache 29 | uses: actions/cache@v2 30 | with: 31 | path: ${{ env.R_LIBS_USER }} 32 | key: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-${{ hashFiles('.github/depends.Rds') }} 33 | restore-keys: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1- 34 | 35 | - name: Install dependencies 36 | run: | 37 | remotes::install_deps(dependencies = TRUE) 38 | install.packages("pkgdown", type = "binary") 39 | shell: Rscript {0} 40 | 41 | - name: Install package 42 | run: R CMD INSTALL . 43 | 44 | - name: Deploy package 45 | run: | 46 | git config --local user.email "actions@github.com" 47 | git config --local user.name "GitHub Actions" 48 | Rscript -e 'pkgdown::deploy_to_branch(new_process = FALSE)' 49 | -------------------------------------------------------------------------------- /.github/workflows/rhub.yaml: -------------------------------------------------------------------------------- 1 | # R-hub's generic GitHub Actions workflow file. It's canonical location is at 2 | # https://github.com/r-hub/actions/blob/v1/workflows/rhub.yaml 3 | # You can update this file to a newer version using the rhub2 package: 4 | # 5 | # rhub::rhub_setup() 6 | # 7 | # It is unlikely that you need to modify this file manually. 8 | 9 | name: R-hub 10 | run-name: "${{ github.event.inputs.id }}: ${{ github.event.inputs.name || format('Manually run by {0}', github.triggering_actor) }}" 11 | 12 | on: 13 | workflow_dispatch: 14 | inputs: 15 | config: 16 | description: 'A comma separated list of R-hub platforms to use.' 17 | type: string 18 | default: 'linux,windows,macos' 19 | name: 20 | description: 'Run name. You can leave this empty now.' 21 | type: string 22 | id: 23 | description: 'Unique ID. You can leave this empty now.' 24 | type: string 25 | 26 | jobs: 27 | 28 | setup: 29 | runs-on: ubuntu-latest 30 | outputs: 31 | containers: ${{ steps.rhub-setup.outputs.containers }} 32 | platforms: ${{ steps.rhub-setup.outputs.platforms }} 33 | 34 | steps: 35 | # NO NEED TO CHECKOUT HERE 36 | - uses: r-hub/actions/setup@v1 37 | with: 38 | config: ${{ github.event.inputs.config }} 39 | id: rhub-setup 40 | 41 | linux-containers: 42 | needs: setup 43 | if: ${{ needs.setup.outputs.containers != '[]' }} 44 | runs-on: ubuntu-latest 45 | name: ${{ matrix.config.label }} 46 | strategy: 47 | fail-fast: false 48 | matrix: 49 | config: ${{ fromJson(needs.setup.outputs.containers) }} 50 | container: 51 | image: ${{ matrix.config.container }} 52 | 53 | steps: 54 | - uses: r-hub/actions/checkout@v1 55 | - uses: r-hub/actions/platform-info@v1 56 | with: 57 | token: ${{ secrets.RHUB_TOKEN }} 58 | job-config: ${{ matrix.config.job-config }} 59 | - uses: r-hub/actions/setup-deps@v1 60 | with: 61 | token: ${{ secrets.RHUB_TOKEN }} 62 | job-config: ${{ matrix.config.job-config }} 63 | - uses: r-hub/actions/run-check@v1 64 | with: 65 | token: ${{ secrets.RHUB_TOKEN }} 66 | job-config: ${{ matrix.config.job-config }} 67 | 68 | other-platforms: 69 | needs: setup 70 | if: ${{ needs.setup.outputs.platforms != '[]' }} 71 | runs-on: ${{ matrix.config.os }} 72 | name: ${{ matrix.config.label }} 73 | strategy: 74 | fail-fast: false 75 | matrix: 76 | config: ${{ fromJson(needs.setup.outputs.platforms) }} 77 | 78 | steps: 79 | - uses: r-hub/actions/checkout@v1 80 | - uses: r-hub/actions/setup-r@v1 81 | with: 82 | job-config: ${{ matrix.config.job-config }} 83 | token: ${{ secrets.RHUB_TOKEN }} 84 | - uses: r-hub/actions/platform-info@v1 85 | with: 86 | token: ${{ secrets.RHUB_TOKEN }} 87 | job-config: ${{ matrix.config.job-config }} 88 | - uses: r-hub/actions/setup-deps@v1 89 | with: 90 | job-config: ${{ matrix.config.job-config }} 91 | token: ${{ secrets.RHUB_TOKEN }} 92 | - uses: r-hub/actions/run-check@v1 93 | with: 94 | job-config: ${{ matrix.config.job-config }} 95 | token: ${{ secrets.RHUB_TOKEN }} 96 | -------------------------------------------------------------------------------- /.github/workflows/test-coverage.yaml: -------------------------------------------------------------------------------- 1 | on: 2 | push: 3 | branches: 4 | - main 5 | - master 6 | - dev 7 | pull_request: 8 | branches: 9 | - main 10 | - master 11 | - dev 12 | 13 | name: test-coverage 14 | 15 | jobs: 16 | test-coverage: 17 | runs-on: macOS-latest 18 | env: 19 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 20 | steps: 21 | - uses: actions/checkout@v2 22 | 23 | - uses: r-lib/actions/setup-r@v2 24 | 25 | - uses: r-lib/actions/setup-pandoc@v2 26 | 27 | - name: Query dependencies 28 | run: | 29 | install.packages('remotes') 30 | saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2) 31 | writeLines(sprintf("R-%i.%i", getRversion()$major, getRversion()$minor), ".github/R-version") 32 | shell: Rscript {0} 33 | 34 | - name: Restore R package cache 35 | uses: actions/cache@v2 36 | with: 37 | path: ${{ env.R_LIBS_USER }} 38 | key: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-${{ hashFiles('.github/depends.Rds') }} 39 | restore-keys: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1- 40 | 41 | - name: Install dependencies 42 | run: | 43 | install.packages(c("remotes")) 44 | remotes::install_deps(dependencies = TRUE) 45 | remotes::install_cran("covr") 46 | shell: Rscript {0} 47 | 48 | - name: Test coverage 49 | run: covr::codecov() 50 | shell: Rscript {0} 51 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # History files 2 | .Rhistory 3 | .Rapp.history 4 | 5 | # Session Data files 6 | .RData 7 | .gitignore 8 | 9 | # Example code in package build process 10 | *-Ex.R 11 | 12 | # Output files from R CMD build 13 | /*.tar.gz 14 | 15 | # Output files from R CMD check 16 | /*.Rcheck/ 17 | 18 | # RStudio files 19 | .Rproj.user/ 20 | 21 | # produced vignettes 22 | vignettes/*.html 23 | vignettes/*.pdf 24 | 25 | # Files to draft tests 26 | */define_examples/* 27 | 28 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3 29 | .httr-oauth 30 | 31 | # knitr and R markdown default cache directories 32 | /*_cache/ 33 | /cache/ 34 | 35 | # Temporary files created by R markdown 36 | *.utf8.md 37 | *.knit.md 38 | 39 | # Shiny token, see https://shiny.rstudio.com/articles/shinyapps.html 40 | rsconnect/ 41 | .Rproj.user 42 | inst/doc 43 | tests/testthat/temp_data/* 44 | 45 | docs 46 | doc 47 | Meta 48 | .DS_Store? 49 | .DS_Store 50 | -------------------------------------------------------------------------------- /CRAN-SUBMISSION: -------------------------------------------------------------------------------- 1 | Version: 0.1.3 2 | Date: 2024-05-02 08:11:47 UTC 3 | SHA: dfac124015ca54e065a7c11a88aa2a492969b4f4 4 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: metacore 2 | Title: A Centralized Metadata Object Focus on Clinical Trial Data Programming Workflows 3 | Version: 0.1.3 4 | Authors@R: 5 | c(person(given = "Christina", 6 | family = "Fillmore", 7 | role = c("aut", "cre"), 8 | email = "christina.e.fillmore@gsk.com", 9 | comment = c(ORCID = "0000-0003-0595-2302")), 10 | person(given = "Maya", 11 | family = "Gans", 12 | role = "aut", 13 | email = "maya.gans@atorusresearch.com", 14 | comment = c(ORCID = "0000-0002-5452-6089")), 15 | person(given = "Ashley", 16 | family = "Tarasiewicz", 17 | role = "aut", 18 | email = "ashley.tarasiewicz@atorusresearch.com"), 19 | person(given = "Mike", 20 | family = "Stackhouse", 21 | role = "aut", 22 | email = "mike.stackhouse@atorusresearch.com", 23 | comment = c(ORCID = "0000-0001-6030-723X")), 24 | person(given = "Tamara", 25 | family = "Senior", 26 | role = "aut", 27 | email = "tamara.senior@roche.com"), 28 | person(given = "GSK/Atorus JPT", 29 | role = c("cph", "fnd"))) 30 | Description: Create an immutable container holding metadata for the purpose of better enabling programming activities and functionality of other packages within the clinical programming workflow. 31 | License: MIT + file LICENSE 32 | Encoding: UTF-8 33 | Roxygen: list(markdown = TRUE, r6 = FALSE) 34 | RoxygenNote: 7.3.1 35 | Depends: 36 | R (>= 3.6) 37 | Suggests: 38 | testthat, 39 | knitr, 40 | rmarkdown, 41 | covr 42 | Imports: 43 | R6, 44 | tidyr, 45 | dplyr, 46 | stringr, 47 | magrittr, 48 | xml2, 49 | purrr, 50 | readxl, 51 | rlang, 52 | tibble, 53 | tidyselect 54 | VignetteBuilder: knitr 55 | URL: https://atorus-research.github.io/metacore/, https://github.com/atorus-research/metacore 56 | BugReports: https://github.com/atorus-research/metacore/issues 57 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | YEAR: 2021 2 | COPYRIGHT HOLDER: GSK/Atorus JPT 3 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | Copyright (c) 2021 GSK/Atorus JPT 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export("%>%") 4 | export(check_inconsistent_formats) 5 | export(check_inconsistent_labels) 6 | export(check_inconsistent_types) 7 | export(create_tbl) 8 | export(define_to_metacore) 9 | export(get_control_term) 10 | export(get_keys) 11 | export(is_metacore) 12 | export(load_metacore) 13 | export(metacore) 14 | export(metacore_example) 15 | export(read_all_sheets) 16 | export(save_metacore) 17 | export(select_dataset) 18 | export(spec_to_metacore) 19 | export(spec_type) 20 | export(spec_type_to_codelist) 21 | export(spec_type_to_derivations) 22 | export(spec_type_to_ds_spec) 23 | export(spec_type_to_ds_vars) 24 | export(spec_type_to_value_spec) 25 | export(spec_type_to_var_spec) 26 | export(xml_to_codelist) 27 | export(xml_to_derivations) 28 | export(xml_to_ds_spec) 29 | export(xml_to_ds_vars) 30 | export(xml_to_value_spec) 31 | export(xml_to_var_spec) 32 | import(R6) 33 | importFrom(dplyr,across) 34 | importFrom(dplyr,anti_join) 35 | importFrom(dplyr,arrange) 36 | importFrom(dplyr,bind_rows) 37 | importFrom(dplyr,case_when) 38 | importFrom(dplyr,distinct) 39 | importFrom(dplyr,filter) 40 | importFrom(dplyr,full_join) 41 | importFrom(dplyr,group_by) 42 | importFrom(dplyr,group_split) 43 | importFrom(dplyr,if_else) 44 | importFrom(dplyr,inner_join) 45 | importFrom(dplyr,left_join) 46 | importFrom(dplyr,mutate) 47 | importFrom(dplyr,n) 48 | importFrom(dplyr,n_distinct) 49 | importFrom(dplyr,pull) 50 | importFrom(dplyr,right_join) 51 | importFrom(dplyr,row_number) 52 | importFrom(dplyr,rowwise) 53 | importFrom(dplyr,select) 54 | importFrom(dplyr,summarise) 55 | importFrom(dplyr,ungroup) 56 | importFrom(magrittr,"%>%") 57 | importFrom(purrr,compact) 58 | importFrom(purrr,discard) 59 | importFrom(purrr,keep) 60 | importFrom(purrr,map) 61 | importFrom(purrr,map2) 62 | importFrom(purrr,map2_chr) 63 | importFrom(purrr,map2_lgl) 64 | importFrom(purrr,map_chr) 65 | importFrom(purrr,map_dfr) 66 | importFrom(purrr,map_int) 67 | importFrom(purrr,map_lgl) 68 | importFrom(purrr,pmap_chr) 69 | importFrom(purrr,quietly) 70 | importFrom(purrr,reduce) 71 | importFrom(purrr,safely) 72 | importFrom(readxl,excel_sheets) 73 | importFrom(readxl,read_excel) 74 | importFrom(rlang,"!!") 75 | importFrom(rlang,as_label) 76 | importFrom(rlang,as_name) 77 | importFrom(rlang,as_string) 78 | importFrom(rlang,enexpr) 79 | importFrom(rlang,expr) 80 | importFrom(rlang,prim_name) 81 | importFrom(rlang,sym) 82 | importFrom(stats,na.omit) 83 | importFrom(stats,var) 84 | importFrom(stringr,regex) 85 | importFrom(stringr,str_c) 86 | importFrom(stringr,str_count) 87 | importFrom(stringr,str_detect) 88 | importFrom(stringr,str_extract) 89 | importFrom(stringr,str_glue) 90 | importFrom(stringr,str_length) 91 | importFrom(stringr,str_remove) 92 | importFrom(stringr,str_remove_all) 93 | importFrom(stringr,str_replace) 94 | importFrom(stringr,str_replace_all) 95 | importFrom(stringr,str_replace_na) 96 | importFrom(stringr,str_split) 97 | importFrom(stringr,str_starts) 98 | importFrom(stringr,str_subset) 99 | importFrom(stringr,str_to_lower) 100 | importFrom(stringr,str_trim) 101 | importFrom(tibble,as_tibble) 102 | importFrom(tibble,tibble) 103 | importFrom(tibble,tribble) 104 | importFrom(tidyr,nest) 105 | importFrom(tidyr,replace_na) 106 | importFrom(tidyr,unnest) 107 | importFrom(tidyselect,everything) 108 | importFrom(tidyselect,matches) 109 | importFrom(xml2,read_xml) 110 | importFrom(xml2,xml_attr) 111 | importFrom(xml2,xml_find_all) 112 | importFrom(xml2,xml_find_first) 113 | importFrom(xml2,xml_ns_strip) 114 | importFrom(xml2,xml_parent) 115 | importFrom(xml2,xml_text) 116 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | # Metacore 0.1.3 2 | - Add `get_keys` function which returns the dataset keys for a given dataset [#102](https://github.com/atorus-research/metacore/issues/102) 3 | - Fix issues with `select_dataset(simplify = TRUE)` [#97](https://github.com/atorus-research/metacore/issues/97) 4 | 5 | # Metacore 0.1.2 6 | - Update to resolve issues from the dplyr updates 7 | 8 | # Metacore 0.1.1 9 | - Add variable cross checks to check consistent use of label, type, and format 10 | - Add `is_metacore()` function to check if it is a metacore object 11 | - [#64](https://github.com/atorus-research/metacore/issues/64) Resolve issue so define and spec readers work when value levels are empty 12 | 13 | # Metacore 0.1.0 14 | - Updates for latest version of {tidyselect} 15 | - Move from XML to xml2 for define.xml processing 16 | - Correct issue with warning on ds_var$core 17 | - Correct bug with not reading the derivation of predecessor and assigned variables correctly 18 | 19 | # Metacore 0.0.6 20 | - Correct typos 21 | - Limits origin value/make all origins lower case 22 | 23 | # Metacore 0.0.5 24 | - [#47](https://github.com/atorus-research/metacore/issues/47) Improve `yn_to_tf()` to be more robust in parsing yes/no columns 25 | 26 | # Metacore 0.0.4 27 | - Adds a supp table to store the `idvar` and `qeval` information 28 | - Adds `sig_dig`column to the `value_spec` table 29 | 30 | 31 | # Metacore 0.0.3 32 | - Fixes bugs found in the `get_control_term` function and improves error messages for `get_control_term` 33 | - Improves internal naming consistency by renaming `define_to_MetaCore` to `define_to_metacore` 34 | - Adds quiet option to readers, to allow users to optionally ignore the reader warnings 35 | - Creates new example metacore object that is an .rda so much faster to read in 36 | - Fixes issue with select dataset when variables are duplicated 37 | 38 | 39 | # Metacore 0.0.1 40 | This fixes the following issues: 41 | 42 | - [#16](https://github.com/atorus-research/metacore/issues/16) the metacore function now accepts any empty datasets and creates an empty dataset with the correct column names and types 43 | - [#10](https://github.com/atorus-research/metacore/issues/10) yn function checks for logicals and returns them 44 | - [#11](https://github.com/atorus-research/metacore/issues/11) updated function description to make this clearer 45 | - [#12](https://github.com/atorus-research/metacore/issues/12) updated regex so to "[F|f]ormat" so it can accept lower case 46 | - [#14](https://github.com/atorus-research/metacore/issues/14) added supp_flag to ds_vars (on a side note we did a really good job with this it was super easy to change and only required a few edits) 47 | - [#15](https://github.com/atorus-research/metacore/issues/15) modified create_tbl so if there are two potential matches in the same dataset and one is an exact match it uses that 48 | 49 | Additionally, it adds the `get_control_term` function to get pull out the control term for a given variable. 50 | -------------------------------------------------------------------------------- /R/checks.R: -------------------------------------------------------------------------------- 1 | #' Optional checks to consistency of metadata 2 | #' 3 | #' @description These functions check to see if values (e.g labels, formats) 4 | #' that should be consistent for a variable across all data are actually 5 | #' consistent. 6 | #' 7 | #' @param metacore metacore object to check 8 | #' 9 | #' @return If all variables are consistent it will return a message. If there 10 | #' are inconsistencies it will return a message and a dataset of the variables 11 | #' with inconsistencies. 12 | #' 13 | #' @export 14 | #' 15 | #' @examples 16 | #' ## EXAMPLE WITH DUPLICATES 17 | #' # Loads in a metacore obj called metacore 18 | #' load(metacore_example("pilot_ADaM.rda")) 19 | #' check_inconsistent_labels(metacore) 20 | #' 21 | #' check_inconsistent_types(metacore) 22 | #' 23 | #' ## EXAMPLE WITHOUT DUPLICATES 24 | #' # Loads in a metacore obj called metacore 25 | #' load(metacore_example("pilot_SDTM.rda")) 26 | #' check_inconsistent_labels(metacore) 27 | #' 28 | #' check_inconsistent_formats(metacore) 29 | #' 30 | #' check_inconsistent_types(metacore) 31 | #' @rdname checks 32 | check_inconsistent_labels <- function(metacore){ 33 | basic_check(label, metacore) 34 | } 35 | 36 | #' @export 37 | #' @rdname checks 38 | check_inconsistent_types <- function(metacore){ 39 | basic_check(type, metacore) 40 | } 41 | 42 | #' @export 43 | #' @rdname checks 44 | check_inconsistent_formats <- function(metacore){ 45 | basic_check(format, metacore) 46 | } 47 | 48 | #' Base code for running cross variable checks 49 | #' 50 | #' @param col_to_check column to check 51 | #' @param metacore metacore obj 52 | #' 53 | #' @return A message if all is well or dataset with all the variables with 54 | #' duplicated variables. 55 | #' @noRd 56 | #' @importFrom stringr str_glue 57 | #' @importFrom dplyr across 58 | basic_check <- function(col_to_check, metacore){ 59 | if(!is_metacore(metacore)){ 60 | stop("Expects a metacore object", call. = FALSE) 61 | } 62 | 63 | report_df <- metacore$var_spec %>% 64 | mutate(var1 = str_remove(variable, "[[:alnum:]]+\\.")) %>% 65 | group_by(var1) %>% 66 | mutate(n_lab = n_distinct({{col_to_check}})) %>% 67 | filter(n_lab > 1) %>% 68 | mutate(across(everything(), remove_label)) %>% 69 | group_by(var1, {{col_to_check}}) %>% 70 | summarise(n_vars = n(), 71 | ls_of_vars = list(variable), 72 | .groups = "drop") %>% 73 | select(variable = var1, everything()) 74 | 75 | if(nrow(report_df) > 0){ 76 | message(str_glue("Mismatch {as_label(enexpr(col_to_check))}s detected")) 77 | return(report_df) 78 | } else { 79 | message(str_glue("No mismatch {as_label(enexpr(col_to_check))}s detected")) 80 | } 81 | } 82 | 83 | remove_label <- function(x) { 84 | attr(x, "label") <- NULL 85 | x 86 | } 87 | -------------------------------------------------------------------------------- /R/metacore.R: -------------------------------------------------------------------------------- 1 | #' This file includes the internal functions needed to create the readonly 2 | #' Metacore R6 object 3 | #' 4 | #' @param ds_spec contains each dataset in the study, with the labels for each 5 | #' @param ds_vars information on what variables are in each dataset + plus 6 | #' dataset specific variable information 7 | #' @param var_spec variable information that is shared across all datasets 8 | #' @param value_spec parameter specific information, as data is long the specs 9 | #' for wbc might be difference the hgb 10 | #' @param derivations contains derivation, it allows for different variables to 11 | #' have the same derivation 12 | #' @param code_list contains the code/decode information 13 | #' @param supp contains the idvar and qeval information for supplemental variables 14 | #' 15 | #' @family Metacore 16 | #' @noRd 17 | #' 18 | #' @importFrom stringr str_to_lower 19 | MetaCore_initialize <- function(ds_spec, ds_vars, var_spec, value_spec, derivations, codelist, supp){ 20 | 21 | private$.ds_spec <- ds_spec %>% 22 | add_labs(dataset = "Dataset Name", 23 | structure = "Value Structure", 24 | label = "Dataset Label") 25 | 26 | private$.ds_vars <- ds_vars %>% 27 | add_labs(dataset = "Dataset Name", 28 | variable = "Variable Name", 29 | key_seq = "Sequence Key", 30 | order = "Variable Order", 31 | keep = "Keep (Boolean)", 32 | core = "ADaM core (Expected, Required, Permissible)", 33 | supp_flag = "Supplemental Flag") 34 | 35 | private$.var_spec <- var_spec %>% 36 | add_labs(variable = "Variable Name", 37 | length = "Variable Length", 38 | label = "Variable Label", 39 | type = "Variable Class", 40 | common = "Common Across ADaM", 41 | format = "Variable Format") 42 | 43 | private$.value_spec <- value_spec %>% 44 | add_labs(type = "Value Type", 45 | orgin = "Origin of Value", 46 | code_id = "ID of the Code List", 47 | dataset = "Dataset Name", 48 | variable = "Variable Name", 49 | where = "Value of the Variable", 50 | derivation_id = "ID of Derivation") %>% 51 | mutate(origin = str_to_lower(.data$origin)) 52 | 53 | 54 | private$.derivations <- derivations %>% 55 | add_labs(derivation_id = "ID of Derivation", 56 | derivation = "Derivation") 57 | 58 | private$.codelist <- codelist %>% 59 | add_labs(code_id = "ID of the Code List", 60 | names = "Name of the Code List", 61 | type = "Code List/Permitted Values/External Library", 62 | codes = "List of Codes") 63 | 64 | private$.codelist <- codelist %>% 65 | add_labs(code_id = "ID of the Code List", 66 | names = "Name of the Code List", 67 | type = "Code List/Permitted Values/External Library", 68 | codes = "List of Codes") 69 | 70 | private$.supp <- supp %>% 71 | add_labs(dataset = "Dataset Name", 72 | variable = "Variable Name", 73 | idvar = "Identifying Variable", 74 | qeval = "Evaluator") 75 | 76 | self$validate() 77 | message("\n Metadata successfully imported") 78 | } 79 | 80 | 81 | #' Metacore class print function 82 | #' 83 | #' @param ... pass in the dataframes to be validated 84 | #' @family Metacore 85 | #' @noRd 86 | #' 87 | MetaCore_print <- function(...){ 88 | ds_len <- private$.ds_spec %>% pull(.data$dataset) %>% length() 89 | paste0("Metacore object contains metadata for ", ds_len, " datasets\n") %>% 90 | cat() 91 | } 92 | 93 | 94 | #' Metacore R6 object validation function 95 | #' 96 | #' This checks that the labels and lengths of ds_vars match var_spec 97 | #' @family Metacore 98 | #' @noRd 99 | #' 100 | MetaCore_validate <- function() { 101 | if(var_name_check(private)){ 102 | 103 | if(nrow(private$.ds_spec) == 0 & 104 | nrow(private$.ds_vars) == 0 & 105 | nrow(private$.var_spec) == 0 & 106 | nrow(private$.value_spec) == 0 & 107 | nrow(private$.derivations) == 0 & 108 | nrow(private$.codelist) == 0 & 109 | nrow(private$.supp) == 0 ){ 110 | warning("Other checks were not preformed, because all datasets are empty", 111 | call. = FALSE) 112 | } else { 113 | check_columns(private$.ds_spec, 114 | private$.ds_vars, 115 | private$.var_spec, 116 | private$.value_spec, 117 | private$.derivations, 118 | private$.codelist, 119 | private$.supp 120 | ) 121 | 122 | ds_vars_check(private$.ds_vars, private$.var_spec) 123 | value_check(private$.ds_vars, private$.value_spec) 124 | derivation_check(private$.value_spec, private$.derivations) 125 | codelist_check(private$.value_spec, private$.codelist) 126 | if(nrow(private$.supp) == 0){ 127 | supp_check(private$.ds_vars, private$.supp) 128 | } 129 | 130 | } 131 | 132 | } else { 133 | warning("Other checks were not preformed, because column names were incorrect", 134 | call. = FALSE) 135 | } 136 | } 137 | 138 | 139 | 140 | #' readonly function factory 141 | #' 142 | #' This function is used inside the R6 active method and allows us 143 | #' to read the selected dataframe and prevents overwriting 144 | #' 145 | #' @param name the name of the readonly object 146 | #' @param value any attempt at assignment to the readonly object 147 | #' @family Metacore 148 | #' @noRd 149 | #' 150 | readonly <- function(name) { 151 | private <- NULL 152 | inside <- function(value) { 153 | name <- attr(sys.function(sys.parent()), "name") 154 | if (missing(value)) { 155 | private[[paste0(".", name)]] 156 | } else { 157 | stop(paste0(name, " is read only"), call. = FALSE) 158 | } 159 | } 160 | attributes(inside) <- list(name = name) 161 | inside 162 | } 163 | 164 | #' Select method to subset by a single dataframe 165 | #' @param value the dataframe to subset by 166 | #' 167 | MetaCore_filter <- function(value) { 168 | 169 | private$.ds_spec <- private$.ds_spec %>% filter(dataset == value) 170 | if(nrow(private$.ds_spec) == 0){ 171 | stop(paste0(value, " is not a dataset in the metacore object", call. = FALSE)) 172 | } 173 | private$.ds_vars <- private$.ds_vars %>% filter(dataset == value) 174 | private$.value_spec <- private$.value_spec %>% filter(dataset == value) 175 | 176 | 177 | # Need clarity on X.Y.Z situation: SUPPY8.QVAL 178 | private$.var_spec <- private$.var_spec %>% 179 | # variables have the dataset prefix so we make this into its own column 180 | mutate(dataset = ifelse(str_detect(variable, "\\."), str_extract(variable, "^.*(?=\\.)"), ""), 181 | variable = str_remove(variable, "^.*\\.") 182 | ) %>% 183 | # then keep the variables that occur once or in the dataset to filter 184 | filter(dataset == "" | dataset == value) %>% 185 | # remove the temporary column 186 | select(-dataset) %>% 187 | # right join 188 | right_join(private$.ds_vars %>% select(variable), by="variable", 189 | multiple = "all") %>% 190 | distinct(variable, .keep_all = TRUE) # for when duplicates gett through and have different lables but the same name 191 | 192 | # Get values/variables that need derivations 193 | val_deriv <- private$.value_spec %>% 194 | distinct(.data$derivation_id) %>% 195 | na.omit() 196 | 197 | private$.derivations <- private$.derivations %>% 198 | right_join(val_deriv, by = "derivation_id", multiple = "all") 199 | 200 | private$.codelist <- private$.codelist %>% 201 | right_join(private$.value_spec %>% 202 | distinct(.data$code_id) %>% 203 | na.omit(), by = "code_id", multiple = "all") 204 | 205 | private$.supp <- private$.supp %>% filter(dataset == value) 206 | } 207 | 208 | #' The Metacore R6 Class 209 | #' 210 | #' This uses the initialize, print, and validate functions above to create a single object 211 | #' The user can query 212 | #' 213 | #' @family Metacore 214 | #' @noRd 215 | # 216 | MetaCore <- R6::R6Class("Metacore", 217 | public = list( 218 | initialize = MetaCore_initialize, 219 | print = MetaCore_print, 220 | validate = MetaCore_validate, 221 | metacore_filter = MetaCore_filter 222 | ), 223 | private = list( 224 | .ds_spec = tibble(dataset = character(), structure = character(), label = character()), 225 | .ds_vars = tibble(dataset = character(), variable = character(), keep = logical(), 226 | key_seq = integer(), order = integer(), core = character(), 227 | supp_flag = logical()), 228 | .var_spec = tibble(variable = character(), label = character(), length = integer(), 229 | type = character(), common = character(), format = character()), 230 | .value_spec = tibble(dataset = character(), 231 | variable = character(), 232 | where = character(), 233 | type = character(), 234 | sig_dig = integer(), 235 | code_id = character(), 236 | origin = character(), 237 | derivation_id = integer()), 238 | .derivations = tibble(derivation_id = integer(), derivation = character()), 239 | # code_type == df | permitted_val | external_lib 240 | .codelist = tibble(code_id = character(), name = character(), type = character(), codes = list()), 241 | .supp = tibble(dataset = character(), variable = character(), idvar = character(), qeval = character()) 242 | ), 243 | active = list( 244 | ds_spec = readonly('ds_spec'), 245 | ds_vars = readonly('ds_vars'), 246 | var_spec = readonly('var_spec'), 247 | value_spec = readonly('value_spec'), 248 | derivations = readonly('derivations'), 249 | codelist = readonly('codelist'), 250 | supp = readonly('supp') 251 | ) 252 | ) 253 | 254 | 255 | #' R6 Class wrapper to create your own metacore object 256 | #' 257 | #' @param ds_spec contains each dataset in the study, with the labels for each 258 | #' @param ds_vars information on what variables are in each dataset + plus dataset specific variable information 259 | #' @param var_spec variable information that is shared across all datasets 260 | #' @param value_spec parameter specific information, as data is long the specs for wbc might be difference the hgb 261 | #' @param derivations contains derivation, it allows for different variables to have the same derivation 262 | #' @param codelist contains the code/decode information 263 | #' @param supp contains the idvar and qeval information for supplemental variables 264 | #' 265 | #' @family Metacore 266 | #' 267 | #' @export 268 | #' 269 | metacore <- function(ds_spec = tibble(dataset = character(), structure = character(), label = character()), 270 | ds_vars = tibble(dataset = character(), variable = character(), keep = logical(), 271 | key_seq = integer(), order = integer(), core = character(), 272 | supp_flag = logical()), 273 | var_spec = tibble(variable = character(), label = character(), length = integer(), 274 | type = character(), common = character(), format = character()), 275 | value_spec = tibble(dataset = character(), 276 | variable = character(), 277 | where = character(), 278 | type = character(), 279 | sig_dig = integer(), 280 | code_id = character(), 281 | origin = character(), 282 | derivation_id = integer()), 283 | derivations = tibble(derivation_id = integer(), derivation = character()), 284 | codelist = tibble(code_id = character(), name = character(), type = character(), codes = list()), 285 | supp = tibble(dataset = character(), variable = character(), idvar = character(), qeval = character())) { 286 | # Check if there are any empty datasets that need adding 287 | is_empty_df <- as.list(environment()) %>% 288 | keep(is.null) 289 | if(length(is_empty_df) > 0) { 290 | # Adding empty datasets 291 | to_replace <- all_message() %>% 292 | #get the type each variable needs to be 293 | mutate(convert = 294 | map(.data$test, function(x){ 295 | if(identical(x, .Primitive("is.numeric"))){ 296 | numeric() 297 | } else if(identical(x, .Primitive("is.logical"))){ 298 | logical() 299 | } else { 300 | character() 301 | } 302 | })) %>% 303 | filter(dataset %in% names(is_empty_df)) %>% 304 | group_by(dataset) %>% 305 | group_split() 306 | replaced <- to_replace %>% 307 | map(function(df){ 308 | names(df$convert) <- df$var 309 | df$convert %>% 310 | as_tibble() 311 | }) 312 | names(replaced) <- to_replace %>% map_chr(~unique(.$dataset)) 313 | list2env(replaced, environment()) 314 | } 315 | MetaCore$new(ds_spec, ds_vars, var_spec, value_spec, derivations, codelist, supp) 316 | } 317 | 318 | 319 | 320 | #' Select metacore object to single dataset 321 | #' 322 | #' @param .data the metacore object of dataframes 323 | #' @param dataset the specific dataset to subset by 324 | #' @param simplify return a single dataframe 325 | #' 326 | #' @return a filtered subset of the metacore object 327 | #' @export 328 | #' 329 | select_dataset <- function(.data, dataset, simplify = FALSE) { 330 | 331 | cl <- .data$clone() 332 | cl$metacore_filter(dataset) 333 | 334 | if (simplify) { 335 | 336 | test <- suppressMessages( 337 | list( 338 | cl$ds_vars, 339 | cl$var_spec, 340 | cl$value_spec, 341 | cl$derivations, 342 | select(cl$codelist, code_id, codes), 343 | cl$supp 344 | ) %>% 345 | reduce(left_join) 346 | ) 347 | 348 | } else { 349 | return(cl) 350 | } 351 | } 352 | 353 | 354 | 355 | #' Get Control Term 356 | #' 357 | #' Returns the control term (a vector for permitted values and a tibble for code 358 | #' lists) for a given variable. The dataset can be optionally specified if there 359 | #' is different control terminology for different datasets 360 | #' 361 | #' @param metacode metacore object 362 | #' @param variable A variable name to get the controlled terms for. This can 363 | #' either be a string or just the name of the variable 364 | #' @param dataset A dataset name. This is not required if there is only one set 365 | #' of control terminology across all datasets 366 | #' 367 | #' @return a vector for permitted values and a 2-column tibble for codelists 368 | #' @export 369 | #' 370 | #' @importFrom rlang as_label enexpr as_name 371 | #' 372 | #' @examples 373 | #' \dontrun{ 374 | #' meta_ex <- spec_to_metacore(metacore_example("p21_mock.xlsx")) 375 | #' get_control_term(meta_ex, QVAL, SUPPAE) 376 | #' get_control_term(meta_ex, "QVAL", "SUPPAE") 377 | #' } 378 | get_control_term <- function(metacode, variable, dataset = NULL){ 379 | var_str <- ifelse(str_detect(as_label(enexpr(variable)), "\""), 380 | as_name(variable), as_label(enexpr(variable))) 381 | dataset_val <- ifelse(str_detect(as_label(enexpr(dataset)), "\""), 382 | as_name(dataset), as_label(enexpr(dataset))) # to make the filter more explicit 383 | if(!var_str %in% metacode$value_spec$variable){ 384 | stop(paste0(var_str, " not found in the value_spec table. Please check the variable name")) 385 | } 386 | if(dataset_val == "NULL"){ 387 | var_code_id <- metacode$value_spec %>% 388 | filter(variable == var_str) %>% 389 | pull(code_id) %>% 390 | unique() 391 | } else { 392 | subset_data <- metacode$value_spec %>% 393 | filter(dataset == dataset_val) 394 | if(nrow(subset_data) == 0){ 395 | stop(paste0(dataset_val, " not found in the value_spec table. Please check the dataset name")) 396 | } 397 | var_code_id <- subset_data %>% 398 | filter(variable == var_str) %>% 399 | pull(code_id) %>% 400 | unique() 401 | } 402 | if(length(var_code_id) > 1){ 403 | stop(paste0(var_str, " does not have a unique control term, consider spcificing a dataset")) 404 | } 405 | ct <- metacode$codelist %>% 406 | filter(code_id == var_code_id) %>% 407 | pull(codes) 408 | if(length(ct) == 0){ 409 | message(paste0(var_str, " has no control terminology")) 410 | } else { 411 | return(ct[[1]]) 412 | } 413 | } 414 | 415 | 416 | #' Get Dataset Keys 417 | #' 418 | #' Returns the dataset keys for a given dataset 419 | #' 420 | #' @param metacode metacore object 421 | #' @param dataset A dataset name 422 | #' 423 | #' @return a 2-column tibble with dataset key variables and key sequence 424 | #' @export 425 | #' 426 | #' @importFrom rlang as_label enexpr as_name 427 | #' 428 | #' @examples 429 | #' \dontrun{ 430 | #' meta_ex <- spec_to_metacore(metacore_example("p21_mock.xlsx")) 431 | #' get_keys(meta_ex, "AE") 432 | #' get_keys(meta_ex, AE) 433 | #' } 434 | get_keys <- function(metacode, dataset){ 435 | dataset_val <- ifelse(str_detect(as_label(enexpr(dataset)), "\""), 436 | as_name(dataset), as_label(enexpr(dataset))) # to make the filter more explicit 437 | 438 | subset_data <- metacode$ds_vars %>% 439 | filter(dataset == dataset_val) 440 | if(nrow(subset_data) == 0){ 441 | stop(paste0(dataset_val, " not found in the ds_vars table. Please check the dataset name")) 442 | } 443 | 444 | keys <- subset_data %>% 445 | filter(!is.na(key_seq)) %>% 446 | select(variable, key_seq) 447 | 448 | keys <- keys[order(keys$key_seq),] 449 | 450 | return(keys) 451 | } 452 | 453 | 454 | #' save metacore object 455 | #' 456 | #' @param metacore_object the metacore object in memory to save to disc 457 | #' @param path file path and file name to save metacore object 458 | #' 459 | #' @return an .rda file 460 | #' @export 461 | #' 462 | save_metacore <- function(metacore_object, path = NULL) { 463 | # if no path save to working directory 464 | # with same name as object 465 | if (is.null(path)) { 466 | nm <- deparse(substitute(metacore_object)) 467 | path <- paste0(nm, ".rds") 468 | 469 | # check the suffix of the path 470 | } else { 471 | suffix <- str_extract(path, "\\.\\w*$") 472 | # if the extension is .rda keep it 473 | if (suffix == ".rds") { 474 | path <- path 475 | 476 | # otherwise we need to replace it with .rda 477 | } else { 478 | prefix <- str_remove(path, "\\.\\w*$") 479 | path <- paste0(prefix, ".rds") 480 | } 481 | } 482 | saveRDS(metacore_object, path) 483 | } 484 | 485 | #' load metacore object 486 | #' 487 | #' @param path location of the metacore object to load into memory 488 | #' 489 | #' @return metacore object in memory 490 | #' @export 491 | load_metacore <- function(path = NULL) { 492 | if (is.null(path)) { 493 | rdss <- list.files(".", ".rds") 494 | if (length(rdss) == 0) { 495 | stop("please supply path to metacore object ending with extension .rds", call. = FALSE) 496 | } else { 497 | stop("metacore object path required, did you mean:", 498 | paste(" ", rdss, sep = "\n "), call. = FALSE) 499 | } 500 | } 501 | readRDS(path) 502 | } 503 | -------------------------------------------------------------------------------- /R/spec_builder.R: -------------------------------------------------------------------------------- 1 | #' Specification document to metacore object 2 | #' 3 | #' This function takes the location of an excel specification document and reads 4 | #' it in as a meta core object. At the moment it only supports specification in 5 | #' the format of pinnacle 21 specifications. But, the section level spec builder can 6 | #' be used as building blocks for bespoke specification documents. 7 | #' 8 | #' @param path string of file location 9 | #' @param quiet Option to quietly load in, this will suppress warnings, but not 10 | #' errors 11 | #' @param where_sep_sheet Option to tell if the where is in a separate sheet, 12 | #' like in older p21 specs or in a single sheet like newer p21 specs 13 | #' 14 | #' @return given a spec document it returns a metacore object 15 | #' @export 16 | spec_to_metacore <- function(path, quiet = FALSE, where_sep_sheet = TRUE){ 17 | doc <- read_all_sheets(path) 18 | 19 | if(spec_type(path) == "by_type"){ 20 | ds_spec <- spec_type_to_ds_spec(doc) 21 | ds_vars <- spec_type_to_ds_vars(doc) 22 | var_spec <- spec_type_to_var_spec(doc) 23 | value_spec <- spec_type_to_value_spec(doc, where_sep_sheet = where_sep_sheet) 24 | derivations <- spec_type_to_derivations(doc) 25 | code_list <- spec_type_to_codelist(doc) 26 | if(!quiet){ 27 | out <- metacore(ds_spec, ds_vars, var_spec, value_spec, derivations, codelist = code_list) 28 | } else{ 29 | out<- suppressWarnings(metacore(ds_spec, ds_vars, var_spec, value_spec, derivations, codelist = code_list)) 30 | message("Loading in metacore object with suppressed warnings") 31 | } 32 | } else { 33 | stop("This specification format is not currently supported. You will need to write your own reader", 34 | call. = FALSE) 35 | } 36 | out 37 | } 38 | 39 | 40 | 41 | 42 | #' Check the type of spec document 43 | #' 44 | #' @param path file location as a string 45 | #' 46 | #' @return returns string indicating the type of spec document 47 | #' @export 48 | #' 49 | spec_type <- function(path){ 50 | sheets <- excel_sheets(path) 51 | if(!any(sheets %>% str_detect("[D|d]omains?|[D|d]atasets?"))){ 52 | stop("File does not contain a Domain/Datasets tab, which is needed. Please either modify the spec document or write a reader (see documentation for more information)", 53 | call. = FALSE) 54 | } else if(any(sheets %>% str_detect("ADSL|DM"))){ 55 | type <- "by_ds" 56 | } else if(any(sheets %>% str_detect("[V|v]ariables?"))){ 57 | type <- "by_type" 58 | } else { 59 | stop("File in an unknown format. Please either modify the spec document or write a reader (see documentation for more information)", 60 | call. = FALSE) 61 | } 62 | type 63 | } 64 | 65 | 66 | 67 | 68 | #' Read in all Sheets 69 | #' 70 | #' Given a path to a file, this function reads in all sheets of an excel file 71 | #' 72 | #' @param path string of the file path 73 | #' @export 74 | #' 75 | #' @return a list of datasets 76 | read_all_sheets <- function(path){ 77 | sheets <- excel_sheets(path) 78 | all_dat <- sheets %>% 79 | map(~read_excel(path, sheet = ., col_types = "text")) 80 | names(all_dat) <- sheets 81 | all_dat 82 | } 83 | 84 | 85 | #' Spec to ds_spec 86 | #' 87 | #' Creates the ds_spec from a list of datasets (optionally filtered by the sheet 88 | #' input). The named vector `cols` is used to determine which is the correct 89 | #' sheet and renames the columns 90 | #' @param doc Named list of datasets @seealso [read_all_sheets()] for exact 91 | #' format 92 | #' @param cols Named vector of column names. The column names can be regular 93 | #' expressions for more flexibility. But, the names must follow the given pattern 94 | #' @param sheet Regular expression for the sheet name 95 | #' 96 | #' @return a dataset formatted for the metacore object 97 | #' @export 98 | #' 99 | #' @family spec builders 100 | spec_type_to_ds_spec <- function(doc, cols = c("dataset" = "[N|n]ame|[D|d]ataset|[D|d]omain", 101 | "structure" = "[S|s]tructure", 102 | "label" = "[L|l]abel|[D|d]escription"), sheet = NULL){ 103 | name_check <- names(cols) %in% c("dataset", "structure", "label") %>% 104 | all() 105 | if(!name_check | is.null(names(cols))){ 106 | stop("Supplied column vector must be named using the following names: 107 | 'dataset', 'structure', 'label'") 108 | } 109 | if(!is.null(sheet)){ 110 | sheet_ls <- str_subset(names(doc), sheet) 111 | doc <- doc[sheet_ls] 112 | } 113 | 114 | # Get missing columns 115 | missing <- col_vars()$.ds_spec %>% 116 | discard(~. %in% names(cols)) 117 | 118 | create_tbl(doc, cols) %>% 119 | distinct() %>% 120 | `is.na<-`(missing) 121 | } 122 | 123 | #' Spec to ds_vars 124 | #' 125 | #' Creates the ds_vars from a list of datasets (optionally filtered by the sheet 126 | #' input). The named vector `cols` is used to determine which is the correct 127 | #' sheet and renames the columns 128 | #' 129 | #' @param doc Named list of datasets @seealso [read_all_sheets()] for exact 130 | #' format 131 | #' @param cols Named vector of column names. The column names can be regular 132 | #' expressions for more flexibility. But, the names must follow the given 133 | #' pattern 134 | #' @param sheet Regular expression for the sheet names 135 | #' @param key_seq_sep_sheet A boolean to indicate if the key sequence is on a 136 | #' separate sheet. If set to false add the key_seq column name to the `cols` 137 | #' vector. 138 | #' @param key_seq_cols names vector to get the key_sequence for each dataset 139 | #' 140 | #' @return a dataset formatted for the metacore object 141 | #' @export 142 | #' 143 | #' @family spec builders 144 | spec_type_to_ds_vars <- function(doc, cols = c("dataset" = "[D|d]ataset|[D|d]omain", 145 | "variable" = "[V|v]ariable [[N|n]ame]?|[V|v]ariables?", 146 | "order" = "[V|v]ariable [O|o]rder|[O|o]rder", 147 | "keep" = "[K|k]eep|[M|m]andatory"), 148 | key_seq_sep_sheet = TRUE, 149 | key_seq_cols = c("dataset" = "Dataset", 150 | "key_seq" = "Key Variables"), 151 | sheet = "[V|v]ar|Datasets"){ 152 | 153 | name_check <- names(cols) %in% c("variable", "dataset", "order", 154 | "keep", "key_seq", "core", "supp_flag") %>% 155 | all() 156 | 157 | name_check_extra <- names(key_seq_cols) %in% c("dataset", "key_seq") %>% 158 | all() %>% 159 | ifelse(key_seq_sep_sheet, ., TRUE) # Adding it cause we only want to check when sep sheet is true 160 | 161 | # Testing for names of vectors 162 | if(any(!name_check, !name_check_extra, is.null(names(cols)))){ 163 | stop("Supplied column vector must be named using the following names: 164 | 'variable', 'dataset', 'order', 'keep', 'core', 'key_seq', 'supp_flag'") 165 | } 166 | # Subsetting sheets 167 | if(!is.null(sheet)){ 168 | sheet_ls <- str_subset(names(doc), sheet) 169 | doc <- doc[sheet_ls] 170 | } 171 | #Get base doc 172 | out <-doc %>% 173 | create_tbl(cols) 174 | 175 | # Getting the key seq values 176 | if(key_seq_sep_sheet){ 177 | key_seq_df <- doc %>% 178 | create_tbl(key_seq_cols) %>% 179 | mutate(key_seq = str_split(key_seq, ",\\s"), 180 | key_seq = map(key_seq, function(x){ 181 | tibble(variable = x) %>% 182 | mutate(key_seq = row_number()) 183 | })) %>% 184 | unnest(key_seq) 185 | out <- left_join(out, key_seq_df, by = c("dataset", "variable")) 186 | } 187 | 188 | # Get missing columns 189 | missing <- col_vars()$.ds_vars %>% 190 | discard(~. %in% names(out)) 191 | 192 | out %>% 193 | distinct() %>% 194 | `is.na<-`(missing) %>% 195 | mutate(key_seq = as.integer(key_seq), 196 | keep = yn_to_tf(keep), 197 | core = as.character(core), 198 | order = as.numeric(order)) 199 | } 200 | 201 | 202 | #' Spec to var_spec 203 | #' 204 | #' Creates the var_spec from a list of datasets (optionally filtered by the sheet 205 | #' input). The named vector `cols` is used to determine which is the correct 206 | #' sheet and renames the columns. (Note: the keep column will be converted logical) 207 | #' 208 | #' @param doc Named list of datasets @seealso [read_all_sheets()] for exact 209 | #' format 210 | #' @param cols Named vector of column names. The column names can be regular 211 | #' expressions for more flexibility. But, the names must follow the given pattern 212 | #' @param sheet Regular expression for the sheet name 213 | #' 214 | #' @return a dataset formatted for the metacore object 215 | #' @export 216 | #' 217 | #' @family spec builders 218 | spec_type_to_var_spec <- function(doc, cols = c("variable" = "[N|n]ame|[V|v]ariables?", 219 | "length" = "[L|l]ength", 220 | "label" = "[L|l]abel", 221 | "type" = "[T|t]ype", 222 | "dataset" = "[D|d]ataset|[D|d]omain", 223 | "format" = "[F|f]ormat"), 224 | sheet = "[V|v]ar"){ 225 | # Check the names 226 | name_check <- names(cols) %in% c("variable", "length", "label", 227 | "type", "dataset", "common", "format") %>% 228 | all() 229 | if(!name_check | is.null(names(cols))){ 230 | stop("Supplied column vector must be named using the following names: 231 | 'variable', 'length', 'label', 'type', 'dataset', 'common', 'format' 232 | If common is not avaliable it can be excluded and will be automatically filled in. 233 | Additionally, dataset is only used to clarify if information differs by domain") 234 | } 235 | 236 | # Check if sheet is specified 237 | if(!is.null(sheet)){ 238 | sheet_ls <- str_subset(names(doc), sheet) 239 | doc <- doc[sheet_ls] 240 | } 241 | out <- create_tbl(doc, cols) 242 | if(!"dataset" %in% names(out)){ 243 | dups <- out %>% 244 | distinct() %>% 245 | group_by(variable) %>% 246 | summarise(n = n(), .groups = "drop") %>% 247 | filter(n > 1) 248 | if(nrow(dups) > 0){ 249 | dups %>% 250 | pull(variable) %>% 251 | paste(collapse = "\n") %>% 252 | paste0("The following variables are repeated with different metadata for different datasets:\n", 253 | ., "\nPlease add 'dataset' = [Name of dataset column] to your named cols vector, to correct for this") %>% 254 | stop(., call. = FALSE) 255 | } 256 | } else { 257 | if(!"common" %in% names(cols)){ 258 | # Get the variable common to all datasets can only be calculated with ds present 259 | common_vars <- out %>% 260 | group_by(dataset) %>% 261 | select(dataset, variable) %>% 262 | group_split(.keep = FALSE) %>% 263 | reduce(inner_join, by = "variable") %>% 264 | mutate(common = TRUE) 265 | out <- out %>% 266 | left_join(common_vars, by = "variable") %>% 267 | replace_na(list(common = FALSE)) 268 | } 269 | 270 | # Remove any multiples and add ds if different metadata for different ds's 271 | out <- out %>% 272 | group_by(variable) %>% 273 | mutate(unique = n_distinct(length, label, type), 274 | variable = if_else(unique == 1, variable, 275 | paste0(dataset, ".", variable)), 276 | length = as.numeric(length)) %>% 277 | distinct(variable, length, label, type, .keep_all = TRUE) %>% 278 | select(-dataset, -unique) 279 | } 280 | 281 | # Get missing columns 282 | missing <- col_vars()$.var_spec %>% 283 | discard(~. %in% names(out)) 284 | out %>% 285 | `is.na<-`(missing) %>% 286 | distinct() %>% 287 | ungroup() %>% 288 | mutate(length = as.integer(length)) 289 | } 290 | 291 | #' Spec to value_spec 292 | #' 293 | #' Creates the value_spec from a list of datasets (optionally filtered by the 294 | #' sheet input). The named vector `cols` is used to determine which is the 295 | #' correct sheet and renames the columns 296 | #' 297 | #' @param doc Named list of datasets @seealso [read_all_sheets()] for exact 298 | #' format 299 | #' @param cols Named vector of column names. The column names can be regular 300 | #' expressions for more flexibility. But, the names must follow the given 301 | #' pattern 302 | #' @param sheet Regular expression for the sheet name 303 | #' @param where_sep_sheet Boolean value to control if the where information in a 304 | #' separate dataset. If the where information is on a separate sheet, set to 305 | #' true and provide the column information with the `where_cols` inputs. 306 | #' @param where_cols Named list with an id and where field. All columns in the 307 | #' where field will be collapsed together 308 | #' @param var_sheet Name of sheet with the Variable information on it. Metacore 309 | #' expects each variable will have a row in the value_spec. Because many 310 | #' specification only have information in the value tab this is added. If the 311 | #' information already exists in the value tab of your specification set to 312 | #' NULL 313 | #' 314 | #' @return a dataset formatted for the metacore object 315 | #' @export 316 | #' 317 | #' @family spec builders 318 | spec_type_to_value_spec <- function(doc, cols = c("dataset" = "[D|d]ataset|[D|d]omain", 319 | "variable" = "[N|n]ame|[V|v]ariables?", 320 | "origin" = "[O|o]rigin", 321 | "type" = "[T|t]ype", 322 | "code_id" = "[C|c]odelist|Controlled Term", 323 | "sig_dig" = "[S|s]ignificant", 324 | "where" = "[W|w]here", 325 | "derivation_id" = "[M|m]ethod", 326 | "predecessor" = "[P|p]redecessor"), 327 | sheet = NULL, 328 | where_sep_sheet = TRUE, 329 | where_cols = c("id" = "ID", 330 | "where" = c("Variable", "Comparator", "Value")), 331 | var_sheet = "[V|v]ar"){ 332 | name_check <- names(cols) %in% c("variable", "origin", "code_id", "sig_dig", 333 | "type", "dataset", "where", "derivation_id", 334 | "predecessor") %>% 335 | all() 336 | 337 | if(!name_check| is.null(names(cols))){ 338 | stop("Supplied column vector must be named using the following names: 339 | 'dataset', 'variable', 'origin', 'code_id', 'type', 'where', 'sig_dig', 'derivation_id', 340 | 'predecessor' 341 | If derivation_id is not avaliable it can be excluded and dataset.variable will be used. 342 | 343 | If the where information is on a seperate sheet, put the column with cross ref as where.") 344 | } 345 | 346 | # Select a subset of sheets if specified 347 | if(!is.null(sheet)){ 348 | sheet_ls <- str_subset(names(doc), sheet) 349 | doc <- doc[sheet_ls] 350 | } 351 | 352 | out <- create_tbl(doc, cols) 353 | 354 | # Does a var sheet exsist? 355 | if(!is.null(var_sheet)){ 356 | var_sheet <- names(doc) %>% 357 | keep(~str_detect(., var_sheet)) 358 | } 359 | 360 | # If so, add any variables not in the value sheet 361 | if(length(var_sheet) > 0){ 362 | var_out <- doc[var_sheet] %>% 363 | map_dfr(function(x){ 364 | var_out <- x %>% 365 | select_rename_w_dups(cols) %>% 366 | mutate(where = "TRUE") 367 | if(nrow(out) > 0){ 368 | var_out %>% 369 | anti_join(out, by = "variable") 370 | } else { 371 | var_out 372 | } 373 | }) 374 | 375 | # THIS ISN'T VERY PRETTY, IF SOMEONE HAS A BETTER IDEA PLEASE FIX 376 | # Needed in cause the value sheet is empty 377 | if(nrow(out) > 0 & nrow(var_out) > 0){ 378 | out <- bind_rows(out, var_out) 379 | } else if(nrow(var_out) > 0) { 380 | out <- var_out 381 | } else { 382 | out 383 | } 384 | 385 | } 386 | 387 | if(where_sep_sheet & "where" %in% names(out)){ 388 | where_df <- create_tbl(doc, where_cols) %>% 389 | mutate( 390 | where_new = pmap_chr(., function(...) { 391 | # Without c_across this gets a little weird 392 | # Use pmap and steal out the arg names 393 | vars <- list(...) 394 | # Filter down to only args that start with where 395 | wheres <- as.character(vars[which(str_starts(names(vars), 'where'))]) 396 | # collapse it together 397 | paste(wheres, collapse=" ") 398 | }) 399 | ) %>% 400 | select(id, where_new) 401 | out <- out %>% 402 | left_join(where_df, by = c("where" = "id")) %>% 403 | select(-where, where = where_new) 404 | } else if(where_sep_sheet) { 405 | warning("Not able to add where infromation from seperate sheet cause a where column is needed to cross-reference the information", 406 | call. = FALSE) 407 | } 408 | 409 | if(!"derivation_id" %in% names(cols)){ 410 | out <- out %>% 411 | mutate(derivation_id = 412 | if_else(str_to_lower(.data$origin) == "assigned", 413 | paste0(dataset, ".", variable), 414 | paste0("pred.", dataset, ".", variable))) 415 | } 416 | 417 | # Get missing columns 418 | missing <- col_vars()$.value_spec %>% 419 | discard(~. %in% names(out)) 420 | 421 | out %>% 422 | `is.na<-`(missing) %>% 423 | distinct() %>% 424 | mutate(sig_dig = as.integer(.data$sig_dig), 425 | derivation_id = case_when( 426 | !is.na(.data$derivation_id) ~ .data$derivation_id, 427 | str_to_lower(.data$origin) == "predecessor" ~ paste0("pred.", as.character(.data$predecessor)), 428 | str_to_lower(.data$origin) == "assigned" ~ paste0(.data$dataset, ".", .data$variable)) 429 | ) %>% 430 | select(-.data$predecessor) 431 | 432 | } 433 | 434 | #' Spec to codelist 435 | #' 436 | #' Creates the value_spec from a list of datasets (optionally filtered by the 437 | #' sheet input). The named vector `*_cols` is used to determine which is the 438 | #' correct sheet and renames the columns. 439 | #' @param doc Named list of datasets @seealso [read_all_sheets()] for exact 440 | #' format 441 | #' @param codelist_cols Named vector of column names that make up the codelist. 442 | #' The column names can be regular expressions for more flexibility. But, the 443 | #' names must follow the given pattern 444 | #' @param permitted_val_cols Named vector of column names that make up the 445 | #' permitted value The column names can be regular expressions for more 446 | #' flexibility. This is optional, can be left as null if there isn't a 447 | #' permitted value sheet 448 | #' @param dict_cols Named vector of column names that make up the dictionary 449 | #' value The column names can be regular expressions for more flexibility. 450 | #' This is optional, can be left as null if there isn't a permitted value 451 | #' sheet 452 | #' @param sheets Optional, regular expressions of the sheets 453 | #' @param simplify Boolean value, if true will convert code/decode pairs that 454 | #' are all equal to a permitted value list. True by default 455 | #' 456 | #' @return a dataset formatted for the metacore object 457 | #' @export 458 | #' 459 | #' @family spec builders 460 | spec_type_to_codelist <- function(doc, codelist_cols = c("code_id" = "ID", 461 | "name" = "[N|n]ame", 462 | "code" = "^[C|c]ode|^[T|t]erm", 463 | "decode" = "[D|d]ecode"), 464 | permitted_val_cols = NULL, 465 | dict_cols = c("code_id" = "ID", 466 | "name" = "[N|n]ame", 467 | "dictionary" = "[D|d]ictionary", 468 | "version" = "[V|v]ersion"), 469 | sheets = NULL, simplify = FALSE){ 470 | if(is.null(codelist_cols)){ 471 | stop("Codelist column names must be provided", call. = FALSE) 472 | } else { 473 | name_check <- names(codelist_cols) %in% c("code_id", "name", "code", "decode") %>% 474 | all() 475 | if(!name_check| is.null(names(codelist_cols))){ 476 | stop("Supplied column vector for codelist_cols must be named using the following names: 477 | 'code_id', 'name', 'code', 'decode'", 478 | call. = FALSE 479 | ) 480 | } 481 | } 482 | 483 | if (!is.null(permitted_val_cols)){ 484 | name_check <- names(permitted_val_cols) %in% c("code_id", "name", "code") %>% 485 | all() 486 | if(!name_check){ 487 | stop("Supplied column vector for permitted_val_cols must be named using the following names: 488 | 'code_id', 'name', 'code'", 489 | call. = FALSE) 490 | } 491 | } 492 | if(!is.null(dict_cols)){ 493 | name_check <- names(dict_cols) %in% c("code_id", "name", "dictionary", "version") %>% 494 | all() 495 | if(!name_check){ 496 | stop("Supplied column vector for `dict_cols` must be named using the following names: 497 | 'code_id', 'name', 'dictionary', 'version', 498 | If a dictionary sheet isn't avaliable set `dict_cols` to NULL", 499 | call. = FALSE) 500 | } 501 | } 502 | 503 | # Select a subset of sheets if specified 504 | if(!is.null(sheets)){ 505 | sheet_ls <- str_subset(names(doc), sheets) 506 | doc <- doc[sheet_ls] 507 | } 508 | 509 | # Create the base table with codes and decodes (min req output) 510 | cd_out <- create_tbl(doc, codelist_cols) %>% 511 | group_by(code_id) %>% 512 | mutate(type = case_when(simplify & all(code == decode) ~ "permitted_val", 513 | TRUE ~ "code_decode")) %>% 514 | nest(codes = c(code, decode)) %>% 515 | mutate(codes = if_else(type == "permitted_val", 516 | lapply(codes, function(df) df %>% pull(code)), 517 | codes)) 518 | # If available get a permitted value sheet 519 | if(!is.null(permitted_val_cols)){ 520 | pv_out <- create_tbl(doc, permitted_val_cols) %>% 521 | mutate(type = "permitted_val") %>% 522 | group_by(code_id) %>% 523 | nest(codes = c(code, decode)) 524 | cd_out <- bind_rows(cd_out, pv_out) 525 | } 526 | # Add dictionary if avaliable 527 | if(!is.null(dict_cols)){ 528 | dic_out <- create_tbl(doc, dict_cols) %>% 529 | mutate(type = "external_library") %>% 530 | group_by(code_id) %>% 531 | nest(codes = c(dictionary, version)) 532 | cd_out <- bind_rows(cd_out, dic_out) 533 | } 534 | # Get missing columns 535 | missing <- col_vars()$.codelist %>% 536 | discard(~. %in% names(cd_out)) 537 | 538 | cd_out %>% 539 | `is.na<-`(missing) %>% 540 | distinct() %>% 541 | filter(!is.na(code_id)) %>% 542 | ungroup() 543 | } 544 | 545 | #' Spec to derivation 546 | #' 547 | #' Creates the derivation table from a list of datasets (optionally filtered by 548 | #' the sheet input). The named vector `cols` is used to determine which is the 549 | #' correct sheet and renames the columns. The derivation will be used for 550 | #' "derived" origins, the comments for "assigned" origins, and predecessor for 551 | #' "predecessor" origins. 552 | #' @param doc Named list of datasets @seealso [read_all_sheets()] for exact 553 | #' format 554 | #' @param cols Named vector of column names. The column names can be regular 555 | #' expressions for more flexibility. But, the names must follow the given 556 | #' pattern 557 | #' @param var_cols Named vector of the name(s) of the origin, predecessor and 558 | #' comment columns. These do not have to be on the specified sheet. 559 | #' @param sheet Regular expression for the sheet name 560 | #' 561 | #' @return a dataset formatted for the metacore object 562 | #' @export 563 | #' 564 | #' @family spec builders 565 | #' @importFrom purrr quietly 566 | spec_type_to_derivations <- function(doc, cols = c("derivation_id" = "ID", 567 | "derivation" = "[D|d]efinition|[D|d]escription"), 568 | sheet = "Method|Derivations?", 569 | var_cols = c("dataset" = "[D|d]ataset|[D|d]omain", 570 | "variable" = "[N|n]ame|[V|v]ariables?", 571 | "origin" = "[O|o]rigin", 572 | "predecessor" = "[P|p]redecessor", 573 | "comment" = "[C|c]omment")){ 574 | 575 | name_check <- names(cols) %in% c("derivation_id", "derivation") %>% 576 | all() 577 | if(!name_check| is.null(names(cols))){ 578 | stop("Supplied column vector must be named using the following names: 579 | 'derivation_id', 'derivation'") 580 | } 581 | 582 | name_check <- names(var_cols) %in% c('dataset', 'variable', 'origin', 'predecessor', 'comment') %>% 583 | all() 584 | if(!name_check| is.null(names(var_cols))){ 585 | stop("Supplied variable column vector must be named using the following names: 586 | 'dataset', 'variable', 'origin', 'predecessor', 'comment'") 587 | } 588 | # Get the predecessor 589 | ls_derivations <- quietly(create_tbl)(doc, var_cols)$result 590 | if(class(ls_derivations)[1] == "list"){ 591 | ls_derivations <- ls_derivations %>% 592 | reduce(bind_rows) 593 | # Get the comments 594 | if(any(str_detect(names(doc), "[C|c]omment"))){ 595 | comments <- doc[str_detect(names(doc), "[C|c]omment")][[1]] |> 596 | select(matches("ID|Description")) 597 | with_comments <- ls_derivations |> 598 | filter(str_to_lower(.data$origin) == "assigned") |> 599 | left_join(comments, by = c("comment" = "ID" )) |> 600 | mutate(comment = .data$Description) |> 601 | select(-.data$Description) 602 | ls_derivations <- ls_derivations |> 603 | filter(str_to_lower(.data$origin) != "assigned") |> 604 | bind_rows(with_comments) 605 | } 606 | } 607 | 608 | other_derivations <- ls_derivations %>% 609 | mutate( 610 | derivation_id = case_when( 611 | str_to_lower(.data$origin) == "predecessor" ~ paste0("pred.", as.character(.data$predecessor)), 612 | str_to_lower(.data$origin) == "assigned" ~ paste0(.data$dataset, ".", .data$variable), 613 | TRUE ~ NA_character_ 614 | ), 615 | derivation = case_when( 616 | str_to_lower(.data$origin) == "predecessor" ~ as.character(.data$predecessor), 617 | str_to_lower(.data$origin) == "assigned" ~ .data$comment, 618 | TRUE ~ NA_character_ 619 | )) %>% 620 | filter(!is.na(.data$derivation_id)) %>% 621 | select(.data$derivation, .data$derivation_id) 622 | 623 | # Select a subset of sheets if specified 624 | if(!is.null(sheet)){ 625 | sheet_ls <- str_subset(names(doc), sheet) 626 | doc <- doc[sheet_ls] 627 | } 628 | out <- create_tbl(doc, cols) 629 | 630 | # Get missing columns 631 | missing <- col_vars()$.derivations %>% 632 | discard(~. %in% names(out)) 633 | 634 | 635 | out %>% 636 | `is.na<-`(missing) %>% 637 | bind_rows(other_derivations) %>% 638 | distinct() %>% 639 | filter(!is.na(derivation_id)) 640 | } 641 | ### Helper Functions 642 | 643 | #' Create table 644 | #' 645 | #' This function creates a table from excel sheets. This is mainly used 646 | #' internally for building spec readers, but is exported so others who need to 647 | #' build spec readers can use it. 648 | #' @param doc list of sheets from a excel doc 649 | #' @param cols vector of regex to get a datasets base on which columns it has. 650 | #' If the vector is named it will also rename the columns 651 | #' 652 | #' @return dataset (or list of datasets if not specific enough) 653 | #' @export 654 | create_tbl <- function(doc, cols){ 655 | matches <- doc %>% 656 | keep(function(x){ 657 | cols %>% 658 | map_lgl(~any(str_detect(names(x), .))) %>% 659 | all() 660 | }) 661 | if(length(matches) == 0) { 662 | # Get which variable can't be matches 663 | mismatch_per_sheet <- doc %>% 664 | map(function(x){ 665 | cols %>% 666 | map_lgl(~any(str_detect(names(x), .))) %>% 667 | discard(~.) # Remove the matched values 668 | }) 669 | # Find the closest sheet by looking for the sheet(s) with the fewest mismatches 670 | mis_lens <- mismatch_per_sheet %>% 671 | map_int(length) 672 | closest_sheets <- mis_lens %>% 673 | keep(~ . == min(mis_lens)) %>% 674 | names() 675 | # Get the name of the sheets and which columns don't match 676 | sheets_to_error <- mismatch_per_sheet %>% 677 | keep(names(.) %in% closest_sheets) 678 | 679 | # Write out the error 680 | sheets_to_error %>% 681 | map2_chr(names(sheets_to_error), function(vars, sheet_name){ 682 | paste0("Sheet '", sheet_name, "' is the closest match, but unable to match the following column(s)\n", 683 | paste(names(vars), collapse = "\n")) 684 | }) %>% 685 | paste0(collapse = "\n") %>% 686 | paste0("Unable to identify a sheet with all columns.\n", . ) %>% 687 | stop(call. = FALSE) 688 | 689 | } else if(length(matches) == 1){ 690 | # Check names and write a better warning message if names don't work 691 | ds_nm <- matches[[1]] %>% 692 | names() 693 | nm_test <- cols %>% 694 | map_int(~sum(str_detect(ds_nm, .))) %>% 695 | keep(~ . != 1) 696 | if(length(nm_test) > 0) { 697 | # See if an exact match will 698 | test_exact <- cols[names(nm_test)] %>% 699 | paste0("^", ., "$") %>% 700 | map_int(~sum(str_detect(ds_nm, .))) %>% 701 | keep(~ . != 1) 702 | if(length(test_exact) == 0){ 703 | cols[names(nm_test)] <- cols[names(nm_test)] %>% 704 | paste0("^", ., "$") 705 | } else { 706 | str_c(names(nm_test), " matches ",nm_test, " columns") %>% 707 | str_c(collapse = "\n ") %>% 708 | paste0("Unable to rename the following columns in ", names(matches[1]), ":\n ", ., 709 | "\nPlease check your regular expression ") %>% 710 | stop(call. = FALSE) 711 | } 712 | } 713 | 714 | # This needs to be done columnwise to allow for duplicate selection of the same column 715 | select_rename_w_dups(matches[[1]], cols) 716 | 717 | } else { 718 | sheets_mats <- matches %>% 719 | names() 720 | paste("Column names are not specific enough to identify a single sheet. The following", 721 | length(sheets_mats), 722 | "match the criteria set:", paste(sheets_mats, collapse = ", ")) %>% 723 | warning(., call. = FALSE) 724 | matches %>% 725 | map(~select_rename_w_dups(., cols)) 726 | } 727 | } 728 | 729 | 730 | #' Yes No to True False 731 | #' 732 | #' @param x takes in a vector to convert 733 | #' 734 | #' @return returns a logical vector or normal vector with warning 735 | #' @noRd 736 | #' 737 | yn_to_tf <- function(x){ 738 | if(all(is.na(x) | str_detect(x, regex("^y$|^n$|^yes$|^no$", ignore_case = T)))){ 739 | case_when(str_detect(x, regex("^y$|^yes$", ignore_case = T)) ~ TRUE, 740 | str_detect(x, regex("^n$|^no$", ignore_case = T)) ~ FALSE, 741 | is.na(x) ~ NA) 742 | } else if(is.logical(x)){ 743 | x 744 | } else { 745 | warning("Keep column needs to be True or False, please correct before converting to a Metacore object", 746 | call. = FALSE) 747 | x 748 | } 749 | } 750 | 751 | 752 | #' Select in a dataset with renames 753 | #' 754 | #' This works like select, but if there are duplicates it won't cause issues 755 | #' 756 | #' @param .data dataset to select columns and rename 757 | #' @param cols named vector 758 | #' 759 | #' @return dataset 760 | #' @noRd 761 | #' @importFrom purrr safely 762 | select_rename_w_dups <- function(.data, cols){ 763 | pull_safe <- safely(~select(.x, matches(.y, ignore.case = FALSE))) 764 | cols %>% 765 | map_dfr(function(col){ 766 | out <- pull_safe(.data, col) %>% 767 | .$result 768 | if(ncol(out) == 1){ 769 | out <- out %>% pull(1) 770 | } else { 771 | out <- NULL 772 | } 773 | out 774 | }) 775 | } 776 | -------------------------------------------------------------------------------- /R/utils-pipe.R: -------------------------------------------------------------------------------- 1 | #' Pipe operator 2 | #' 3 | #' See \code{magrittr::\link[magrittr:pipe]{\%>\%}} for details. 4 | #' 5 | #' @name %>% 6 | #' @rdname pipe 7 | #' @keywords internal 8 | #' @export 9 | #' @importFrom magrittr %>% 10 | #' @usage lhs \%>\% rhs 11 | NULL 12 | -------------------------------------------------------------------------------- /R/utils.R: -------------------------------------------------------------------------------- 1 | #' Add a label to a vector 2 | #' 3 | #' @param x Vector to add label to 4 | #' @param label Label to add to vector 5 | #' @return Labeled vector 6 | #' @noRd 7 | 8 | add_lab <- function(x, label) { 9 | if (length(label) == 0) { 10 | label <- NULL 11 | } 12 | attr(x, "label") <- label 13 | x 14 | } 15 | 16 | #' Add Labels to Dataframe 17 | #' @param .data Dataframe that you want to add labels to 18 | #' @param ... Labeled vector of labels, with the name of the element equal to 19 | #' the name of the column and the element equaling the label, or comma-separated name-value pair 20 | #' @importFrom purrr map2 21 | #' @importFrom tibble tibble as_tibble 22 | #' @importFrom dplyr filter pull 23 | #' @return Dataframe with labels 24 | #' @noRd 25 | add_labs <- function(.data,...) { 26 | name_list <- c(...) 27 | df <- tibble(col = names(name_list), lab = name_list) 28 | .data %>% 29 | purrr::map2(names(.data), function(x, name) { 30 | label <- df %>% 31 | filter(col == name) %>% 32 | pull(lab) %>% 33 | unname() 34 | if(length(label) > 0) { 35 | add_lab(x, label) 36 | } else { 37 | x 38 | } 39 | }) %>% 40 | as_tibble() 41 | } 42 | 43 | 44 | #' Column Validation Function 45 | #' 46 | #' @param .data the dataframe to check the column for 47 | #' @param col the column to test 48 | #' @param func the function to use to assert column structure 49 | #' @param any_na_acceptable boolean, testing if the column can have missing 50 | #' @param nm name of column to check (for warning and error clarification) 51 | #' 52 | check_structure <- function(.data, col, func, any_na_acceptable, nm) { 53 | 54 | column <- as_string(col) 55 | 56 | vec <- .data %>% pull(!!col) 57 | 58 | if(any(is.na(vec)) & !any_na_acceptable) { 59 | error_message <- paste(column, "from the", nm, "table contains missing values. Actual values are needed.") 60 | warning_string <- NULL 61 | } else if (all(is.na(vec))){ 62 | warning_string <- paste(column, "from the", nm, 63 | "table only contain missing values.") 64 | error_message <- NULL 65 | } else { 66 | 67 | failures <- vec %>% 68 | discard(~do.call(func, list(.))) %>% 69 | unique() 70 | 71 | all_fails <- paste(" ", failures, collapse = "\n") 72 | error_message <- NULL 73 | 74 | if (length(failures) > 0 ) { 75 | 76 | if (is.primitive(func)) { 77 | 78 | assertion_func <- prim_name(func) 79 | warning_string <- paste0(nm, "$", column, " fails ", assertion_func, " check \n") 80 | 81 | } else { 82 | warning_string <- paste0("The following words in ", nm, "$", column, " are not allowed: \n", all_fails, "\n") 83 | } 84 | 85 | } else { 86 | warning_string <- NULL 87 | } 88 | 89 | } 90 | 91 | list(warning = warning_string, error = error_message) 92 | } 93 | 94 | #' Check Words in Column 95 | #' 96 | #' @param ... permissible words in the column 97 | #' @param col the column to check for specific words 98 | check_words <- function(..., col) { 99 | accepted_words <- unlist(c(...)) 100 | expr <- expr(function(col) col %in% !!accepted_words) 101 | make_function(body = expr, env = parent.frame())() 102 | } 103 | 104 | make_function <- function(args = pairlist(), body, env = parent.frame()) { 105 | eval(call("function", args, body), env) 106 | } 107 | 108 | 109 | #' Get path to metacore example 110 | #' 111 | #' metacore comes bundled with a number of sample files in its `inst/extdata` 112 | #' directory. This function make them easy to access. When testing or writing 113 | #' examples in other packages, it is best to use the 'pilot_ADaM.rda' example as 114 | #' it loads fastest. 115 | #' @param file Name of file. If `NULL`, the example files will be listed. 116 | #' @export 117 | #' @examples 118 | #' metacore_example() 119 | #' metacore_example("mock_spec.xlsx") 120 | metacore_example <- function(file = NULL) { 121 | if (is.null(file)) { 122 | dir(system.file("extdata", package = "metacore")) 123 | } else { 124 | system.file("extdata", file, package = "metacore", mustWork = TRUE) 125 | } 126 | } 127 | -------------------------------------------------------------------------------- /R/validators.R: -------------------------------------------------------------------------------- 1 | #' Checks the variables in ds_var are in var_spec 2 | #' 3 | #' @param ds_vars ds_var table 4 | #' @param var_spec var_spec table 5 | #' 6 | #' @return writes warning to console if there is an issue 7 | #' @noRd 8 | ds_vars_check <- function(ds_vars, var_spec){ 9 | var_check <- anti_join(ds_vars, var_spec, by = "variable") 10 | 11 | if(var_check %>% nrow()){ 12 | var_ls <- var_check %>% 13 | pull(.data$variable) %>% 14 | unique() 15 | 16 | var_check_dbl <- ds_vars %>% 17 | filter(.data$variable %in% var_ls) %>% 18 | mutate(var_name = paste0(.data$dataset, ".", .data$variable)) %>% 19 | anti_join(var_spec, by = c("var_name" = "variable")) %>% 20 | pull(.data$variable) %>% 21 | unique() 22 | 23 | if(var_check_dbl %>% length() != 0){ 24 | cat("\n") 25 | warning( 26 | "The following variable(s) do not have labels and lengths: ", 27 | paste(" ", var_check_dbl, sep = "\n "), 28 | "\n\n", 29 | call. = FALSE 30 | ) 31 | } 32 | } 33 | } 34 | 35 | 36 | #' Check Values 37 | #' 38 | #' 39 | #' Check the value spec table works with the ds_var tables. All variables in the 40 | #' ds_var should be in the value_spec and all variables in value_spec should be 41 | #' in ds_vars 42 | #' @param ds_vars ds_vars table 43 | #' @param value_spec value_spec table 44 | #' 45 | #' @return writes warning to console if there is an issue 46 | #' @noRd 47 | value_check <- function(ds_vars, value_spec){ 48 | value_vars <- value_spec %>% 49 | distinct(.data$dataset, .data$variable) 50 | 51 | #Check the variables in ds_vars that don't have value specs 52 | not_in_val <- anti_join(ds_vars, value_vars, by = c("dataset", "variable")) 53 | if(nrow(not_in_val) != 0){ 54 | variables <- not_in_val %>% 55 | mutate(full = str_c(.data$dataset, .data$variable, sep = ".")) %>% 56 | pull(.data$full) %>% 57 | str_c(collapse = ", ") 58 | message <- paste("The following variables are in the ds_vars table, but don't have value specs:\n", 59 | variables, "\n\n") 60 | warning(message, call. = FALSE) 61 | } 62 | # Check the variables in value spec that aren't in ds_vars 63 | not_in_ds <- anti_join(value_vars, ds_vars, by = c("dataset", "variable")) 64 | if(nrow(not_in_ds) != 0){ 65 | variables <- not_in_ds %>% 66 | pull(.data$variable) %>% 67 | str_c(collapse = ", ") 68 | message <- paste("The following variables are have value specifications, but aren't in the ds_vars table:\n", 69 | variables, "\n\n") 70 | warning(message, call. = FALSE) 71 | } 72 | } 73 | 74 | 75 | #' Derivation check 76 | #' 77 | #' @param value_spec value_spec table 78 | #' @param derivations derivation table 79 | #' 80 | #' @return writes warning to console if there is an issue 81 | #' @noRd 82 | derivation_check <- function(value_spec, derivations){ 83 | deriv_vars <- value_spec %>% 84 | filter(!is.na(.data$derivation_id)) %>% 85 | distinct(.data$variable,.data$ derivation_id) 86 | 87 | #Check the variables that don't have derivations in derivations 88 | not_in_val <- anti_join(deriv_vars, derivations, by = c("derivation_id")) 89 | if(nrow(not_in_val) != 0){ 90 | variables <- not_in_val %>% 91 | pull(.data$variable) %>% 92 | str_c(collapse = "\n ") 93 | message <- paste("The following variables have derivation ids not found in the derivations table:\n", 94 | variables, "\n\n") 95 | warning(message, call. = FALSE) 96 | } 97 | # Check the derivations in deriavtion that aren't in value spec 98 | not_in_deriv <- anti_join(derivations, deriv_vars, by = c("derivation_id")) 99 | if(nrow(not_in_deriv) != 0){ 100 | deriv <- not_in_deriv %>% 101 | mutate(message = paste0(.data$derivation_id, ": ", .data$derivation)) %>% 102 | pull(.data$message) %>% 103 | str_c(collapse = "\n ") 104 | message <- paste("The following derivations are never used:\n", 105 | deriv, "\n\n") 106 | warning(message, call. = FALSE) 107 | } 108 | 109 | } 110 | 111 | #' Codelist Check 112 | #' 113 | #' @param value_spec value spec table 114 | #' @param codelist codelist table 115 | #' 116 | #' @return writes warning to console if there is an issue 117 | #' @noRd 118 | codelist_check <- function(value_spec, codelist){ 119 | code_vars <- value_spec %>% 120 | filter(!is.na(.data$code_id)) %>% 121 | distinct(.data$variable, .data$code_id) 122 | 123 | #Check the variables in don't codelists have codelist 124 | not_in_val <- anti_join(code_vars, codelist, by = c("code_id")) 125 | if(nrow(not_in_val)){ 126 | variables <- not_in_val %>% 127 | pull(.data$variable) %>% 128 | str_c(collapse = "\n ") 129 | message <- paste("The following variables have code ids not found in the codelist(s):\n", 130 | variables, "\n") 131 | warning(message, call. = FALSE) 132 | } 133 | # Check the code_ids in codelist that aren't in value spec 134 | not_in_cl <- anti_join(codelist, code_vars, by = c("code_id")) 135 | if(nrow(not_in_cl)){ 136 | cl_nm <- not_in_cl %>% 137 | pull(.data$name) %>% 138 | str_c(collapse = "\n ") 139 | message <- paste("The following codelist(s) are never used:\n", 140 | cl_nm, "\n\n") 141 | warning(message, call. = FALSE) 142 | } 143 | } 144 | 145 | 146 | #' Check Supp 147 | #' 148 | #' 149 | #' Check the supp table works with the ds_var tables. All variables in the 150 | #' ds_var with a TRUE supp flag should be in the supp and all variables in supp 151 | #' should be in ds_vars 152 | #' @param ds_vars ds_vars table 153 | #' @param supp supp table 154 | #' 155 | #' @return writes warning to console if there is an issue 156 | #' @noRd 157 | supp_check <- function(ds_vars, supp){ 158 | dist_test <- supp %>% 159 | distinct(.data$dataset, .data$variable) %>% 160 | nrow() == nrow(supp) 161 | if(!dist_test){ 162 | warning("Supp table contains non-unique dataset/variable combinations") 163 | } 164 | 165 | ds_vars <- ds_vars %>% 166 | filter(.data$supp_flag) 167 | 168 | #Check the variables in ds_vars that don't have value specs 169 | not_in_supp <- anti_join(ds_vars, supp, by = c("dataset", "variable")) 170 | if(nrow(not_in_supp) != 0){ 171 | variables <- not_in_supp %>% 172 | mutate(full = str_c(.data$dataset, .data$variable, sep = ".")) %>% 173 | pull(.data$full) %>% 174 | str_c(collapse = ", ") 175 | message <- paste("The following variables are in the ds_vars table and tagged as supplement, but don't have supp specs:\n", 176 | variables, "\n\n") 177 | warning(message, call. = FALSE) 178 | } 179 | # Check the variables in value spec that aren't in ds_vars 180 | not_in_ds <- anti_join(supp, ds_vars, by = c("dataset", "variable")) 181 | if(nrow(not_in_ds) != 0){ 182 | variables <- not_in_ds %>% 183 | pull(.data$variable) %>% 184 | str_c(collapse = ", ") 185 | message <- paste("The following variables are have supp specifications, but aren't in the ds_vars table:\n", 186 | variables, "\n\n") 187 | warning(message, call. = FALSE) 188 | } 189 | } 190 | 191 | 192 | #' Column Names by dataset 193 | #' 194 | #' @return list of column names by dataset 195 | #' @noRd 196 | col_vars <- function(){ 197 | list(.ds_spec = c("dataset", "structure", "label"), 198 | .ds_vars = c("dataset", "variable", "key_seq", "order","keep", "core", "supp_flag"), 199 | .var_spec = c("variable", "length", "label", "type", "common", "format"), 200 | .value_spec = c("dataset", "variable", "type", "origin","sig_dig", "code_id", "where", "derivation_id"), 201 | .derivations = c("derivation_id", "derivation"), 202 | .codelist= c("code_id", "name","type", "codes"), 203 | .supp = c("dataset", "variable", "idvar", "qeval")) 204 | } 205 | 206 | 207 | #' Check Variable names 208 | #' 209 | #' @param envrionment the private environment of the object 210 | #' 211 | #' @return warning messages to the console if there is an issue 212 | #' @noRd 213 | var_name_check <- function(envrionment){ 214 | # Set the name as they should be 215 | col_names <- col_vars() 216 | # Get the tables and table names from the environment 217 | tbl_name <- ls(envrionment, all.names = TRUE) 218 | tbls <- map(tbl_name, get, envir = envrionment) 219 | # Checks is names match the table above, returns T if so F else. If the names 220 | # don't match, will also produce a warning of what the names should be 221 | map2_lgl(tbl_name, tbls, function(name, tbl){ 222 | name 223 | if(is.null(tbl)){ 224 | # Checks for null tables 225 | print_message <- name %>% 226 | str_remove("[:punct:]") %>% 227 | paste("is null") 228 | warning(print_message, call. = FALSE) 229 | FALSE 230 | } else if(!setequal(names(tbl),col_names[[name]])){ 231 | # writes a message if the column names don't match 232 | print_message <- name %>% 233 | str_remove("[:punct:]") %>% 234 | paste0("'", ., "' has incorrect column names. It should be:\n", 235 | str_c(col_names[[name]], collapse = ", "), "\n") 236 | warning(print_message, call. = FALSE) 237 | FALSE 238 | } else { 239 | TRUE 240 | } 241 | }) %>% 242 | all() 243 | 244 | } 245 | 246 | 247 | 248 | #' Column Data Check <- lol horrible name 249 | #' 250 | #' @return a data frame of the datasets, column 251 | #' @noRd 252 | #' 253 | all_message <- function() { 254 | tribble( 255 | ~dataset, ~var, ~test, ~any_na_acceptable, 256 | "ds_spec", "dataset", is.character, FALSE, 257 | "ds_spec", "structure", is.character, TRUE, 258 | "ds_spec", "label", is.character, TRUE, 259 | "ds_vars", "dataset", is.character, FALSE, 260 | "ds_vars", "variable", is.character, FALSE, 261 | "ds_vars", "key_seq", is.numeric, TRUE, 262 | "ds_vars", "order", is.numeric, TRUE, 263 | "ds_vars", "keep", is.logical, TRUE, 264 | "ds_vars", "core", check_words("Expected", "Required", "Permissible", "Conditionally Required", "Conditionally Expected", NA), TRUE, 265 | "ds_vars", "supp_flag", is.logical, TRUE, 266 | "var_spec", "variable", is.character, FALSE, 267 | "var_spec", "type", is.character, TRUE, 268 | "var_spec", "length", is.numeric, TRUE, 269 | "var_spec", "label", is.character, TRUE, 270 | "var_spec", "format", is.character, TRUE, 271 | "var_spec", "common", is.logical, TRUE, 272 | "value_spec", "type", is.character, TRUE, 273 | "value_spec", "sig_dig", is.integer, TRUE, 274 | "value_spec", "origin", function(x){str_detect(x, "collected|derived|assigned|protocol|predecessor|crf.*")||is.na(x)}, TRUE, 275 | "value_spec", "code_id", is.character, TRUE, 276 | "value_spec", "dataset", is.character, FALSE, 277 | "value_spec", "where", is.character, TRUE, 278 | "value_spec", "derivation_id", is.character, TRUE, 279 | "derivations", "derivation_id", is.character, FALSE, 280 | "derivations", "derivation", is.character, TRUE, 281 | "codelist", "code_id", is.character, FALSE, 282 | "codelist", "name", is.character, TRUE, 283 | "codelist", "codes", function(x){!is.null(x)}, TRUE, 284 | "codelist", "type", is.character, TRUE, 285 | "supp", "dataset", is.character, FALSE, 286 | "supp", "variable", is.character, FALSE, 287 | "supp", "idvar", is.character, TRUE, 288 | "supp", "qeval", is.character, TRUE, 289 | ) 290 | } 291 | 292 | 293 | #' Check all data frames include the correct types of columns 294 | #' 295 | #' This function checks for vector types and accepted words 296 | #' 297 | #' @param ds_spec dataset specification 298 | #' @param ds_vars dataset variables 299 | #' @param var_spec variable specification 300 | #' @param value_spec value specification 301 | #' @param derivations derivation information 302 | #' @param codelist codelist information 303 | #' @param supp supp information 304 | #' 305 | check_columns <- function(ds_spec, ds_vars, var_spec, value_spec, derivations, codelist, supp) { 306 | 307 | 308 | messages <- purrr::pmap(all_message(), 309 | ~check_structure( 310 | get(..1), sym(..2), ..3, ..4, ..1) 311 | ) 312 | 313 | # errors 314 | errors <- map(messages, "error") %>% 315 | compact() %>% 316 | paste0(., collapse = "\n\n") 317 | if(errors != "") 318 | stop(paste0(errors, "\n\n"), call. = FALSE) 319 | 320 | # warnings 321 | warnings <- map(messages, "warning") %>% 322 | compact() %>% 323 | paste0(., collapse = "\n\n") 324 | if(warnings != "") 325 | warning(paste0(warnings, "\n\n"), call. = FALSE) 326 | 327 | 328 | 329 | } 330 | 331 | #' Is metacore object 332 | #' 333 | #' @param x object to check 334 | #' 335 | #' @return `TRUE` if metacore, `FALSE` if not 336 | #' @export 337 | #' 338 | #' @examples 339 | #' # Loads in a metacore obj called metacore 340 | #' load(metacore_example("pilot_ADaM.rda")) 341 | #' is_metacore(metacore) 342 | #' 343 | is_metacore <- function(x){ 344 | inherits(x, "Metacore") 345 | } 346 | -------------------------------------------------------------------------------- /R/xml_builders.R: -------------------------------------------------------------------------------- 1 | #' Define XML to DataDef Object 2 | #' 3 | #' Given a path, this function converts the define xml to a DataDef Object 4 | #' 5 | #' @param path location of the define xml as a string 6 | #' @param quiet Option to quietly load in, this will suppress warnings, but not errors 7 | #' 8 | #' @return DataDef Object 9 | #' @export 10 | #' @importFrom xml2 read_xml xml_find_all xml_attr xml_ns_strip 11 | define_to_metacore <- function(path, quiet = FALSE){ 12 | 13 | xml <- read_xml(path) 14 | xml_ns_strip(xml) 15 | 16 | define_version <- xml_find_all(xml, "//MetaDataVersion") %>% 17 | xml_attr("DefineVersion") %>% 18 | as.numeric_version() 19 | 20 | 21 | ds_spec <- xml_to_ds_spec(xml) 22 | ds_vars <- xml_to_ds_vars(xml) 23 | var_spec <- xml_to_var_spec(xml) 24 | value_spec <- xml_to_value_spec(xml) 25 | code_list <- xml_to_codelist(xml) 26 | derivations <- xml_to_derivations(xml) 27 | if(!quiet){ 28 | out <- metacore(ds_spec, ds_vars, var_spec, value_spec, derivations, codelist = code_list) 29 | } else{ 30 | out<- suppressWarnings(metacore(ds_spec, ds_vars, var_spec, value_spec, derivations, codelist = code_list)) 31 | message("Loading in metacore object with suppressed warnings") 32 | } 33 | out 34 | } 35 | 36 | 37 | #' XML to Data Set Spec 38 | #' 39 | #' Creates a dataset specification, which has the domain name and label for each dataset 40 | #' @param doc xml document 41 | #' 42 | #' @return data frame with the data set specifications 43 | #' @family xml builder 44 | #' @export 45 | #' @importFrom xml2 xml_attr xml_find_first xml_text 46 | xml_to_ds_spec <- function(doc) { 47 | # Read in the dataset level nodes 48 | xml_find_all(doc, "//MetaDataVersion/ItemGroupDef[contains(@OID, 'IG')]") %>% 49 | map_dfr(function(node){ 50 | tibble( 51 | dataset = xml_attr(node, "Name"), 52 | structure = xml_attr(node, "Structure"), 53 | label = xml_find_first(node, "./Description") %>% xml_text() 54 | ) 55 | }) 56 | } 57 | 58 | 59 | #' XML to Data Set Var table 60 | #' 61 | #' Creates the ds_vars table, which acts as a key between the datasets and the var spec 62 | #' @param doc xml document 63 | #' 64 | #' @return data frame with the dataset and variables 65 | #' @family xml builder 66 | #' @export 67 | #' 68 | xml_to_ds_vars <- function(doc) { 69 | # Each dataset is an ItemGroupDef 70 | xml_find_all(doc, "//ItemGroupDef") %>% 71 | map_dfr(function(node){ 72 | # Each Variable is a Item Ref 73 | child_node <- xml_find_all(node, "./ItemRef") 74 | tibble( 75 | dataset = xml_attr(node, "Name"), 76 | oid = xml_attr(child_node, "ItemOID"), 77 | mandatory = xml_attr(child_node, "Mandatory"), 78 | key_seq = xml_attr(child_node, "KeySequence") %>% 79 | as.integer(), 80 | order = xml_attr(child_node, "OrderNumber") %>% 81 | as.integer() 82 | ) 83 | }) %>% 84 | mutate( 85 | variable = id_to_var(.data$oid), 86 | keep = .data$mandatory == "Yes", 87 | core = NA_character_, 88 | supp_flag = NA 89 | ) %>% 90 | select(.data$dataset, .data$variable, .data$key_seq, 91 | .data$order, .data$keep, .data$core, .data$supp_flag, 92 | -.data$mandatory, -.data$oid) 93 | } 94 | 95 | 96 | #' XML to variable spec 97 | #' 98 | #' Takes a define xml and returns a dataset with specifications for each 99 | #' variable. The variable will just be the variable, unless the specification 100 | #' for that variable differ between datasets 101 | #' 102 | #' @param doc define xml document 103 | #' 104 | #' @return data frame with variable, length, label columns 105 | #' @family xml builder 106 | #' @export 107 | #' @importFrom xml2 xml_attr xml_find_all xml_text 108 | xml_to_var_spec <- function(doc) { 109 | 110 | # Gets the name, id, and length from the variable node and the description from the child 111 | var_info <- xml_find_all(doc, "//ItemDef") %>% 112 | map_dfr(function(node){ 113 | tibble( 114 | oid = xml_attr(node,"OID") %>% as.character(), 115 | variable = xml_attr(node, "Name") %>% as.character(), 116 | type = xml_attr(node, "DataType"), 117 | length = xml_attr(node, "Length") %>% as.integer(), 118 | format = xml_attr(node, "DisplayFormat"), 119 | label = xml_find_first(node, "./Description/TranslatedText") %>% 120 | xml_text() 121 | ) 122 | }) 123 | 124 | possible_vars <- xml_find_all(doc, "//ItemGroupDef/ItemRef") %>% 125 | map_chr(function(node){ 126 | oid = xml_attr(node, "ItemOID") 127 | }) 128 | 129 | # Get for each variable, get the number of distinct lengths and labels 130 | dist_df <- var_info %>% 131 | filter(.data$oid %in% possible_vars) %>% 132 | distinct(.data$variable, .data$length, .data$label, .data$type, .keep_all = TRUE) %>% 133 | group_by(.data$variable) %>% 134 | mutate( 135 | n = n(), 136 | common = NA 137 | ) %>% 138 | ungroup() 139 | 140 | # For variables with more than one distinct label, this gets all the full 141 | # variable names with that root. Sometimes 3 variables will have the same root 142 | # (i.e. ARMCD), 2 of them will match, but one of them won't. This means the 143 | # two matching will have been collapsed to one in the distinct and we have to 144 | # bring back the one that got dropped. Cause all of them need to be DS.var 145 | full_name_vars <- dist_df %>% 146 | filter(n > 1) %>% 147 | select(.data$variable) %>% 148 | inner_join(var_info, by = "variable") %>% 149 | mutate(variable = str_remove(.data$oid, "^IT\\.")) %>% 150 | distinct() 151 | 152 | # Combine the variables that need full names with the variables that don't 153 | dist_df %>% 154 | filter(n == 1) %>% 155 | bind_rows(full_name_vars) %>% 156 | select(.data$variable, .data$type, .data$length, .data$label, 157 | .data$format, .data$common, -.data$n, -.data$oid) 158 | } 159 | 160 | 161 | #' XML to value spec 162 | #' 163 | #' Takes a define xml and pulls out the value level metadata including codelist_id's, 164 | #' defines_id's, and where clause. There is one row per variable expect when there 165 | #' is a where clause, at which point there is one row per value. 166 | #' @param doc xml document 167 | #' 168 | #' @return tibble with the value level information 169 | #' @family xml builder 170 | #' @export 171 | #' 172 | #' @importFrom xml2 xml_attr xml_find_first xml_parent xml_find_all 173 | xml_to_value_spec <- function(doc) { 174 | # Get information in the item definition 175 | item_def <- xml_find_all(doc, "//ItemDef") %>% 176 | map_dfr(function(node){ 177 | tibble( 178 | oid = xml_attr(node,"OID") %>% as.character(), 179 | variable = xml_attr(node, "Name") %>% as.character(), 180 | type = xml_attr(node, "DataType"), 181 | sig_dig = xml_attr(node, "SignificantDigits") %>% as.integer(), 182 | origin = xml_find_first(node, "./def:Origin") %>% xml_attr("Type"), 183 | page_num = xml_find_first(node, "./def:Origin/def:DocumentRef/def:PDFPageRef") %>% xml_attr("PageRefs"), 184 | predecessor = xml_find_first(node, "./def:Origin") %>% xml_text(), 185 | comment_id = xml_attr(node,"CommentOID"), 186 | code_id = xml_find_first(node, "CodeListRef") %>% xml_attr("CodeListOID"), 187 | varname = xml_attr(node, "SASFieldName") %>% as.character() 188 | ) 189 | }) %>% 190 | mutate( 191 | origin = if_else(.data$origin == "Collected" & !is.na(.data$page_num), 192 | paste0(.data$origin,", page_num = ", .data$page_num), 193 | .data$origin) 194 | ) %>% 195 | select(-.data$page_num) 196 | 197 | # Pull the information from the item reference only for dataset variable, not where level information 198 | derivations <- xml_find_all(doc, "//ItemGroupDef/ItemRef") %>% 199 | map_dfr(function(node){ 200 | tibble( 201 | oid = xml_attr(node, "ItemOID") %>% as.character(), 202 | dataset = xml_parent(node) %>% xml_attr("Name") %>% as.character(), 203 | derivation_id = xml_attr(node, "MethodOID") 204 | ) 205 | }) 206 | # Combine all the item information but 207 | item_info <- left_join(derivations, item_def, by = "oid") 208 | 209 | where_to_merge <- xml_find_all(doc, "//def:ValueListDef/ItemRef") %>% 210 | map_dfr(function(node){ 211 | tibble( 212 | oid = xml_parent(node) %>% xml_attr("OID") %>% as.character(), 213 | item_oid = xml_attr(node, "ItemOID"), 214 | ord = xml_attr(node, "OrderNumber"), 215 | where_oid = xml_find_all(node, "./def:WhereClauseRef") %>% 216 | xml_attr("WhereClauseOID"), 217 | derivation_id = xml_attr(node, "MethodOID") 218 | ) 219 | } 220 | ) 221 | 222 | where_eqs <- xml_find_all(doc, "//def:WhereClauseDef[@OID]/RangeCheck") %>% 223 | map_dfr(function(node){ 224 | tibble( 225 | where_oid = xml_parent(node) %>% xml_attr("OID"), 226 | left = xml_attr(node, "ItemOID"), 227 | test = xml_attr(node, "Comparator"), 228 | right = xml_find_all(node, "./CheckValue") %>% xml_text() 229 | ) 230 | } 231 | ) 232 | # create 0x4 tibble if where_eqs is 0x0 233 | # tmp workaround until below bug is resolved in purrr 234 | # https://github.com/tidyverse/purrr/issues/824 235 | if(nrow(where_eqs) == 0){ 236 | where_eqs <- tibble(where_oid=character(), 237 | left=character(), 238 | test=character(), 239 | right = character()) 240 | } 241 | 242 | if(nrow(where_to_merge) == 0){ 243 | where_eqs <- where_eqs %>% 244 | mutate(item_oid = .data$left, 245 | derivation_id = paste0("MT", str_remove(.data$left, "IT"), ".", .data$right), 246 | ord = NA, 247 | oid = .data$left) %>% 248 | left_join(item_def, by = c("oid")) %>% 249 | left_join(select(derivations, -.data$derivation_id), by = c("oid")) 250 | 251 | } else{ 252 | where_eqs<- full_join(where_to_merge, where_eqs, by = "where_oid") %>% 253 | left_join(item_def, by = c("item_oid" = "oid")) %>% 254 | # Allow for merging with the derivations to get the dataset 255 | mutate(oid = paste0("IT", str_remove(.data$oid, "^VL")), 256 | variable = .data$varname) %>% 257 | left_join(select(derivations, -.data$derivation_id), by = c("oid")) 258 | } 259 | 260 | all_where_eqs <- where_eqs %>% 261 | group_by(.data$where_oid) %>% 262 | mutate(var = str_extract(.data$left, "\\w*$"), 263 | right = paste0("'", .data$right, "'"), 264 | test = case_when(.data$test == "EQ" ~ "==", 265 | .data$test == "LT" ~ "<", 266 | .data$test == "LE" ~ "<=", 267 | .data$test == "GT" ~ ">", 268 | .data$test == "GE" ~ ">=", 269 | .data$test == "NE" ~ "!=", 270 | TRUE ~ .data$test), 271 | eq = case_when( test == "IN" ~ paste(.data$var, "%in%", "c(", 272 | paste(.data$right, collapse = ","), 273 | ")"), 274 | test == "NOTIN" ~ paste("!", .data$var, "%in%", "c(", 275 | paste(.data$right, collapse = ","), 276 | ")"), 277 | TRUE ~ paste(.data$var, .data$test, .data$right, collapse = " & ")) 278 | ) %>% 279 | select(-.data$left, -.data$var, -.data$test, -.data$right) %>% 280 | distinct() %>% 281 | group_by(.data$item_oid, .data$derivation_id) %>% 282 | mutate(full_eq = str_c(.data$eq, collapse = "||")) %>% 283 | filter(!is.na(.data$item_oid)) %>% 284 | ungroup() %>% 285 | select(-.data$eq, where = .data$full_eq, .data$derivation_id, 286 | -.data$where_oid, -.data$ord, -.data$item_oid) 287 | 288 | val_spec <- item_info %>% 289 | anti_join(all_where_eqs, by = c("oid")) %>% #remove any variables with a where 290 | bind_rows(all_where_eqs) %>% 291 | mutate(derivation_id = case_when( 292 | .data$origin == "Predecessor" & !is.na(.data$predecessor) ~ .data$predecessor, 293 | .data$origin == "Assigned" & !is.na(.data$comment_id) ~ .data$comment_id, 294 | TRUE ~ .data$derivation_id)) %>% 295 | select(.data$dataset, .data$variable, .data$code_id, .data$derivation_id, 296 | .data$type, .data$origin, .data$where, .data$sig_dig, 297 | -.data$predecessor, -.data$comment_id, -.data$varname, 298 | -.data$oid) 299 | 300 | val_spec 301 | } 302 | 303 | 304 | #' XML to code list 305 | #' 306 | #' Reads in a define xml and creates a code_list table. The code_list table is a 307 | #' nested tibble where each row is a code list or permitted value list. The code 308 | #' column contains a vector of a tibble depending on if it is a permitted values 309 | #' or code list 310 | #' 311 | #' @param doc xml document 312 | #' 313 | #' @return a tibble containing the code list and permitted value information 314 | #' @family xml builder 315 | #' @export 316 | xml_to_codelist <- function(doc) { 317 | cl_pv <- xml_find_all(doc, "//CodeList") %>% 318 | map_dfr(function(node){ 319 | #Values within the code/decode 320 | node_children_EI <- xml_find_all(node, "./EnumeratedItem") 321 | node_children_CL <- xml_find_all(node, "./CodeListItem") 322 | 323 | decodes <- c(xml_find_all(node_children_EI, "./Decode") %>% xml_text(), 324 | xml_find_all(node_children_CL, "./Decode") %>% xml_text()) 325 | if(length(decodes) == 0){ 326 | decodes <- NA_character_ 327 | } 328 | 329 | tibble( 330 | code_id = xml_attr(node, "OID"), 331 | name = xml_attr(node, "Name"), 332 | code = c(xml_attr(node_children_EI, "CodedValue"), 333 | xml_attr(node_children_CL, "CodedValue")), 334 | decode = decodes 335 | ) 336 | }) %>% 337 | group_by(code_id) %>% 338 | mutate(type = if_else(all(is.na(decode)), 339 | "permitted_val", "code_decode")) 340 | 341 | permitted_val <- cl_pv %>% 342 | filter(type == "permitted_val") %>% 343 | select(-.data$decode) %>% 344 | nest(codes = c(.data$code)) 345 | 346 | code_decode <- cl_pv %>% 347 | filter(type == "code_decode") %>% 348 | nest(codes = c(.data$code, .data$decode)) 349 | 350 | 351 | external_libs <- xml_find_all(doc, "//CodeList/ExternalCodeList") %>% 352 | map_dfr(function(node){ 353 | tibble( 354 | code_id = xml_parent(node) %>% xml_attr("OID"), 355 | name = xml_parent(node) %>% xml_attr("Name"), 356 | dictionary = xml_attr(node, "Dictionary"), 357 | version = xml_attr(node, "Version"), 358 | type = "external_library" 359 | ) 360 | }) 361 | if(nrow(external_libs) > 0){ 362 | external_libs <- external_libs |> 363 | nest(codes = c(.data$dictionary, .data$version)) 364 | } 365 | 366 | 367 | # Combinging the code decode with the permitted values 368 | bind_rows(code_decode, permitted_val, external_libs) %>% 369 | ungroup() 370 | } 371 | 372 | 373 | 374 | #' XML to derivation table 375 | #' 376 | #' This reads in a xml document and gets all the derivations/comments. These can 377 | #' be cross referenced to variables using the derivation_id's 378 | #' @param doc xml document 379 | #' 380 | #' @return dataframe with derivation id's and derivations 381 | #' @family xml builder 382 | #' @export 383 | #' 384 | xml_to_derivations <- function(doc) { 385 | 386 | derivation <- 387 | xml_find_all(doc, "//MethodDef") %>% 388 | map_dfr(function(node){ 389 | tibble(derivation_id = xml_attr(node, "OID"), 390 | derivation = xml_find_first(node, "./Description/TranslatedText") %>% 391 | xml_text()) 392 | }) 393 | 394 | comment <- 395 | xml_find_all(doc, "//def:CommentDef") %>% 396 | map_dfr(function(node){ 397 | tibble(derivation_id = xml_attr(node, "OID"), 398 | derivation = xml_find_first(node, "./Description/TranslatedText") %>% 399 | xml_text()) 400 | }) 401 | 402 | 403 | predecessor <- xml_find_all(doc, "//ItemDef") %>% 404 | map_dfr(function(node){ 405 | tibble( 406 | derivation_id = xml_find_first(node, "./def:Origin") %>% xml_text(), 407 | derivation = derivation_id 408 | ) 409 | }) %>% 410 | filter(!is.na(.data$derivation) & str_length(.data$derivation) > 0) 411 | 412 | bind_rows(derivation, 413 | comment, 414 | predecessor) %>% 415 | distinct() 416 | } 417 | -------------------------------------------------------------------------------- /R/xml_helpers.R: -------------------------------------------------------------------------------- 1 | 2 | #' id to dataset 3 | #' 4 | #' @param id vector of id's 5 | #' 6 | #' @return vector of datasets 7 | #' @noRd 8 | id_to_ds <- function(id) { 9 | id %>% str_extract("(?<=^IT\\.)[:alnum:]+(?=\\..*)") 10 | } 11 | 12 | #' id to variable 13 | #' 14 | #' @param id vector of id's 15 | #' @param ds vector of ds's 16 | #' 17 | #' @return vector of variable names 18 | #' @noRd 19 | id_to_var <- function(id) { 20 | ds <- id %>% str_extract("(?<=^IT\\.)[:alnum:]+(?=\\..*)") 21 | extract <- if_else(is.na(ds), "(?<=^IT\\.)[:alnum:]*", 22 | str_c("(?<=^IT\\.", ds, "\\.)[:alnum:]*") 23 | ) 24 | id %>% 25 | str_extract(extract) 26 | } 27 | 28 | 29 | 30 | -------------------------------------------------------------------------------- /R/zzz.R: -------------------------------------------------------------------------------- 1 | #' @import R6 2 | #' @importFrom tidyr replace_na nest unnest 3 | #' @importFrom tidyselect everything matches 4 | #' @importFrom dplyr right_join n_distinct row_number case_when n if_else arrange rowwise anti_join inner_join bind_rows distinct mutate full_join left_join group_by summarise filter pull select group_split ungroup 5 | #' @importFrom tibble tibble tribble 6 | #' @importFrom stringr str_replace str_count str_remove_all str_remove str_detect str_subset str_length str_replace_na str_replace_all str_extract str_c str_trim str_split str_starts regex 7 | #' @importFrom rlang !! as_string expr sym prim_name 8 | #' @importFrom purrr keep discard reduce compact map map_chr map_lgl map_dfr map_int map2_lgl map2_chr pmap_chr 9 | #' @importFrom stats var na.omit 10 | #' @importFrom readxl excel_sheets read_excel 11 | #' @importFrom tibble tibble as_tibble 12 | #' @importFrom readxl excel_sheets read_excel 13 | 14 | NULL 15 | 16 | globalVariables(c("private", 17 | "self", 18 | "keep", 19 | ".", 20 | ".data", 21 | "code", 22 | "code_id", 23 | "codes", 24 | "core", 25 | "dataset", 26 | "decode", 27 | "derivation_id", 28 | "dictionary", 29 | "id", 30 | "key_seq", 31 | "lab", 32 | "label", 33 | "matches", 34 | "spec_type_to_code_list", 35 | "type", 36 | "variable", 37 | "where", 38 | "where_new", 39 | "var1", 40 | "n_lab")) 41 | -------------------------------------------------------------------------------- /README.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: github_document 3 | --- 4 | 5 | 6 | 7 | ```{r, include = FALSE} 8 | knitr::opts_chunk$set( 9 | collapse = TRUE, 10 | comment = "#>", 11 | fig.path = "man/figures/README-" 12 | ) 13 | ``` 14 | 15 | # metacore 16 | 17 | 18 | [](https://RValidationHub.slack.com) 19 | [![R build status](https://github.com/atorus-research/metacore/workflows/R-CMD-check/badge.svg)](https://github.com/atorus-research/xportr/actions?workflow=R-CMD-check) 20 | [](https://app.codecov.io/gh/atorus-research/metacore) 21 | [](https://github.com/atorus-research/metacore/blob/master/LICENSE) 22 | [![Lifecycle: experimental](https://img.shields.io/badge/lifecycle-experimental-orange.svg)](https://lifecycle.r-lib.org/articles/stages.html#experimental-1) 23 | [![CRAN status](https://www.r-pkg.org/badges/version/metacore)](https://CRAN.R-project.org/package=metacore) 24 | 25 | 26 | Programming for clinical trial data analysis tends to be very standardized. With data standards such as [CDISC](https://www.cdisc.org/), expectations tend to be very clearly defined. Within these programming activities, there's ample room for the use of metadata. Metadata can be used for several different purposes, such as applying dataset attributes, establishing sort sequences, working with controlled terminology, and more. Despite CDISC standards, organizations tend to have their own means of storing metadata, be it in excel spreadsheets, databases, and more. 27 | 28 | The purpose of metacore is to establish a common foundation for the use of metadata within an R session. This is done by creating an R object that can hold the necessary data in a standardized, immutable structure (using R6) that makes it easy to extract out necessary information when needed. Users can read in their metadata from their various sources. To make this easy, we've provided some helper functions - and even have readers that can read directly from Define.xml 2.0. By establishing a common and consistent object in memory, further packages that support these work flows can have a common foundation upon which tools can be built that leverage metadata in the future. This reduces the need to hold different data structures containing metadata and instead allows programs to pull this information from a centralized source. 29 | 30 | ## Installation 31 | 32 | You can install the current development version of metacore from [github](https://github.com/atorus-research/metacore) with: 33 | 34 | ``` r 35 | devtools::install_github("atorus-research/metacore") 36 | ``` 37 | 38 | ## Structure 39 | 40 | A metacore object is made-up of 6 different tables, which are connected with a series of identifying columns. The goal of these tables is to normalize the information as much as possible, while keeping together like information. Each table has a basic theme to make them easier to remember. They are as follows: 41 | 42 | - **ds_spec**: Contains dataset level information 43 | 44 | - **ds_vars**: Bridges the dataset and variable level information 45 | 46 | - **var_spec**: Contains variable level information 47 | 48 | - **value_spec**: Contains value level information 49 | 50 | - **derivations**: Contains all derivations 51 | 52 | - **codelist**: Contains information about code/decodes, permitted values and external libraries 53 | 54 | - **supp**: Contains information specific to supplemental variables 55 | 56 | Here is a schema of how all this fits together: 57 | 58 | ![](man/figures/schema-colors.png "man/figures/Metacore Schema") 59 | 60 | ### ds_spec 61 | 62 | This table covers the basic information about each dataset. There is only a single row per dataset, with the following information: 63 | 64 | - *dataset*: The abbreviated name of the dataset (e.g. AE) 65 | 66 | - *structure*: Value structure of the dataset as a sting 67 | 68 | - *label*: Dataset label 69 | 70 | ### ds_vars 71 | 72 | This table contains the information that bridges between purely dataset level and purely variable level. There is one row per dataset per variable: 73 | 74 | - *dataset*: The abbreviated name of the dataset. This will match to the name in **ds_spec** 75 | 76 | - *variable*: Variable name 77 | 78 | - *key_seq*: Sequence key, which are the variables used to order a dataset. This is a column of integers, where 1 is the first sorting variable and 2 is the second etc. If the variable is not used in sorting it will be left `NA` 79 | 80 | - *order*: Order sets the order of the columns to appear in the dataset. This is also a numeric value 81 | 82 | - *keep*: Logical value about if the variable needs to be kept 83 | 84 | - *core*: ADaM core, which should be one of the following values: "Expected", "Required", "Permissible", "Conditionally Required", "Conditionally Expected", or NA. For more information about core see [CDISC](https://www.cdisc.org/standards/foundational/adam) 85 | 86 | - *supp_flag*: Logical to determine if the variable is in the supplemental datasets 87 | 88 | ### var_spec 89 | 90 | This table contains the purely variable level information. The goal is there is a single row per variable, which is common across all datasets. This helps ensure variables follow the CDISC standard. But, this isn't always possible, so if information for a given variable differs across datasets, the variable will be recorded as dataset.variable in the variable column. 91 | 92 | - *variable*: Variable name, which should match the name in **ds_spec**. Unless the variable needs to be duplicated, then the name will be a combination of the the dataset name and variable name from **ds_spec** (dataset.variable) 93 | 94 | - *type*: Variable class 95 | 96 | - *length*: Variable length (while not relevant to R datasets, this is important for when creating XPT files) 97 | 98 | - *label*: Variable label 99 | 100 | - *common*: Common across ADaM datasets 101 | 102 | - *format*: Variable format 103 | 104 | ### value_spec 105 | 106 | This table contains the information the information at the value level. There will be at least one row per dataset/variable combination. There is more than one row per dataset/variable combination if the combination has values which have differing metadata. For instance LBORRES that are different data types depending on the value. The information contained are as follows: 107 | 108 | - *dataset*: The abbreviated name of the dataset. This will match to the name in **ds_spec** 109 | 110 | - *variable*: Variable name. This will match to the name in **ds_vars** 111 | 112 | - *type*: String of the value type 113 | 114 | - *origin*: Origin of the value 115 | 116 | - *sig_dig*: Significant digits of the value 117 | 118 | - *code_id*: ID for the code list to match the id in the **codelist** table 119 | 120 | - *where*: Value of the variable 121 | 122 | - *derivation_id*: ID for the derivation to match with the **derivation** table 123 | 124 | ### derivations 125 | 126 | This table has all the derivation information, with one row per derivation ID and the following information: 127 | 128 | - *derivation_id*: The ID, which should match to **value_spec** 129 | 130 | - *derivation*: Text describing the derivation 131 | 132 | ### codelist 133 | 134 | This table contains the code lists, permitted value lists, and external libraries nested within a tibble. There is only a single row per list/library, with the following information: 135 | 136 | - *code_id*: the ID used to identify the code list. This should be the same as the *code_id* in **val_spec** 137 | 138 | - *name*: Name of the code list 139 | 140 | - *code*: A list of tibbles (for code / decode combinations) and vectors (for permitted values and libraries), which contain all the codes 141 | 142 | - *type*: An indicator of if the information in the code column is a code/decode table, permitted value, or external library 143 | 144 | 145 | 146 | ### supp 147 | 148 | This table contains the information needed to create supplemental tables. If you want to add a variable which will go into a supplemental qualifier then you can create as normal (i.e. label information going to the **var_spec** table and derivation and origin going into the **value_spec** table), but you need to flag it as supplemental in the **ds_vars** table and add a row to the **supp** table. There is only a single row per dataset/variable, with the following information: 149 | 150 | - *dataset*: The abbreviated name of the dataset. This will match to the name in **ds_spec** 151 | 152 | - *variable*: Variable name. This will match to the name in **ds_spec** 153 | 154 | - *idvar*: ID variable used for the supplemental variable. Can be left missing if not needed 155 | 156 | - *qeval*: Evaluator for the supplemental variable 157 | 158 | To get more information about the metacore objects and how to build a specification reader, please see our vignettes. 159 | 160 | 161 | ![](man/figures/labeled_schema.png "man/figures/Metacore Schema") 162 | 163 | ## Future Development 164 | 165 | This is an alpha release of this package, so if you have ideas on future improvements please add them to the issue log. 166 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | # metacore 5 | 6 | 7 | 8 | [](https://RValidationHub.slack.com) 9 | [![R build 10 | status](https://github.com/atorus-research/metacore/workflows/R-CMD-check/badge.svg)](https://github.com/atorus-research/xportr/actions?workflow=R-CMD-check) 11 | [](https://app.codecov.io/gh/atorus-research/metacore) 12 | [](https://github.com/atorus-research/metacore/blob/master/LICENSE) 13 | [![Lifecycle: 14 | experimental](https://img.shields.io/badge/lifecycle-experimental-orange.svg)](https://lifecycle.r-lib.org/articles/stages.html#experimental-1) 15 | [![CRAN 16 | status](https://www.r-pkg.org/badges/version/metacore)](https://CRAN.R-project.org/package=metacore) 17 | 18 | 19 | Programming for clinical trial data analysis tends to be very 20 | standardized. With data standards such as 21 | [CDISC](https://www.cdisc.org/), expectations tend to be very clearly 22 | defined. Within these programming activities, there’s ample room for the 23 | use of metadata. Metadata can be used for several different purposes, 24 | such as applying dataset attributes, establishing sort sequences, 25 | working with controlled terminology, and more. Despite CDISC standards, 26 | organizations tend to have their own means of storing metadata, be it in 27 | excel spreadsheets, databases, and more. 28 | 29 | The purpose of metacore is to establish a common foundation for the use 30 | of metadata within an R session. This is done by creating an R object 31 | that can hold the necessary data in a standardized, immutable structure 32 | (using R6) that makes it easy to extract out necessary information when 33 | needed. Users can read in their metadata from their various sources. To 34 | make this easy, we’ve provided some helper functions - and even have 35 | readers that can read directly from Define.xml 2.0. By establishing a 36 | common and consistent object in memory, further packages that support 37 | these work flows can have a common foundation upon which tools can be 38 | built that leverage metadata in the future. This reduces the need to 39 | hold different data structures containing metadata and instead allows 40 | programs to pull this information from a centralized source. 41 | 42 | ## Installation 43 | 44 | You can install the current development version of metacore from 45 | [github](https://github.com/atorus-research/metacore) with: 46 | 47 | ``` r 48 | devtools::install_github("atorus-research/metacore") 49 | ``` 50 | 51 | ## Structure 52 | 53 | A metacore object is made-up of 6 different tables, which are connected 54 | with a series of identifying columns. The goal of these tables is to 55 | normalize the information as much as possible, while keeping together 56 | like information. Each table has a basic theme to make them easier to 57 | remember. They are as follows: 58 | 59 | - **ds_spec**: Contains dataset level information 60 | 61 | - **ds_vars**: Bridges the dataset and variable level information 62 | 63 | - **var_spec**: Contains variable level information 64 | 65 | - **value_spec**: Contains value level information 66 | 67 | - **derivations**: Contains all derivations 68 | 69 | - **codelist**: Contains information about code/decodes, permitted 70 | values and external libraries 71 | 72 | - **supp**: Contains information specific to supplemental variables 73 | 74 | Here is a schema of how all this fits together: 75 | 76 | ![](man/figures/schema-colors.png "man/figures/Metacore Schema") 77 | 78 | ### ds_spec 79 | 80 | This table covers the basic information about each dataset. There is 81 | only a single row per dataset, with the following information: 82 | 83 | - *dataset*: The abbreviated name of the dataset (e.g. AE) 84 | 85 | - *structure*: Value structure of the dataset as a sting 86 | 87 | - *label*: Dataset label 88 | 89 | ### ds_vars 90 | 91 | This table contains the information that bridges between purely dataset 92 | level and purely variable level. There is one row per dataset per 93 | variable: 94 | 95 | - *dataset*: The abbreviated name of the dataset. This will match to the 96 | name in **ds_spec** 97 | 98 | - *variable*: Variable name 99 | 100 | - *key_seq*: Sequence key, which are the variables used to order a 101 | dataset. This is a column of integers, where 1 is the first sorting 102 | variable and 2 is the second etc. If the variable is not used in 103 | sorting it will be left `NA` 104 | 105 | - *order*: Order sets the order of the columns to appear in the dataset. 106 | This is also a numeric value 107 | 108 | - *keep*: Logical value about if the variable needs to be kept 109 | 110 | - *core*: ADaM core, which should be one of the following values: 111 | “Expected”, “Required”, “Permissible”, “Conditionally Required”, 112 | “Conditionally Expected”, or NA. For more information about core see 113 | [CDISC](https://www.cdisc.org/standards/foundational/adam) 114 | 115 | - *supp_flag*: Logical to determine if the variable is in the 116 | supplemental datasets 117 | 118 | ### var_spec 119 | 120 | This table contains the purely variable level information. The goal is 121 | there is a single row per variable, which is common across all datasets. 122 | This helps ensure variables follow the CDISC standard. But, this isn’t 123 | always possible, so if information for a given variable differs across 124 | datasets, the variable will be recorded as dataset.variable in the 125 | variable column. 126 | 127 | - *variable*: Variable name, which should match the name in **ds_spec**. 128 | Unless the variable needs to be duplicated, then the name will be a 129 | combination of the the dataset name and variable name from **ds_spec** 130 | (dataset.variable) 131 | 132 | - *type*: Variable class 133 | 134 | - *length*: Variable length (while not relevant to R datasets, this is 135 | important for when creating XPT files) 136 | 137 | - *label*: Variable label 138 | 139 | - *common*: Common across ADaM datasets 140 | 141 | - *format*: Variable format 142 | 143 | ### value_spec 144 | 145 | This table contains the information the information at the value level. 146 | There will be at least one row per dataset/variable combination. There 147 | is more than one row per dataset/variable combination if the combination 148 | has values which have differing metadata. For instance LBORRES that are 149 | different data types depending on the value. The information contained 150 | are as follows: 151 | 152 | - *dataset*: The abbreviated name of the dataset. This will match to the 153 | name in **ds_spec** 154 | 155 | - *variable*: Variable name. This will match to the name in **ds_vars** 156 | 157 | - *type*: String of the value type 158 | 159 | - *origin*: Origin of the value 160 | 161 | - *sig_dig*: Significant digits of the value 162 | 163 | - *code_id*: ID for the code list to match the id in the **codelist** 164 | table 165 | 166 | - *where*: Value of the variable 167 | 168 | - *derivation_id*: ID for the derivation to match with the 169 | **derivation** table 170 | 171 | ### derivations 172 | 173 | This table has all the derivation information, with one row per 174 | derivation ID and the following information: 175 | 176 | - *derivation_id*: The ID, which should match to **value_spec** 177 | 178 | - *derivation*: Text describing the derivation 179 | 180 | ### codelist 181 | 182 | This table contains the code lists, permitted value lists, and external 183 | libraries nested within a tibble. There is only a single row per 184 | list/library, with the following information: 185 | 186 | - *code_id*: the ID used to identify the code list. This should be the 187 | same as the *code_id* in **val_spec** 188 | 189 | - *name*: Name of the code list 190 | 191 | - *code*: A list of tibbles (for code / decode combinations) and vectors 192 | (for permitted values and libraries), which contain all the codes 193 | 194 | - *type*: An indicator of if the information in the code column is a 195 | code/decode table, permitted value, or external library 196 | 197 | ### supp 198 | 199 | This table contains the information needed to create supplemental 200 | tables. If you want to add a variable which will go into a supplemental 201 | qualifier then you can create as normal (i.e. label information going to 202 | the **var_spec** table and derivation and origin going into the 203 | **value_spec** table), but you need to flag it as supplemental in the 204 | **ds_vars** table and add a row to the **supp** table. There is only a 205 | single row per dataset/variable, with the following information: 206 | 207 | - *dataset*: The abbreviated name of the dataset. This will match to the 208 | name in **ds_spec** 209 | 210 | - *variable*: Variable name. This will match to the name in **ds_spec** 211 | 212 | - *idvar*: ID variable used for the supplemental variable. Can be left 213 | missing if not needed 214 | 215 | - *qeval*: Evaluator for the supplemental variable 216 | 217 | To get more information about the metacore objects and how to build a 218 | specification reader, please see our vignettes. 219 | 220 | ![](man/figures/labeled_schema.png "man/figures/Metacore Schema") 221 | 222 | ## Future Development 223 | 224 | This is an alpha release of this package, so if you have ideas on future 225 | improvements please add them to the issue log. 226 | -------------------------------------------------------------------------------- /_pkgdown.yml: -------------------------------------------------------------------------------- 1 | destination: docs 2 | 3 | template: 4 | params: 5 | bootswatch: yeti 6 | opengraph: 7 | image: 8 | src: man/figures/metacore.PNG 9 | alt: "metacore Hex Sticker" 10 | -------------------------------------------------------------------------------- /cran-comments.md: -------------------------------------------------------------------------------- 1 | ## R CMD check results 2 | There were no ERRORs or WARNINGs. 3 | 4 | ## Downstream dependencies 5 | I have checked with the dependencies for this package 6 | -------------------------------------------------------------------------------- /inst/extdata/SDTM_spec_CDISC_pilot.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atorus-research/metacore/4b251e077374152bfa7032cbcbb50cd2912aec1c/inst/extdata/SDTM_spec_CDISC_pilot.xlsx -------------------------------------------------------------------------------- /inst/extdata/mock_spec.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atorus-research/metacore/4b251e077374152bfa7032cbcbb50cd2912aec1c/inst/extdata/mock_spec.xlsx -------------------------------------------------------------------------------- /inst/extdata/p21_mock.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atorus-research/metacore/4b251e077374152bfa7032cbcbb50cd2912aec1c/inst/extdata/p21_mock.xlsx -------------------------------------------------------------------------------- /inst/extdata/pilot_ADaM.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atorus-research/metacore/4b251e077374152bfa7032cbcbb50cd2912aec1c/inst/extdata/pilot_ADaM.rda -------------------------------------------------------------------------------- /inst/extdata/pilot_SDTM.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atorus-research/metacore/4b251e077374152bfa7032cbcbb50cd2912aec1c/inst/extdata/pilot_SDTM.rda -------------------------------------------------------------------------------- /man/MetaCore_filter.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/metacore.R 3 | \name{MetaCore_filter} 4 | \alias{MetaCore_filter} 5 | \title{Select method to subset by a single dataframe} 6 | \usage{ 7 | MetaCore_filter(value) 8 | } 9 | \arguments{ 10 | \item{value}{the dataframe to subset by} 11 | } 12 | \description{ 13 | Select method to subset by a single dataframe 14 | } 15 | -------------------------------------------------------------------------------- /man/check_columns.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/validators.R 3 | \name{check_columns} 4 | \alias{check_columns} 5 | \title{Check all data frames include the correct types of columns} 6 | \usage{ 7 | check_columns( 8 | ds_spec, 9 | ds_vars, 10 | var_spec, 11 | value_spec, 12 | derivations, 13 | codelist, 14 | supp 15 | ) 16 | } 17 | \arguments{ 18 | \item{ds_spec}{dataset specification} 19 | 20 | \item{ds_vars}{dataset variables} 21 | 22 | \item{var_spec}{variable specification} 23 | 24 | \item{value_spec}{value specification} 25 | 26 | \item{derivations}{derivation information} 27 | 28 | \item{codelist}{codelist information} 29 | 30 | \item{supp}{supp information} 31 | } 32 | \description{ 33 | This function checks for vector types and accepted words 34 | } 35 | -------------------------------------------------------------------------------- /man/check_structure.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{check_structure} 4 | \alias{check_structure} 5 | \title{Column Validation Function} 6 | \usage{ 7 | check_structure(.data, col, func, any_na_acceptable, nm) 8 | } 9 | \arguments{ 10 | \item{.data}{the dataframe to check the column for} 11 | 12 | \item{col}{the column to test} 13 | 14 | \item{func}{the function to use to assert column structure} 15 | 16 | \item{any_na_acceptable}{boolean, testing if the column can have missing} 17 | 18 | \item{nm}{name of column to check (for warning and error clarification)} 19 | } 20 | \description{ 21 | Column Validation Function 22 | } 23 | -------------------------------------------------------------------------------- /man/check_words.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{check_words} 4 | \alias{check_words} 5 | \title{Check Words in Column} 6 | \usage{ 7 | check_words(..., col) 8 | } 9 | \arguments{ 10 | \item{...}{permissible words in the column} 11 | 12 | \item{col}{the column to check for specific words} 13 | } 14 | \description{ 15 | Check Words in Column 16 | } 17 | -------------------------------------------------------------------------------- /man/checks.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/checks.R 3 | \name{check_inconsistent_labels} 4 | \alias{check_inconsistent_labels} 5 | \alias{check_inconsistent_types} 6 | \alias{check_inconsistent_formats} 7 | \title{Optional checks to consistency of metadata} 8 | \usage{ 9 | check_inconsistent_labels(metacore) 10 | 11 | check_inconsistent_types(metacore) 12 | 13 | check_inconsistent_formats(metacore) 14 | } 15 | \arguments{ 16 | \item{metacore}{metacore object to check} 17 | } 18 | \value{ 19 | If all variables are consistent it will return a message. If there 20 | are inconsistencies it will return a message and a dataset of the variables 21 | with inconsistencies. 22 | } 23 | \description{ 24 | These functions check to see if values (e.g labels, formats) 25 | that should be consistent for a variable across all data are actually 26 | consistent. 27 | } 28 | \examples{ 29 | ## EXAMPLE WITH DUPLICATES 30 | # Loads in a metacore obj called metacore 31 | load(metacore_example("pilot_ADaM.rda")) 32 | check_inconsistent_labels(metacore) 33 | 34 | check_inconsistent_types(metacore) 35 | 36 | ## EXAMPLE WITHOUT DUPLICATES 37 | # Loads in a metacore obj called metacore 38 | load(metacore_example("pilot_SDTM.rda")) 39 | check_inconsistent_labels(metacore) 40 | 41 | check_inconsistent_formats(metacore) 42 | 43 | check_inconsistent_types(metacore) 44 | } 45 | -------------------------------------------------------------------------------- /man/create_tbl.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/spec_builder.R 3 | \name{create_tbl} 4 | \alias{create_tbl} 5 | \title{Create table} 6 | \usage{ 7 | create_tbl(doc, cols) 8 | } 9 | \arguments{ 10 | \item{doc}{list of sheets from a excel doc} 11 | 12 | \item{cols}{vector of regex to get a datasets base on which columns it has. 13 | If the vector is named it will also rename the columns} 14 | } 15 | \value{ 16 | dataset (or list of datasets if not specific enough) 17 | } 18 | \description{ 19 | This function creates a table from excel sheets. This is mainly used 20 | internally for building spec readers, but is exported so others who need to 21 | build spec readers can use it. 22 | } 23 | -------------------------------------------------------------------------------- /man/define_to_metacore.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/xml_builders.R 3 | \name{define_to_metacore} 4 | \alias{define_to_metacore} 5 | \title{Define XML to DataDef Object} 6 | \usage{ 7 | define_to_metacore(path, quiet = FALSE) 8 | } 9 | \arguments{ 10 | \item{path}{location of the define xml as a string} 11 | 12 | \item{quiet}{Option to quietly load in, this will suppress warnings, but not errors} 13 | } 14 | \value{ 15 | DataDef Object 16 | } 17 | \description{ 18 | Given a path, this function converts the define xml to a DataDef Object 19 | } 20 | -------------------------------------------------------------------------------- /man/figures/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atorus-research/metacore/4b251e077374152bfa7032cbcbb50cd2912aec1c/man/figures/.DS_Store -------------------------------------------------------------------------------- /man/figures/labeled-code_list.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atorus-research/metacore/4b251e077374152bfa7032cbcbb50cd2912aec1c/man/figures/labeled-code_list.png -------------------------------------------------------------------------------- /man/figures/labeled-derivation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atorus-research/metacore/4b251e077374152bfa7032cbcbb50cd2912aec1c/man/figures/labeled-derivation.png -------------------------------------------------------------------------------- /man/figures/labeled-ds_spec.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atorus-research/metacore/4b251e077374152bfa7032cbcbb50cd2912aec1c/man/figures/labeled-ds_spec.png -------------------------------------------------------------------------------- /man/figures/labeled-ds_vars.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atorus-research/metacore/4b251e077374152bfa7032cbcbb50cd2912aec1c/man/figures/labeled-ds_vars.png -------------------------------------------------------------------------------- /man/figures/labeled-supp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atorus-research/metacore/4b251e077374152bfa7032cbcbb50cd2912aec1c/man/figures/labeled-supp.png -------------------------------------------------------------------------------- /man/figures/labeled-value_spec.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atorus-research/metacore/4b251e077374152bfa7032cbcbb50cd2912aec1c/man/figures/labeled-value_spec.png -------------------------------------------------------------------------------- /man/figures/labeled-var_spec.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atorus-research/metacore/4b251e077374152bfa7032cbcbb50cd2912aec1c/man/figures/labeled-var_spec.png -------------------------------------------------------------------------------- /man/figures/labeled_schema.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atorus-research/metacore/4b251e077374152bfa7032cbcbb50cd2912aec1c/man/figures/labeled_schema.png -------------------------------------------------------------------------------- /man/figures/metacore.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atorus-research/metacore/4b251e077374152bfa7032cbcbb50cd2912aec1c/man/figures/metacore.PNG -------------------------------------------------------------------------------- /man/figures/schema-colors.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atorus-research/metacore/4b251e077374152bfa7032cbcbb50cd2912aec1c/man/figures/schema-colors.png -------------------------------------------------------------------------------- /man/get_control_term.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/metacore.R 3 | \name{get_control_term} 4 | \alias{get_control_term} 5 | \title{Get Control Term} 6 | \usage{ 7 | get_control_term(metacode, variable, dataset = NULL) 8 | } 9 | \arguments{ 10 | \item{metacode}{metacore object} 11 | 12 | \item{variable}{A variable name to get the controlled terms for. This can 13 | either be a string or just the name of the variable} 14 | 15 | \item{dataset}{A dataset name. This is not required if there is only one set 16 | of control terminology across all datasets} 17 | } 18 | \value{ 19 | a vector for permitted values and a 2-column tibble for codelists 20 | } 21 | \description{ 22 | Returns the control term (a vector for permitted values and a tibble for code 23 | lists) for a given variable. The dataset can be optionally specified if there 24 | is different control terminology for different datasets 25 | } 26 | \examples{ 27 | \dontrun{ 28 | meta_ex <- spec_to_metacore(metacore_example("p21_mock.xlsx")) 29 | get_control_term(meta_ex, QVAL, SUPPAE) 30 | get_control_term(meta_ex, "QVAL", "SUPPAE") 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /man/get_keys.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/metacore.R 3 | \name{get_keys} 4 | \alias{get_keys} 5 | \title{Get Dataset Keys} 6 | \usage{ 7 | get_keys(metacode, dataset) 8 | } 9 | \arguments{ 10 | \item{metacode}{metacore object} 11 | 12 | \item{dataset}{A dataset name} 13 | } 14 | \value{ 15 | a 2-column tibble with dataset key variables and key sequence 16 | } 17 | \description{ 18 | Returns the dataset keys for a given dataset 19 | } 20 | \examples{ 21 | \dontrun{ 22 | meta_ex <- spec_to_metacore(metacore_example("p21_mock.xlsx")) 23 | get_keys(meta_ex, "AE") 24 | get_keys(meta_ex, AE) 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /man/is_metacore.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/validators.R 3 | \name{is_metacore} 4 | \alias{is_metacore} 5 | \title{Is metacore object} 6 | \usage{ 7 | is_metacore(x) 8 | } 9 | \arguments{ 10 | \item{x}{object to check} 11 | } 12 | \value{ 13 | \code{TRUE} if metacore, \code{FALSE} if not 14 | } 15 | \description{ 16 | Is metacore object 17 | } 18 | \examples{ 19 | # Loads in a metacore obj called metacore 20 | load(metacore_example("pilot_ADaM.rda")) 21 | is_metacore(metacore) 22 | 23 | } 24 | -------------------------------------------------------------------------------- /man/load_metacore.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/metacore.R 3 | \name{load_metacore} 4 | \alias{load_metacore} 5 | \title{load metacore object} 6 | \usage{ 7 | load_metacore(path = NULL) 8 | } 9 | \arguments{ 10 | \item{path}{location of the metacore object to load into memory} 11 | } 12 | \value{ 13 | metacore object in memory 14 | } 15 | \description{ 16 | load metacore object 17 | } 18 | -------------------------------------------------------------------------------- /man/metacore.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/metacore.R 3 | \name{metacore} 4 | \alias{metacore} 5 | \title{R6 Class wrapper to create your own metacore object} 6 | \usage{ 7 | metacore( 8 | ds_spec = tibble(dataset = character(), structure = character(), label = character()), 9 | ds_vars = tibble(dataset = character(), variable = character(), keep = logical(), 10 | key_seq = integer(), order = integer(), core = character(), supp_flag = logical()), 11 | var_spec = tibble(variable = character(), label = character(), length = integer(), type 12 | = character(), common = character(), format = character()), 13 | value_spec = tibble(dataset = character(), variable = character(), where = character(), 14 | type = character(), sig_dig = integer(), code_id = character(), origin = character(), 15 | derivation_id = integer()), 16 | derivations = tibble(derivation_id = integer(), derivation = character()), 17 | codelist = tibble(code_id = character(), name = character(), type = character(), codes 18 | = list()), 19 | supp = tibble(dataset = character(), variable = character(), idvar = character(), qeval 20 | = character()) 21 | ) 22 | } 23 | \arguments{ 24 | \item{ds_spec}{contains each dataset in the study, with the labels for each} 25 | 26 | \item{ds_vars}{information on what variables are in each dataset + plus dataset specific variable information} 27 | 28 | \item{var_spec}{variable information that is shared across all datasets} 29 | 30 | \item{value_spec}{parameter specific information, as data is long the specs for wbc might be difference the hgb} 31 | 32 | \item{derivations}{contains derivation, it allows for different variables to have the same derivation} 33 | 34 | \item{codelist}{contains the code/decode information} 35 | 36 | \item{supp}{contains the idvar and qeval information for supplemental variables} 37 | } 38 | \description{ 39 | R6 Class wrapper to create your own metacore object 40 | } 41 | \concept{Metacore} 42 | -------------------------------------------------------------------------------- /man/metacore_example.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{metacore_example} 4 | \alias{metacore_example} 5 | \title{Get path to metacore example} 6 | \usage{ 7 | metacore_example(file = NULL) 8 | } 9 | \arguments{ 10 | \item{file}{Name of file. If \code{NULL}, the example files will be listed.} 11 | } 12 | \description{ 13 | metacore comes bundled with a number of sample files in its \code{inst/extdata} 14 | directory. This function make them easy to access. When testing or writing 15 | examples in other packages, it is best to use the 'pilot_ADaM.rda' example as 16 | it loads fastest. 17 | } 18 | \examples{ 19 | metacore_example() 20 | metacore_example("mock_spec.xlsx") 21 | } 22 | -------------------------------------------------------------------------------- /man/pipe.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils-pipe.R 3 | \name{\%>\%} 4 | \alias{\%>\%} 5 | \title{Pipe operator} 6 | \usage{ 7 | lhs \%>\% rhs 8 | } 9 | \description{ 10 | See \code{magrittr::\link[magrittr:pipe]{\%>\%}} for details. 11 | } 12 | \keyword{internal} 13 | -------------------------------------------------------------------------------- /man/read_all_sheets.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/spec_builder.R 3 | \name{read_all_sheets} 4 | \alias{read_all_sheets} 5 | \title{Read in all Sheets} 6 | \usage{ 7 | read_all_sheets(path) 8 | } 9 | \arguments{ 10 | \item{path}{string of the file path} 11 | } 12 | \value{ 13 | a list of datasets 14 | } 15 | \description{ 16 | Given a path to a file, this function reads in all sheets of an excel file 17 | } 18 | -------------------------------------------------------------------------------- /man/save_metacore.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/metacore.R 3 | \name{save_metacore} 4 | \alias{save_metacore} 5 | \title{save metacore object} 6 | \usage{ 7 | save_metacore(metacore_object, path = NULL) 8 | } 9 | \arguments{ 10 | \item{metacore_object}{the metacore object in memory to save to disc} 11 | 12 | \item{path}{file path and file name to save metacore object} 13 | } 14 | \value{ 15 | an .rda file 16 | } 17 | \description{ 18 | save metacore object 19 | } 20 | -------------------------------------------------------------------------------- /man/select_dataset.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/metacore.R 3 | \name{select_dataset} 4 | \alias{select_dataset} 5 | \title{Select metacore object to single dataset} 6 | \usage{ 7 | select_dataset(.data, dataset, simplify = FALSE) 8 | } 9 | \arguments{ 10 | \item{.data}{the metacore object of dataframes} 11 | 12 | \item{dataset}{the specific dataset to subset by} 13 | 14 | \item{simplify}{return a single dataframe} 15 | } 16 | \value{ 17 | a filtered subset of the metacore object 18 | } 19 | \description{ 20 | Select metacore object to single dataset 21 | } 22 | -------------------------------------------------------------------------------- /man/spec_to_metacore.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/spec_builder.R 3 | \name{spec_to_metacore} 4 | \alias{spec_to_metacore} 5 | \title{Specification document to metacore object} 6 | \usage{ 7 | spec_to_metacore(path, quiet = FALSE, where_sep_sheet = TRUE) 8 | } 9 | \arguments{ 10 | \item{path}{string of file location} 11 | 12 | \item{quiet}{Option to quietly load in, this will suppress warnings, but not 13 | errors} 14 | 15 | \item{where_sep_sheet}{Option to tell if the where is in a separate sheet, 16 | like in older p21 specs or in a single sheet like newer p21 specs} 17 | } 18 | \value{ 19 | given a spec document it returns a metacore object 20 | } 21 | \description{ 22 | This function takes the location of an excel specification document and reads 23 | it in as a meta core object. At the moment it only supports specification in 24 | the format of pinnacle 21 specifications. But, the section level spec builder can 25 | be used as building blocks for bespoke specification documents. 26 | } 27 | -------------------------------------------------------------------------------- /man/spec_type.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/spec_builder.R 3 | \name{spec_type} 4 | \alias{spec_type} 5 | \title{Check the type of spec document} 6 | \usage{ 7 | spec_type(path) 8 | } 9 | \arguments{ 10 | \item{path}{file location as a string} 11 | } 12 | \value{ 13 | returns string indicating the type of spec document 14 | } 15 | \description{ 16 | Check the type of spec document 17 | } 18 | -------------------------------------------------------------------------------- /man/spec_type_to_codelist.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/spec_builder.R 3 | \name{spec_type_to_codelist} 4 | \alias{spec_type_to_codelist} 5 | \title{Spec to codelist} 6 | \usage{ 7 | spec_type_to_codelist( 8 | doc, 9 | codelist_cols = c(code_id = "ID", name = "[N|n]ame", code = "^[C|c]ode|^[T|t]erm", 10 | decode = "[D|d]ecode"), 11 | permitted_val_cols = NULL, 12 | dict_cols = c(code_id = "ID", name = "[N|n]ame", dictionary = "[D|d]ictionary", version 13 | = "[V|v]ersion"), 14 | sheets = NULL, 15 | simplify = FALSE 16 | ) 17 | } 18 | \arguments{ 19 | \item{doc}{Named list of datasets @seealso \code{\link[=read_all_sheets]{read_all_sheets()}} for exact 20 | format} 21 | 22 | \item{codelist_cols}{Named vector of column names that make up the codelist. 23 | The column names can be regular expressions for more flexibility. But, the 24 | names must follow the given pattern} 25 | 26 | \item{permitted_val_cols}{Named vector of column names that make up the 27 | permitted value The column names can be regular expressions for more 28 | flexibility. This is optional, can be left as null if there isn't a 29 | permitted value sheet} 30 | 31 | \item{dict_cols}{Named vector of column names that make up the dictionary 32 | value The column names can be regular expressions for more flexibility. 33 | This is optional, can be left as null if there isn't a permitted value 34 | sheet} 35 | 36 | \item{sheets}{Optional, regular expressions of the sheets} 37 | 38 | \item{simplify}{Boolean value, if true will convert code/decode pairs that 39 | are all equal to a permitted value list. True by default} 40 | } 41 | \value{ 42 | a dataset formatted for the metacore object 43 | } 44 | \description{ 45 | Creates the value_spec from a list of datasets (optionally filtered by the 46 | sheet input). The named vector \verb{*_cols} is used to determine which is the 47 | correct sheet and renames the columns. 48 | } 49 | \seealso{ 50 | Other spec builders: 51 | \code{\link{spec_type_to_derivations}()}, 52 | \code{\link{spec_type_to_ds_spec}()}, 53 | \code{\link{spec_type_to_ds_vars}()}, 54 | \code{\link{spec_type_to_value_spec}()}, 55 | \code{\link{spec_type_to_var_spec}()} 56 | } 57 | \concept{spec builders} 58 | -------------------------------------------------------------------------------- /man/spec_type_to_derivations.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/spec_builder.R 3 | \name{spec_type_to_derivations} 4 | \alias{spec_type_to_derivations} 5 | \title{Spec to derivation} 6 | \usage{ 7 | spec_type_to_derivations( 8 | doc, 9 | cols = c(derivation_id = "ID", derivation = "[D|d]efinition|[D|d]escription"), 10 | sheet = "Method|Derivations?", 11 | var_cols = c(dataset = "[D|d]ataset|[D|d]omain", variable = "[N|n]ame|[V|v]ariables?", 12 | origin = "[O|o]rigin", predecessor = "[P|p]redecessor", comment = "[C|c]omment") 13 | ) 14 | } 15 | \arguments{ 16 | \item{doc}{Named list of datasets @seealso \code{\link[=read_all_sheets]{read_all_sheets()}} for exact 17 | format} 18 | 19 | \item{cols}{Named vector of column names. The column names can be regular 20 | expressions for more flexibility. But, the names must follow the given 21 | pattern} 22 | 23 | \item{sheet}{Regular expression for the sheet name} 24 | 25 | \item{var_cols}{Named vector of the name(s) of the origin, predecessor and 26 | comment columns. These do not have to be on the specified sheet.} 27 | } 28 | \value{ 29 | a dataset formatted for the metacore object 30 | } 31 | \description{ 32 | Creates the derivation table from a list of datasets (optionally filtered by 33 | the sheet input). The named vector \code{cols} is used to determine which is the 34 | correct sheet and renames the columns. The derivation will be used for 35 | "derived" origins, the comments for "assigned" origins, and predecessor for 36 | "predecessor" origins. 37 | } 38 | \seealso{ 39 | Other spec builders: 40 | \code{\link{spec_type_to_codelist}()}, 41 | \code{\link{spec_type_to_ds_spec}()}, 42 | \code{\link{spec_type_to_ds_vars}()}, 43 | \code{\link{spec_type_to_value_spec}()}, 44 | \code{\link{spec_type_to_var_spec}()} 45 | } 46 | \concept{spec builders} 47 | -------------------------------------------------------------------------------- /man/spec_type_to_ds_spec.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/spec_builder.R 3 | \name{spec_type_to_ds_spec} 4 | \alias{spec_type_to_ds_spec} 5 | \title{Spec to ds_spec} 6 | \usage{ 7 | spec_type_to_ds_spec( 8 | doc, 9 | cols = c(dataset = "[N|n]ame|[D|d]ataset|[D|d]omain", structure = "[S|s]tructure", 10 | label = "[L|l]abel|[D|d]escription"), 11 | sheet = NULL 12 | ) 13 | } 14 | \arguments{ 15 | \item{doc}{Named list of datasets @seealso \code{\link[=read_all_sheets]{read_all_sheets()}} for exact 16 | format} 17 | 18 | \item{cols}{Named vector of column names. The column names can be regular 19 | expressions for more flexibility. But, the names must follow the given pattern} 20 | 21 | \item{sheet}{Regular expression for the sheet name} 22 | } 23 | \value{ 24 | a dataset formatted for the metacore object 25 | } 26 | \description{ 27 | Creates the ds_spec from a list of datasets (optionally filtered by the sheet 28 | input). The named vector \code{cols} is used to determine which is the correct 29 | sheet and renames the columns 30 | } 31 | \seealso{ 32 | Other spec builders: 33 | \code{\link{spec_type_to_codelist}()}, 34 | \code{\link{spec_type_to_derivations}()}, 35 | \code{\link{spec_type_to_ds_vars}()}, 36 | \code{\link{spec_type_to_value_spec}()}, 37 | \code{\link{spec_type_to_var_spec}()} 38 | } 39 | \concept{spec builders} 40 | -------------------------------------------------------------------------------- /man/spec_type_to_ds_vars.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/spec_builder.R 3 | \name{spec_type_to_ds_vars} 4 | \alias{spec_type_to_ds_vars} 5 | \title{Spec to ds_vars} 6 | \usage{ 7 | spec_type_to_ds_vars( 8 | doc, 9 | cols = c(dataset = "[D|d]ataset|[D|d]omain", variable = 10 | "[V|v]ariable [[N|n]ame]?|[V|v]ariables?", order = 11 | "[V|v]ariable [O|o]rder|[O|o]rder", keep = "[K|k]eep|[M|m]andatory"), 12 | key_seq_sep_sheet = TRUE, 13 | key_seq_cols = c(dataset = "Dataset", key_seq = "Key Variables"), 14 | sheet = "[V|v]ar|Datasets" 15 | ) 16 | } 17 | \arguments{ 18 | \item{doc}{Named list of datasets @seealso \code{\link[=read_all_sheets]{read_all_sheets()}} for exact 19 | format} 20 | 21 | \item{cols}{Named vector of column names. The column names can be regular 22 | expressions for more flexibility. But, the names must follow the given 23 | pattern} 24 | 25 | \item{key_seq_sep_sheet}{A boolean to indicate if the key sequence is on a 26 | separate sheet. If set to false add the key_seq column name to the \code{cols} 27 | vector.} 28 | 29 | \item{key_seq_cols}{names vector to get the key_sequence for each dataset} 30 | 31 | \item{sheet}{Regular expression for the sheet names} 32 | } 33 | \value{ 34 | a dataset formatted for the metacore object 35 | } 36 | \description{ 37 | Creates the ds_vars from a list of datasets (optionally filtered by the sheet 38 | input). The named vector \code{cols} is used to determine which is the correct 39 | sheet and renames the columns 40 | } 41 | \seealso{ 42 | Other spec builders: 43 | \code{\link{spec_type_to_codelist}()}, 44 | \code{\link{spec_type_to_derivations}()}, 45 | \code{\link{spec_type_to_ds_spec}()}, 46 | \code{\link{spec_type_to_value_spec}()}, 47 | \code{\link{spec_type_to_var_spec}()} 48 | } 49 | \concept{spec builders} 50 | -------------------------------------------------------------------------------- /man/spec_type_to_value_spec.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/spec_builder.R 3 | \name{spec_type_to_value_spec} 4 | \alias{spec_type_to_value_spec} 5 | \title{Spec to value_spec} 6 | \usage{ 7 | spec_type_to_value_spec( 8 | doc, 9 | cols = c(dataset = "[D|d]ataset|[D|d]omain", variable = "[N|n]ame|[V|v]ariables?", 10 | origin = "[O|o]rigin", type = "[T|t]ype", code_id = "[C|c]odelist|Controlled Term", 11 | sig_dig = "[S|s]ignificant", where = "[W|w]here", derivation_id = "[M|m]ethod", 12 | predecessor = "[P|p]redecessor"), 13 | sheet = NULL, 14 | where_sep_sheet = TRUE, 15 | where_cols = c(id = "ID", where = c("Variable", "Comparator", "Value")), 16 | var_sheet = "[V|v]ar" 17 | ) 18 | } 19 | \arguments{ 20 | \item{doc}{Named list of datasets @seealso \code{\link[=read_all_sheets]{read_all_sheets()}} for exact 21 | format} 22 | 23 | \item{cols}{Named vector of column names. The column names can be regular 24 | expressions for more flexibility. But, the names must follow the given 25 | pattern} 26 | 27 | \item{sheet}{Regular expression for the sheet name} 28 | 29 | \item{where_sep_sheet}{Boolean value to control if the where information in a 30 | separate dataset. If the where information is on a separate sheet, set to 31 | true and provide the column information with the \code{where_cols} inputs.} 32 | 33 | \item{where_cols}{Named list with an id and where field. All columns in the 34 | where field will be collapsed together} 35 | 36 | \item{var_sheet}{Name of sheet with the Variable information on it. Metacore 37 | expects each variable will have a row in the value_spec. Because many 38 | specification only have information in the value tab this is added. If the 39 | information already exists in the value tab of your specification set to 40 | NULL} 41 | } 42 | \value{ 43 | a dataset formatted for the metacore object 44 | } 45 | \description{ 46 | Creates the value_spec from a list of datasets (optionally filtered by the 47 | sheet input). The named vector \code{cols} is used to determine which is the 48 | correct sheet and renames the columns 49 | } 50 | \seealso{ 51 | Other spec builders: 52 | \code{\link{spec_type_to_codelist}()}, 53 | \code{\link{spec_type_to_derivations}()}, 54 | \code{\link{spec_type_to_ds_spec}()}, 55 | \code{\link{spec_type_to_ds_vars}()}, 56 | \code{\link{spec_type_to_var_spec}()} 57 | } 58 | \concept{spec builders} 59 | -------------------------------------------------------------------------------- /man/spec_type_to_var_spec.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/spec_builder.R 3 | \name{spec_type_to_var_spec} 4 | \alias{spec_type_to_var_spec} 5 | \title{Spec to var_spec} 6 | \usage{ 7 | spec_type_to_var_spec( 8 | doc, 9 | cols = c(variable = "[N|n]ame|[V|v]ariables?", length = "[L|l]ength", label = 10 | "[L|l]abel", type = "[T|t]ype", dataset = "[D|d]ataset|[D|d]omain", format = 11 | "[F|f]ormat"), 12 | sheet = "[V|v]ar" 13 | ) 14 | } 15 | \arguments{ 16 | \item{doc}{Named list of datasets @seealso \code{\link[=read_all_sheets]{read_all_sheets()}} for exact 17 | format} 18 | 19 | \item{cols}{Named vector of column names. The column names can be regular 20 | expressions for more flexibility. But, the names must follow the given pattern} 21 | 22 | \item{sheet}{Regular expression for the sheet name} 23 | } 24 | \value{ 25 | a dataset formatted for the metacore object 26 | } 27 | \description{ 28 | Creates the var_spec from a list of datasets (optionally filtered by the sheet 29 | input). The named vector \code{cols} is used to determine which is the correct 30 | sheet and renames the columns. (Note: the keep column will be converted logical) 31 | } 32 | \seealso{ 33 | Other spec builders: 34 | \code{\link{spec_type_to_codelist}()}, 35 | \code{\link{spec_type_to_derivations}()}, 36 | \code{\link{spec_type_to_ds_spec}()}, 37 | \code{\link{spec_type_to_ds_vars}()}, 38 | \code{\link{spec_type_to_value_spec}()} 39 | } 40 | \concept{spec builders} 41 | -------------------------------------------------------------------------------- /man/xml_to_codelist.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/xml_builders.R 3 | \name{xml_to_codelist} 4 | \alias{xml_to_codelist} 5 | \title{XML to code list} 6 | \usage{ 7 | xml_to_codelist(doc) 8 | } 9 | \arguments{ 10 | \item{doc}{xml document} 11 | } 12 | \value{ 13 | a tibble containing the code list and permitted value information 14 | } 15 | \description{ 16 | Reads in a define xml and creates a code_list table. The code_list table is a 17 | nested tibble where each row is a code list or permitted value list. The code 18 | column contains a vector of a tibble depending on if it is a permitted values 19 | or code list 20 | } 21 | \seealso{ 22 | Other xml builder: 23 | \code{\link{xml_to_derivations}()}, 24 | \code{\link{xml_to_ds_spec}()}, 25 | \code{\link{xml_to_ds_vars}()}, 26 | \code{\link{xml_to_value_spec}()}, 27 | \code{\link{xml_to_var_spec}()} 28 | } 29 | \concept{xml builder} 30 | -------------------------------------------------------------------------------- /man/xml_to_derivations.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/xml_builders.R 3 | \name{xml_to_derivations} 4 | \alias{xml_to_derivations} 5 | \title{XML to derivation table} 6 | \usage{ 7 | xml_to_derivations(doc) 8 | } 9 | \arguments{ 10 | \item{doc}{xml document} 11 | } 12 | \value{ 13 | dataframe with derivation id's and derivations 14 | } 15 | \description{ 16 | This reads in a xml document and gets all the derivations/comments. These can 17 | be cross referenced to variables using the derivation_id's 18 | } 19 | \seealso{ 20 | Other xml builder: 21 | \code{\link{xml_to_codelist}()}, 22 | \code{\link{xml_to_ds_spec}()}, 23 | \code{\link{xml_to_ds_vars}()}, 24 | \code{\link{xml_to_value_spec}()}, 25 | \code{\link{xml_to_var_spec}()} 26 | } 27 | \concept{xml builder} 28 | -------------------------------------------------------------------------------- /man/xml_to_ds_spec.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/xml_builders.R 3 | \name{xml_to_ds_spec} 4 | \alias{xml_to_ds_spec} 5 | \title{XML to Data Set Spec} 6 | \usage{ 7 | xml_to_ds_spec(doc) 8 | } 9 | \arguments{ 10 | \item{doc}{xml document} 11 | } 12 | \value{ 13 | data frame with the data set specifications 14 | } 15 | \description{ 16 | Creates a dataset specification, which has the domain name and label for each dataset 17 | } 18 | \seealso{ 19 | Other xml builder: 20 | \code{\link{xml_to_codelist}()}, 21 | \code{\link{xml_to_derivations}()}, 22 | \code{\link{xml_to_ds_vars}()}, 23 | \code{\link{xml_to_value_spec}()}, 24 | \code{\link{xml_to_var_spec}()} 25 | } 26 | \concept{xml builder} 27 | -------------------------------------------------------------------------------- /man/xml_to_ds_vars.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/xml_builders.R 3 | \name{xml_to_ds_vars} 4 | \alias{xml_to_ds_vars} 5 | \title{XML to Data Set Var table} 6 | \usage{ 7 | xml_to_ds_vars(doc) 8 | } 9 | \arguments{ 10 | \item{doc}{xml document} 11 | } 12 | \value{ 13 | data frame with the dataset and variables 14 | } 15 | \description{ 16 | Creates the ds_vars table, which acts as a key between the datasets and the var spec 17 | } 18 | \seealso{ 19 | Other xml builder: 20 | \code{\link{xml_to_codelist}()}, 21 | \code{\link{xml_to_derivations}()}, 22 | \code{\link{xml_to_ds_spec}()}, 23 | \code{\link{xml_to_value_spec}()}, 24 | \code{\link{xml_to_var_spec}()} 25 | } 26 | \concept{xml builder} 27 | -------------------------------------------------------------------------------- /man/xml_to_value_spec.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/xml_builders.R 3 | \name{xml_to_value_spec} 4 | \alias{xml_to_value_spec} 5 | \title{XML to value spec} 6 | \usage{ 7 | xml_to_value_spec(doc) 8 | } 9 | \arguments{ 10 | \item{doc}{xml document} 11 | } 12 | \value{ 13 | tibble with the value level information 14 | } 15 | \description{ 16 | Takes a define xml and pulls out the value level metadata including codelist_id's, 17 | defines_id's, and where clause. There is one row per variable expect when there 18 | is a where clause, at which point there is one row per value. 19 | } 20 | \seealso{ 21 | Other xml builder: 22 | \code{\link{xml_to_codelist}()}, 23 | \code{\link{xml_to_derivations}()}, 24 | \code{\link{xml_to_ds_spec}()}, 25 | \code{\link{xml_to_ds_vars}()}, 26 | \code{\link{xml_to_var_spec}()} 27 | } 28 | \concept{xml builder} 29 | -------------------------------------------------------------------------------- /man/xml_to_var_spec.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/xml_builders.R 3 | \name{xml_to_var_spec} 4 | \alias{xml_to_var_spec} 5 | \title{XML to variable spec} 6 | \usage{ 7 | xml_to_var_spec(doc) 8 | } 9 | \arguments{ 10 | \item{doc}{define xml document} 11 | } 12 | \value{ 13 | data frame with variable, length, label columns 14 | } 15 | \description{ 16 | Takes a define xml and returns a dataset with specifications for each 17 | variable. The variable will just be the variable, unless the specification 18 | for that variable differ between datasets 19 | } 20 | \seealso{ 21 | Other xml builder: 22 | \code{\link{xml_to_codelist}()}, 23 | \code{\link{xml_to_derivations}()}, 24 | \code{\link{xml_to_ds_spec}()}, 25 | \code{\link{xml_to_ds_vars}()}, 26 | \code{\link{xml_to_value_spec}()} 27 | } 28 | \concept{xml builder} 29 | -------------------------------------------------------------------------------- /metacore.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: No 4 | SaveWorkspace: No 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 3 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | AutoAppendNewline: Yes 16 | StripTrailingWhitespace: Yes 17 | LineEndingConversion: Posix 18 | 19 | BuildType: Package 20 | PackageUseDevtools: Yes 21 | PackageInstallArgs: --no-multiarch --with-keep.source 22 | PackageRoxygenize: rd,collate,namespace 23 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(metacore) 3 | 4 | test_check("metacore") 5 | -------------------------------------------------------------------------------- /tests/testthat/example_spec.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atorus-research/metacore/4b251e077374152bfa7032cbcbb50cd2912aec1c/tests/testthat/example_spec.xlsx -------------------------------------------------------------------------------- /tests/testthat/metacore-figure/unnamed-chunk-1-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atorus-research/metacore/4b251e077374152bfa7032cbcbb50cd2912aec1c/tests/testthat/metacore-figure/unnamed-chunk-1-1.png -------------------------------------------------------------------------------- /tests/testthat/metacore-figure/unnamed-chunk-2-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atorus-research/metacore/4b251e077374152bfa7032cbcbb50cd2912aec1c/tests/testthat/metacore-figure/unnamed-chunk-2-1.png -------------------------------------------------------------------------------- /tests/testthat/spec_no_val.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atorus-research/metacore/4b251e077374152bfa7032cbcbb50cd2912aec1c/tests/testthat/spec_no_val.xlsx -------------------------------------------------------------------------------- /tests/testthat/test-checks.R: -------------------------------------------------------------------------------- 1 | test_that("Test label Checks", { 2 | load(metacore_example("pilot_ADaM.rda")) 3 | man_label <- tibble::tribble( 4 | ~variable, ~label, ~n_vars, ~ls_of_vars, 5 | "ABLFL" , "ABLFL" , 1L, "ADADAS.ABLFL", 6 | "ABLFL" , "Analysis Baseline Flag" , 1L, "ADNPIX.ABLFL" , 7 | "ABLFL" , "Baseline Record Flag" , 6L, c("ADLBC.ABLFL", "ADLBCPV.ABLFL", "ADLBH.ABLFL", "ADLBHPV.ABLFL", "ADLBHY.ABLFL", "ADVS.ABLFL"), 8 | "ANL01FL", "Analysis Flag 01" , 5L, c("ADLBC.ANL01FL", "ADLBCPV.ANL01FL", "ADLBH.ANL01FL", "ADLBHPV.ANL01FL", "ADVS.ANL01FL"), 9 | "ANL01FL", "Analysis Record Flag 01" , 3L, c("ADADAS.ANL01FL", "ADCIBC.ANL01FL", "ADNPIX.ANL01FL"), 10 | "CHG" , "Baseline Value" , 1L, "ADADAS.CHG", 11 | "CHG" , "Change from Baseline" , 6L, c("ADLBC.CHG", "ADLBCPV.CHG", "ADLBH.CHG", "ADLBHPV.CHG", "ADNPIX.CHG", "ADVS.CHG"), 12 | "ITTFL" , "Intent-to-Treat Population Flag" , 3L, c("ADADAS.ITTFL", "ADCIBC.ITTFL", "ADNPIX.ITTFL"), 13 | "ITTFL" , "Intent-To-Treat Population Flag" , 1L, "ADSL.ITTFL" 14 | ) %>% 15 | dplyr::arrange(variable, label) 16 | label_df <- check_inconsistent_labels(metacore) %>% 17 | dplyr::arrange(variable, label) 18 | expect_equal(label_df, man_label) 19 | 20 | load(metacore_example("pilot_SDTM.rda")) 21 | expect_message(check_inconsistent_labels(metacore), 22 | "No mismatch labels detected") 23 | 24 | expect_error(check_inconsistent_labels("metacore"), 25 | "Expects a metacore object") 26 | }) 27 | test_that("Check formats and types", { 28 | load(metacore_example("pilot_ADaM.rda")) 29 | expect_message(check_inconsistent_formats(metacore), 30 | "No mismatch formats detected" 31 | ) 32 | man_types <- tibble::tribble( 33 | ~variable, ~type, ~n_vars, ~ls_of_vars, 34 | "AVAL" , "float" , 6L, c("ADLBC.AVAL", "ADLBCPV.AVAL", "ADLBH.AVAL", "ADLBHPV.AVAL", "ADLBHY.AVAL", "ADVS.AVAL"), 35 | "AVAL", "integer", 4L, c("ADADAS.AVAL", "ADCIBC.AVAL", "ADNPIX.AVAL", "ADTTE.AVAL"), 36 | "BASE", "float" , 6L, c("ADLBC.BASE", "ADLBCPV.BASE", "ADLBH.BASE","ADLBHPV.BASE", "ADLBHY.BASE","ADVS.BASE"), 37 | "BASE" , "integer", 2L, c("ADADAS.BASE", "ADNPIX.BASE"), 38 | "CHG", "float" , 5L, c("ADLBC.CHG", "ADLBCPV.CHG", "ADLBH.CHG", "ADLBHPV.CHG", "ADVS.CHG"), 39 | "CHG", "integer", 2L, c("ADADAS.CHG", "ADNPIX.CHG"), 40 | "PCHG", "float" , 1L, "ADVS.PCHG", 41 | "PCHG", "integer", 2L, c("ADADAS.PCHG", "ADNPIX.PCHG"), 42 | ) 43 | 44 | type_df <- check_inconsistent_types(metacore) 45 | 46 | expect_equal(type_df, man_types) 47 | }) 48 | 49 | -------------------------------------------------------------------------------- /tests/testthat/test-metacore.R: -------------------------------------------------------------------------------- 1 | # where should this function go 2 | empty_df <- function(nms, fill) { 3 | df <- as.data.frame(matrix(fill,1,length(nms))) 4 | names(df) <- nms 5 | return(df) 6 | } 7 | 8 | dfs <- purrr::map(col_vars(), ~ empty_df(.x, fill = "A")) %>% 9 | setNames(c("ds_spec", 10 | "ds_vars", 11 | "var_spec", 12 | "value_spec", 13 | "derivations", 14 | "codelist", 15 | "supp")) 16 | dfs$ds_vars <- dfs$ds_vars %>% 17 | mutate(supp_flag = FALSE) 18 | 19 | # function from the withr package 20 | with_dir <- function (new, code) { 21 | old <- setwd(dir = new) 22 | on.exit(setwd(old)) 23 | force(code) 24 | } 25 | 26 | 27 | test_that("readonly function factory", { 28 | a <- readonly("a") 29 | expect_equal(class(a), "function") 30 | expect_equal(attr(a, "name"), "a") 31 | }) 32 | 33 | test_that("metacore wrapper function works", { 34 | wrapper <- suppressWarnings(do.call(metacore, dfs[1:7])) 35 | 36 | r6 <- suppressWarnings( 37 | MetaCore$new(dfs$ds_spec, 38 | dfs$ds_vars, 39 | dfs$var_spec, 40 | dfs$value_spec, 41 | dfs$derivations, 42 | dfs$codelist, 43 | dfs$supp) 44 | ) 45 | 46 | expect_equal(wrapper, r6) 47 | 48 | expect_warning(define_to_metacore(metacore_example("ADaM_define.xml"))) 49 | expect_warning(spec_to_metacore(metacore_example("p21_mock.xlsx"))) 50 | }) 51 | 52 | 53 | test_that("Can pass metacore NULL df's", { 54 | wrapper <- suppressWarnings(metacore(dfs$ds_spec, NULL, dfs$var_spec, 55 | dfs$value_spec, dfs$derivations, dfs$codelist, dfs$supp)) 56 | dummy <- list(character(), character(), numeric(), numeric(), 57 | logical(), character(), logical()) 58 | names(dummy) <- c("dataset", "variable", "key_seq", "order", 59 | "keep", "core", "supp_flag") 60 | dummy <- as_tibble(dummy) 61 | #Because of the labels the dfs are slightly different so checking 62 | # the insides match 63 | expect_equal(names(wrapper$ds_vars), names(dummy)) 64 | expect_equal(map_chr(wrapper$ds_vars, mode), 65 | map_chr(dummy, mode)) 66 | }) 67 | 68 | test_that("subsetting works", { 69 | test <- spec_to_metacore(metacore_example("p21_mock.xlsx"), quiet = TRUE) 70 | subset <- test %>% select_dataset("DM") 71 | expect_equal(unique(subset$ds_spec$dataset), "DM") 72 | }) 73 | 74 | test_that("save_metacore creates .rds with no file path", { 75 | wrapper <- suppressWarnings(do.call(metacore, dfs[1:7])) 76 | my_temp_dir <- tempdir() 77 | with_dir(my_temp_dir, save_metacore(wrapper)) 78 | expect_true("wrapper.rds" %in% list.files(my_temp_dir)) 79 | unlink(my_temp_dir) 80 | }) 81 | 82 | test_that("save_metacore replaces file path", { 83 | wrapper <- suppressWarnings(do.call(metacore, dfs[1:7])) 84 | my_temp_dir <- tempdir() 85 | save_metacore(wrapper, file.path(my_temp_dir, "wrapper.csv")) 86 | expect_true("wrapper.rds" %in% list.files(my_temp_dir)) 87 | unlink(my_temp_dir) 88 | }) 89 | 90 | test_that("save_metacore uses file path", { 91 | wrapper <- suppressWarnings(do.call(metacore, dfs[1:7])) 92 | my_temp_dir <- tempdir() 93 | save_metacore(wrapper, file.path(my_temp_dir, "wrapper.rds")) 94 | expect_true("wrapper.rds" %in% list.files(my_temp_dir)) 95 | unlink(my_temp_dir) 96 | }) 97 | 98 | test_that("load_metacore loads .rds", { 99 | wrapper <- suppressWarnings(do.call(metacore, dfs[1:7])) 100 | my_temp_dir <- tempdir() 101 | save_metacore(wrapper, file.path(my_temp_dir, "wrapper.rds")) 102 | wrapper <- load_metacore(file.path(my_temp_dir, "wrapper.rds")) 103 | expect_equal(class(wrapper), c("Metacore", "R6")) 104 | unlink(my_temp_dir) 105 | }) 106 | 107 | test_that("load metacore fails with no path", { 108 | expect_error(load_metacore()) 109 | }) 110 | 111 | test_that("load metacore fails with no path and rdss in wd", { 112 | wrapper <- suppressWarnings(do.call(metacore, dfs[1:7])) 113 | my_temp_dir <- tempdir() 114 | save_metacore(wrapper, file.path(my_temp_dir, "wrapper.rds")) 115 | expect_error( 116 | with_dir(my_temp_dir, load_metacore()) 117 | ) 118 | unlink(my_temp_dir) 119 | }) 120 | 121 | test_that("pulling out control terminology works", { 122 | test <- spec_to_metacore(metacore_example("p21_mock.xlsx"), quiet = TRUE) 123 | #Testing Errors 124 | ## Not specific enough 125 | expect_error(get_control_term(test, QVAL)) 126 | ## Wrong Dataset name 127 | expect_error(get_control_term(test, QVAL, LB)) 128 | ## Wrong variable name 129 | expect_error(get_control_term(test, QVA)) 130 | expect_equal( 131 | get_control_term(test, QVAL, SUPPAE), 132 | tibble(code = c("N", "Y"), decode = c("No", "Yes")) 133 | ) 134 | expect_equal( 135 | get_control_term(test, "QVAL", "SUPPAE"), 136 | tibble(code = c("N", "Y"), decode = c("No", "Yes")) 137 | ) 138 | }) 139 | 140 | test_that("get_keys works", { 141 | test <- spec_to_metacore(metacore_example("p21_mock.xlsx"), quiet = TRUE) 142 | #Testing Errors 143 | ## Domain not in ds_vars table 144 | expect_error(get_keys(test, DS)) 145 | ## Missing dataset name 146 | expect_error(get_keys(test)) 147 | #Testing Correct Output 148 | expect_equal( 149 | get_keys(test, DM), 150 | tibble(variable = c("STUDYID", "USUBJID"), key_seq = c(1L, 2L)) %>% 151 | add_labs(variable = "Variable Name", 152 | key_seq = "Sequence Key") 153 | ) 154 | }) 155 | -------------------------------------------------------------------------------- /tests/testthat/test-utils.R: -------------------------------------------------------------------------------- 1 | test_df <- data.frame( 2 | a = c("blah", NA), 3 | b = c("SpecificWord", "Incorrect") 4 | ) 5 | 6 | test_that("check_structure function works", { 7 | # error when NA not permissible 8 | 9 | expect_equal(check_structure(test_df, "a", is.character, FALSE, "test_df")$error, 10 | "a from the test_df table contains missing values. Actual values are needed.") 11 | 12 | expect_equal(check_structure(test_df, "a", is.logical, TRUE, "test_df")$warning, 13 | "test_df$a fails is.logical check \n" 14 | ) 15 | 16 | expect_null(check_structure(test_df, "b", check_words("SpecificWord", "Incorrect"), TRUE, "test_df")$warning) 17 | expect_equal(check_structure(test_df, "b", check_words("SpecificWord"), TRUE, "test_df")$warning, 18 | "The following words in test_df$b are not allowed: \n Incorrect\n") 19 | 20 | }) 21 | 22 | test_that("check_words creates function", { 23 | expect_true(is.function(check_words("word"))) 24 | }) 25 | 26 | 27 | test_that("add_lab function adds label", { 28 | a <- add_lab("a", "mylabel") 29 | expect_equal(attr(a, "label"), "mylabel") 30 | }) 31 | 32 | test_that("label set to null if not supplied", { 33 | a <- add_lab("a", character(0)) 34 | expect_null(attr(a, "label")) 35 | }) 36 | 37 | test_that("add labels adds multiple labels", { 38 | a <- data.frame(a = "a") %>% 39 | add_labs("a" = "label_a") 40 | expect_equal(attr(a$a, "label"), "label_a") 41 | }) 42 | 43 | test_that("add labels adds NULL to missing labels", { 44 | a <- data.frame(a = "a", b = "b") %>% 45 | add_labs("a" = "label_a", "b" = character(0)) 46 | expect_null(attr(a$b, "label")) 47 | }) 48 | 49 | test_that("metacore example returns file options", { 50 | expect_equal(sort(metacore_example()), 51 | sort(c("ADaM_define.xml", "mock_spec.xlsx", "p21_mock.xlsx", "pilot_ADaM.rda", 52 | "pilot_SDTM.rda", 53 | "SDTM_define.xml", "SDTM_spec_CDISC_pilot.xlsx"))) 54 | }) 55 | -------------------------------------------------------------------------------- /tests/testthat/test-validators.R: -------------------------------------------------------------------------------- 1 | # where should this function go 2 | empty_df <- function(nms, fill) { 3 | df <- as.data.frame(matrix(fill,1,length(nms))) 4 | names(df) <- nms 5 | return(df) 6 | } 7 | 8 | # both of these functions only work 9 | # when the data def object is loaded 10 | # and i think its checking the wrong thing 11 | 12 | test_that("specific words and primitive columns fail when character", { 13 | 14 | dfs <- purrr::map(col_vars(), ~ empty_df(.x, fill = "A")) %>% 15 | setNames(c("ds_spec", 16 | "ds_vars", 17 | "var_spec", 18 | "value_spec", 19 | "derivations", 20 | "codelist", 21 | "supp")) 22 | 23 | expect_warning(do.call(check_columns, dfs)) 24 | }) 25 | 26 | 27 | test_that("NA columns fail", { 28 | 29 | dfs <- purrr::map(col_vars(), ~ empty_df(.x, fill = NA)) %>% 30 | setNames(c("ds_spec", 31 | "ds_vars", 32 | "var_spec", 33 | "value_spec", 34 | "derivations", 35 | "codelist", 36 | "supp")) 37 | 38 | expect_error(do.call(check_columns, dfs)) 39 | }) 40 | 41 | 42 | test_that("NA columns fail", { 43 | 44 | dfs <- purrr::map(col_vars(), ~ empty_df(.x, fill = "A")) %>% 45 | setNames(c("ds_spec", 46 | "ds_vars", 47 | "var_spec", 48 | "value_spec", 49 | "derivations", 50 | "codelist", 51 | "supp")) 52 | 53 | dfs$ds_spec$label <- NA 54 | 55 | expect_warning(do.call(check_columns, dfs)) 56 | }) 57 | 58 | test_that("all_message dataframe contains 6 datasets", { 59 | expect_equal(all_message() %>% 60 | distinct(dataset) %>% 61 | nrow(), 7) 62 | }) 63 | 64 | test_that("check cross-reference tests", { 65 | dfs <- purrr::map(col_vars(), ~ empty_df(.x, fill = "A")) %>% 66 | setNames(c("ds_spec", 67 | "ds_vars", 68 | "var_spec", 69 | "value_spec", 70 | "derivations", 71 | "codelist", 72 | "supp")) 73 | 74 | dfs$var_spec <- dfs$var_spec %>% 75 | mutate(variable = "B") 76 | dfs$derivations <- dfs$derivations %>% 77 | mutate(derivation_id = "C") 78 | dfs$codelist <- dfs$codelist %>% 79 | mutate(code_id = "D") 80 | expect_warning(do.call(metacore, dfs[1:7])) 81 | }) 82 | 83 | test_that("test for incorrect column names", { 84 | dfs <- purrr::map(col_vars(), ~ empty_df(.x, fill = NA)) %>% 85 | setNames(c("ds_spec", 86 | "ds_vars", 87 | "var_spec", 88 | "value_spec", 89 | "derivations", 90 | "codelist", 91 | "supp")) 92 | 93 | dfs$codelist <- dfs$codelist %>% 94 | mutate(codelist2 = "A") 95 | expect_warning(do.call(metacore, dfs[1:7])) 96 | }) 97 | 98 | test_that("check object works", { 99 | load(metacore_example("pilot_ADaM.rda")) 100 | metacore %>% 101 | is_metacore() %>% 102 | expect_equal(TRUE) 103 | 104 | is_metacore("THIS IS NOT A THING") %>% 105 | expect_equal(FALSE) 106 | }) 107 | -------------------------------------------------------------------------------- /vignettes/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | *.R 3 | -------------------------------------------------------------------------------- /vignettes/Building_Specification_Readers.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Building Specification Readers" 3 | output: rmarkdown::html_vignette 4 | vignette: > 5 | %\VignetteIndexEntry{Building Specification Readers} 6 | %\VignetteEncoding{UTF-8} 7 | %\VignetteEngine{knitr::rmarkdown} 8 | editor_options: 9 | chunk_output_type: console 10 | --- 11 | 12 | ```{r, include = FALSE} 13 | knitr::opts_chunk$set( 14 | collapse = TRUE, 15 | comment = "#>" 16 | ) 17 | ``` 18 | 19 | ```{r setup} 20 | library(metacore) 21 | library(dplyr) 22 | library(purrr) 23 | library(stringr) 24 | ``` 25 | 26 | The first thing to do when trying to build a specification reader is to try the default. By default metacore can read in specifications that are in the Pinnacle 21 specification format. If your document isn't in that format, it is still worth trying the default readers, as the error messages can be helpful. 27 | 28 | ```{r, error=TRUE} 29 | spec_to_metacore(metacore_example("mock_spec.xlsx")) 30 | ``` 31 | 32 | As we can see, the mock spec we are using here doesn't match the format. Therefore we will have to build a bespoke reader. Before we start, it is important to understand the structure of the metacore object. Each object acts as its own database for all dataset related metadata. The object has 7 tables and their general purpose are as follows: 33 | 34 | - **ds_spec**: Contains dataset level information 35 | 36 | - **ds_vars**: Bridges the dataset and variable level information 37 | 38 | - **var_spec**: Contains variable level information 39 | 40 | - **value_spec**: Contains value level information 41 | 42 | - **derivations**: Contains all derivations 43 | 44 | - **codelist**: Contains information about code/decodes, permitted values and external libraries 45 | 46 | - **supp**: Contains information specific to supplemental variables 47 | 48 | Here is a schema of how all this fits together 49 | 50 | ![](../man/figures/labeled_schema.png "Metacore Schema") 51 | 52 | ds_spec is connected to ds_vars by the 'dataset' variable and ds_vars is connected to var_spec by the 'variable' variable, etc. For more information on the make-up of metacore objects please see the README. 53 | 54 | Now that we understand what makes a metacore object, we can start to develop the reader. 55 | 56 | First, we need to identify what type of specification format you have. At the moment we support a specification where each tab contains information relevant to a different table, such as a domain tab, a variable tab, etc. To test this you can use the `spec_type` function. 57 | 58 | ```{r} 59 | metacore:::spec_type(metacore_example("mock_spec.xlsx")) 60 | ``` 61 | 62 | Given we have the type style of specification, we can attempt to run with the lower level built-in specification to metacore build. There are 6 lower level specification builders to match each of the 6 datasets needed, spec_type_to\_\*. Even if these fail, the error messages should help identify the issues. 63 | 64 | But, before we start any of that, we need to read in our document using the `read_all_sheets` function. This function reads in a multisheet excel file into a named list, where the name of each dataset is the name of the tab. The lower level specification builders do assume the provided `doc` is a named list. This mock specification has 5 tabs, domain, variables, value level metadata, controlled terms, and computational method. So it looks like we might need to split the information in these tabs to get the 6 tables needed for the metacore object. 65 | 66 | ```{r} 67 | doc <- read_all_sheets(metacore_example("mock_spec.xlsx")) 68 | doc %>% map(head) 69 | 70 | ``` 71 | 72 | Let's start with making the ds_spec (dataset specification) table using `spec_type_to_ds_spec`. The ds_spec table is made of 3 columns: the dataset name, the dataset structure, and the dataset label. If we look at our specification document, it looks like all this information is in the Domains tab. Now we know what we need, we can start building the table by trying the `spec_type_to_ds_spec` function. 73 | 74 | This function takes in our named list of datasets (doc), a named vector of columns (cols) and a sheet name (sheet). But, only doc is needed, the other inputs have defaults. So we can try with just the default and see what we get. 75 | 76 | ```{r, error=TRUE} 77 | spec_type_to_ds_spec(doc) 78 | ``` 79 | 80 | The error tells us there is an issue with the label column in the Domains table. Meaning, we need to change the default vector for the cols input because the default regular expression isn't specific enough. First, let's check the column names in the Domain tab 81 | 82 | ```{r} 83 | doc$Domains %>% names() 84 | ``` 85 | 86 | If we look at the default input for cols, `"label" = "[L|l]abel|[D|d]escription"`, we can see the label is matching to the `Label` and the `Description` columns . 87 | 88 | We only need the Domain `Name`, `Label`, and `Data Structure` columns. So we can update the expressions to be more specific. 89 | 90 | ```{r} 91 | ds_spec <- spec_type_to_ds_spec(doc, 92 | cols = c("dataset" = "Name", 93 | "structure" = "Data Structure", 94 | "label" = "Label")) 95 | head(ds_spec) 96 | ``` 97 | 98 | Regular expressions are used to match the columns, so if you needed a more flexible input, you could do that. Now, we have the ds_spec table we can move on to the ds_vars table. 99 | 100 | The ds_vars table has 7 columns: 101 | 102 | - dataset: dataset name 103 | 104 | - variable: variable name 105 | 106 | - key_seq: integers controlling the sort order of each dataset 107 | 108 | - order: integer controlling the column order of each dataset 109 | 110 | - keep: boolean to determine if the variable is needed 111 | 112 | - core: ADaM core (Expected, Required, Permissible) 113 | 114 | - supp_flag: boolean to determine if the variable is in the supplementals 115 | 116 | When we look back at our specification document we can see all this information is in the variable tab. The inputs for the `spec_type_to_ds_vars` function are the same as before, but with slightly different defaults. By default ds_vars only checks sheets labeled "Variable" (this is because all the settings are defaulted to read in P21 formatted specs). But, those default work for our specifications cause all the information is in the variable tab; so we can try with just the defaults again. 117 | 118 | ```{r, error=TRUE} 119 | spec_type_to_ds_vars(doc) 120 | ``` 121 | 122 | This error means it is trying to match the sheet entitled Variable, the variable column matches to two different columns. This is the same error we had before. We just need to have a quick look at the columns and adjust the regular expression to be more specific. Additionally, for the key sequence variable isn't in the variable tab. We saw this information above in the domain tab. So we will need to do two things to fix this. First, adjust the dataset name in the `key_seq_cols` argument. Second, change the sheets to include the variable and the domain sheet. 123 | 124 | ```{r} 125 | doc$Variables %>% head() 126 | 127 | ds_vars<- spec_type_to_ds_vars(doc, cols = c("dataset" = "Domain", 128 | "variable" = "[V|v]ariable [N|n]ame", 129 | "order" = "[V|v]ariable [O|o]rder", 130 | "keep" = "[M|m]andatory"), 131 | key_seq_cols = c("dataset" = "Domain Name", 132 | "key_seq" = "Key"), 133 | sheet = "[V|v]ar|Domains") 134 | 135 | head(ds_vars) 136 | ``` 137 | 138 | The next table we have is var_spec, the table of variable level metadata. var_spec is separate from ds_vars because, in accordance with CDISC standards, labels and lengths should be the same for a given variable across all datasets. So, we are able to normalize the data to only have one row per variable, which ensures this rule and helps reduce the size of the object. There are 6 columns in var_spec: 139 | 140 | - variable: variable name 141 | 142 | - length: variable length 143 | 144 | - label: variable label 145 | 146 | - type: variable Class 147 | 148 | - format: variable format 149 | 150 | - common: boolean if variable is common across ADaM 151 | 152 | Looking back at our specification we see this will also be built using the Variable tab. So, we know we need to edit the regular expression for the variable to make it more specific. Additionally, if you look at the default for `cols` you see there is a dataset input. This is because some standards aren't 100% consistent, some variables (e.g. visit) have different lengths depending on the dataset. **So to accommodate this some of the variables in var_spec are in the ds.variable format**. These builders will do this conversion for you , but the dataset is needed. The other thing the builders can automatically deal with is the common variable. If given a dataset column, the builder function will automatically figure out which variables are common to all dataset. This is good because we don't have a common variable in our specs. 153 | 154 | ```{r} 155 | var_spec <- spec_type_to_var_spec(doc, cols = c("variable" = "Variable Name", 156 | "length" = "[L|l]ength", 157 | "label" = "[L|l]abel", 158 | "type" = "[T|t]ype", 159 | "dataset" = "[D|d]ataset|[D|d]omain", 160 | "format" = "Format")) 161 | head(var_spec) 162 | ``` 163 | 164 | There is one issue here: the format column is also the codelist names. This is because the information came from the "Controlled Term or Format" column of my spec document. So the final step of preparing var_spec table is to remove the controlled terms. It is easy here because all the formats end in a full stop (.), but the controlled terms don't. 165 | 166 | ```{r} 167 | var_spec <- var_spec %>% 168 | mutate(format = if_else(str_detect(format, "\\."), format, "")) 169 | ``` 170 | 171 | The next dataset is value_spec, which contains the value level metadata. It is made up of 8 columns: 172 | 173 | - dataset: dataset name 174 | 175 | - variable: variable name 176 | 177 | - origin: origin of data 178 | 179 | - type: value type 180 | 181 | - sig_dig: significant digits of the value 182 | 183 | - code_id: id used to cross-reference the code/decode 184 | 185 | - where: value of the variable 186 | 187 | - derivation_id: id used to cross-reference the derivation 188 | 189 | By default, `spec_type_to_value_spec` is set up to have the where information on a different sheet because that is the format of a P21 spec, but in our spec we don't have that. In fact, we don't have any value level metadata in our spec. But, that is fine - the default builders will just pull what information it can from the variable tab. Additionally this spec doesn't have a predecessor column, so we can just use the method column. 190 | 191 | ```{r} 192 | value_spec <- spec_type_to_value_spec(doc, cols = c("dataset" = "VLM Name|Domain", 193 | "variable" = "VLM Name|Variable Name", 194 | "origin" = "[O|o]rigin", 195 | "type" = "[T|t]ype", 196 | "code_id" = "Controlled Term", 197 | "where" = "Parameter Code", 198 | "derivation_id" = "Method", 199 | "predecessor" = "Method"), 200 | where_sep_sheet = FALSE) 201 | head(value_spec) 202 | ``` 203 | 204 | The derivation table is relatively simple by comparison. It just has two columns, the derivation id and the derivation. But, the derivation comes from the supplied derivation, predecessor, or comment column depending on the origin. In this mock we don't have a predecessor column so we can set to comment as well. 205 | 206 | ```{r} 207 | derivation <- spec_type_to_derivations(doc, cols = c("derivation_id" = "Name", 208 | "derivation" = "[D|d]efinition|[D|d]escription"), 209 | var_cols = c("dataset" = "Domain Name", 210 | "variable" = "Variable Name|VLM", 211 | "origin" = "[O|o]rigin", 212 | "predecessor" = "Comment", 213 | "comment" = "Comment")) 214 | head(derivation) 215 | ``` 216 | 217 | The final table is codelist. This table contains all the code/decode pairs, all lists of permitted values and information about external libraries. What is somewhat special about the structure of this table is there isn't just a code and a decode column, but rather a codes column that contains a list of code/decode tables, permitted value vectors and external dictionary vectors. So there is one row per code (i.e. a row for country and one for yes/no codes etc.). This structure makes it easier to see all the codes at once and allows some code to be numeric and others to be character. 218 | 219 | By default the `spec_type_to_codelist` function expects codelists and external dictionaries. But, in the specification we only have codelist so `dict_cols` needs to be set to null. 220 | 221 | ```{r} 222 | codelist <- spec_type_to_codelist(doc, codelist_cols = c("code_id" = "Codelist Code", 223 | "name" = "Codelist Name", 224 | "code" = "Coded Value", 225 | "decode" = "Decoded Value"), 226 | simplify = TRUE, 227 | dict_cols = NULL) 228 | head(codelist) 229 | ``` 230 | 231 | Now we have all the tables we need we can make the metacore object 232 | 233 | ```{r} 234 | metacore(ds_spec, ds_vars, var_spec, value_spec, 235 | derivation, codelist) 236 | ``` 237 | 238 | And we're good to go! 239 | -------------------------------------------------------------------------------- /vignettes/Example.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Example" 3 | output: rmarkdown::html_vignette 4 | vignette: > 5 | %\VignetteIndexEntry{Example} 6 | %\VignetteEngine{knitr::rmarkdown} 7 | %\VignetteEncoding{UTF-8} 8 | --- 9 | 10 | ```{r, include = FALSE} 11 | knitr::opts_chunk$set( 12 | collapse = TRUE, 13 | comment = "#>" 14 | ) 15 | ``` 16 | 17 | ```{r setup} 18 | library(metacore) 19 | library(xml2) 20 | ``` 21 | 22 | [CDISC](https://www.cdisc.org/) standards provide a standard for submission of data set metadata through a document known as define.xml. The define provides a great deal of useful information that is both machine readable and can be viewed through your web browser. While many organizations wait to produce a define until the datasets are finalized, it can still be advantageous to be able to read metadata directly from a define. For this purpose, we developed readers that can go directly from a define.xml to a metacore object. 23 | 24 | To do this, we've built separate reader function for each of the metacore tables. For more information on the structure of the metacore tables, check out the README. 25 | 26 | We start by reading the define from disk using the `xmlTreeParse()` function from the `XML` package. 27 | 28 | ```{r} 29 | doc <- read_xml(metacore_example("SDTM_define.xml")) 30 | xml_ns_strip(doc) 31 | ``` 32 | 33 | 34 | Next, we use the metacore readers for each of the separate tables necessary for a metacore object. 35 | 36 | ```{r} 37 | ds_spec2 <- xml_to_ds_spec(doc) 38 | ds_vars <- xml_to_ds_vars(doc) 39 | var_spec <- xml_to_var_spec(doc) 40 | value_spec <- xml_to_value_spec(doc) 41 | code_list <- xml_to_codelist(doc) 42 | derivations <- xml_to_derivations(doc) 43 | ``` 44 | 45 | Great! Now we're ready to create our metacore object. 46 | 47 | ```{r} 48 | test <- metacore(ds_spec2, ds_vars, var_spec, value_spec, derivations, code_list) 49 | ``` 50 | 51 | Something to note about a metacore object is that it inherently holds all data from your source of metadata, be it your specification, define.xml, database, etc. So that means you have _all_ the metadata. In your program, it's likely that you'll just want to keep metadata relevant to the dataset you're currently programming. We've made process easily, with functions that filter metadata down to information only relevant to a specific dataset. 52 | 53 | ```{r} 54 | # a metacore object with all your dataframes 55 | subset <- test %>% select_dataset("DM") 56 | subset$ds_spec 57 | 58 | # a simplified dataframe 59 | subset_t <- test %>% select_dataset("DM", simplify = TRUE) 60 | ``` 61 | 62 | As can be seen above, the metacore object can be filtered directly, or by using the `simplify = TRUE` argument, a simplified data frame can be returned. 63 | 64 | ```{r} 65 | subset_t 66 | ``` 67 | --------------------------------------------------------------------------------