├── .Rbuildignore ├── .github ├── .gitignore └── workflows │ └── pkgdown.yaml ├── .gitignore ├── DESCRIPTION ├── LICENSE ├── LICENSE.md ├── NAMESPACE ├── NEWS.md ├── R ├── clinspacy.R ├── cui2vec_data.R ├── mtsamples.R ├── utils-data-table.R └── utils-pipe.R ├── README.Rmd ├── README.html ├── README.md ├── _pkgdown.yml ├── clinspacy.Rproj ├── cran-comments.md ├── data ├── cui2vec_definitions.rda └── mtsamples.rda ├── docs ├── 404.html ├── LICENSE-text.html ├── LICENSE.html ├── articles │ ├── index.html │ ├── using_embeddings_for_machine_learning_on_clinical_text.html │ └── using_embeddings_for_machine_learning_on_clinical_text_files │ │ └── figure-html │ │ ├── unnamed-chunk-7-1.png │ │ └── unnamed-chunk-8-1.png ├── authors.html ├── bootstrap-toc.css ├── bootstrap-toc.js ├── docsearch.css ├── docsearch.js ├── index.html ├── link.svg ├── news │ └── index.html ├── pkgdown.css ├── pkgdown.js ├── pkgdown.yml └── reference │ ├── Rplot001.png │ ├── bind_clinspacy.html │ ├── bind_clinspacy_embeddings.html │ ├── clinspacy.html │ ├── clinspacy_init.html │ ├── clinspacy_single.html │ ├── cui2vec_definitions.html │ ├── cui2vec_embeddings.html │ ├── dataset_cui2vec_definitions.html │ ├── dataset_cui2vec_embeddings.html │ ├── dataset_mtsamples.html │ ├── index.html │ ├── mtsamples.html │ └── pipe.html ├── man ├── bind_clinspacy.Rd ├── bind_clinspacy_embeddings.Rd ├── clinspacy.Rd ├── clinspacy_init.Rd ├── dataset_cui2vec_definitions.Rd ├── dataset_cui2vec_embeddings.Rd ├── dataset_mtsamples.Rd └── pipe.Rd └── vignettes ├── .gitignore └── using_embeddings_for_machine_learning_on_clinical_text.Rmd /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | ^LICENSE\.md$ 4 | ^README\.Rmd$ 5 | ^README\.html$ 6 | ^\.github$ 7 | ^data$ 8 | ^_pkgdown\.yml$ 9 | ^docs$ 10 | ^pkgdown$ 11 | ^vignettes$ 12 | ^cran-comments\.md$ 13 | ^CRAN-RELEASE$ 14 | -------------------------------------------------------------------------------- /.github/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | -------------------------------------------------------------------------------- /.github/workflows/pkgdown.yaml: -------------------------------------------------------------------------------- 1 | on: 2 | push: 3 | branches: master 4 | 5 | name: pkgdown 6 | 7 | jobs: 8 | pkgdown: 9 | runs-on: macOS-latest 10 | env: 11 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 12 | steps: 13 | - uses: actions/checkout@v2 14 | 15 | - uses: r-lib/actions/setup-r@master 16 | 17 | - uses: r-lib/actions/setup-pandoc@master 18 | 19 | - name: Query dependencies 20 | run: | 21 | install.packages('remotes') 22 | saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2) 23 | writeLines(sprintf("R-%i.%i", getRversion()$major, getRversion()$minor), ".github/R-version") 24 | shell: Rscript {0} 25 | 26 | - name: Cache R packages 27 | uses: actions/cache@v1 28 | with: 29 | path: ${{ env.R_LIBS_USER }} 30 | key: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-${{ hashFiles('.github/depends.Rds') }} 31 | restore-keys: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1- 32 | 33 | - name: Install dependencies 34 | run: | 35 | remotes::install_deps(dependencies = TRUE) 36 | install.packages("pkgdown") 37 | shell: Rscript {0} 38 | 39 | - name: Install package 40 | run: R CMD INSTALL . 41 | 42 | - name: Deploy package 43 | run: | 44 | git config --local user.email "actions@github.com" 45 | git config --local user.name "GitHub Actions" 46 | Rscript -e 'pkgdown::deploy_to_branch(new_process = FALSE)' 47 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .Ruserdata 5 | data 6 | inst/doc 7 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: clinspacy 2 | Type: Package 3 | Title: Clinical Natural Language Processing using 'spaCy', 'scispaCy', and 'medspaCy' 4 | Version: 1.0.2.9000 5 | Authors@R: c(person("Karandeep", "Singh", 6 | email = "kdpsingh@umich.edu", role = c("aut", "cre")), 7 | person("Benjamin", "Kompa", role = c("aut")), 8 | person("Andrew", "Beam", role = c("aut")), 9 | person("Allen", "Schmaltz", role = c("aut"))) 10 | Description: Performs biomedical named entity recognition, 11 | Unified Medical Language System (UMLS) concept mapping, and negation 12 | detection using the Python 'spaCy', 'scispaCy', and 'medspaCy' packages, and 13 | transforms extracted data into a wide format for inclusion in machine 14 | learning models. The development of the 'scispaCy' package is described by 15 | Neumann (2019) . The 'medspacy' package uses 16 | 'ConText', an algorithm for determining the context of clinical statements 17 | described by Harkema (2009) . Clinspacy 18 | also supports entity embeddings from 'scispaCy' and UMLS 'cui2vec' concept 19 | embeddings developed by Beam (2018) . 20 | License: MIT + file LICENSE 21 | Encoding: UTF-8 22 | LazyData: true 23 | Imports: 24 | reticulate (>= 1.19), 25 | data.table, 26 | assertthat, 27 | rappdirs, 28 | utils, 29 | magrittr 30 | RoxygenNote: 7.1.1 31 | URL: https://github.com/ML4LHS/clinspacy 32 | BugReports: https://github.com/ML4LHS/clinspacy/issues 33 | Depends: 34 | R (>= 2.10) 35 | Suggests: 36 | knitr, 37 | rmarkdown 38 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | YEAR: 2021 2 | COPYRIGHT HOLDER: Karandeep Singh, Benjamin Kompa, Andrew Beam, Allen Schmaltz 3 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | Copyright (c) 2021 Karandeep Singh, Benjamin Kompa, Andrew Beam, Allen Schmaltz 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export("%>%") 4 | export(bind_clinspacy) 5 | export(bind_clinspacy_embeddings) 6 | export(clinspacy) 7 | export(clinspacy_init) 8 | export(dataset_cui2vec_definitions) 9 | export(dataset_cui2vec_embeddings) 10 | export(dataset_mtsamples) 11 | import(data.table) 12 | importFrom(magrittr,"%>%") 13 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | # clinspacy (development version) 2 | 3 | # clinspacy 0.1.0.9002 4 | * Added a `NEWS.md` file to track changes to the package. 5 | * bind_* functions no longer run clinspacy_init() -- this should speed up load times 6 | 7 | # clinspacy 0.2.0.9000 8 | * Added `NA` to `semantic_types` argument for both `clinspacy()` and `clinspacy_single()` to prevent tokens from being discarded if they do not match a listed semantic type. 9 | * Moved `clinspacy_single()` logic into `lapply()` instead of gradually building a list using a `for` loop for boost in speed 10 | * Moved progress bar to `clinspacy()` so that it iterates over documents rather than tokens 11 | 12 | # clinspacy 0.2.0 (2021-02-22) 13 | * Changed lifecycle badge to stable 14 | 15 | # clinspacy 1.0.0 (2021-02-23) 16 | * Bumped version number to 1.0.0 since it's ready for CRAN submission 17 | 18 | # clinspacy 1.0.1 (2021-03-08) 19 | * Bug fix: removed unnecessary arguments and some clean up in prep for CRAN submission 20 | 21 | # clinspacy 1.0.2 (2021-03-18) 22 | * Fixed documentation prior to CRAN submission based on feedback 23 | * Bug fix: Specified version numbers for spaCy (2.3.0), scispaCy (0.2.5), and medspaCy (0.1.0.2) to ensure that the versions are compatible with one another 24 | * Bug fix: spaCy 2.3.0 must be installed from conda-forge (`pip` set to `FALSE`) because the source fails to build properly on Windows even with Visual C++ build tools installed 25 | * Update: Switched to using medspaCy instead of its individual components because medspaCy 0.1.0.2 is compatible with spaCy 2.3.0 (an older version was not). 26 | * Bug fix: changed `section_title` to `section_category` due to updates in medspaCy sectionizer API 27 | * Known issue: After first running `clinspacy_init()` on Windows, sometimes it cannot find `numpy`. This is a known issue with `reticulate` [https://github.com/rstudio/reticulate/issues/367](https://github.com/rstudio/reticulate/issues/367). Restarting the R session and re-running `clinspacy_init()` appears to fix the issue 28 | -------------------------------------------------------------------------------- /R/cui2vec_data.R: -------------------------------------------------------------------------------- 1 | #' Cui2vec concept embeddings 2 | #' 3 | #' This dataset contains Unified Medical Langauge System (UMLS) concept 4 | #' embeddings from Andrew Beam's 5 | #' \href{https://github.com/beamandrew/cui2vec}{cui2vec R package}. There are 6 | #' 500 embeddings included for each concept. 7 | #' 8 | #' This dataset is not viewable until it has been downloaded, which will occur 9 | #' the very first time you run \code{clinspacy_init()} after installing this 10 | #' package. 11 | #' 12 | #' Citation 13 | #' 14 | #' Beam, A.L., Kompa, B., Schmaltz, A., Fried, I., Griffin, W, Palmer, N.P., 15 | #' Shi, X., Cai, T., and Kohane, I.S.,, 2019. Clinical Concept Embeddings 16 | #' Learned from Massive Sources of Multimodal Medical Data. arXiv preprint 17 | #' arXiv:1804.01486. 18 | #' 19 | #' License 20 | #' 21 | #' This data is made available under a 22 | #' \href{https://creativecommons.org/licenses/by/4.0/}{CC BY 4.0 license}. The 23 | #' only change made to the original dataset is the renaming of columns. 24 | #' 25 | #' @format A data frame with 109053 rows and 501 variables: \describe{ 26 | #' \item{cui}{A Unified Medical Language System (UMLS) Concept Unique 27 | #' Identifier (CUI)} \item{emb_001}{Concept embedding vector #1} 28 | #' \item{emb_002}{Concept embedding vector #2} \item{...}{and so on...} 29 | #' \item{emb_500}{Concept embedding vector #500} } 30 | #' @source \url{https://figshare.com/s/00d69861786cd0156d81} 31 | #' @return Returns the cui2vec UMLS embeddings as a data frame. 32 | #' @export 33 | dataset_cui2vec_embeddings <- function() { 34 | source_file = 'https://github.com/ML4LHS/clinspacy/releases/download/v0.1.0/cui2vec_embeddings.rds' 35 | destination_file = file.path(rappdirs::user_data_dir('clinspacy'), 'cui2vec_embeddings.rds') 36 | 37 | if (!file.exists(destination_file)) { 38 | if (!dir.exists(rappdirs::user_data_dir('clinspacy'))) { 39 | dir.create(rappdirs::user_data_dir('clinspacy'), recursive = TRUE) 40 | } 41 | 42 | message('Downloading the cui2vec_embeddings dataset...') 43 | utils::download.file(source_file, destination_file) 44 | } 45 | 46 | readRDS(destination_file) 47 | } 48 | 49 | #' Cui2vec concept definitions 50 | #' 51 | #' This dataset contains definitions for the Unified Medical Language System 52 | #' (UMLS) Concept Unique Identifiers (CUIs). These come from Andrew Beam's 53 | #' \href{https://github.com/beamandrew/cui2vec}{cui2vec R package}. 54 | #' 55 | #' License 56 | #' 57 | #' This data is made available under a 58 | #' \href{https://github.com/beamandrew/cui2vec/blob/master/LICENSE.md}{MIT 59 | #' license}. The data is copyrighted in 2019 by Benjamin Kompa, Andrew Beam, and 60 | #' Allen Schmaltz. The only change made to the original dataset is the renaming 61 | #' of columns. 62 | #' 63 | #' @format A data frame with 3053795 rows and 3 variables: \describe{ 64 | #' \item{cui}{A Unified Medical Language System (UMLS) Concept Unique 65 | #' Identifier (CUI)} \item{semantic_type}{Semantic type of the CUI} 66 | #' \item{definition}{Definition of the CUI} } 67 | #' @source \url{https://github.com/beamandrew/cui2vec} 68 | #' @return Returns the cui2vec UMLS definitions as a data frame. 69 | #' @export 70 | dataset_cui2vec_definitions <- function() { 71 | source_file = 'https://github.com/ML4LHS/clinspacy/releases/download/v0.1.0/cui2vec_definitions.rds' 72 | destination_file = file.path(rappdirs::user_data_dir('clinspacy'), 'cui2vec_definitions.rds') 73 | 74 | if (!file.exists(destination_file)) { 75 | if (!dir.exists(rappdirs::user_data_dir('clinspacy'))) { 76 | dir.create(rappdirs::user_data_dir('clinspacy'), recursive = TRUE) 77 | } 78 | 79 | message('Downloading the cui2vec_definitions dataset...') 80 | utils::download.file(source_file, destination_file) 81 | } 82 | 83 | readRDS(destination_file) 84 | } 85 | -------------------------------------------------------------------------------- /R/mtsamples.R: -------------------------------------------------------------------------------- 1 | #' Medical transcription samples. 2 | #' 3 | #' This dataset contains sample medical transcriptions for various medical 4 | #' specialties. 5 | #' 6 | #' Acknowledgements 7 | #' 8 | #' This data was scraped from 9 | #' \href{https://mtsamples.com}{https://mtsamples.com} by Tara Boyle. 10 | #' 11 | #' License This data is made available under a 12 | #' \href{https://creativecommons.org/share-your-work/public-domain/cc0/}{CC0: 13 | #' Public Domain license}. 14 | #' 15 | #' @format A data frame with 4999 rows and 6 variables: \describe{ 16 | #' \item{note_id}{A unique identifier for each note} \item{description}{A 17 | #' description or chief concern} \item{medical_specialty}{Medical specialty of 18 | #' the note} \item{sample_name}{mtsamples.com note name} 19 | #' \item{transcription}{Transcription of note text} \item{keywords}{Keywords} 20 | #' } 21 | #' @source \url{https://www.kaggle.com/tboyle10/medicaltranscriptions/data} 22 | #' @return Returns the mtsamples dataset as a data frame. 23 | #' @export 24 | dataset_mtsamples <- function() { 25 | source_file = 'https://github.com/ML4LHS/clinspacy/releases/download/v0.1.0/mtsamples.rds' 26 | destination_file = file.path(rappdirs::user_data_dir('clinspacy'), 'mtsamples.rds') 27 | 28 | if (!file.exists(destination_file)) { 29 | if (!dir.exists(rappdirs::user_data_dir('clinspacy'))) { 30 | dir.create(rappdirs::user_data_dir('clinspacy'), recursive = TRUE) 31 | } 32 | 33 | message('Downloading the mtsamples dataset...') 34 | utils::download.file(source_file, destination_file) 35 | } 36 | 37 | readRDS(destination_file) 38 | } 39 | -------------------------------------------------------------------------------- /R/utils-data-table.R: -------------------------------------------------------------------------------- 1 | # data.table is generally careful to minimize the scope for namespace 2 | # conflicts (i.e., functions with the same name as in other packages); 3 | # a more conservative approach using @importFrom should be careful to 4 | # import any needed data.table special symbols as well, e.g., if you 5 | # run DT[ , .N, by='grp'] in your package, you'll need to add 6 | # @importFrom data.table .N to prevent the NOTE from R CMD check. 7 | # See ?data.table::`special-symbols` for the list of such symbols 8 | # data.table defines; see the 'Importing data.table' vignette for more 9 | # advice (vignette('datatable-importing', 'data.table')). 10 | # 11 | #' @import data.table 12 | NULL 13 | -------------------------------------------------------------------------------- /R/utils-pipe.R: -------------------------------------------------------------------------------- 1 | #' Pipe operator 2 | #' 3 | #' See \code{magrittr::\link[magrittr:pipe]{\%>\%}} for details. 4 | #' 5 | #' @name %>% 6 | #' @rdname pipe 7 | #' @param lhs A value or the magrittr placeholder. 8 | #' @param rhs A function call using the magrittr semantics. 9 | #' @keywords internal 10 | #' @export 11 | #' @importFrom magrittr %>% 12 | #' @return Returns rhs(lhs). 13 | #' @usage lhs \%>\% rhs 14 | NULL 15 | -------------------------------------------------------------------------------- /README.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: github_document 3 | --- 4 | 5 | 6 | 7 | ```{r, include = FALSE} 8 | knitr::opts_chunk$set( 9 | message = FALSE, 10 | warning = FALSE, 11 | collapse = TRUE, 12 | comment = "#>", 13 | fig.path = "man/figures/README-", 14 | out.width = "100%" 15 | ) 16 | ``` 17 | 18 | # clinspacy 19 | 20 | 21 | [![Lifecycle: stable](https://img.shields.io/badge/lifecycle-stable-brightgreen.svg)](https://lifecycle.r-lib.org/articles/stages.html#stable) 22 | 23 | 24 | The goal of clinspacy is to perform biomedical named entity recognition, Unified Medical Language System (UMLS) concept mapping, and negation detection using the Python spaCy, scispacy, and medspacy packages. 25 | 26 | ## Installation 27 | 28 | 29 | You can install the CRAN version of clinspacy with: 30 | 31 | ``` 32 | install.packages('clinspacy') 33 | ``` 34 | 35 | You can install the GitHub version of clinspacy with: 36 | 37 | ``` 38 | remotes::install_github('ML4LHS/clinspacy', INSTALL_opts = '--no-multiarch') 39 | ``` 40 | 41 | ## How to load clinspacy 42 | 43 | ```{r} 44 | library(clinspacy) 45 | ``` 46 | 47 | 48 | ## Initiating clinspacy 49 | 50 | *Note: the very first time you run `clinspacy_init()` or `clinspacy()` after installing the package, you may receive an error stating that `spaCy` was unable to be imported because it was not found. Restarting your R session should resolve the issue.* 51 | 52 | Initiating clinspacy is optional. If you do not initiate the package using `clinspacy_init()`, it will be automatically initiated without the UMLS linker. The UMLS linker takes up ~12 GB of RAM, so if you would like to use the linker, you can initiate clinspacy with the linker. The linker can still be added on later by reinitiating with the `use_linker` argument set to `TRUE`. 53 | 54 | ```{r} 55 | clinspacy_init() # This is optional! The default functionality is to initiatie clinspacy without the UMLS linker 56 | ``` 57 | 58 | ## Named entity recognition (without the UMLS linker) 59 | 60 | The `clinspacy()` function can take a single string, a character vector, or a data frame. It can output either a data frame or a file name. 61 | 62 | ### A single character string as input 63 | 64 | ```{r} 65 | clinspacy('This patient has diabetes and CKD stage 3 but no HTN.') 66 | 67 | clinspacy('HISTORY: He presents with chest pain. PMH: HTN. MEDICATIONS: This patient with diabetes is taking omeprazole, aspirin, and lisinopril 10 mg but is not taking albuterol anymore as his asthma has resolved. ALLERGIES: penicillin.', verbose = FALSE) 68 | ``` 69 | 70 | 71 | ### A character vector as input 72 | 73 | ```{r} 74 | clinspacy(c('This pt has CKD and HTN', 'Pt only has CKD but no HTN'), 75 | verbose = FALSE) 76 | ``` 77 | 78 | ### A data frame as input 79 | 80 | ```{r} 81 | data.frame(text = c('This pt has CKD and HTN', 'Diabetes is present'), 82 | stringsAsFactors = FALSE) %>% 83 | clinspacy(df_col = 'text', verbose = FALSE) 84 | ``` 85 | 86 | ### Saving the output to file 87 | 88 | The `output_file` can then be piped into `bind_clinspacy()` or `bind_clinspacy_embeddings()`. This saves a lot of time because you can try different strategies of subsetting in both of these functions without needing to re-process the original data. 89 | 90 | ```{r} 91 | if (!dir.exists(rappdirs::user_data_dir('clinspacy'))) { 92 | dir.create(rappdirs::user_data_dir('clinspacy'), recursive = TRUE) 93 | } 94 | 95 | mtsamples = dataset_mtsamples() 96 | 97 | mtsamples[1:5,] 98 | 99 | clinspacy_output_file = 100 | mtsamples[1:5, 1:2] %>% 101 | clinspacy(df_col = 'description', 102 | verbose = FALSE, 103 | output_file = file.path(rappdirs::user_data_dir('clinspacy'), 104 | 'output.csv'), 105 | overwrite = TRUE) 106 | 107 | clinspacy_output_file 108 | ``` 109 | 110 | ## Binding named entities to a data frame (without the UMLS linker) 111 | 112 | Negated concepts, as identified by the medspacy cycontext flag, are ignored by default and do not count towards the frequencies. However, you can now change the subsetting criteria. 113 | 114 | Note that you now need to re-provide the original dataset to the `bind_clinspacy()` function. 115 | 116 | ```{r} 117 | mtsamples[1:5, 1:2] %>% 118 | clinspacy(df_col = 'description', verbose = FALSE) %>% 119 | bind_clinspacy(mtsamples[1:5, 1:2]) 120 | ``` 121 | 122 | ### We can also store the intermediate result so that bind_clinspacy() does not need to re-process the text. 123 | 124 | ```{r} 125 | clinspacy_output_data = 126 | mtsamples[1:5, 1:2] %>% 127 | clinspacy(df_col = 'description', verbose = FALSE) 128 | 129 | clinspacy_output_data %>% 130 | bind_clinspacy(mtsamples[1:5, 1:2]) 131 | 132 | clinspacy_output_data %>% 133 | bind_clinspacy(mtsamples[1:5, 1:2], 134 | cs_col = 'entity') 135 | 136 | clinspacy_output_data %>% 137 | bind_clinspacy(mtsamples[1:5, 1:2], 138 | subset = 'is_uncertain == FALSE & is_negated == FALSE') 139 | ``` 140 | 141 | ### We can also re-use the output file we had created earlier and pipe this directly into bind_clinspacy(). 142 | 143 | ```{r} 144 | clinspacy_output_file 145 | 146 | clinspacy_output_file %>% 147 | bind_clinspacy(mtsamples[1:5, 1:2]) 148 | 149 | clinspacy_output_file %>% 150 | bind_clinspacy(mtsamples[1:5, 1:2], 151 | cs_col = 'entity') 152 | 153 | clinspacy_output_file %>% 154 | bind_clinspacy(mtsamples[1:5, 1:2], 155 | subset = 'is_uncertain == FALSE & is_negated == FALSE') 156 | ``` 157 | 158 | 159 | ## Binding entity embeddings to a data frame (without the UMLS linker) 160 | 161 | With the UMLS linker disabled, 200-dimensional entity embeddings can be extracted from the scispacy Python package. For this to work, you must set `return_scispacy_embeddings` to `TRUE` when running `clinspacy()`. It's also a good idea to write the output directly to file because the embeddings can be quite large. 162 | 163 | ```{r} 164 | clinspacy_output_file = 165 | mtsamples[1:5, 1:2] %>% 166 | clinspacy(df_col = 'description', 167 | return_scispacy_embeddings = TRUE, 168 | verbose = FALSE, 169 | output_file = file.path(rappdirs::user_data_dir('clinspacy'), 170 | 'output.csv'), 171 | overwrite = TRUE) 172 | 173 | clinspacy_output_file %>% 174 | bind_clinspacy_embeddings(mtsamples[1:5, 1:2]) 175 | 176 | ``` 177 | 178 | ## Adding the UMLS linker 179 | 180 | The UMLS linker can be turned on (and off) even if `clinspacy_init()` has already been called. The first time you turn it on, it takes a while because the linker needs to be loaded into memory. On subsequent removal and addition, this occurs much more quickly because the linker is only removed/added to the pipeline and does not need to be reloaded into memory. 181 | 182 | ```{r} 183 | clinspacy_init(use_linker = TRUE) 184 | ``` 185 | 186 | ## Named entity recognition (with the UMLS linker) 187 | 188 | By turning on the UMLS linker, you can restrict the results by semantic type. In general, restricting the result in `clinspacy()` is not a good idea because you can always subset the results later within `bind_clinspacy()` and `bind_clinspacy_embeddings()`. 189 | 190 | ```{r} 191 | clinspacy('This patient has diabetes and CKD stage 3 but no HTN.') 192 | 193 | clinspacy('This patient with diabetes is taking omeprazole, aspirin, and lisinopril 10 mg but is not taking albuterol anymore as his asthma has resolved.', 194 | semantic_types = 'Pharmacologic Substance') 195 | 196 | clinspacy('This patient with diabetes is taking omeprazole, aspirin, and lisinopril 10 mg but is not taking albuterol anymore as his asthma has resolved.', 197 | semantic_types = 'Disease or Syndrome') 198 | ``` 199 | 200 | ## Binding UMLS concept unique identifiers to a data frame (with the UMLS linker) 201 | 202 | This function binds columns containing concept unique identifiers with which scispacy has 99% confidence of being present with values containing frequencies. Negated concepts, as identified by the medspacy cycontext is_negated flag, are ignored and do not count towards the frequencies. However, this behavior can be changed using the `subset` argument. 203 | 204 | Note that by turning on the UMLS linker, you can restrict the results by semantic type. 205 | 206 | ```{r} 207 | clinspacy_output_file = 208 | mtsamples[1:5, 1:2] %>% 209 | clinspacy(df_col = 'description', 210 | return_scispacy_embeddings = TRUE, # only so we can retrieve these below 211 | verbose = FALSE, 212 | output_file = file.path(rappdirs::user_data_dir('clinspacy'), 213 | 'output.csv'), 214 | overwrite = TRUE) 215 | 216 | clinspacy_output_file %>% 217 | bind_clinspacy(mtsamples[1:5, 1:2]) 218 | 219 | clinspacy_output_file %>% 220 | bind_clinspacy( 221 | mtsamples[1:5, 1:2], 222 | subset = 'is_negated == FALSE & semantic_type == "Diagnostic Procedure"' 223 | ) 224 | ``` 225 | 226 | ## Binding concept embeddings to a data frame (with the UMLS linker) 227 | 228 | The default embeddings are from the scispacy R package. If you want to use the cui2vec embeddings (only available with the linker enabled), you ned to set the `type` arguement to `cui2vec`. Up to 500-dimensional embeddings can be returned. 229 | 230 | Note that by turning on the UMLS linker, you can restrict the results by semantic type (with either type of embedding). 231 | 232 | ### Scispacy embeddings (with the UMLS linker) 233 | 234 | With the UMLS linker enabled, you can restrict by semantic type when obtaining scispacy embeddings. 235 | 236 | Note: The mean embeddings may be slightly different than if the linker was disabled because entities may be captured twice (as entities may map to multiple concepts). Thus, if you do not need to restrict by semantic type, the recommended setting is to turn the UMLS linker off by re-running `clinspacy_init(use_linker = FALSE)` (note that `use_linker = FALSE` is the default in `clinspacy_init()`). 237 | 238 | 239 | ```{r} 240 | clinspacy_output_file %>% 241 | bind_clinspacy_embeddings(mtsamples[1:5, 1:2]) 242 | 243 | clinspacy_output_file %>% 244 | bind_clinspacy_embeddings( 245 | mtsamples[1:5, 1:2], 246 | subset = 'is_negated == FALSE & semantic_type == "Diagnostic Procedure"' 247 | ) 248 | ``` 249 | 250 | 251 | ### Cui2vec embeddings (with the UMLS linker) 252 | 253 | These are only available with the UMLS linker enabled. 254 | 255 | ```{r} 256 | clinspacy_output_file %>% 257 | bind_clinspacy_embeddings(mtsamples[1:5, 1:2], 258 | type = 'cui2vec') 259 | 260 | clinspacy_output_file %>% 261 | bind_clinspacy_embeddings( 262 | mtsamples[1:5, 1:2], 263 | type = 'cui2vec', 264 | subset = 'is_negated == FALSE & semantic_type == "Diagnostic Procedure"' 265 | ) 266 | ``` 267 | 268 | # UMLS CUI definitions 269 | 270 | ```{r} 271 | cui2vec_definitions = dataset_cui2vec_definitions() 272 | head(cui2vec_definitions) 273 | ``` 274 | -------------------------------------------------------------------------------- /_pkgdown.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kdpsingh/clinspacy/2efc73a3dbd152cb57aa65e263ae37310e068c61/_pkgdown.yml -------------------------------------------------------------------------------- /clinspacy.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | AutoAppendNewline: Yes 16 | StripTrailingWhitespace: Yes 17 | 18 | BuildType: Package 19 | PackageUseDevtools: Yes 20 | PackageInstallArgs: --no-multiarch --with-keep.source 21 | -------------------------------------------------------------------------------- /cran-comments.md: -------------------------------------------------------------------------------- 1 | ## Test environments 2 | * local R installation, R 3.6.0 3 | * win-builder (devel) 4 | 5 | ## R CMD check results 6 | 7 | 0 errors | 0 warnings | 1 note 8 | 9 | * This is a new release. 10 | 11 | ## Resubmission 12 | 13 | * I corrected all references to software and APIs in the DESCRIPTION to have single quotes, including converting spaCy to 'spaCy', scispaCy to 'scispaCy', and medspaCy to 'medspaCy'. I made this change to both the title and description sections. 14 | 15 | * I added references to the DESCRIPTION file (in the description section) to relevant methods papers related to the underlying 'scispaCy' and 'medspaCy' packages used by this package, as well references to the 'cui2vec' study. 16 | 17 | * I added all missing \value and \argument tags to clinspacy_init.Rd, dataset_cui2vec_definitions.Rd, dataset_cui2vec_embeddings.Rd, dataset_mtsamples.Rd, and pipe.Rd 18 | 19 | * I am confirming that none of the functions write to the user's home directory, the working directory, or the package directory. Any files that are written by functions are written to the appropriate OS-specific app directory folder, which is identified using the rappdirs package. 20 | 21 | * Due to the inclusion of cui2vec data in this package (which is also licensed through an MIT license), I have added Benjamin Kompa, Andrew Beam, and Allen Schmaltz as authors on this package (with their permission) and listed them as copyright holders in the LICENSE file. 22 | 23 | * I discovered an issue related to Python spaCy, scispaCy, and medspaCy packages that clinspacy depends upon. Those have now been corrected by setting strict version requirements for Python dependencies so that recent package updates should not affect clinspacy. 24 | -------------------------------------------------------------------------------- /data/cui2vec_definitions.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kdpsingh/clinspacy/2efc73a3dbd152cb57aa65e263ae37310e068c61/data/cui2vec_definitions.rda -------------------------------------------------------------------------------- /data/mtsamples.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kdpsingh/clinspacy/2efc73a3dbd152cb57aa65e263ae37310e068c61/data/mtsamples.rda -------------------------------------------------------------------------------- /docs/404.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Page not found (404) • clinspacy 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 55 | 56 | 57 | 58 | 59 | 60 | 61 |
62 |
63 | 117 | 118 | 119 | 120 |
121 | 122 |
123 |
124 | 127 | 128 | Content not found. Please use links in the navbar. 129 | 130 |
131 | 132 | 137 | 138 |
139 | 140 | 141 | 142 |
143 | 146 | 147 |
148 |

Site built with pkgdown 1.6.1.

149 |
150 | 151 |
152 |
153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | -------------------------------------------------------------------------------- /docs/LICENSE-text.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | License • clinspacy 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 55 | 56 | 57 | 58 | 59 | 60 | 61 |
62 |
63 | 117 | 118 | 119 | 120 |
121 | 122 |
123 |
124 | 127 | 128 |
YEAR: 2021
129 | COPYRIGHT HOLDER: Karandeep Singh, Benjamin Kompa, Andrew Beam, Allen Schmaltz
130 | 
131 | 132 |
133 | 134 | 139 | 140 |
141 | 142 | 143 | 144 |
145 | 148 | 149 |
150 |

Site built with pkgdown 1.6.1.

151 |
152 | 153 |
154 |
155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | -------------------------------------------------------------------------------- /docs/LICENSE.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | MIT License • clinspacy 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 55 | 56 | 57 | 58 | 59 | 60 | 61 |
62 |
63 | 117 | 118 | 119 | 120 |
121 | 122 |
123 |
124 | 127 | 128 |
129 | 130 |

Copyright (c) 2021 Karandeep Singh, Benjamin Kompa, Andrew Beam, Allen Schmaltz

131 |

Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

132 |

The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.

133 |

THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

134 |
135 | 136 |
137 | 138 | 143 | 144 |
145 | 146 | 147 | 148 |
149 | 152 | 153 |
154 |

Site built with pkgdown 1.6.1.

155 |
156 | 157 |
158 |
159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | -------------------------------------------------------------------------------- /docs/articles/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Articles • clinspacy 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 55 | 56 | 57 | 58 | 59 | 60 | 61 |
62 |
63 | 117 | 118 | 119 | 120 |
121 | 122 |
123 |
124 | 127 | 128 |
129 |

All vignettes

130 |

131 | 132 |
133 |
Using Embeddings for Machine Learning on Clinical Text
134 |
135 |
136 |
137 |
138 |
139 | 140 | 141 |
142 | 145 | 146 |
147 |

Site built with pkgdown 1.6.1.

148 |
149 | 150 |
151 |
152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | -------------------------------------------------------------------------------- /docs/articles/using_embeddings_for_machine_learning_on_clinical_text_files/figure-html/unnamed-chunk-7-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kdpsingh/clinspacy/2efc73a3dbd152cb57aa65e263ae37310e068c61/docs/articles/using_embeddings_for_machine_learning_on_clinical_text_files/figure-html/unnamed-chunk-7-1.png -------------------------------------------------------------------------------- /docs/articles/using_embeddings_for_machine_learning_on_clinical_text_files/figure-html/unnamed-chunk-8-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kdpsingh/clinspacy/2efc73a3dbd152cb57aa65e263ae37310e068c61/docs/articles/using_embeddings_for_machine_learning_on_clinical_text_files/figure-html/unnamed-chunk-8-1.png -------------------------------------------------------------------------------- /docs/authors.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Authors • clinspacy 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 55 | 56 | 57 | 58 | 59 | 60 | 61 |
62 |
63 | 117 | 118 | 119 | 120 |
121 | 122 |
123 |
124 | 127 | 128 |
    129 |
  • 130 |

    Karandeep Singh. Author, maintainer. 131 |

    132 |
  • 133 |
  • 134 |

    Benjamin Kompa. Author. 135 |

    136 |
  • 137 |
  • 138 |

    Andrew Beam. Author. 139 |

    140 |
  • 141 |
  • 142 |

    Allen Schmaltz. Author. 143 |

    144 |
  • 145 |
146 | 147 |
148 | 149 |
150 | 151 | 152 | 153 |
154 | 157 | 158 |
159 |

Site built with pkgdown 1.6.1.

160 |
161 | 162 |
163 |
164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | -------------------------------------------------------------------------------- /docs/bootstrap-toc.css: -------------------------------------------------------------------------------- 1 | /*! 2 | * Bootstrap Table of Contents v0.4.1 (http://afeld.github.io/bootstrap-toc/) 3 | * Copyright 2015 Aidan Feldman 4 | * Licensed under MIT (https://github.com/afeld/bootstrap-toc/blob/gh-pages/LICENSE.md) */ 5 | 6 | /* modified from https://github.com/twbs/bootstrap/blob/94b4076dd2efba9af71f0b18d4ee4b163aa9e0dd/docs/assets/css/src/docs.css#L548-L601 */ 7 | 8 | /* All levels of nav */ 9 | nav[data-toggle='toc'] .nav > li > a { 10 | display: block; 11 | padding: 4px 20px; 12 | font-size: 13px; 13 | font-weight: 500; 14 | color: #767676; 15 | } 16 | nav[data-toggle='toc'] .nav > li > a:hover, 17 | nav[data-toggle='toc'] .nav > li > a:focus { 18 | padding-left: 19px; 19 | color: #563d7c; 20 | text-decoration: none; 21 | background-color: transparent; 22 | border-left: 1px solid #563d7c; 23 | } 24 | nav[data-toggle='toc'] .nav > .active > a, 25 | nav[data-toggle='toc'] .nav > .active:hover > a, 26 | nav[data-toggle='toc'] .nav > .active:focus > a { 27 | padding-left: 18px; 28 | font-weight: bold; 29 | color: #563d7c; 30 | background-color: transparent; 31 | border-left: 2px solid #563d7c; 32 | } 33 | 34 | /* Nav: second level (shown on .active) */ 35 | nav[data-toggle='toc'] .nav .nav { 36 | display: none; /* Hide by default, but at >768px, show it */ 37 | padding-bottom: 10px; 38 | } 39 | nav[data-toggle='toc'] .nav .nav > li > a { 40 | padding-top: 1px; 41 | padding-bottom: 1px; 42 | padding-left: 30px; 43 | font-size: 12px; 44 | font-weight: normal; 45 | } 46 | nav[data-toggle='toc'] .nav .nav > li > a:hover, 47 | nav[data-toggle='toc'] .nav .nav > li > a:focus { 48 | padding-left: 29px; 49 | } 50 | nav[data-toggle='toc'] .nav .nav > .active > a, 51 | nav[data-toggle='toc'] .nav .nav > .active:hover > a, 52 | nav[data-toggle='toc'] .nav .nav > .active:focus > a { 53 | padding-left: 28px; 54 | font-weight: 500; 55 | } 56 | 57 | /* from https://github.com/twbs/bootstrap/blob/e38f066d8c203c3e032da0ff23cd2d6098ee2dd6/docs/assets/css/src/docs.css#L631-L634 */ 58 | nav[data-toggle='toc'] .nav > .active > ul { 59 | display: block; 60 | } 61 | -------------------------------------------------------------------------------- /docs/bootstrap-toc.js: -------------------------------------------------------------------------------- 1 | /*! 2 | * Bootstrap Table of Contents v0.4.1 (http://afeld.github.io/bootstrap-toc/) 3 | * Copyright 2015 Aidan Feldman 4 | * Licensed under MIT (https://github.com/afeld/bootstrap-toc/blob/gh-pages/LICENSE.md) */ 5 | (function() { 6 | 'use strict'; 7 | 8 | window.Toc = { 9 | helpers: { 10 | // return all matching elements in the set, or their descendants 11 | findOrFilter: function($el, selector) { 12 | // http://danielnouri.org/notes/2011/03/14/a-jquery-find-that-also-finds-the-root-element/ 13 | // http://stackoverflow.com/a/12731439/358804 14 | var $descendants = $el.find(selector); 15 | return $el.filter(selector).add($descendants).filter(':not([data-toc-skip])'); 16 | }, 17 | 18 | generateUniqueIdBase: function(el) { 19 | var text = $(el).text(); 20 | var anchor = text.trim().toLowerCase().replace(/[^A-Za-z0-9]+/g, '-'); 21 | return anchor || el.tagName.toLowerCase(); 22 | }, 23 | 24 | generateUniqueId: function(el) { 25 | var anchorBase = this.generateUniqueIdBase(el); 26 | for (var i = 0; ; i++) { 27 | var anchor = anchorBase; 28 | if (i > 0) { 29 | // add suffix 30 | anchor += '-' + i; 31 | } 32 | // check if ID already exists 33 | if (!document.getElementById(anchor)) { 34 | return anchor; 35 | } 36 | } 37 | }, 38 | 39 | generateAnchor: function(el) { 40 | if (el.id) { 41 | return el.id; 42 | } else { 43 | var anchor = this.generateUniqueId(el); 44 | el.id = anchor; 45 | return anchor; 46 | } 47 | }, 48 | 49 | createNavList: function() { 50 | return $(''); 51 | }, 52 | 53 | createChildNavList: function($parent) { 54 | var $childList = this.createNavList(); 55 | $parent.append($childList); 56 | return $childList; 57 | }, 58 | 59 | generateNavEl: function(anchor, text) { 60 | var $a = $(''); 61 | $a.attr('href', '#' + anchor); 62 | $a.text(text); 63 | var $li = $('
  • '); 64 | $li.append($a); 65 | return $li; 66 | }, 67 | 68 | generateNavItem: function(headingEl) { 69 | var anchor = this.generateAnchor(headingEl); 70 | var $heading = $(headingEl); 71 | var text = $heading.data('toc-text') || $heading.text(); 72 | return this.generateNavEl(anchor, text); 73 | }, 74 | 75 | // Find the first heading level (`

    `, then `

    `, etc.) that has more than one element. Defaults to 1 (for `

    `). 76 | getTopLevel: function($scope) { 77 | for (var i = 1; i <= 6; i++) { 78 | var $headings = this.findOrFilter($scope, 'h' + i); 79 | if ($headings.length > 1) { 80 | return i; 81 | } 82 | } 83 | 84 | return 1; 85 | }, 86 | 87 | // returns the elements for the top level, and the next below it 88 | getHeadings: function($scope, topLevel) { 89 | var topSelector = 'h' + topLevel; 90 | 91 | var secondaryLevel = topLevel + 1; 92 | var secondarySelector = 'h' + secondaryLevel; 93 | 94 | return this.findOrFilter($scope, topSelector + ',' + secondarySelector); 95 | }, 96 | 97 | getNavLevel: function(el) { 98 | return parseInt(el.tagName.charAt(1), 10); 99 | }, 100 | 101 | populateNav: function($topContext, topLevel, $headings) { 102 | var $context = $topContext; 103 | var $prevNav; 104 | 105 | var helpers = this; 106 | $headings.each(function(i, el) { 107 | var $newNav = helpers.generateNavItem(el); 108 | var navLevel = helpers.getNavLevel(el); 109 | 110 | // determine the proper $context 111 | if (navLevel === topLevel) { 112 | // use top level 113 | $context = $topContext; 114 | } else if ($prevNav && $context === $topContext) { 115 | // create a new level of the tree and switch to it 116 | $context = helpers.createChildNavList($prevNav); 117 | } // else use the current $context 118 | 119 | $context.append($newNav); 120 | 121 | $prevNav = $newNav; 122 | }); 123 | }, 124 | 125 | parseOps: function(arg) { 126 | var opts; 127 | if (arg.jquery) { 128 | opts = { 129 | $nav: arg 130 | }; 131 | } else { 132 | opts = arg; 133 | } 134 | opts.$scope = opts.$scope || $(document.body); 135 | return opts; 136 | } 137 | }, 138 | 139 | // accepts a jQuery object, or an options object 140 | init: function(opts) { 141 | opts = this.helpers.parseOps(opts); 142 | 143 | // ensure that the data attribute is in place for styling 144 | opts.$nav.attr('data-toggle', 'toc'); 145 | 146 | var $topContext = this.helpers.createChildNavList(opts.$nav); 147 | var topLevel = this.helpers.getTopLevel(opts.$scope); 148 | var $headings = this.helpers.getHeadings(opts.$scope, topLevel); 149 | this.helpers.populateNav($topContext, topLevel, $headings); 150 | } 151 | }; 152 | 153 | $(function() { 154 | $('nav[data-toggle="toc"]').each(function(i, el) { 155 | var $nav = $(el); 156 | Toc.init($nav); 157 | }); 158 | }); 159 | })(); 160 | -------------------------------------------------------------------------------- /docs/docsearch.css: -------------------------------------------------------------------------------- 1 | /* Docsearch -------------------------------------------------------------- */ 2 | /* 3 | Source: https://github.com/algolia/docsearch/ 4 | License: MIT 5 | */ 6 | 7 | .algolia-autocomplete { 8 | display: block; 9 | -webkit-box-flex: 1; 10 | -ms-flex: 1; 11 | flex: 1 12 | } 13 | 14 | .algolia-autocomplete .ds-dropdown-menu { 15 | width: 100%; 16 | min-width: none; 17 | max-width: none; 18 | padding: .75rem 0; 19 | background-color: #fff; 20 | background-clip: padding-box; 21 | border: 1px solid rgba(0, 0, 0, .1); 22 | box-shadow: 0 .5rem 1rem rgba(0, 0, 0, .175); 23 | } 24 | 25 | @media (min-width:768px) { 26 | .algolia-autocomplete .ds-dropdown-menu { 27 | width: 175% 28 | } 29 | } 30 | 31 | .algolia-autocomplete .ds-dropdown-menu::before { 32 | display: none 33 | } 34 | 35 | .algolia-autocomplete .ds-dropdown-menu [class^=ds-dataset-] { 36 | padding: 0; 37 | background-color: rgb(255,255,255); 38 | border: 0; 39 | max-height: 80vh; 40 | } 41 | 42 | .algolia-autocomplete .ds-dropdown-menu .ds-suggestions { 43 | margin-top: 0 44 | } 45 | 46 | .algolia-autocomplete .algolia-docsearch-suggestion { 47 | padding: 0; 48 | overflow: visible 49 | } 50 | 51 | .algolia-autocomplete .algolia-docsearch-suggestion--category-header { 52 | padding: .125rem 1rem; 53 | margin-top: 0; 54 | font-size: 1.3em; 55 | font-weight: 500; 56 | color: #00008B; 57 | border-bottom: 0 58 | } 59 | 60 | .algolia-autocomplete .algolia-docsearch-suggestion--wrapper { 61 | float: none; 62 | padding-top: 0 63 | } 64 | 65 | .algolia-autocomplete .algolia-docsearch-suggestion--subcategory-column { 66 | float: none; 67 | width: auto; 68 | padding: 0; 69 | text-align: left 70 | } 71 | 72 | .algolia-autocomplete .algolia-docsearch-suggestion--content { 73 | float: none; 74 | width: auto; 75 | padding: 0 76 | } 77 | 78 | .algolia-autocomplete .algolia-docsearch-suggestion--content::before { 79 | display: none 80 | } 81 | 82 | .algolia-autocomplete .ds-suggestion:not(:first-child) .algolia-docsearch-suggestion--category-header { 83 | padding-top: .75rem; 84 | margin-top: .75rem; 85 | border-top: 1px solid rgba(0, 0, 0, .1) 86 | } 87 | 88 | .algolia-autocomplete .ds-suggestion .algolia-docsearch-suggestion--subcategory-column { 89 | display: block; 90 | padding: .1rem 1rem; 91 | margin-bottom: 0.1; 92 | font-size: 1.0em; 93 | font-weight: 400 94 | /* display: none */ 95 | } 96 | 97 | .algolia-autocomplete .algolia-docsearch-suggestion--title { 98 | display: block; 99 | padding: .25rem 1rem; 100 | margin-bottom: 0; 101 | font-size: 0.9em; 102 | font-weight: 400 103 | } 104 | 105 | .algolia-autocomplete .algolia-docsearch-suggestion--text { 106 | padding: 0 1rem .5rem; 107 | margin-top: -.25rem; 108 | font-size: 0.8em; 109 | font-weight: 400; 110 | line-height: 1.25 111 | } 112 | 113 | .algolia-autocomplete .algolia-docsearch-footer { 114 | width: 110px; 115 | height: 20px; 116 | z-index: 3; 117 | margin-top: 10.66667px; 118 | float: right; 119 | font-size: 0; 120 | line-height: 0; 121 | } 122 | 123 | .algolia-autocomplete .algolia-docsearch-footer--logo { 124 | background-image: url("data:image/svg+xml;utf8,"); 125 | background-repeat: no-repeat; 126 | background-position: 50%; 127 | background-size: 100%; 128 | overflow: hidden; 129 | text-indent: -9000px; 130 | width: 100%; 131 | height: 100%; 132 | display: block; 133 | transform: translate(-8px); 134 | } 135 | 136 | .algolia-autocomplete .algolia-docsearch-suggestion--highlight { 137 | color: #FF8C00; 138 | background: rgba(232, 189, 54, 0.1) 139 | } 140 | 141 | 142 | .algolia-autocomplete .algolia-docsearch-suggestion--text .algolia-docsearch-suggestion--highlight { 143 | box-shadow: inset 0 -2px 0 0 rgba(105, 105, 105, .5) 144 | } 145 | 146 | .algolia-autocomplete .ds-suggestion.ds-cursor .algolia-docsearch-suggestion--content { 147 | background-color: rgba(192, 192, 192, .15) 148 | } 149 | -------------------------------------------------------------------------------- /docs/docsearch.js: -------------------------------------------------------------------------------- 1 | $(function() { 2 | 3 | // register a handler to move the focus to the search bar 4 | // upon pressing shift + "/" (i.e. "?") 5 | $(document).on('keydown', function(e) { 6 | if (e.shiftKey && e.keyCode == 191) { 7 | e.preventDefault(); 8 | $("#search-input").focus(); 9 | } 10 | }); 11 | 12 | $(document).ready(function() { 13 | // do keyword highlighting 14 | /* modified from https://jsfiddle.net/julmot/bL6bb5oo/ */ 15 | var mark = function() { 16 | 17 | var referrer = document.URL ; 18 | var paramKey = "q" ; 19 | 20 | if (referrer.indexOf("?") !== -1) { 21 | var qs = referrer.substr(referrer.indexOf('?') + 1); 22 | var qs_noanchor = qs.split('#')[0]; 23 | var qsa = qs_noanchor.split('&'); 24 | var keyword = ""; 25 | 26 | for (var i = 0; i < qsa.length; i++) { 27 | var currentParam = qsa[i].split('='); 28 | 29 | if (currentParam.length !== 2) { 30 | continue; 31 | } 32 | 33 | if (currentParam[0] == paramKey) { 34 | keyword = decodeURIComponent(currentParam[1].replace(/\+/g, "%20")); 35 | } 36 | } 37 | 38 | if (keyword !== "") { 39 | $(".contents").unmark({ 40 | done: function() { 41 | $(".contents").mark(keyword); 42 | } 43 | }); 44 | } 45 | } 46 | }; 47 | 48 | mark(); 49 | }); 50 | }); 51 | 52 | /* Search term highlighting ------------------------------*/ 53 | 54 | function matchedWords(hit) { 55 | var words = []; 56 | 57 | var hierarchy = hit._highlightResult.hierarchy; 58 | // loop to fetch from lvl0, lvl1, etc. 59 | for (var idx in hierarchy) { 60 | words = words.concat(hierarchy[idx].matchedWords); 61 | } 62 | 63 | var content = hit._highlightResult.content; 64 | if (content) { 65 | words = words.concat(content.matchedWords); 66 | } 67 | 68 | // return unique words 69 | var words_uniq = [...new Set(words)]; 70 | return words_uniq; 71 | } 72 | 73 | function updateHitURL(hit) { 74 | 75 | var words = matchedWords(hit); 76 | var url = ""; 77 | 78 | if (hit.anchor) { 79 | url = hit.url_without_anchor + '?q=' + escape(words.join(" ")) + '#' + hit.anchor; 80 | } else { 81 | url = hit.url + '?q=' + escape(words.join(" ")); 82 | } 83 | 84 | return url; 85 | } 86 | -------------------------------------------------------------------------------- /docs/link.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 8 | 12 | 13 | -------------------------------------------------------------------------------- /docs/news/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Changelog • clinspacy 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 55 | 56 | 57 | 58 | 59 | 60 | 61 |
    62 |
    63 | 117 | 118 | 119 | 120 |
    121 | 122 |
    123 |
    124 | 128 | 129 |
    130 |

    131 | clinspacy 0.1.0.9002

    132 |
      133 |
    • Added a NEWS.md file to track changes to the package.
    • 134 |
    • bind_* functions no longer run clinspacy_init() – this should speed up load times
    • 135 |
    136 |
    137 |
    138 |

    139 | clinspacy 0.2.0.9000

    140 |
      141 |
    • Added NA to semantic_types argument for both clinspacy() and clinspacy_single() to prevent tokens from being discarded if they do not match a listed semantic type.
    • 142 |
    • Moved clinspacy_single() logic into lapply() instead of gradually building a list using a for loop for boost in speed
    • 143 |
    • Moved progress bar to clinspacy() so that it iterates over documents rather than tokens
    • 144 |
    145 |
    146 |
    147 |

    148 | clinspacy 0.2.0 (2021-02-22)

    149 |
      150 |
    • Changed lifecycle badge to stable
    • 151 |
    152 |
    153 |
    154 |

    155 | clinspacy 1.0.0 (2021-02-23)

    156 |
      157 |
    • Bumped version number to 1.0.0 since it’s ready for CRAN submission
    • 158 |
    159 |
    160 |
    161 |

    162 | clinspacy 1.0.1 (2021-03-08)

    163 |
      164 |
    • Bug fix: removed unnecessary arguments and some clean up in prep for CRAN submission
    • 165 |
    166 |
    167 |
    168 |

    169 | clinspacy 1.0.2 (2021-03-18)

    170 |
      171 |
    • Fixed documentation prior to CRAN submission based on feedback
    • 172 |
    • Bug fix: Specified version numbers for spaCy (2.3.0), scispaCy (0.2.5), and medspaCy (0.1.0.2) to ensure that the versions are compatible with one another
    • 173 |
    • Bug fix: spaCy 2.3.0 must be installed from conda-forge (pip set to FALSE) because the source fails to build properly on Windows even with Visual C++ build tools installed
    • 174 |
    • Update: Switched to using medspaCy instead of its individual components because medspaCy 0.1.0.2 is compatible with spaCy 2.3.0 (an older version was not).
    • 175 |
    • Bug fix: changed section_title to section_category due to updates in medspaCy sectionizer API
    • 176 |
    • Known issue: After first running clinspacy_init() on Windows, sometimes it cannot find numpy. This is a known issue with reticulate https://github.com/rstudio/reticulate/issues/367. Restarting the R session and re-running clinspacy_init() appears to fix the issue
    • 177 |
    178 |
    179 |
    180 | 181 | 186 | 187 |
    188 | 189 | 190 |
    191 | 194 | 195 |
    196 |

    Site built with pkgdown 1.6.1.

    197 |
    198 | 199 |
    200 |
    201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | -------------------------------------------------------------------------------- /docs/pkgdown.css: -------------------------------------------------------------------------------- 1 | /* Sticky footer */ 2 | 3 | /** 4 | * Basic idea: https://philipwalton.github.io/solved-by-flexbox/demos/sticky-footer/ 5 | * Details: https://github.com/philipwalton/solved-by-flexbox/blob/master/assets/css/components/site.css 6 | * 7 | * .Site -> body > .container 8 | * .Site-content -> body > .container .row 9 | * .footer -> footer 10 | * 11 | * Key idea seems to be to ensure that .container and __all its parents__ 12 | * have height set to 100% 13 | * 14 | */ 15 | 16 | html, body { 17 | height: 100%; 18 | } 19 | 20 | body { 21 | position: relative; 22 | } 23 | 24 | body > .container { 25 | display: flex; 26 | height: 100%; 27 | flex-direction: column; 28 | } 29 | 30 | body > .container .row { 31 | flex: 1 0 auto; 32 | } 33 | 34 | footer { 35 | margin-top: 45px; 36 | padding: 35px 0 36px; 37 | border-top: 1px solid #e5e5e5; 38 | color: #666; 39 | display: flex; 40 | flex-shrink: 0; 41 | } 42 | footer p { 43 | margin-bottom: 0; 44 | } 45 | footer div { 46 | flex: 1; 47 | } 48 | footer .pkgdown { 49 | text-align: right; 50 | } 51 | footer p { 52 | margin-bottom: 0; 53 | } 54 | 55 | img.icon { 56 | float: right; 57 | } 58 | 59 | img { 60 | max-width: 100%; 61 | } 62 | 63 | /* Fix bug in bootstrap (only seen in firefox) */ 64 | summary { 65 | display: list-item; 66 | } 67 | 68 | /* Typographic tweaking ---------------------------------*/ 69 | 70 | .contents .page-header { 71 | margin-top: calc(-60px + 1em); 72 | } 73 | 74 | dd { 75 | margin-left: 3em; 76 | } 77 | 78 | /* Section anchors ---------------------------------*/ 79 | 80 | a.anchor { 81 | margin-left: -30px; 82 | display:inline-block; 83 | width: 30px; 84 | height: 30px; 85 | visibility: hidden; 86 | 87 | background-image: url(./link.svg); 88 | background-repeat: no-repeat; 89 | background-size: 20px 20px; 90 | background-position: center center; 91 | } 92 | 93 | .hasAnchor:hover a.anchor { 94 | visibility: visible; 95 | } 96 | 97 | @media (max-width: 767px) { 98 | .hasAnchor:hover a.anchor { 99 | visibility: hidden; 100 | } 101 | } 102 | 103 | 104 | /* Fixes for fixed navbar --------------------------*/ 105 | 106 | .contents h1, .contents h2, .contents h3, .contents h4 { 107 | padding-top: 60px; 108 | margin-top: -40px; 109 | } 110 | 111 | /* Navbar submenu --------------------------*/ 112 | 113 | .dropdown-submenu { 114 | position: relative; 115 | } 116 | 117 | .dropdown-submenu>.dropdown-menu { 118 | top: 0; 119 | left: 100%; 120 | margin-top: -6px; 121 | margin-left: -1px; 122 | border-radius: 0 6px 6px 6px; 123 | } 124 | 125 | .dropdown-submenu:hover>.dropdown-menu { 126 | display: block; 127 | } 128 | 129 | .dropdown-submenu>a:after { 130 | display: block; 131 | content: " "; 132 | float: right; 133 | width: 0; 134 | height: 0; 135 | border-color: transparent; 136 | border-style: solid; 137 | border-width: 5px 0 5px 5px; 138 | border-left-color: #cccccc; 139 | margin-top: 5px; 140 | margin-right: -10px; 141 | } 142 | 143 | .dropdown-submenu:hover>a:after { 144 | border-left-color: #ffffff; 145 | } 146 | 147 | .dropdown-submenu.pull-left { 148 | float: none; 149 | } 150 | 151 | .dropdown-submenu.pull-left>.dropdown-menu { 152 | left: -100%; 153 | margin-left: 10px; 154 | border-radius: 6px 0 6px 6px; 155 | } 156 | 157 | /* Sidebar --------------------------*/ 158 | 159 | #pkgdown-sidebar { 160 | margin-top: 30px; 161 | position: -webkit-sticky; 162 | position: sticky; 163 | top: 70px; 164 | } 165 | 166 | #pkgdown-sidebar h2 { 167 | font-size: 1.5em; 168 | margin-top: 1em; 169 | } 170 | 171 | #pkgdown-sidebar h2:first-child { 172 | margin-top: 0; 173 | } 174 | 175 | #pkgdown-sidebar .list-unstyled li { 176 | margin-bottom: 0.5em; 177 | } 178 | 179 | /* bootstrap-toc tweaks ------------------------------------------------------*/ 180 | 181 | /* All levels of nav */ 182 | 183 | nav[data-toggle='toc'] .nav > li > a { 184 | padding: 4px 20px 4px 6px; 185 | font-size: 1.5rem; 186 | font-weight: 400; 187 | color: inherit; 188 | } 189 | 190 | nav[data-toggle='toc'] .nav > li > a:hover, 191 | nav[data-toggle='toc'] .nav > li > a:focus { 192 | padding-left: 5px; 193 | color: inherit; 194 | border-left: 1px solid #878787; 195 | } 196 | 197 | nav[data-toggle='toc'] .nav > .active > a, 198 | nav[data-toggle='toc'] .nav > .active:hover > a, 199 | nav[data-toggle='toc'] .nav > .active:focus > a { 200 | padding-left: 5px; 201 | font-size: 1.5rem; 202 | font-weight: 400; 203 | color: inherit; 204 | border-left: 2px solid #878787; 205 | } 206 | 207 | /* Nav: second level (shown on .active) */ 208 | 209 | nav[data-toggle='toc'] .nav .nav { 210 | display: none; /* Hide by default, but at >768px, show it */ 211 | padding-bottom: 10px; 212 | } 213 | 214 | nav[data-toggle='toc'] .nav .nav > li > a { 215 | padding-left: 16px; 216 | font-size: 1.35rem; 217 | } 218 | 219 | nav[data-toggle='toc'] .nav .nav > li > a:hover, 220 | nav[data-toggle='toc'] .nav .nav > li > a:focus { 221 | padding-left: 15px; 222 | } 223 | 224 | nav[data-toggle='toc'] .nav .nav > .active > a, 225 | nav[data-toggle='toc'] .nav .nav > .active:hover > a, 226 | nav[data-toggle='toc'] .nav .nav > .active:focus > a { 227 | padding-left: 15px; 228 | font-weight: 500; 229 | font-size: 1.35rem; 230 | } 231 | 232 | /* orcid ------------------------------------------------------------------- */ 233 | 234 | .orcid { 235 | font-size: 16px; 236 | color: #A6CE39; 237 | /* margins are required by official ORCID trademark and display guidelines */ 238 | margin-left:4px; 239 | margin-right:4px; 240 | vertical-align: middle; 241 | } 242 | 243 | /* Reference index & topics ----------------------------------------------- */ 244 | 245 | .ref-index th {font-weight: normal;} 246 | 247 | .ref-index td {vertical-align: top; min-width: 100px} 248 | .ref-index .icon {width: 40px;} 249 | .ref-index .alias {width: 40%;} 250 | .ref-index-icons .alias {width: calc(40% - 40px);} 251 | .ref-index .title {width: 60%;} 252 | 253 | .ref-arguments th {text-align: right; padding-right: 10px;} 254 | .ref-arguments th, .ref-arguments td {vertical-align: top; min-width: 100px} 255 | .ref-arguments .name {width: 20%;} 256 | .ref-arguments .desc {width: 80%;} 257 | 258 | /* Nice scrolling for wide elements --------------------------------------- */ 259 | 260 | table { 261 | display: block; 262 | overflow: auto; 263 | } 264 | 265 | /* Syntax highlighting ---------------------------------------------------- */ 266 | 267 | pre { 268 | word-wrap: normal; 269 | word-break: normal; 270 | border: 1px solid #eee; 271 | } 272 | 273 | pre, code { 274 | background-color: #f8f8f8; 275 | color: #333; 276 | } 277 | 278 | pre code { 279 | overflow: auto; 280 | word-wrap: normal; 281 | white-space: pre; 282 | } 283 | 284 | pre .img { 285 | margin: 5px 0; 286 | } 287 | 288 | pre .img img { 289 | background-color: #fff; 290 | display: block; 291 | height: auto; 292 | } 293 | 294 | code a, pre a { 295 | color: #375f84; 296 | } 297 | 298 | a.sourceLine:hover { 299 | text-decoration: none; 300 | } 301 | 302 | .fl {color: #1514b5;} 303 | .fu {color: #000000;} /* function */ 304 | .ch,.st {color: #036a07;} /* string */ 305 | .kw {color: #264D66;} /* keyword */ 306 | .co {color: #888888;} /* comment */ 307 | 308 | .message { color: black; font-weight: bolder;} 309 | .error { color: orange; font-weight: bolder;} 310 | .warning { color: #6A0366; font-weight: bolder;} 311 | 312 | /* Clipboard --------------------------*/ 313 | 314 | .hasCopyButton { 315 | position: relative; 316 | } 317 | 318 | .btn-copy-ex { 319 | position: absolute; 320 | right: 0; 321 | top: 0; 322 | visibility: hidden; 323 | } 324 | 325 | .hasCopyButton:hover button.btn-copy-ex { 326 | visibility: visible; 327 | } 328 | 329 | /* headroom.js ------------------------ */ 330 | 331 | .headroom { 332 | will-change: transform; 333 | transition: transform 200ms linear; 334 | } 335 | .headroom--pinned { 336 | transform: translateY(0%); 337 | } 338 | .headroom--unpinned { 339 | transform: translateY(-100%); 340 | } 341 | 342 | /* mark.js ----------------------------*/ 343 | 344 | mark { 345 | background-color: rgba(255, 255, 51, 0.5); 346 | border-bottom: 2px solid rgba(255, 153, 51, 0.3); 347 | padding: 1px; 348 | } 349 | 350 | /* vertical spacing after htmlwidgets */ 351 | .html-widget { 352 | margin-bottom: 10px; 353 | } 354 | 355 | /* fontawesome ------------------------ */ 356 | 357 | .fab { 358 | font-family: "Font Awesome 5 Brands" !important; 359 | } 360 | 361 | /* don't display links in code chunks when printing */ 362 | /* source: https://stackoverflow.com/a/10781533 */ 363 | @media print { 364 | code a:link:after, code a:visited:after { 365 | content: ""; 366 | } 367 | } 368 | -------------------------------------------------------------------------------- /docs/pkgdown.js: -------------------------------------------------------------------------------- 1 | /* http://gregfranko.com/blog/jquery-best-practices/ */ 2 | (function($) { 3 | $(function() { 4 | 5 | $('.navbar-fixed-top').headroom(); 6 | 7 | $('body').css('padding-top', $('.navbar').height() + 10); 8 | $(window).resize(function(){ 9 | $('body').css('padding-top', $('.navbar').height() + 10); 10 | }); 11 | 12 | $('[data-toggle="tooltip"]').tooltip(); 13 | 14 | var cur_path = paths(location.pathname); 15 | var links = $("#navbar ul li a"); 16 | var max_length = -1; 17 | var pos = -1; 18 | for (var i = 0; i < links.length; i++) { 19 | if (links[i].getAttribute("href") === "#") 20 | continue; 21 | // Ignore external links 22 | if (links[i].host !== location.host) 23 | continue; 24 | 25 | var nav_path = paths(links[i].pathname); 26 | 27 | var length = prefix_length(nav_path, cur_path); 28 | if (length > max_length) { 29 | max_length = length; 30 | pos = i; 31 | } 32 | } 33 | 34 | // Add class to parent
  • , and enclosing
  • if in dropdown 35 | if (pos >= 0) { 36 | var menu_anchor = $(links[pos]); 37 | menu_anchor.parent().addClass("active"); 38 | menu_anchor.closest("li.dropdown").addClass("active"); 39 | } 40 | }); 41 | 42 | function paths(pathname) { 43 | var pieces = pathname.split("/"); 44 | pieces.shift(); // always starts with / 45 | 46 | var end = pieces[pieces.length - 1]; 47 | if (end === "index.html" || end === "") 48 | pieces.pop(); 49 | return(pieces); 50 | } 51 | 52 | // Returns -1 if not found 53 | function prefix_length(needle, haystack) { 54 | if (needle.length > haystack.length) 55 | return(-1); 56 | 57 | // Special case for length-0 haystack, since for loop won't run 58 | if (haystack.length === 0) { 59 | return(needle.length === 0 ? 0 : -1); 60 | } 61 | 62 | for (var i = 0; i < haystack.length; i++) { 63 | if (needle[i] != haystack[i]) 64 | return(i); 65 | } 66 | 67 | return(haystack.length); 68 | } 69 | 70 | /* Clipboard --------------------------*/ 71 | 72 | function changeTooltipMessage(element, msg) { 73 | var tooltipOriginalTitle=element.getAttribute('data-original-title'); 74 | element.setAttribute('data-original-title', msg); 75 | $(element).tooltip('show'); 76 | element.setAttribute('data-original-title', tooltipOriginalTitle); 77 | } 78 | 79 | if(ClipboardJS.isSupported()) { 80 | $(document).ready(function() { 81 | var copyButton = ""; 82 | 83 | $(".examples, div.sourceCode").addClass("hasCopyButton"); 84 | 85 | // Insert copy buttons: 86 | $(copyButton).prependTo(".hasCopyButton"); 87 | 88 | // Initialize tooltips: 89 | $('.btn-copy-ex').tooltip({container: 'body'}); 90 | 91 | // Initialize clipboard: 92 | var clipboardBtnCopies = new ClipboardJS('[data-clipboard-copy]', { 93 | text: function(trigger) { 94 | return trigger.parentNode.textContent; 95 | } 96 | }); 97 | 98 | clipboardBtnCopies.on('success', function(e) { 99 | changeTooltipMessage(e.trigger, 'Copied!'); 100 | e.clearSelection(); 101 | }); 102 | 103 | clipboardBtnCopies.on('error', function() { 104 | changeTooltipMessage(e.trigger,'Press Ctrl+C or Command+C to copy'); 105 | }); 106 | }); 107 | } 108 | })(window.jQuery || window.$) 109 | -------------------------------------------------------------------------------- /docs/pkgdown.yml: -------------------------------------------------------------------------------- 1 | pandoc: '2.6' 2 | pkgdown: 1.6.1 3 | pkgdown_sha: ~ 4 | articles: 5 | using_embeddings_for_machine_learning_on_clinical_text: using_embeddings_for_machine_learning_on_clinical_text.html 6 | last_built: 2021-03-18T15:55Z 7 | 8 | -------------------------------------------------------------------------------- /docs/reference/Rplot001.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kdpsingh/clinspacy/2efc73a3dbd152cb57aa65e263ae37310e068c61/docs/reference/Rplot001.png -------------------------------------------------------------------------------- /docs/reference/bind_clinspacy.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | This function binds columns containing either the lemma of the entity or the 10 | UMLS concept unique identifier (CUI) with frequencies to a data frame. The 11 | resulting data frame can be used to train a machine learning model or for 12 | additional feature selection. — bind_clinspacy • clinspacy 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 49 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 65 | 66 | 67 | 68 | 69 | 70 | 71 |
    72 |
    73 | 127 | 128 | 129 | 130 |
    131 | 132 |
    133 |
    134 | 142 | 143 |
    144 |

    This function binds columns containing either the lemma of the entity or the 145 | UMLS concept unique identifier (CUI) with frequencies to a data frame. The 146 | resulting data frame can be used to train a machine learning model or for 147 | additional feature selection.

    148 |
    149 | 150 |
    bind_clinspacy(
    151 |   clinspacy_output,
    152 |   df,
    153 |   cs_col = NULL,
    154 |   df_id = NULL,
    155 |   subset = "is_negated == FALSE"
    156 | )
    157 | 158 |

    Arguments

    159 | 160 | 161 | 162 | 163 | 165 | 166 | 167 | 168 | 170 | 171 | 172 | 173 | 178 | 179 | 180 | 181 | 187 | 188 | 189 | 190 | 196 | 197 |
    clinspacy_output

    A data.frame or file name containing the output from 164 | clinspacy.

    df

    The data.frame to which you would like to bind the output of 169 | clinspacy.

    cs_col

    Name of the column in the clinspacy_output that you 174 | would like to pivot. For example: "entity", "lemma", 175 | "cui", or "definition". Defaults to "lemma" if 176 | use_linker is set to FALSE and "cui" if 177 | use_linker is set to TRUE.

    df_id

    The name of the id column in the data frame with which 182 | the clinspacy_id column in clinspacy_output will be joined. 183 | If you supplied a df_id in clinspacy, then you must 184 | also supply it here. If you did not supply it in clinspacy, 185 | then it will default to the row number (similar behavior to in 186 | clinspacy).

    subset

    Logical criteria represented as a string by which the 191 | clinspacy_output will be subsetted prior to building the output data 192 | frame. Defaults to "is_negated == FALSE", which removes negated 193 | concepts prior to generating the output. Any column in 194 | clinspacy_output may be referenced here. To avoid any subsetting, 195 | set this to NULL.

    198 | 199 |

    Value

    200 | 201 |

    A data frame containing the original data frame as well as additional 202 | column names for each lemma or UMLS concept unique identifer found with 203 | values containing frequencies.

    204 | 205 |

    Examples

    206 |
    if (FALSE) { 207 | mtsamples <- dataset_mtsamples() 208 | mtsamples[1:5,] %>% 209 | clinspacy(df_col = 'description') %>% 210 | bind_clinspacy(mtsamples[1:5,]) 211 | } 212 |
    213 |
    214 | 219 |
    220 | 221 | 222 |
    223 | 226 | 227 |
    228 |

    Site built with pkgdown 1.6.1.

    229 |
    230 | 231 |
    232 |
    233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | -------------------------------------------------------------------------------- /docs/reference/clinspacy_init.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Initializes clinspacy. This function is optional to run but gives you more 10 | control over the parameters used by scispacy at initiation. If you do not run 11 | this function, it will be run with default parameters the first time that any 12 | of the package functions are run. — clinspacy_init • clinspacy 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 49 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 65 | 66 | 67 | 68 | 69 | 70 | 71 |
    72 |
    73 | 127 | 128 | 129 | 130 |
    131 | 132 |
    133 |
    134 | 142 | 143 |
    144 |

    Initializes clinspacy. This function is optional to run but gives you more 145 | control over the parameters used by scispacy at initiation. If you do not run 146 | this function, it will be run with default parameters the first time that any 147 | of the package functions are run.

    148 |
    149 | 150 |
    clinspacy_init(
    151 |   miniconda = TRUE,
    152 |   use_linker = FALSE,
    153 |   linker_threshold = 0.99,
    154 |   ...
    155 | )
    156 | 157 |

    Arguments

    158 | 159 | 160 | 161 | 162 | 166 | 167 | 168 | 169 | 171 | 172 | 173 | 174 | 179 | 180 | 181 | 182 | 184 | 185 |
    miniconda

    Defaults to TRUE, which results in miniconda being installed 163 | (~400 MB) and configured with the "clinspacy" conda environment. If you 164 | want to override this behavior, set miniconda to FALSE and 165 | specify an alternative environment using use_python() or use_conda().

    use_linker

    Defaults to FALSE. To turn on the UMLS linker, set 170 | this to TRUE.

    linker_threshold

    Defaults to 0.99. This arguemtn is only relevant if 175 | use_linker is set to TRUE. It refers to the confidence 176 | threshold value used by the scispacy UMLS entity linker. Note: This can be 177 | lower than the threshold from clinspacy_init). The 178 | linker_threshold can only be set once per session.

    ...

    Additional settings available from: 183 | https://github.com/allenai/scispacy.

    186 | 187 |

    Value

    188 | 189 |

    No return value.

    190 | 191 |
    192 | 197 |
    198 | 199 | 200 |
    201 | 204 | 205 |
    206 |

    Site built with pkgdown 1.6.1.

    207 |
    208 | 209 |
    210 |
    211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | -------------------------------------------------------------------------------- /docs/reference/cui2vec_definitions.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Cui2vec concept definitions — cui2vec_definitions • clinspacy 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 58 | 59 | 60 | 61 | 62 | 63 | 64 |
    65 |
    66 | 117 | 118 | 119 | 120 |
    121 | 122 |
    123 |
    124 | 129 | 130 |
    131 |

    This dataset contains definitions for the Unified Medical Language System (UMLS) 132 | Concept Unique Identifiers (CUIs). These come from Andrew Beam's 133 | cui2vec R package.

    134 |
    135 | 136 |
    cui2vec_definitions
    137 | 138 | 139 |

    Format

    140 | 141 |

    A data frame with 3053795 rows and 3 variables:

    142 |
    cui

    A Unified Medical Language System (UMLS) Concept Unique Identifier (CUI)

    143 |
    semantic_type

    Semantic type of the CUI

    144 |
    definition

    Definition of the CUI

    145 | 146 | 147 | 148 |

    Source

    149 | 150 |

    https://github.com/beamandrew/cui2vec

    151 |

    Details

    152 | 153 |

    License

    154 |

    This data is made available under a 155 | MIT license. The data 156 | is copyrighted in 2019 by Benjamin Kompa, Andrew Beam, and Allen Schmaltz. The only change 157 | made to the original dataset is the renaming of columns.

    158 | 159 |
    160 | 165 |
    166 | 167 | 168 |
    169 | 172 | 173 |
    174 |

    Site built with pkgdown 1.5.1.

    175 |
    176 | 177 |
    178 |
    179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | -------------------------------------------------------------------------------- /docs/reference/cui2vec_embeddings.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Cui2vec concept embeddings — cui2vec_embeddings • clinspacy 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 58 | 59 | 60 | 61 | 62 | 63 | 64 |
    65 |
    66 | 117 | 118 | 119 | 120 |
    121 | 122 |
    123 |
    124 | 129 | 130 |
    131 |

    This dataset contains Unified Medical Langauge System (UMLS) concept embeddings from 132 | Andrew Beam's cui2vec R package. There are 133 | 500 embeddings included for each concept.

    134 |
    135 | 136 |
    cui2vec_embeddings
    137 | 138 | 139 |

    Format

    140 | 141 |

    A data frame with 109053 rows and 501 variables:

    142 |
    cui

    A Unified Medical Language System (UMLS) Concept Unique Identifier (CUI)

    143 |
    emb_001

    Concept embedding vector #1

    144 |
    emb_002

    Concept embedding vector #2

    145 |
    ...

    and so on...

    146 |
    emb_500

    Concept embedding vector #500

    147 | 148 | 149 | 150 |

    Source

    151 | 152 |

    https://figshare.com/s/00d69861786cd0156d81

    153 |

    Details

    154 | 155 |

    This dataset is not viewable until it has been downloaded, which will occur 156 | the very first time you run clinspacy_init() after installing this 157 | package.

    158 |

    Citation

    159 |

    Beam, A.L., Kompa, B., Schmaltz, A., Fried, I., Griffin, W, Palmer, N.P., Shi, X., 160 | Cai, T., and Kohane, I.S.,, 2019. Clinical Concept Embeddings Learned from Massive 161 | Sources of Multimodal Medical Data. arXiv preprint arXiv:1804.01486.

    162 |

    License

    163 |

    This data is made available under a 164 | CC BY 4.0 license. The only change 165 | made to the original dataset is the renaming of columns.

    166 | 167 |
    168 | 173 |
    174 | 175 | 176 |
    177 | 180 | 181 |
    182 |

    Site built with pkgdown 1.5.1.

    183 |
    184 | 185 |
    186 |
    187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | -------------------------------------------------------------------------------- /docs/reference/dataset_cui2vec_definitions.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Cui2vec concept definitions — dataset_cui2vec_definitions • clinspacy 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 58 | 59 | 60 | 61 | 62 | 63 | 64 |
    65 |
    66 | 120 | 121 | 122 | 123 |
    124 | 125 |
    126 |
    127 | 132 | 133 |
    134 |

    This dataset contains definitions for the Unified Medical Language System 135 | (UMLS) Concept Unique Identifiers (CUIs). These come from Andrew Beam's 136 | cui2vec R package.

    137 |
    138 | 139 |
    dataset_cui2vec_definitions()
    140 | 141 | 142 |

    Format

    143 | 144 |

    A data frame with 3053795 rows and 3 variables:

    145 |
    cui

    A Unified Medical Language System (UMLS) Concept Unique 146 | Identifier (CUI)

    semantic_type

    Semantic type of the CUI

    147 |
    definition

    Definition of the CUI

    148 |
    149 | 150 |

    Source

    151 | 152 |

    https://github.com/beamandrew/cui2vec

    153 |

    Value

    154 | 155 |

    Returns the cui2vec UMLS definitions as a data frame.

    156 |

    Details

    157 | 158 |

    License

    159 |

    This data is made available under a 160 | MIT 161 | license. The data is copyrighted in 2019 by Benjamin Kompa, Andrew Beam, and 162 | Allen Schmaltz. The only change made to the original dataset is the renaming 163 | of columns.

    164 | 165 |
    166 | 171 |
    172 | 173 | 174 |
    175 | 178 | 179 |
    180 |

    Site built with pkgdown 1.6.1.

    181 |
    182 | 183 |
    184 |
    185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | -------------------------------------------------------------------------------- /docs/reference/dataset_cui2vec_embeddings.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Cui2vec concept embeddings — dataset_cui2vec_embeddings • clinspacy 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 59 | 60 | 61 | 62 | 63 | 64 | 65 |
    66 |
    67 | 121 | 122 | 123 | 124 |
    125 | 126 |
    127 |
    128 | 133 | 134 |
    135 |

    This dataset contains Unified Medical Langauge System (UMLS) concept 136 | embeddings from Andrew Beam's 137 | cui2vec R package. There are 138 | 500 embeddings included for each concept.

    139 |
    140 | 141 |
    dataset_cui2vec_embeddings()
    142 | 143 | 144 |

    Format

    145 | 146 |

    A data frame with 109053 rows and 501 variables:

    147 |
    cui

    A Unified Medical Language System (UMLS) Concept Unique 148 | Identifier (CUI)

    emb_001

    Concept embedding vector #1

    149 |
    emb_002

    Concept embedding vector #2

    ...

    and so on...

    150 |
    emb_500

    Concept embedding vector #500

    151 |
    152 | 153 |

    Source

    154 | 155 |

    https://figshare.com/s/00d69861786cd0156d81

    156 |

    Value

    157 | 158 |

    Returns the cui2vec UMLS embeddings as a data frame.

    159 |

    Details

    160 | 161 |

    This dataset is not viewable until it has been downloaded, which will occur 162 | the very first time you run clinspacy_init() after installing this 163 | package.

    164 |

    Citation

    165 |

    Beam, A.L., Kompa, B., Schmaltz, A., Fried, I., Griffin, W, Palmer, N.P., 166 | Shi, X., Cai, T., and Kohane, I.S.,, 2019. Clinical Concept Embeddings 167 | Learned from Massive Sources of Multimodal Medical Data. arXiv preprint 168 | arXiv:1804.01486.

    169 |

    License

    170 |

    This data is made available under a 171 | CC BY 4.0 license. The 172 | only change made to the original dataset is the renaming of columns.

    173 | 174 |
    175 | 180 |
    181 | 182 | 183 |
    184 | 187 | 188 |
    189 |

    Site built with pkgdown 1.6.1.

    190 |
    191 | 192 |
    193 |
    194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | -------------------------------------------------------------------------------- /docs/reference/dataset_mtsamples.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Medical transcription samples. — dataset_mtsamples • clinspacy 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 57 | 58 | 59 | 60 | 61 | 62 | 63 |
    64 |
    65 | 119 | 120 | 121 | 122 |
    123 | 124 |
    125 |
    126 | 131 | 132 |
    133 |

    This dataset contains sample medical transcriptions for various medical 134 | specialties.

    135 |
    136 | 137 |
    dataset_mtsamples()
    138 | 139 | 140 |

    Format

    141 | 142 |

    A data frame with 4999 rows and 6 variables:

    143 |
    note_id

    A unique identifier for each note

    description

    A 144 | description or chief concern

    medical_specialty

    Medical specialty of 145 | the note

    sample_name

    mtsamples.com note name

    146 |
    transcription

    Transcription of note text

    keywords

    Keywords

    147 | 148 |
    149 | 150 |

    Source

    151 | 152 |

    https://www.kaggle.com/tboyle10/medicaltranscriptions/data

    153 |

    Value

    154 | 155 |

    Returns the mtsamples dataset as a data frame.

    156 |

    Details

    157 | 158 |

    Acknowledgements

    159 |

    This data was scraped from 160 | https://mtsamples.com by Tara Boyle.

    161 |

    License This data is made available under a 162 | CC0: 163 | Public Domain license.

    164 | 165 |
    166 | 171 |
    172 | 173 | 174 |
    175 | 178 | 179 |
    180 |

    Site built with pkgdown 1.6.1.

    181 |
    182 | 183 |
    184 |
    185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | -------------------------------------------------------------------------------- /docs/reference/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Function reference • clinspacy 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 55 | 56 | 57 | 58 | 59 | 60 | 61 |
    62 |
    63 | 117 | 118 | 119 | 120 |
    121 | 122 |
    123 |
    124 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 153 | 155 | 156 | 157 | 160 | 164 | 165 | 166 | 169 | 173 | 174 | 175 | 178 | 182 | 183 | 184 | 187 | 188 | 189 | 190 | 193 | 194 | 195 | 196 | 199 | 200 | 201 | 202 |
    139 |

    All functions

    140 |

    141 |
    151 |

    clinspacy()

    152 |

    This is the primary function for processing both data frames and character 154 | vectors in the clinspacy package.

    158 |

    bind_clinspacy()

    159 |

    This function binds columns containing either the lemma of the entity or the 161 | UMLS concept unique identifier (CUI) with frequencies to a data frame. The 162 | resulting data frame can be used to train a machine learning model or for 163 | additional feature selection.

    167 |

    clinspacy_init()

    168 |

    Initializes clinspacy. This function is optional to run but gives you more 170 | control over the parameters used by scispacy at initiation. If you do not run 171 | this function, it will be run with default parameters the first time that any 172 | of the package functions are run.

    176 |

    bind_clinspacy_embeddings()

    177 |

    This function binds columns containing entity or concept embeddings to a data 179 | frame. The entity embeddings are derived from the scispacy package, and the 180 | concept embeddings are derived from the 181 | dataset_cui2vec_embeddings dataset included with this package.

    185 |

    dataset_mtsamples()

    186 |

    Medical transcription samples.

    191 |

    dataset_cui2vec_definitions()

    192 |

    Cui2vec concept definitions

    197 |

    dataset_cui2vec_embeddings()

    198 |

    Cui2vec concept embeddings

    203 |
    204 | 205 | 210 |
    211 | 212 | 213 |
    214 | 217 | 218 |
    219 |

    Site built with pkgdown 1.6.1.

    220 |
    221 | 222 |
    223 |
    224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | -------------------------------------------------------------------------------- /docs/reference/mtsamples.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Medical transcription samples. — mtsamples • clinspacy 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 56 | 57 | 58 | 59 | 60 | 61 | 62 |
    63 |
    64 | 115 | 116 | 117 | 118 |
    119 | 120 |
    121 |
    122 | 127 | 128 |
    129 |

    This dataset contains sample medical transcriptions for various medical specialties.

    130 |
    131 | 132 |
    mtsamples
    133 | 134 | 135 |

    Format

    136 | 137 |

    A data frame with 4999 rows and 6 variables:

    138 |
    note_id

    A unique identifier for each note

    139 |
    description

    A description or chief concern

    140 |
    medical_specialty

    Medical specialty of the note

    141 |
    sample_name

    mtsamples.com note name

    142 |
    transcription

    Transcription of note text

    143 |
    keywords

    Keywords

    144 | 145 | 146 | 147 |

    Source

    148 | 149 |

    https://www.kaggle.com/tboyle10/medicaltranscriptions/data

    150 |

    Details

    151 | 152 |

    Acknowledgements

    153 |

    This data was scraped from https://mtsamples.com by Tara Boyle.

    154 |

    License 155 | This data is made available under a 156 | CC0: Public Domain license.

    157 | 158 |
    159 | 164 |
    165 | 166 | 167 |
    168 | 171 | 172 |
    173 |

    Site built with pkgdown 1.5.1.

    174 |
    175 | 176 |
    177 |
    178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | -------------------------------------------------------------------------------- /docs/reference/pipe.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Pipe operator — %>% • clinspacy 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 56 | 57 | 58 | 59 | 60 | 61 | 62 |
    63 |
    64 | 118 | 119 | 120 | 121 |
    122 | 123 |
    124 |
    125 | 130 | 131 |
    132 |

    See magrittr::%>% for details.

    133 |
    134 | 135 |
    lhs %>% rhs
    136 | 137 |

    Arguments

    138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 |
    lhs

    A value or the magrittr placeholder.

    rhs

    A function call using the magrittr semantics.

    149 | 150 |

    Value

    151 | 152 |

    Returns rhs(lhs).

    153 | 154 |
    155 | 160 |
    161 | 162 | 163 | 173 |
    174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | -------------------------------------------------------------------------------- /man/bind_clinspacy.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/clinspacy.R 3 | \name{bind_clinspacy} 4 | \alias{bind_clinspacy} 5 | \title{This function binds columns containing either the lemma of the entity or the 6 | UMLS concept unique identifier (CUI) with frequencies to a data frame. The 7 | resulting data frame can be used to train a machine learning model or for 8 | additional feature selection.} 9 | \usage{ 10 | bind_clinspacy( 11 | clinspacy_output, 12 | df, 13 | cs_col = NULL, 14 | df_id = NULL, 15 | subset = "is_negated == FALSE" 16 | ) 17 | } 18 | \arguments{ 19 | \item{clinspacy_output}{A data.frame or file name containing the output from 20 | \code{\link{clinspacy}}.} 21 | 22 | \item{df}{The data.frame to which you would like to bind the output of 23 | \code{\link{clinspacy}}.} 24 | 25 | \item{cs_col}{Name of the column in the \code{clinspacy_output} that you 26 | would like to pivot. For example: \code{"entity"}, \code{"lemma"}, 27 | \code{"cui"}, or \code{"definition"}. Defaults to \code{"lemma"} if 28 | \code{use_linker} is set to \code{FALSE} and \code{"cui"} if 29 | \code{use_linker} is set to \code{TRUE}.} 30 | 31 | \item{df_id}{The name of the \code{id} column in the data frame with which 32 | the \code{clinspacy_id} column in \code{clinspacy_output} will be joined. 33 | If you supplied a \code{df_id} in \code{\link{clinspacy}}, then you must 34 | also supply it here. If you did not supply it in \code{\link{clinspacy}}, 35 | then it will default to the row number (similar behavior to in 36 | \code{\link{clinspacy}}).} 37 | 38 | \item{subset}{Logical criteria represented as a string by which the 39 | \code{clinspacy_output} will be subsetted prior to building the output data 40 | frame. Defaults to \code{"is_negated == FALSE"}, which removes negated 41 | concepts prior to generating the output. Any column in 42 | \code{clinspacy_output} may be referenced here. To avoid any subsetting, 43 | set this to \code{NULL}.} 44 | } 45 | \value{ 46 | A data frame containing the original data frame as well as additional 47 | column names for each lemma or UMLS concept unique identifer found with 48 | values containing frequencies. 49 | } 50 | \description{ 51 | This function binds columns containing either the lemma of the entity or the 52 | UMLS concept unique identifier (CUI) with frequencies to a data frame. The 53 | resulting data frame can be used to train a machine learning model or for 54 | additional feature selection. 55 | } 56 | \examples{ 57 | \dontrun{ 58 | mtsamples <- dataset_mtsamples() 59 | mtsamples[1:5,] \%>\% 60 | clinspacy(df_col = 'description') \%>\% 61 | bind_clinspacy(mtsamples[1:5,]) 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /man/bind_clinspacy_embeddings.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/clinspacy.R 3 | \name{bind_clinspacy_embeddings} 4 | \alias{bind_clinspacy_embeddings} 5 | \title{This function binds columns containing entity or concept embeddings to a data 6 | frame. The entity embeddings are derived from the scispacy package, and the 7 | concept embeddings are derived from the 8 | \code{\link{dataset_cui2vec_embeddings}} dataset included with this package.} 9 | \usage{ 10 | bind_clinspacy_embeddings( 11 | clinspacy_output, 12 | df, 13 | type = "scispacy", 14 | df_id = NULL, 15 | subset = "is_negated == FALSE" 16 | ) 17 | } 18 | \arguments{ 19 | \item{clinspacy_output}{A data.frame or file name containing the output from 20 | \code{\link{clinspacy}}. In order for scispacy embeddings to be available 21 | to \code{\link{bind_clinspacy_embeddings}}, you must set 22 | \code{return_scispacy_embeddings} to \code{TRUE} when running 23 | \code{\link{clinspacy}} so that the embeddings are included within 24 | \code{clinspacy_output}.} 25 | 26 | \item{df}{The data.frame to which you would like to bind the output of 27 | \code{\link{clinspacy}}.} 28 | 29 | \item{type}{The type of embeddings to return. One of \code{scispacy} and 30 | \code{cui2vec}. Whereas \code{cui2vec} embeddings require the UMLS linker 31 | to be enabled, the \code{scispacy} embeddings do not. Defaults to 32 | \code{scispacy}.} 33 | 34 | \item{df_id}{The name of the \code{id} column in the data frame with which 35 | the \code{id} column in \code{clinspacy_output} will be joined. If you 36 | supplied a \code{df_id} in \code{\link{clinspacy}}, then you must also 37 | supply it here. If you did not supply it in \code{\link{clinspacy}}, then 38 | it will default to the row number (similar behavior to in 39 | \code{\link{clinspacy}}).} 40 | 41 | \item{subset}{Logical criteria represented as a string by which the 42 | \code{clinspacy_output} will be subsetted prior to building the output data 43 | frame. Defaults to \code{"is_negated == FALSE"}, which removes negated 44 | concepts prior to generating the output. Any column in 45 | \code{clinspacy_output} may be referenced here. To avoid any subsetting, 46 | set this to \code{NULL}.} 47 | } 48 | \value{ 49 | A data frame containing the original data frame as well as the 50 | concept embeddings. For scispacy embeddings, this returns 200 columns of 51 | embeddings. For cui2vec embeddings, this returns 500 columns of embedings. 52 | The resulting data frame can be used to train a machine learning model. 53 | } 54 | \description{ 55 | The embeddings are derived from Andrew Beam's 56 | \href{https://github.com/beamandrew/cui2vec}{cui2vec R package}. 57 | } 58 | \details{ 59 | Citation 60 | 61 | Beam, A.L., Kompa, B., Schmaltz, A., Fried, I., Griffin, W, Palmer, N.P., 62 | Shi, X., Cai, T., and Kohane, I.S.,, 2019. Clinical Concept Embeddings 63 | Learned from Massive Sources of Multimodal Medical Data. arXiv preprint 64 | arXiv:1804.01486. 65 | 66 | License 67 | 68 | The cui2vec data is made available under a 69 | \href{https://creativecommons.org/licenses/by/4.0/}{CC BY 4.0 license}. The 70 | only change made to the original dataset is the renaming of columns. 71 | } 72 | \examples{ 73 | \dontrun{ 74 | mtsamples <- dataset_mtsamples() 75 | mtsamples[1:5,] \%>\% 76 | clinspacy(df_col = 'description', return_scispacy_embeddings = TRUE) \%>\% 77 | bind_clinspacy_embeddings(mtsamples[1:5,]) 78 | } 79 | 80 | } 81 | -------------------------------------------------------------------------------- /man/clinspacy.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/clinspacy.R 3 | \name{clinspacy} 4 | \alias{clinspacy} 5 | \title{This is the primary function for processing both data frames and character 6 | vectors in the \code{clinspacy} package.} 7 | \usage{ 8 | clinspacy( 9 | x, 10 | df_col = NULL, 11 | df_id = NULL, 12 | threshold = 0.99, 13 | semantic_types = c(NA, "Acquired Abnormality", "Activity", "Age Group", 14 | "Amino Acid Sequence", "Amino Acid, Peptide, or Protein", "Amphibian", 15 | "Anatomical Abnormality", "Anatomical Structure", "Animal", "Antibiotic", "Archaeon", 16 | "Bacterium", "Behavior", "Biologic Function", "Biologically Active Substance", 17 | "Biomedical Occupation or Discipline", "Biomedical or Dental Material", "Bird", 18 | "Body Location or Region", "Body Part, Organ, or Organ Component", 19 | "Body Space or Junction", "Body Substance", "Body System", "Carbohydrate Sequence", 20 | "Cell", "Cell Component", "Cell Function", "Cell or Molecular Dysfunction", 21 | "Chemical", "Chemical Viewed Functionally", "Chemical Viewed Structurally", 22 | "Classification", "Clinical Attribute", "Clinical Drug", "Conceptual Entity", 23 | "Congenital Abnormality", "Daily or Recreational Activity", "Diagnostic Procedure", 24 | "Disease or Syndrome", "Drug Delivery Device", "Educational Activity", 25 | "Element, Ion, or Isotope", "Embryonic Structure", "Entity", 26 | "Environmental Effect of Humans", "Enzyme", "Eukaryote", "Event", 27 | "Experimental Model of Disease", "Family Group", "Finding", "Fish", "Food", 28 | "Fully Formed Anatomical Structure", "Functional Concept", "Fungus", 29 | "Gene or Genome", "Genetic Function", "Geographic Area", 30 | "Governmental or Regulatory Activity", "Group", "Group Attribute", 31 | "Hazardous or Poisonous Substance", "Health Care Activity", 32 | "Health Care Related Organization", "Hormone", "Human", 33 | "Human-caused Phenomenon or Process", "Idea or Concept", "Immunologic Factor", 34 | "Indicator, Reagent, or Diagnostic Aid", "Individual Behavior", 35 | "Injury or Poisoning", "Inorganic Chemical", "Intellectual Product", 36 | "Laboratory or Test Result", "Laboratory Procedure", "Language", "Machine Activity", 37 | "Mammal", "Manufactured Object", "Medical Device", 38 | "Mental or Behavioral Dysfunction", "Mental Process", 39 | "Molecular Biology Research Technique", "Molecular Function", "Molecular Sequence", 40 | "Natural Phenomenon or Process", "Neoplastic Process", 41 | "Nucleic Acid, Nucleoside, or Nucleotide", "Nucleotide Sequence", 42 | "Occupation or Discipline", "Occupational Activity", "Organ or Tissue Function", 43 | "Organic Chemical", "Organism", "Organism Attribute", "Organism Function", 44 | "Organization", "Pathologic Function", "Patient or Disabled Group", 45 | "Pharmacologic Substance", "Phenomenon or Process", "Physical Object", 46 | "Physiologic Function", "Plant", "Population Group", 47 | "Professional or Occupational Group", "Professional Society", "Qualitative Concept", 48 | "Quantitative Concept", "Receptor", "Regulation or Law", "Reptile", 49 | "Research Activity", "Research Device", "Self-help or Relief Organization", 50 | "Sign or Symptom", "Social Behavior", "Spatial Concept", "Substance", 51 | "Temporal Concept", "Therapeutic or Preventive Procedure", "Tissue", "Vertebrate", 52 | "Virus", "Vitamin"), 53 | return_scispacy_embeddings = FALSE, 54 | verbose = TRUE, 55 | output_file = NULL, 56 | overwrite = FALSE 57 | ) 58 | } 59 | \arguments{ 60 | \item{x}{Either a data.frame or a character vector} 61 | 62 | \item{df_col}{If \code{x} is a data.frame then you must specify the name of 63 | the column containing text as a string.} 64 | 65 | \item{df_id}{If \code{x} is a data.frame then you may *optionally* specify an 66 | \code{id} column to help match up each row of text in the original data 67 | frame with the resulting output. If you do not specify an id, the resulting 68 | will contain the row number from the original data.frame.} 69 | 70 | \item{threshold}{Defaults to 0.99. The confidence threshold value used by 71 | clinspacy (can be higher than the \code{linker_threshold} from 72 | \code{\link{clinspacy_init}}). Note that whereas the linker_threshold can 73 | only be set once per session, this threshold can be updated during the R 74 | session.} 75 | 76 | \item{semantic_types}{Character vector containing any combination of the 77 | following: c(NA, "Acquired Abnormality", "Activity", "Age Group", "Amino 78 | Acid Sequence", "Amino Acid, Peptide, or Protein", "Amphibian", "Anatomical 79 | Abnormality", "Anatomical Structure", "Animal", "Antibiotic", "Archaeon", 80 | "Bacterium", "Behavior", "Biologic Function", "Biologically Active 81 | Substance", "Biomedical Occupation or Discipline", "Biomedical or Dental 82 | Material", "Bird", "Body Location or Region", "Body Part, Organ, or Organ 83 | Component", "Body Space or Junction", "Body Substance", "Body System", 84 | "Carbohydrate Sequence", "Cell", "Cell Component", "Cell Function", "Cell 85 | or Molecular Dysfunction", "Chemical", "Chemical Viewed Functionally", 86 | "Chemical Viewed Structurally", "Classification", "Clinical Attribute", 87 | "Clinical Drug", "Conceptual Entity", "Congenital Abnormality", "Daily or 88 | Recreational Activity", "Diagnostic Procedure", "Disease or Syndrome", 89 | "Drug Delivery Device", "Educational Activity", "Element, Ion, or Isotope", 90 | "Embryonic Structure", "Entity", "Environmental Effect of Humans", 91 | "Enzyme", "Eukaryote", "Event", "Experimental Model of Disease", "Family 92 | Group", "Finding", "Fish", "Food", "Fully Formed Anatomical Structure", 93 | "Functional Concept", "Fungus", "Gene or Genome", "Genetic Function", 94 | "Geographic Area", "Governmental or Regulatory Activity", "Group", "Group 95 | Attribute", "Hazardous or Poisonous Substance", "Health Care Activity", 96 | "Health Care Related Organization", "Hormone", "Human", "Human-caused 97 | Phenomenon or Process", "Idea or Concept", "Immunologic Factor", 98 | "Indicator, Reagent, or Diagnostic Aid", "Individual Behavior", "Injury or 99 | Poisoning", "Inorganic Chemical", "Intellectual Product", "Laboratory or 100 | Test Result", "Laboratory Procedure", "Language", "Machine Activity", 101 | "Mammal", "Manufactured Object", "Medical Device", "Mental or Behavioral 102 | Dysfunction", "Mental Process", "Molecular Biology Research Technique", 103 | "Molecular Function", "Molecular Sequence", "Natural Phenomenon or 104 | Process", "Neoplastic Process", "Nucleic Acid, Nucleoside, or Nucleotide", 105 | "Nucleotide Sequence", "Occupation or Discipline", "Occupational Activity", 106 | "Organ or Tissue Function", "Organic Chemical", "Organism", "Organism 107 | Attribute", "Organism Function", "Organization", "Pathologic Function", 108 | "Patient or Disabled Group", "Pharmacologic Substance", "Phenomenon or 109 | Process", "Physical Object", "Physiologic Function", "Plant", "Population 110 | Group", "Professional or Occupational Group", "Professional Society", 111 | "Qualitative Concept", "Quantitative Concept", "Receptor", "Regulation or 112 | Law", "Reptile", "Research Activity", "Research Device", "Self-help or 113 | Relief Organization", "Sign or Symptom", "Social Behavior", "Spatial 114 | Concept", "Substance", "Temporal Concept", "Therapeutic or Preventive 115 | Procedure", "Tissue", "Vertebrate", "Virus", "Vitamin")} 116 | 117 | \item{return_scispacy_embeddings}{Defaults to \code{FALSE}. This is primarily 118 | intended for use by the \code{\link{bind_clinspacy_embeddings}} function to 119 | obtain scispacy embeddings. In order for scispacy embeddings to be 120 | available to \code{\link{bind_clinspacy_embeddings}}, you must set this to 121 | \code{TRUE}.} 122 | 123 | \item{verbose}{Defaults to \code{TRUE}.} 124 | 125 | \item{output_file}{Defaults to \code{NULL}. This is an optional argument that 126 | writes the output to a comma-separated value (CSV) file.} 127 | 128 | \item{overwrite}{Defaults to \code{FALSE}. If \code{output_file} already 129 | exists and \code{overwrite} is set to \code{FALSE}, then you will be 130 | prompted to confirm whether you would like to overwrite the file. If set to 131 | \code{TRUE}, then \code{output_file} will automatically be overwritten.} 132 | } 133 | \value{ 134 | If \code{output_file} is \code{NULL} (the default), then this 135 | function returns a data frame containing the UMLS concept unique 136 | identifiers (cui), entities, lemmatized entities, CyContext negation status 137 | (\code{TRUE} means negated, \code{FALSE} means *not* negated), other 138 | CyContext contexts, and section title from the clinical sectionizer. If 139 | \code{output_file} points to a file name, then the name of the created file 140 | will be returned. 141 | } 142 | \description{ 143 | This is the primary function for processing both data frames and character 144 | vectors in the \code{clinspacy} package. 145 | } 146 | \examples{ 147 | \dontrun{ 148 | clinspacy('This patient has diabetes and CKD stage 3 but no HTN.') 149 | 150 | clinspacy(c('This pt has CKD and HTN', 'Pt only has CKD but no HTN')) 151 | 152 | data.frame(text = c('This pt has CKD and HTN', 'Diabetes is present'), 153 | stringsAsFactors = FALSE) \%>\% 154 | clinspacy(df_col = 'text') 155 | 156 | if (!dir.exists(rappdirs::user_data_dir('clinspacy'))) { 157 | dir.create(rappdirs::user_data_dir('clinspacy'), recursive = TRUE) 158 | } 159 | 160 | clinspacy(c('This pt has CKD and HTN', 'Has CKD but no HTN'), 161 | output_file = file.path(rappdirs::user_data_dir('clinspacy'), 162 | 'output.csv'), 163 | overwrite = TRUE) 164 | } 165 | 166 | } 167 | -------------------------------------------------------------------------------- /man/clinspacy_init.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/clinspacy.R 3 | \name{clinspacy_init} 4 | \alias{clinspacy_init} 5 | \title{Initializes clinspacy. This function is optional to run but gives you more 6 | control over the parameters used by scispacy at initiation. If you do not run 7 | this function, it will be run with default parameters the first time that any 8 | of the package functions are run.} 9 | \usage{ 10 | clinspacy_init( 11 | miniconda = TRUE, 12 | use_linker = FALSE, 13 | linker_threshold = 0.99, 14 | ... 15 | ) 16 | } 17 | \arguments{ 18 | \item{miniconda}{Defaults to TRUE, which results in miniconda being installed 19 | (~400 MB) and configured with the "clinspacy" conda environment. If you 20 | want to override this behavior, set \code{miniconda} to \code{FALSE} and 21 | specify an alternative environment using use_python() or use_conda().} 22 | 23 | \item{use_linker}{Defaults to \code{FALSE}. To turn on the UMLS linker, set 24 | this to \code{TRUE}.} 25 | 26 | \item{linker_threshold}{Defaults to 0.99. This arguemtn is only relevant if 27 | \code{use_linker} is set to \code{TRUE}. It refers to the confidence 28 | threshold value used by the scispacy UMLS entity linker. Note: This can be 29 | lower than the \code{threshold} from \code{\link{clinspacy_init}}). The 30 | linker_threshold can only be set once per session.} 31 | 32 | \item{...}{Additional settings available from: 33 | \href{https://github.com/allenai/scispacy}{https://github.com/allenai/scispacy}.} 34 | } 35 | \value{ 36 | No return value. 37 | } 38 | \description{ 39 | Initializes clinspacy. This function is optional to run but gives you more 40 | control over the parameters used by scispacy at initiation. If you do not run 41 | this function, it will be run with default parameters the first time that any 42 | of the package functions are run. 43 | } 44 | -------------------------------------------------------------------------------- /man/dataset_cui2vec_definitions.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/cui2vec_data.R 3 | \name{dataset_cui2vec_definitions} 4 | \alias{dataset_cui2vec_definitions} 5 | \title{Cui2vec concept definitions} 6 | \format{ 7 | A data frame with 3053795 rows and 3 variables: \describe{ 8 | \item{cui}{A Unified Medical Language System (UMLS) Concept Unique 9 | Identifier (CUI)} \item{semantic_type}{Semantic type of the CUI} 10 | \item{definition}{Definition of the CUI} } 11 | } 12 | \source{ 13 | \url{https://github.com/beamandrew/cui2vec} 14 | } 15 | \usage{ 16 | dataset_cui2vec_definitions() 17 | } 18 | \value{ 19 | Returns the cui2vec UMLS definitions as a data frame. 20 | } 21 | \description{ 22 | This dataset contains definitions for the Unified Medical Language System 23 | (UMLS) Concept Unique Identifiers (CUIs). These come from Andrew Beam's 24 | \href{https://github.com/beamandrew/cui2vec}{cui2vec R package}. 25 | } 26 | \details{ 27 | License 28 | 29 | This data is made available under a 30 | \href{https://github.com/beamandrew/cui2vec/blob/master/LICENSE.md}{MIT 31 | license}. The data is copyrighted in 2019 by Benjamin Kompa, Andrew Beam, and 32 | Allen Schmaltz. The only change made to the original dataset is the renaming 33 | of columns. 34 | } 35 | -------------------------------------------------------------------------------- /man/dataset_cui2vec_embeddings.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/cui2vec_data.R 3 | \name{dataset_cui2vec_embeddings} 4 | \alias{dataset_cui2vec_embeddings} 5 | \title{Cui2vec concept embeddings} 6 | \format{ 7 | A data frame with 109053 rows and 501 variables: \describe{ 8 | \item{cui}{A Unified Medical Language System (UMLS) Concept Unique 9 | Identifier (CUI)} \item{emb_001}{Concept embedding vector #1} 10 | \item{emb_002}{Concept embedding vector #2} \item{...}{and so on...} 11 | \item{emb_500}{Concept embedding vector #500} } 12 | } 13 | \source{ 14 | \url{https://figshare.com/s/00d69861786cd0156d81} 15 | } 16 | \usage{ 17 | dataset_cui2vec_embeddings() 18 | } 19 | \value{ 20 | Returns the cui2vec UMLS embeddings as a data frame. 21 | } 22 | \description{ 23 | This dataset contains Unified Medical Langauge System (UMLS) concept 24 | embeddings from Andrew Beam's 25 | \href{https://github.com/beamandrew/cui2vec}{cui2vec R package}. There are 26 | 500 embeddings included for each concept. 27 | } 28 | \details{ 29 | This dataset is not viewable until it has been downloaded, which will occur 30 | the very first time you run \code{clinspacy_init()} after installing this 31 | package. 32 | 33 | Citation 34 | 35 | Beam, A.L., Kompa, B., Schmaltz, A., Fried, I., Griffin, W, Palmer, N.P., 36 | Shi, X., Cai, T., and Kohane, I.S.,, 2019. Clinical Concept Embeddings 37 | Learned from Massive Sources of Multimodal Medical Data. arXiv preprint 38 | arXiv:1804.01486. 39 | 40 | License 41 | 42 | This data is made available under a 43 | \href{https://creativecommons.org/licenses/by/4.0/}{CC BY 4.0 license}. The 44 | only change made to the original dataset is the renaming of columns. 45 | } 46 | -------------------------------------------------------------------------------- /man/dataset_mtsamples.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/mtsamples.R 3 | \name{dataset_mtsamples} 4 | \alias{dataset_mtsamples} 5 | \title{Medical transcription samples.} 6 | \format{ 7 | A data frame with 4999 rows and 6 variables: \describe{ 8 | \item{note_id}{A unique identifier for each note} \item{description}{A 9 | description or chief concern} \item{medical_specialty}{Medical specialty of 10 | the note} \item{sample_name}{mtsamples.com note name} 11 | \item{transcription}{Transcription of note text} \item{keywords}{Keywords} 12 | } 13 | } 14 | \source{ 15 | \url{https://www.kaggle.com/tboyle10/medicaltranscriptions/data} 16 | } 17 | \usage{ 18 | dataset_mtsamples() 19 | } 20 | \value{ 21 | Returns the mtsamples dataset as a data frame. 22 | } 23 | \description{ 24 | This dataset contains sample medical transcriptions for various medical 25 | specialties. 26 | } 27 | \details{ 28 | Acknowledgements 29 | 30 | This data was scraped from 31 | \href{https://mtsamples.com}{https://mtsamples.com} by Tara Boyle. 32 | 33 | License This data is made available under a 34 | \href{https://creativecommons.org/share-your-work/public-domain/cc0/}{CC0: 35 | Public Domain license}. 36 | } 37 | -------------------------------------------------------------------------------- /man/pipe.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils-pipe.R 3 | \name{\%>\%} 4 | \alias{\%>\%} 5 | \title{Pipe operator} 6 | \usage{ 7 | lhs \%>\% rhs 8 | } 9 | \arguments{ 10 | \item{lhs}{A value or the magrittr placeholder.} 11 | 12 | \item{rhs}{A function call using the magrittr semantics.} 13 | } 14 | \value{ 15 | Returns rhs(lhs). 16 | } 17 | \description{ 18 | See \code{magrittr::\link[magrittr:pipe]{\%>\%}} for details. 19 | } 20 | \keyword{internal} 21 | -------------------------------------------------------------------------------- /vignettes/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | *.R 3 | -------------------------------------------------------------------------------- /vignettes/using_embeddings_for_machine_learning_on_clinical_text.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Using Embeddings for Machine Learning on Clinical Text" 3 | output: rmarkdown::html_vignette 4 | vignette: > 5 | %\VignetteIndexEntry{Using Embeddings for Machine Learning on Clinical Text} 6 | %\VignetteEngine{knitr::rmarkdown} 7 | %\VignetteEncoding{UTF-8} 8 | --- 9 | 10 | ```{r, include = FALSE} 11 | knitr::opts_chunk$set( 12 | collapse = TRUE, 13 | comment = "#>" 14 | ) 15 | 16 | library(tidyverse) 17 | mtsamples_embeddings = readr::read_csv('Z:/kdpsingh/mtsamples_description.csv') %>% 18 | mutate(is_cardiology_note = 19 | if_else(medical_specialty == 'Cardiovascular / Pulmonary', 20 | 'Yes', 21 | 'No')) %>% 22 | select(contains('emb_'), is_cardiology_note) %>% 23 | na.omit() 24 | ``` 25 | 26 | ```{r setup} 27 | library(tidyverse) 28 | library(tidymodels) 29 | library(clinspacy) 30 | library(runway) 31 | ``` 32 | 33 | # Load in the mtsamples data frame 34 | 35 | ```{r} 36 | mtsamples = dataset_mtsamples() 37 | ``` 38 | 39 | # Add clinspacy embeddings to the mtsamples data frame 40 | 41 | Here, we are aiming to predict which of the descriptions refer to 'Cardiovascular / Pulmonary' notes so we will convert the outcome into a binary outcome. We will remove all of the predictor variables other than 42 | 43 | ```{r eval=FALSE} 44 | mtsamples_embeddings = 45 | mtsamples %>% 46 | clinspacy(df_col = 'description', 47 | return_scispacy_embeddings = TRUE, 48 | verbose = FALSE) %>% 49 | bind_clinspacy_embeddings(mtsamples) %>% 50 | mutate(is_cardiology_note = 51 | if_else(medical_specialty == 'Cardiovascular / Pulmonary', 52 | 'Yes', 53 | 'No')) %>% 54 | select(contains('emb_'), is_cardiology_note) %>% 55 | na.omit() 56 | ``` 57 | 58 | # Use tidymodels to fit a logistic regression model 59 | 60 | ```{r} 61 | set.seed(1) 62 | 63 | logreg_workflow = workflow() %>% 64 | add_model(logistic_reg() %>% set_engine('glm')) %>% 65 | add_recipe((recipe(is_cardiology_note~., data = mtsamples_embeddings))) 66 | 67 | logreg_result = 68 | fit_resamples(logreg_workflow, 69 | resamples = validation_split(data = mtsamples_embeddings, prop = 2/3), 70 | metrics = metric_set(roc_auc, pr_auc), 71 | control = control_resamples(save_pred = TRUE)) 72 | 73 | logreg_result %>% collect_metrics() 74 | ``` 75 | 76 | # Use tidymodels to fit a random forest model 77 | 78 | ```{r} 79 | set.seed(1) 80 | 81 | rf_workflow = workflow() %>% 82 | add_model(rand_forest(mode = 'classification', trees = 1000) %>% 83 | set_engine('ranger')) %>% 84 | add_recipe((recipe(is_cardiology_note~., data = mtsamples_embeddings))) 85 | 86 | rf_result = 87 | fit_resamples(rf_workflow, 88 | resamples = validation_split(data = mtsamples_embeddings, prop = 2/3), 89 | metrics = metric_set(roc_auc, pr_auc), 90 | control = control_resamples(save_pred = TRUE)) 91 | 92 | rf_result %>% collect_metrics() 93 | ``` 94 | 95 | # Aggregate the predictions 96 | 97 | ```{r} 98 | combined_predictions = 99 | bind_rows( 100 | logreg_result %>% collect_predictions() %>% mutate(model_name = 'Logistic regression'), 101 | rf_result %>% collect_predictions() %>% mutate(model_name = 'Random forest') 102 | ) 103 | ``` 104 | 105 | # Use runway to compare the two models 106 | 107 | ## What are the performance characteristics of the model? 108 | 109 | ```{r fig.width = 8, fig.height = 6} 110 | combined_predictions %>% 111 | mutate(is_cardiology_note = if_else(is_cardiology_note == 'Yes', 1, 0)) %>% 112 | threshperf_plot_multi(outcome = 'is_cardiology_note', 113 | prediction = '.pred_Yes', 114 | model = 'model_name') 115 | ``` 116 | 117 | ## How well-calibrated is it? 118 | 119 | ```{r fig.width = 6, fig.height = 6} 120 | combined_predictions %>% 121 | mutate(is_cardiology_note = if_else(is_cardiology_note == 'Yes', 1, 0)) %>% 122 | cal_plot_multi(outcome = 'is_cardiology_note', 123 | prediction = '.pred_Yes', 124 | model = 'model_name', 125 | n_bins = 5) 126 | ``` 127 | 128 | --------------------------------------------------------------------------------