├── .gitignore ├── 0-make-embeddings.R ├── 1-compress-icd-10-embeddings.R ├── 2-validation.R ├── 3-create-datasets.R ├── 4-estimate-leading-char.R ├── README.Rmd ├── README.md ├── README_files ├── figure-gfm │ └── unnamed-chunk-2-1.png └── figure-markdown_strict │ └── unnamed-chunk-2-1.png ├── alpha-char-model.R ├── autoencoder.R ├── bmc-bioinformatics-paper ├── bmc_article.bib ├── bmc_article.tex ├── bmcart-biblio.sty ├── bmcart.cls ├── tsne-plot.png └── vancouver.bst ├── category-codes.ssv ├── comparators ├── REAME.md ├── alpha-char-embedding-model.R ├── clinicalbert │ ├── 0-make-embedding.R │ ├── 1-benchmark.R │ └── ref.txt ├── medbert │ ├── 0-make-embedding.R │ ├── 1-benchmark.R │ └── ref.txt ├── pubmedbert-fulltext │ ├── 0-make-embedding.R │ ├── 1-benchmark.R │ └── ref.txt ├── pubmedbert-ms-marco │ ├── 0-make-embedding.R │ ├── 1-benchmark.R │ └── ref.txt ├── setup └── short-code.R ├── embedding-data ├── .gitattributes ├── icd-10-cm-2019-0010.csv.gz ├── icd-10-cm-2019-0050.csv.gz ├── icd-10-cm-2019-0100.csv.gz ├── icd-10-cm-2019-1000.csv.gz ├── icd-10-cm-2020-0010.csv.gz ├── icd-10-cm-2020-0050.csv.gz ├── icd-10-cm-2020-0100.csv.gz ├── icd-10-cm-2020-1000.csv.gz ├── icd-10-cm-2021-0010.csv.gz ├── icd-10-cm-2021-0050.csv.gz ├── icd-10-cm-2021-0100.csv.gz ├── icd-10-cm-2021-1000.csv.gz ├── icd-10-cm-2022-0010.csv.gz ├── icd-10-cm-2022-0050.csv.gz ├── icd-10-cm-2022-0100.csv.gz └── icd-10-cm-2022-1000.csv.gz ├── figure └── unnamed-chunk-2-1.png ├── icd-10-cm-codes ├── icd10cm_codes_2019.txt ├── icd10cm_codes_2020.txt ├── icd10cm_codes_2021.txt └── icd10cm_codes_2022.txt ├── icd-10-cm-embedding.Rproj ├── icd10_dl.rds ├── make-biogpt-conda-env ├── make-download-rds.R ├── model-performance.rds ├── sup-model-perf.rds └── year-validation.rds /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .Ruserdata 5 | -------------------------------------------------------------------------------- /0-make-embeddings.R: -------------------------------------------------------------------------------- 1 | # Create a directory called icd-10-embeddings and add BioGPT embedding 2 | # values to it. 3 | 4 | library(reticulate) 5 | library(dplyr) 6 | library(tidyr) 7 | library(purrr) 8 | library(foreach) 9 | library(itertools) 10 | library(readr) 11 | library(tibble) 12 | library(doMC) 13 | library(iterators) 14 | registerDoMC(cores = 2) 15 | 16 | # Use the conda environment created from make-biogpt-conda-env 17 | use_condaenv("biogpt") 18 | 19 | # Import the needed libraries. 20 | torch = import("torch") 21 | BioGptTokenizer = import("transformers")$BioGptTokenizer 22 | BioGptForCausalLM = import("transformers")$BioGptForCausalLM 23 | 24 | # Get the BioGPT tokenizer and model from Huggingface. 25 | tokenizer = BioGptTokenizer$from_pretrained("microsoft/biogpt") 26 | model = BioGptForCausalLM$from_pretrained("microsoft/biogpt") 27 | 28 | # A function to calculate the embedding location. 29 | mean_pooling = function(model_output, attention_mask) { 30 | #First element of model_output contains all token embeddings 31 | token_embeddings = model_output[1] 32 | input_mask_expanded = attention_mask$unsqueeze(-1L)$expand(token_embeddings$logits$size())$float() 33 | sum_embeddings = torch$sum(torch$multiply(token_embeddings$logits, input_mask_expanded), 1L) 34 | sum_mask = torch$clamp(input_mask_expanded$sum(1L), min=1e-9) 35 | ret = purrr::reduce(sum_embeddings$div(sum_mask)$tolist(), rbind) 36 | rownames(ret) = as.character(seq_len(nrow(ret))) 37 | ret 38 | } 39 | 40 | # A function to embed a set of string. 41 | embed = function(strings, max_len = 256) { 42 | encoded_input = tokenizer( 43 | strings, 44 | padding = TRUE, 45 | truncation = TRUE, 46 | max_length = max_len, 47 | return_tensors = 'pt' 48 | ) 49 | model_output = model( 50 | input_ids = encoded_input$input_ids, 51 | attention_mask = encoded_input$attention_mask 52 | ) 53 | mean_pooling(model_output, encoded_input$attention_mask) |> 54 | (\(x) {rownames(x) = strings; x})() 55 | } 56 | 57 | create_embeddings = function(icd10, dir_name) { 58 | dir.create(dir_name) 59 | 60 | foreach(it = isplitVector(seq_len(nrow(icd10)), chunkSize = 500)) %do% { 61 | icds = icd10[it,] 62 | emb = embed(icds$desc) 63 | icds$emb = map(seq_len(nrow(emb)), ~ emb[.x,]) 64 | walk( 65 | seq_len(nrow(icds)), 66 | ~ saveRDS(icds[.x,], sprintf("%s/%s.rds", dir_name, icds$code[.x])) 67 | ) 68 | print(it[length(it)]) 69 | NULL 70 | } |> unlist() |> invisible() 71 | } 72 | 73 | # The directory where the embeddings will go, by year. 74 | dir.create("icd-10-cm-embeddings") 75 | 76 | # Write the embeddings to their respective years. 77 | 78 | for (year in 2019:2022) { 79 | print(year) 80 | icd10 = sprintf("icd-10-cm-codes/icd10cm_codes_%s.txt", year) |> 81 | read_fwf(fwf_cols(code = 8, desc = 1000)) 82 | 83 | write_dir = file.path("icd-10-cm-embeddings", year) 84 | dir.create(write_dir) 85 | 86 | # Write the code, description, and embedding to a file with one file 87 | # per code. 88 | foreach(it = isplitVector(seq_len(nrow(icd10)), chunkSize = 200)) %do% { 89 | icds = icd10[it,] 90 | emb = embed(icds$desc) 91 | icds$emb = map(seq_len(nrow(emb)), ~ emb[.x,]) 92 | walk( 93 | seq_len(nrow(icds)), 94 | ~ saveRDS(icds[.x,], sprintf("%s/%s.rds", write_dir, icds$code[.x])) 95 | ) 96 | gc() 97 | NULL 98 | } 99 | } 100 | 101 | -------------------------------------------------------------------------------- /1-compress-icd-10-embeddings.R: -------------------------------------------------------------------------------- 1 | library(luz) 2 | library(tidyr) 3 | 4 | source("autoencoder.R") 5 | 6 | embedding_dir = file.path("icd-10-cm-embeddings", "2019") 7 | 8 | fn = file.path(embedding_dir, dir(embedding_dir)) 9 | set.seed(123) 10 | train = sample.int(length(fn), round(0.9 * length(fn))) 11 | test = setdiff(seq_len(length(fn)), train) 12 | 13 | icd10_emb_train = ICD10Embedding(fn[train]) 14 | icd10_emb_test = ICD10Embedding(fn[test]) 15 | 16 | emb_len = icd10_emb_train[1]$x$shape[1] 17 | 18 | # Create the set of parameters we will create autoencoders over. 19 | 20 | params = expand_grid( 21 | model_layers = list( 22 | c(emb_len, 1000, emb_len), 23 | c(emb_len, 1000, 100, 1000, emb_len), 24 | c(emb_len, 1000, 100, 50, 100, 1000, emb_len), 25 | c(emb_len, 1000, 100, 50, 10, 50, 100, 1000, emb_len)), 26 | batch_size = c(64, 128, 256) 27 | ) 28 | 29 | params$model_name = as.character(seq_len(nrow(params))) 30 | 31 | make_model = function(model_layers, batch_size, model_name, epochs = 30) { 32 | 33 | ret_luz = ICD10AutoEncoder |> 34 | setup( 35 | loss = nn_mse_loss(), 36 | optimizer = optim_adam 37 | # metrics = list( 38 | # luz_metric_mse(), 39 | # luz_metric_data_variation() 40 | # ) 41 | ) |> 42 | set_hparams(layers = model_layers) |> 43 | fit( 44 | data = dataloader( 45 | icd10_emb_train, 46 | batch_size = batch_size, 47 | shuffle = TRUE, 48 | num_workers = 4, 49 | worker_packages = "torch" 50 | ), 51 | epochs = epochs, 52 | valid_data = dataloader( 53 | icd10_emb_test, 54 | batch_size = batch_size, 55 | shuffle = TRUE, 56 | num_workers = 4, 57 | worker_packages = "torch" 58 | ), 59 | callbacks = 60 | list( 61 | luz_callback_keep_best_model() 62 | ) 63 | ) 64 | ret_luz 65 | } 66 | 67 | # The parameters and the models 68 | md = params 69 | 70 | 71 | md$model = map( 72 | seq_len(nrow(params)), 73 | ~ make_model( 74 | params$model_layers[[.x]], 75 | params$batch_size[.x], 76 | params$model_name[.x] 77 | ) 78 | ) 79 | 80 | md$embedding_dim = rep(c(1000, 100, 50, 10), each = 3) 81 | 82 | # Best valid loss index 83 | bvli = map_int(md$model, ~ which.min(unlist(.x$records$metrics$valid))) 84 | 85 | md$best_valid_loss = 86 | map_dbl( 87 | seq_along(md$model), 88 | ~ unlist(md$model[[.x]]$records$metrics$valid[bvli[.x]]) 89 | ) 90 | 91 | md$best_train_loss = 92 | map_dbl( 93 | seq_along(md$model), 94 | ~ unlist(md$model[[.x]]$records$metrics$train[bvli[.x]]) 95 | ) 96 | 97 | 98 | # Save the luz_models 99 | luz_model_dir = "luz-models" 100 | dir.create(luz_model_dir) 101 | md$model_path = NA 102 | for (i in seq_len(nrow(md))) { 103 | model_path = file.path(luz_model_dir, sprintf("luz-model-%02d.luz", i)) 104 | luz_save( 105 | md$model[[i]], 106 | model_path 107 | ) 108 | md$model_path = model_path 109 | } 110 | 111 | md = md |> arrange(best_valid_loss, best_train_loss) 112 | 113 | mdo = md |> 114 | select(embedding_dim, batch_size, starts_with("best")) 115 | 116 | saveRDS(mdo, "model-performance.rds") 117 | 118 | dir.create("autoencoder-models") 119 | 120 | torch_save( 121 | (md |> filter(embedding_dim == 10))$model[[1]]$model, 122 | file.path("autoencoder-models", "icd10cm-0010.pt") 123 | ) 124 | 125 | torch_save( 126 | (md |> filter(embedding_dim == 50))$model[[1]]$model, 127 | file.path("autoencoder-models", "icd10cm-0050.pt") 128 | ) 129 | 130 | torch_save( 131 | (md |> filter(embedding_dim == 100))$model[[1]]$model, 132 | file.path("autoencoder-models", "icd10cm-0100.pt") 133 | ) 134 | 135 | torch_save( 136 | (md |> filter(embedding_dim == 1000))$model[[1]]$model, 137 | file.path("autoencoder-models", "icd10cm-1000.pt") 138 | ) 139 | 140 | -------------------------------------------------------------------------------- /2-validation.R: -------------------------------------------------------------------------------- 1 | # Validate the selected models using ICD 10 codes from other years. 2 | 3 | library(torch) 4 | library(purrr) 5 | library(stringr) 6 | library(readr) 7 | library(tibble) 8 | library(tidyr) 9 | library(progress) 10 | 11 | source("autoencoder.R") 12 | 13 | default_device = "cpu" 14 | if (backends_cudnn_is_available()) { 15 | default_device = "cuda" 16 | } else if (backends_mps_is_available()) { 17 | default_device = "mps" 18 | } else if (backends_mkldnn_is_available()) { 19 | default_device = "mkldnn" 20 | } else if (backends_openmp_is_available()) { 21 | default_device = "openmp" 22 | } else if (backends_mkl_is_available()) { 23 | default_device = "mkl" 24 | } 25 | 26 | ae_model_paths = "autoencoder-models" |> 27 | (\(x) file.path(x, dir(x)))() 28 | 29 | icd10_embedding_paths = file.path("icd-10-cm-embeddings", 2019:2022) |> 30 | map( ~ file.path(.x, dir(.x))) 31 | 32 | xs = tibble( 33 | embed = map(icd10_embedding_paths, ICD10Embedding), 34 | year = 2019:2022 35 | ) 36 | 37 | vds = tibble( 38 | model = map(ae_model_paths, torch_load), 39 | embedding_dim = str_extract(ae_model_paths, "\\d{4}") |> as.integer() 40 | ) 41 | 42 | pred_error = function(d, m, device = default_device) { 43 | m = m$to(device = device) 44 | ret = c() 45 | dl = dataloader(d, batch_size = 100, num_workers = 5) 46 | pb = progress_bar$new( 47 | format = "[:bar] :percent eta: :eta", 48 | total = length(dl) 49 | ) 50 | loop(for (b in dl) { 51 | xt = b$x$to(device = device) 52 | r = torch_mean((xt - m(xt))^2, 2)$to(device = "cpu") |> 53 | as.numeric() 54 | pb$tick() 55 | ret = c(ret, r) 56 | }) 57 | mean(ret) 58 | } 59 | 60 | variation = function(d, device = default_device) { 61 | ret = c() 62 | dl = dataloader(d, batch_size = 100, num_workers = 5) 63 | pb = progress_bar$new( 64 | format = "[:bar] :percent eta: :eta", 65 | total = length(dl) 66 | ) 67 | loop(for (b in dl) { 68 | xt = b$x$to(device = device) 69 | r = torch_var(xt, 2)$to(device = "cpu") |> as.numeric() 70 | pb$tick() 71 | ret = c(ret, r) 72 | }) 73 | mean(ret) 74 | } 75 | 76 | x = expand_grid(vds, xs) 77 | 78 | x$pred_error = 79 | map_dbl( 80 | seq_len(nrow(x)), 81 | ~ {print(.x); pred_error(x$embed[[.x]], x$model[[.x]])}) 82 | 83 | x$variation = 84 | map_dbl( 85 | seq_len(nrow(x)), 86 | ~ {print(.x); variation(x$embed[[.x]])}) 87 | 88 | saveRDS(x, "year-validation-raw.rds") 89 | 90 | x |> 91 | arrange(year) |> 92 | select(-model, -embed) |> 93 | mutate(cod = pred_error / variation) |> 94 | select(year, embedding_dim, pred_error, cod) |> 95 | saveRDS("year-validation.rds") 96 | -------------------------------------------------------------------------------- /3-create-datasets.R: -------------------------------------------------------------------------------- 1 | # Write the datasets. 2 | 3 | library(torch) 4 | library(purrr) 5 | library(stringr) 6 | library(readr) 7 | library(tibble) 8 | library(tidyr) 9 | library(progress) 10 | library(dplyr) 11 | library(itertools) 12 | library(foreach) 13 | #library(doMC) 14 | #registerDoMC(cores = 2) 15 | registerDoSEQ() 16 | 17 | source("autoencoder.R") 18 | 19 | default_device = "cpu" 20 | if (backends_cudnn_is_available()) { 21 | default_device = "cuda" 22 | } else if (backends_mps_is_available()) { 23 | default_device = "mps" 24 | } else if (backends_mkldnn_is_available()) { 25 | default_device = "mkldnn" 26 | } else if (backends_openmp_is_available()) { 27 | default_device = "openmp" 28 | } else if (backends_mkl_is_available()) { 29 | default_device = "mkl" 30 | } 31 | 32 | 33 | ae_model_paths = "autoencoder-models" |> 34 | (\(x) file.path(x, dir(x)))() 35 | 36 | icd10_embedding_paths = file.path("icd-10-cm-embeddings", 2019:2022) |> 37 | map( ~ file.path(.x, dir(.x))) 38 | 39 | icd10_code_paths = 2019:2022 |> 40 | map_chr(~file.path("icd-10-cm-codes", sprintf("icd10cm_codes_%s.txt", .x))) 41 | 42 | xs = tibble( 43 | embed = map(icd10_embedding_paths, ICD10Embedding), 44 | year = 2019:2022, 45 | codes = map(icd10_code_paths, ~read_fwf(.x, fwf_cols(code = 8, desc = 150))) 46 | ) 47 | 48 | vds = tibble( 49 | model = map(ae_model_paths, torch_load), 50 | embedding_dim = str_extract(ae_model_paths, "\\d{4}") |> as.integer() 51 | ) 52 | 53 | get_embedding = function(d, m, device = default_device) { 54 | m = m$to(device = device) 55 | ret = c() 56 | dl = dataloader(d, batch_size = 100, num_workers = 6) 57 | pb = progress_bar$new( format = "[:bar] :percent eta: :eta", 58 | total = length(dl) 59 | ) 60 | loop(for (b in dl) { 61 | xt = b$x$to(device = device) 62 | for (i in seq_len(length(m$decoder) / 2)) { 63 | x = xt 64 | xt = m$decoder[[i]](x) 65 | } 66 | pb$tick() 67 | gc() 68 | ret = rbind(ret, as.matrix(xt$to(device = "cpu"))) 69 | }) 70 | ret = as_tibble(as.data.frame(ret)) 71 | ret 72 | } 73 | 74 | xd = expand_grid(xs, vds) 75 | xd$embedding = map( 76 | seq_len(nrow(xd)), 77 | ~ {print(.x); get_embedding(xd$embed[[.x]], xd$model[[.x]])} 78 | ) 79 | 80 | dir.create("embedding-data") 81 | 82 | for (i in seq_len(nrow(xd))) { 83 | d = bind_cols(xd$codes[[i]], xd$embedding[[i]]) 84 | write_csv( 85 | d, 86 | file.path( 87 | "embedding-data", 88 | sprintf("icd-10-cm-%s-%04d.csv.gz", xd$year[i], xd$embedding_dim[i]) 89 | ) 90 | ) 91 | gc() 92 | } 93 | 94 | for (year in 2019:2022) { 95 | fns = file.path("icd-10-cm-embeddings", year) |> 96 | (\(x) file.path(x, dir(x)))() 97 | 98 | dfs = foreach(it = isplitVector(fns, chunkSize = 1000), 99 | .combine = bind_rows, 100 | .inorder = FALSE, .errorhandling = "remove", 101 | .multicombine = TRUE) %do% { 102 | print(tail(it, 1)) 103 | df = foreach(fn = it, .combine = bind_rows, 104 | .errorhandling = "remove", .multicombine = TRUE) %dopar% { 105 | ret = readRDS(fn) 106 | ret = 107 | bind_cols( 108 | ret[,1:2], 109 | ret$emb[[1]] |> t() |> as.data.frame() 110 | ) 111 | gc() 112 | ret 113 | } 114 | gc() 115 | print(nrow(df)) 116 | df 117 | } 118 | write_csv(dfs, sprintf("embedding-data/icd-10-cm-%s-full.csv", year)) 119 | } 120 | 121 | 122 | -------------------------------------------------------------------------------- /4-estimate-leading-char.R: -------------------------------------------------------------------------------- 1 | library(luz) 2 | library(yardstick) 3 | library(tibble) 4 | library(stringr) 5 | 6 | source("alpha-char-model.R") 7 | 8 | ccc = c( 9 | "^[AB].*", 10 | "(^C|^D[0-4]).*", 11 | "^D[5-8].*", 12 | "^E[0-8][0-9].*", 13 | "^F.*", 14 | "^G.*", 15 | "^H[0-5][0-9].*", 16 | "^H[6-9][0-9].*", 17 | "^I.*", 18 | "^J.*", 19 | "^K.*", 20 | "^L.*", 21 | "^M.*", 22 | "^N.*", 23 | "^O[0-9].*", 24 | "^P.*", 25 | "^Q.*", 26 | "^R.*", 27 | "^[ST].*", 28 | "^[UVWXY].*", 29 | "^[Z].*" 30 | ) 31 | 32 | get_short_code_impl = function(code) { 33 | which(map_lgl(ccc, ~ grepl(.x, code))) 34 | } 35 | 36 | get_short_code = function(code) { 37 | map_int(code, get_short_code_impl) 38 | } 39 | 40 | emb_data_dir = "embedding-data" 41 | 42 | params = tibble( 43 | embedding_files = 44 | file.path(emb_data_dir, dir(emb_data_dir) |> str_subset("2019")), 45 | emb_dim = 46 | str_extract(embedding_files, "-\\d{4}\\.") |> str_extract("\\d{4}") 47 | ) 48 | 49 | dir.create("luz-supervised-models") 50 | 51 | ms = list() 52 | 53 | for (i in seq_len(nrow(params))) { 54 | 55 | aced = params$embedding_files[i]|> read_csv() 56 | 57 | traini = sample.int(nrow(aced), round(0.9 * nrow(aced))) 58 | testi = setdiff(seq_len(nrow(aced)), traini) 59 | 60 | aced$code = get_short_code(aced$code) 61 | 62 | train = AlphaCharEmbedding(aced[traini, ], sort(unique(aced$code))) 63 | test = AlphaCharEmbedding(aced[testi, ], sort(unique(aced$code))) 64 | 65 | layers = c(train$width(), 100, 100, 21) 66 | batch_size = 64 67 | epochs = 30 68 | 69 | # Cross entropy 70 | loss = function(input, target) { 71 | torch_mean(-torch_sum(target * torch_log(input + 1e-16), 2)) 72 | } 73 | 74 | luz_model = AlphaCodeEstimator |> 75 | setup( 76 | loss = loss, #nn_cross_entropy_loss(26), 77 | optimizer = optim_adam 78 | ) |> 79 | set_hparams(layers = layers) |> 80 | fit( 81 | data = dataloader( 82 | train, 83 | batch_size = batch_size, 84 | shuffle = TRUE, 85 | num_workers = 4, 86 | worker_packages = c("torch", "dplyr") 87 | ), 88 | epochs = epochs, 89 | valid_data = dataloader( 90 | test, 91 | batch_size = batch_size, 92 | shuffle = TRUE, 93 | num_workers = 4, 94 | worker_packages = c("torch", "dplyr") 95 | ), 96 | callbacks = list( 97 | luz_callback_keep_best_model() 98 | ) 99 | ) 100 | 101 | luz_save( 102 | luz_model, 103 | file.path("luz-supervised-models", 104 | sprintf("luz-model-%s.pt", params$emb_dim[i])) 105 | ) 106 | 107 | preds = 108 | predict( 109 | luz_model, 110 | dataloader( 111 | test, 112 | batch_size = batch_size, 113 | num_workers = 4, 114 | worker_packages = c("torch", "dplyr") 115 | ) 116 | ) 117 | 118 | comp = tibble( 119 | obs = aced[testi,]$code |> 120 | factor(levels = 1:21), 121 | pred = preds |> 122 | torch_tensor(device = "cpu") |> 123 | as.matrix() |> 124 | apply(1, which.max) |> 125 | factor(levels = 1:21) 126 | ) 127 | 128 | ms = c(ms, 129 | list( 130 | metric_set(accuracy, bal_accuracy)(comp, truth = obs, estimate = pred) 131 | ) 132 | ) 133 | print(ms) 134 | } 135 | 136 | params$accuracy = c(ms[[3]][1], ms[[6]][1], ms[[9]][1], ms[[12]][1]) 137 | params$bal_accuracy = c(ms[[3]][2], ms[[6]][2], ms[[9]][2], ms[[12]][2]) 138 | 139 | saveRDS(params |> select(-embedding_files), "sup-model-perf.rds") 140 | 141 | 142 | -------------------------------------------------------------------------------- /README.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: md_document 3 | --- 4 | 5 | 6 | 7 | ```{r, include = FALSE} 8 | knitr::opts_chunk$set( 9 | collapse = TRUE, 10 | comment = "#>" 11 | ) 12 | ``` 13 | 14 | # Compressed, Large-Language-Model Embedded Datasets of ICD-10-CM Descriptions 15 | 16 | ## Citing this work 17 | 18 | ``` 19 | @article{kane2023llm 20 | author = {Michael J. Kane and Casey King and Denise Esserman and Nancy K. Latham and Erich J. Greene and David A. Ganz}, 21 | title = {A Compressed Large Language Model Embedding Dataset of ICD 10 CM Descriptions}, 22 | elocation-id = {2023.04.24.23289046}, 23 | year = {2023}, 24 | doi = {10.1101/2023.04.24.23289046}, 25 | publisher = {Cold Spring Harbor Laboratory Press}, 26 | URL = {https://www.medrxiv.org/content/early/2023/05/15/2023.04.24.23289046.1}, 27 | eprint = {https://www.medrxiv.org/content/early/2023/05/15/2023.04.24.23289046.1.full.pdf}, 28 | journal = {medRxiv} 29 | } 30 | ``` 31 | 32 | ## License 33 | 34 | The code in this repository is licensed under [GPL v2](https://www.gnu.org/licenses/old-licenses/gpl-2.0.en.html) and the data 35 | are licenced under [CC BY-NC-SA 4.0](https://creativecommons.org/licenses/by-nc-sa/4.0/). 36 | 37 | # Funding 38 | 39 | This work was supported by the National Institute on Aging of the National Institutes of Health (NIH) through a project grant to Yale University (1R01AG071528). The organizations funding this study had no role in the design or conduct of the study; in the collection, management, analysis, or interpretation of the data; or in the preparation, review, or approval of the manuscript. The content of this publication is solely the responsibility of the authors and does not necessarily represent the official views of the National Institutes of Health, the Department of Veterans Affairs, or the United States government. 40 | 41 | ## ICD-10-CM Datasets 42 | 43 | ### 2022 44 | 45 | 1. [ICD-10-CM, 10-dimensions](https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/embedding-data/icd-10-cm-2022-0010.csv.gz?raw=true) 46 | 1. [ICD-10-CM, 50-dimensions](https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/embedding-data/icd-10-cm-2022-0050.csv.gz?raw=true) 47 | 1. [ICD-10-CM, 100-dimensions](https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/embedding-data/icd-10-cm-2022-0100.csv.gz?raw=true) 48 | 1. [ICD-10-CM, 1000-dimensions](https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/embedding-data/icd-10-cm-2022-1000.csv.gz?raw=true) 49 | 1. [ICD-10-CM, 1000-dimensions](https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/embedding-data/icd-10-cm-2022-1000.csv.gz?raw=true) 50 | 1. [ICD-10-CM, 42,384-dimensions (not compressed)](https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/embedding-data/icd-10-cm-2022-full.csv.gz?raw=true) 51 | 52 | ### 2021 53 | 54 | 1. [ICD-10-CM, 10-dimensions](https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/embedding-data/icd-10-cm-2021-0010.csv.gz?raw=true) 55 | 1. [ICD-10-CM, 50-dimensions](https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/embedding-data/icd-10-cm-2021-0050.csv.gz?raw=true) 56 | 1. [ICD-10-CM, 100-dimensions](https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/embedding-data/icd-10-cm-2021-0100.csv.gz?raw=true) 57 | 1. [ICD-10-CM, 1000-dimensions](https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/embedding-data/icd-10-cm-2021-1000.csv.gz?raw=true) 58 | 1. [ICD-10-CM, 42,384-dimensions (not compressed)](https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/embedding-data/icd-10-cm-2021-full.csv.gz?raw=true) 59 | 60 | ### 2020 61 | 62 | 1. [ICD-10-CM, 10-dimensions](https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/embedding-data/icd-10-cm-2020-0010.csv.gz?raw=true) 63 | 1. [ICD-10-CM, 50-dimensions](https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/embedding-data/icd-10-cm-2020-0050.csv.gz?raw=true) 64 | 1. [ICD-10-CM, 100-dimensions](https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/embedding-data/icd-10-cm-2020-0100.csv.gz?raw=true) 65 | 1. [ICD-10-CM, 1000-dimensions](https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/embedding-data/icd-10-cm-2020-1000.csv.gz?raw=true) 66 | 67 | ### 2019 68 | 69 | 1. [ICD-10-CM, 10-dimensions](https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/embedding-data/icd-10-cm-2019-0010.csv.gz?raw=true) 70 | 1. [ICD-10-CM, 50-dimensions](https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/embedding-data/icd-10-cm-2019-0050.csv.gz?raw=true) 71 | 1. [ICD-10-CM, 100-dimensions](https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/embedding-data/icd-10-cm-2019-0100.csv.gz?raw=true) 72 | 1. [ICD-10-CM, 1000-dimensions](https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/embedding-data/icd-10-cm-2019-1000.csv.gz?raw=true) 73 | 1. [ICD-10-CM, 42,384-dimensions (not compressed)](https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/embedding-data/icd-10-cm-2019-full.csv.gz?raw=true) 74 | 75 | ## Overview 76 | 77 | The International Classification of Diseases, 10th Revision, Clinical Modification ([ICD-10-CM](https://www.cdc.gov/nchs/icd/icd-10-cm.htm)) is a standardized classification system used for diagnosing diseases, disorders, and health conditions. It plays a crucial role in analyzing electronic medical records (EMRs) or electronic health records (EHRs). However, the high dimensionality of ICD-10-CM codes and their hierarchical structure make their incorporation into statistical and machine learning analyses challenging. Traditional contrast encoding methods like one-hot and treatment may not fully capture the hierarchical information of the codes. Large language models (LLMs) generate contextualized embeddings that capture the semantic relationships between codes more effectively. This repository provides data sets of ICD-10-CM codes mapped to embeddings generated using the [BioGPT Large Language Model](https://academic.oup.com/bib/article/23/6/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9&login=false). The embeddings provide informative input features for machine learning models, and dimension-reduced versions in 1,000, 100, 50, and 10 dimensions are provided. Validation for both the dimension reduction and the representation of the embeddings are shown below. The readily available datasets are anticipated to be highly valuable for researchers incorporating ICD-10-CM codes into their analyses, retaining contextual information, and enabling more advanced analyses in the field. 78 | 79 | The data sets and code use to generate them are available at https://github.com/kaneplusplus/icd-10-cs-embedding. The data are licensed under the [Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License](https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode). The code is 80 | licensed under [GPL-v2](https://www.gnu.org/licenses/old-licenses/gpl-2.0.en.html) 81 | 82 | ## Model description and performance 83 | 84 | The data provided are generated by embedding ICD-10-CM descriptions using the BioGPT-Large model, which tokenizes textual phrases into tokens and maps them to unique vocabulary IDs, resulting in a sequence of continuous embedding vectors. The embeddings are then contextualized by passing them through the model's layers with an attention mask. The embeddings are in a 42,384 dimensional space, which are then compressed using an auto-encoder with fully connected layers of decreasing and increasing sizes until the output layer. The autoencoder structure is the same for models with larger dimensions, with only the appropriate layers retained. 85 | 86 | ### Validating the dimension reduction 87 | 88 | ```{r model_perf, message = FALSE, warning = FALSE, echo = FALSE, fig.cap = "The autoencoder performance diagnostics ordered by decreasing Validation Loss."} 89 | library(dplyr) 90 | x = readRDS("model-performance.rds") 91 | x = x |> 92 | mutate(best_valid_loss = round(best_valid_loss, 3), 93 | best_train_loss = round(best_train_loss, 3)) |> 94 | select(embedding_dim, batch_size, best_train_loss, best_valid_loss) 95 | names(x) = c("Embedding Dimension", "Batch Size","Training Loss", "Validation Loss") 96 | knitr::kable(x, caption = "The autoencoder parameters and performance ordered by increasing validation loss.") 97 | ``` 98 | 99 | The autoencoder compressing the LLM embedding was fit on the 2019 ICD-10-CM descriptions 100 | for 20 epochs, with batch sizes 64, 128, and 256, mean-square error loss between 101 | the embedding and autoencoder estimate, and a validation data set comprised 102 | of random subset of 10\% of the samples. The model performance is shown above. 103 | Based on these results the models with the best validation loss where selected for distribution. 104 | 105 | ```{r autoencoder_perf, echo = FALSE, fig.cap = "The autoencoder year-validation diagnostics ordered by year."} 106 | x = readRDS("year-validation.rds") 107 | x = x |> 108 | mutate(pred_error = round(pred_error, 3), 109 | cod = round(cod, 3)) 110 | names(x) = c("Year", "Embedding Dimension", "MSE", "Coef. of Determination") 111 | knitr::kable(x, caption = "The autoencoder year validation performance ordered by year.") 112 | ``` 113 | 114 | In addition to the 2019 validation the models selected for distribution were 115 | tested on the 2020-2022 data sets to ensure their performance is comparable 116 | over years. It should be noted that the ICD-10-CM codes do not vary much from 117 | one year to the the next, 118 | so we should not expect large differences. As expected, the mean square error 119 | and coefficients of determination are similar to the 2019 data. 120 | 121 | ### Validating the embedding representation 122 | 123 | To validate the compressed embeddings, the hierarchical information in the ICD-10-CM codes was used to ensure that relevant relationships were preserved. The leading letter and two numeric values categorize codes, allowing for the estimation of categories at a rate higher than chance using a supervised model. The training data was a one-hot encoding of the ICD-10-CM categories as the dependent variable and the compressed embedding values as the independent variable. The model consisted of two hidden layers with 100 nodes each, using categorical cross-entropy as the loss function. The model was trained using 30 epochs, and the performance in terms of accuracy and balanced accuracy was evaluated. The compressed embeddings result in an increase in lost predictive information, as is typical for this type of problem. 124 | 125 | ```{r sm_perf, echo = FALSE, fig.cap = "The supervised model performance."} 126 | x = readRDS("sup-model-perf.rds") 127 | x = x |> 128 | mutate(accuracy = round(accuracy, 3), 129 | bal_accuracy = round(bal_accuracy, 3), 130 | emb_dim = as.integer(emb_dim)) 131 | names(x) = c("Embedding Dimension", "Accuracy", "Balanced Accuracy") 132 | knitr::kable(x, caption = "The supervised models' performance ordered by increasing embedding dimension.") 133 | ``` 134 | 135 | Of note, the goal in presenting these results is not to necessarily to 136 | maximize the prediction accuracy. Rather, it is to show that the embedding retains the 137 | hierarchical information in the ICD-10-CM codes. Some of the codes correspond to 138 | conditions that could be classified in several ways, and as a result coding 139 | for at least some of the conditions might be considered non-systematic. 140 | 141 | ## An example using the embedding data in R 142 | 143 | To conclude, we present a simple example of how one might use the embedding 144 | information in the R programming environment. Suppose we would like to 145 | visualize the ICD-10-CM codes beginning with G (diseases of the nervous system), 146 | I (diseases of the circulatory system), J (diseases of the respiratory system), 147 | and K (diseases of the digestive system) to better understand the relationships 148 | between these categories or specific conditions in the the 50-dimensional 149 | embedding. For convenience, the projects page includes an `.rds` file 150 | containing the available embeddings along with their URLs, which can be 151 | retrieved from the R console. The code categores can then be visualized 152 | by performing another dimension reduction (in this case we will use the 153 | Rtsne package), to 2 dimensions and presented them 154 | to a scatter plot as shown below. 155 | 156 | 157 | ```{r message = FALSE, eval = TRUE, warning = FALSE} 158 | library(dplyr) 159 | library(ggplot2) 160 | library(readr) 161 | library(Rtsne) 162 | library(stringr) 163 | 164 | # Download the locations of the embeddings. 165 | tf = tempfile() 166 | download.file( 167 | "https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/icd10_dl.rds?raw=true", 168 | tf 169 | ) 170 | dl = readRDS(tf) 171 | 172 | # Read in the unspecified injury codes. 173 | tf = tempfile() 174 | download.file( 175 | dl$url[dl$year == 2019 & dl$emb_dim == 50], 176 | tf 177 | ) 178 | 179 | icd10s = read_csv(tf) |> 180 | filter(str_detect(code, "^(G|I|J|K)")) |> 181 | mutate(desc = tolower(desc)) |> 182 | mutate(`Leading Letter` = str_sub(code, 1, 1)) 183 | 184 | # Fit tSNE to the embedding. 185 | tsne_fit = icd10s |> 186 | select(starts_with("V")) |> 187 | scale() |> 188 | Rtsne(perplexity = 10) 189 | 190 | # Bind the tSNE values to the data set. 191 | icd10p = bind_cols( 192 | icd10s |> 193 | select(-starts_with("V")), 194 | tsne_fit$Y |> 195 | as.data.frame() |> 196 | rename(tSNE1="V1", tSNE2="V2") |> 197 | as_tibble() 198 | ) 199 | 200 | # Visualize the results. 201 | ggplot(icd10p, aes(x = tSNE1, y = tSNE2, color = `Leading Letter`)) + 202 | geom_point() + 203 | theme_minimal() 204 | ``` 205 | 206 | The visualization shows that a subset of the circulatory diseases (I) and 207 | nervous system diseases (G) are well-differentiated from other conditions. It 208 | also shows overlap between other conditions related to K (digestive diseases), 209 | J (respiratory diseases), and I (circulatory). 210 | 211 | ## A SAS example 212 | 213 | ```sas 214 | /* Options */ 215 | %let dlyear=2019; /* code year; can be 2019, 2020, 2021, 2022 */ 216 | %let dldim=50; /* encoding dimensions; can be 1000, 100, 50, 10 */ 217 | %let tempdir=D:; /* directory for temporary file */ 218 | %let pathsep=\; /* path separator; \ for Windows, / for *NIX */ 219 | %let dsname=icd10cm; /* name for the final dataset */ 220 | %let target=icd-10-cm-&dlyear-%sysfunc(putn(&dldim,z4.)).csv.gz; 221 | %let tempfile=&tempdir&pathsep⌖ 222 | 223 | /* Download gzipped file to a temp location */ 224 | /* -- filename url and filename zip methods don't stack */ 225 | filename rawdl "&tempfile"; 226 | proc http 227 | url="https://github.com/kaneplusplus/icd-10-cm-embedding/raw/main/embedding-data/&target" 228 | out=rawdl; 229 | run; 230 | 231 | /* Read the downloaded temp file into a dataset */ 232 | filename codes ZIP "&tempfile" GZIP; 233 | %macro vlist; 234 | %local i; 235 | %do i=1 %to &dldim; V&i %end; 236 | %mend; 237 | data &dsname; 238 | informat code $4. desc $256. %vlist best.; 239 | infile codes delimiter=',' firstobs=2 dsd; 240 | input code $ desc $ %vlist; 241 | run; 242 | ``` 243 | 244 | ## Reproducing these results 245 | 246 | R version: >= 4.2 247 | 248 | R package dependencies: 249 | 250 | - `arrow` 251 | - `torch` 252 | - `reticulate` 253 | - `dplyr` 254 | - `tidyr` 255 | - `purrr` 256 | - `foreach` 257 | - `itertools` 258 | - `readr` 259 | - `luz` 260 | - `tidyr` 261 | - `tibble` 262 | - `progress` 263 | - `stringr` 264 | - `yardstick` 265 | 266 | Scripts 267 | 268 | - `0-make-embeddings.R` 269 | - Purpose - create the embeddings created by BioGPT-Large 270 | - Dependencies 271 | - A conda evironment with the `torch` and `transformers` packages (see the `make-biogpt-conda-env` script) 272 | - Inputs 273 | - `icd-10-cm-codes/icd10cm_codes_2019.txt` 274 | - `icd-10-cm-codes/icd10cm_codes_2020.txt` 275 | - `icd-10-cm-codes/icd10cm_codes_2021.txt` 276 | - `icd-10-cm-codes/icd10cm_codes_2022.txt` 277 | - Outputs 278 | - An `icd-10-cm-embeddings` directory with subdirectories corresponding to each year, and subsubdirectories with files whose names correspond to the ICD-10-CM code holding R .rds files with the code, description, and BioGPT embedding values stored as a `data.frame`. 279 | - `1-compress-icd-10-embeddings.R` 280 | - Purpose - recreate the embeddings created by BioGPT 281 | - Dependencies 282 | - R files: `autoencoder.R` 283 | - Inputs 284 | - Outputs 285 | - `2-validation.R` 286 | - Purpose - recreate the embeddings created by BioGPT-Large 287 | - Dependencies 288 | - R files: `autoencoder.R` 289 | - Inputs 290 | - Files in the `icd-10-cm-embeddings/2019` directory. 291 | - Outputs 292 | - `model-performance.rds` holding a `data.frame` consisting of the model performance table. 293 | - Files in the `autoencoder-models` directory containing model to create the compressed embeddings. 294 | - `3-create-datasets.R` 295 | - Purpose - recreate the embeddings created by BioGPT-Large 296 | - Dependencies 297 | - R files: `autoencoder.R` 298 | - Inputs 299 | - Files in the `autoencoder-models` directory. 300 | - Files in the `icd-10-cm-embeddings` directory for all years (2019-2020). 301 | - Outputs 302 | - `year-validation.rds` holding a data frame of the autoencoder year-validation model performance. 303 | - Files in the `embedding-data` directory holding the embedding values as .csv files for all year-dimension combinations. 304 | - `4-estimate-leading-char.R` 305 | - Purpose - recreate the embeddings created by BioGPT-Large 306 | - Dependencies 307 | - R files: `alpha-char-model.R` 308 | - Inputs 309 | - Files in the `embedding-data` directory. 310 | - Outputs 311 | - Files in the `luz-supervised-models` directory holding the `luz` package representation of the fitted models. 312 | - The `supervised-model-perf.rds` files containing a `data.frame` summarizing the supervised model performance. 313 | 314 | © Michael J. Kane (kaneplusplus at proton mail dot com) 315 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | --- 2 | output: md_document 3 | --- 4 | 5 | 6 | 7 | 8 | 9 | # Compressed, Large-Language-Model Embedded Datasets of ICD-10-CM Descriptions 10 | 11 | ## Citing this work 12 | 13 | ``` 14 | @article{kane2023llm 15 | author = {Michael J. Kane and Casey King and Denise Esserman and Nancy K. Latham and Erich J. Greene and David A. Ganz}, 16 | title = {A Compressed Large Language Model Embedding Dataset of ICD 10 CM Descriptions}, 17 | elocation-id = {2023.04.24.23289046}, 18 | year = {2023}, 19 | doi = {10.1101/2023.04.24.23289046}, 20 | publisher = {Cold Spring Harbor Laboratory Press}, 21 | URL = {https://www.medrxiv.org/content/early/2023/05/15/2023.04.24.23289046.1}, 22 | eprint = {https://www.medrxiv.org/content/early/2023/05/15/2023.04.24.23289046.1.full.pdf}, 23 | journal = {medRxiv} 24 | } 25 | ``` 26 | 27 | ## License 28 | 29 | The code in this repository is licensed under [GPL v2](https://www.gnu.org/licenses/old-licenses/gpl-2.0.en.html) and the data 30 | are licenced under [CC BY-NC-SA 4.0](https://creativecommons.org/licenses/by-nc-sa/4.0/). 31 | 32 | # Funding 33 | 34 | This work was supported by the National Institute on Aging of the National Institutes of Health (NIH) through a project grant to Yale University (1R01AG071528). The organizations funding this study had no role in the design or conduct of the study; in the collection, management, analysis, or interpretation of the data; or in the preparation, review, or approval of the manuscript. The content of this publication is solely the responsibility of the authors and does not necessarily represent the official views of the National Institutes of Health, the Department of Veterans Affairs, or the United States government. 35 | 36 | ## ICD-10-CM Datasets 37 | 38 | ### 2022 39 | 40 | 1. [ICD-10-CM, 10-dimensions](https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/embedding-data/icd-10-cm-2022-0010.csv.gz?raw=true) 41 | 1. [ICD-10-CM, 50-dimensions](https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/embedding-data/icd-10-cm-2022-0050.csv.gz?raw=true) 42 | 1. [ICD-10-CM, 100-dimensions](https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/embedding-data/icd-10-cm-2022-0100.csv.gz?raw=true) 43 | 1. [ICD-10-CM, 1000-dimensions](https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/embedding-data/icd-10-cm-2022-1000.csv.gz?raw=true) 44 | 1. [ICD-10-CM, 1000-dimensions](https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/embedding-data/icd-10-cm-2022-1000.csv.gz?raw=true) 45 | 1. [ICD-10-CM, 42,384-dimensions (not compressed)](https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/embedding-data/icd-10-cm-2022-full.csv.gz?raw=true) 46 | 47 | ### 2021 48 | 49 | 1. [ICD-10-CM, 10-dimensions](https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/embedding-data/icd-10-cm-2021-0010.csv.gz?raw=true) 50 | 1. [ICD-10-CM, 50-dimensions](https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/embedding-data/icd-10-cm-2021-0050.csv.gz?raw=true) 51 | 1. [ICD-10-CM, 100-dimensions](https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/embedding-data/icd-10-cm-2021-0100.csv.gz?raw=true) 52 | 1. [ICD-10-CM, 1000-dimensions](https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/embedding-data/icd-10-cm-2021-1000.csv.gz?raw=true) 53 | 1. [ICD-10-CM, 42,384-dimensions (not compressed)](https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/embedding-data/icd-10-cm-2021-full.csv.gz?raw=true) 54 | 55 | ### 2020 56 | 57 | 1. [ICD-10-CM, 10-dimensions](https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/embedding-data/icd-10-cm-2020-0010.csv.gz?raw=true) 58 | 1. [ICD-10-CM, 50-dimensions](https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/embedding-data/icd-10-cm-2020-0050.csv.gz?raw=true) 59 | 1. [ICD-10-CM, 100-dimensions](https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/embedding-data/icd-10-cm-2020-0100.csv.gz?raw=true) 60 | 1. [ICD-10-CM, 1000-dimensions](https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/embedding-data/icd-10-cm-2020-1000.csv.gz?raw=true) 61 | 62 | ### 2019 63 | 64 | 1. [ICD-10-CM, 10-dimensions](https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/embedding-data/icd-10-cm-2019-0010.csv.gz?raw=true) 65 | 1. [ICD-10-CM, 50-dimensions](https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/embedding-data/icd-10-cm-2019-0050.csv.gz?raw=true) 66 | 1. [ICD-10-CM, 100-dimensions](https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/embedding-data/icd-10-cm-2019-0100.csv.gz?raw=true) 67 | 1. [ICD-10-CM, 1000-dimensions](https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/embedding-data/icd-10-cm-2019-1000.csv.gz?raw=true) 68 | 1. [ICD-10-CM, 42,384-dimensions (not compressed)](https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/embedding-data/icd-10-cm-2019-full.csv.gz?raw=true) 69 | 70 | ## Overview 71 | 72 | The International Classification of Diseases, 10th Revision, Clinical Modification ([ICD-10-CM](https://www.cdc.gov/nchs/icd/icd-10-cm.htm)) is a standardized classification system used for diagnosing diseases, disorders, and health conditions. It plays a crucial role in analyzing electronic medical records (EMRs) or electronic health records (EHRs). However, the high dimensionality of ICD-10-CM codes and their hierarchical structure make their incorporation into statistical and machine learning analyses challenging. Traditional contrast encoding methods like one-hot and treatment may not fully capture the hierarchical information of the codes. Large language models (LLMs) generate contextualized embeddings that capture the semantic relationships between codes more effectively. This repository provides data sets of ICD-10-CM codes mapped to embeddings generated using the [BioGPT Large Language Model](https://academic.oup.com/bib/article/23/6/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9&login=false). The embeddings provide informative input features for machine learning models, and dimension-reduced versions in 1,000, 100, 50, and 10 dimensions are provided. Validation for both the dimension reduction and the representation of the embeddings are shown below. The readily available datasets are anticipated to be highly valuable for researchers incorporating ICD-10-CM codes into their analyses, retaining contextual information, and enabling more advanced analyses in the field. 73 | 74 | The data sets and code use to generate them are available at https://github.com/kaneplusplus/icd-10-cs-embedding. The data are licensed under the [Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License](https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode). The code is 75 | licensed under [GPL-v2](https://www.gnu.org/licenses/old-licenses/gpl-2.0.en.html) 76 | 77 | ## Model description and performance 78 | 79 | The data provided are generated by embedding ICD-10-CM descriptions using the BioGPT-Large model, which tokenizes textual phrases into tokens and maps them to unique vocabulary IDs, resulting in a sequence of continuous embedding vectors. The embeddings are then contextualized by passing them through the model's layers with an attention mask. The embeddings are in a 42,384 dimensional space, which are then compressed using an auto-encoder with fully connected layers of decreasing and increasing sizes until the output layer. The autoencoder structure is the same for models with larger dimensions, with only the appropriate layers retained. 80 | 81 | ### Validating the dimension reduction 82 | 83 | 84 | 85 | Table: The autoencoder parameters and performance ordered by increasing validation loss. 86 | 87 | | Embedding Dimension| Batch Size| Training Loss| Validation Loss| 88 | |-------------------:|----------:|-------------:|---------------:| 89 | | 50| 256| 0.390| 0.377| 90 | | 100| 64| 0.710| 0.435| 91 | | 50| 128| 0.470| 0.438| 92 | | 1000| 128| 1.106| 0.479| 93 | | 1000| 64| 1.299| 0.484| 94 | | 10| 128| 0.614| 0.605| 95 | | 10| 256| 0.610| 0.610| 96 | | 10| 64| 0.693| 0.634| 97 | | 100| 128| 11.647| 0.653| 98 | | 50| 64| 0.757| 0.658| 99 | | 100| 256| 1.418| 0.805| 100 | | 1000| 256| 0.863| 0.847| 101 | 102 | 103 | 104 | The autoencoder compressing the LLM embedding was fit on the 2019 ICD-10-CM descriptions 105 | for 20 epochs, with batch sizes 64, 128, and 256, mean-square error loss between 106 | the embedding and autoencoder estimate, and a validation data set comprised 107 | of random subset of 10\% of the samples. The model performance is shown above. 108 | Based on these results the models with the best validation loss where selected for distribution. 109 | 110 | 111 | 112 | Table: The autoencoder year validation performance ordered by year. 113 | 114 | | Year| Embedding Dimension| MSE| Coef. of Determination| 115 | |----:|-------------------:|-----:|----------------------:| 116 | | 2019| 10| 0.600| 0.087| 117 | | 2019| 50| 0.372| 0.054| 118 | | 2019| 100| 0.431| 0.062| 119 | | 2019| 1000| 0.473| 0.068| 120 | | 2020| 10| 0.601| 0.087| 121 | | 2020| 50| 0.373| 0.054| 122 | | 2020| 100| 0.431| 0.062| 123 | | 2020| 1000| 0.474| 0.068| 124 | | 2021| 10| 0.602| 0.087| 125 | | 2021| 50| 0.374| 0.054| 126 | | 2021| 100| 0.432| 0.062| 127 | | 2021| 1000| 0.475| 0.069| 128 | | 2022| 10| 0.602| 0.087| 129 | | 2022| 50| 0.374| 0.054| 130 | | 2022| 100| 0.433| 0.063| 131 | | 2022| 1000| 0.475| 0.069| 132 | 133 | 134 | 135 | In addition to the 2019 validation the models selected for distribution were 136 | tested on the 2020-2022 data sets to ensure their performance is comparable 137 | over years. It should be noted that the ICD-10-CM codes do not vary much from 138 | one year to the the next, 139 | so we should not expect large differences. As expected, the mean square error 140 | and coefficients of determination are similar to the 2019 data. 141 | 142 | ### Validating the embedding representation 143 | 144 | To validate the compressed embeddings, the hierarchical information in the ICD-10-CM codes was used to ensure that relevant relationships were preserved. The leading letter and two numeric values categorize codes, allowing for the estimation of categories at a rate higher than chance using a supervised model. The training data was a one-hot encoding of the ICD-10-CM categories as the dependent variable and the compressed embedding values as the independent variable. The model consisted of two hidden layers with 100 nodes each, using categorical cross-entropy as the loss function. The model was trained using 30 epochs, and the performance in terms of accuracy and balanced accuracy was evaluated. The compressed embeddings result in an increase in lost predictive information, as is typical for this type of problem. 145 | 146 | 147 | 148 | Table: The supervised models' performance ordered by increasing embedding dimension. 149 | 150 | | Embedding Dimension| Accuracy| Balanced Accuracy| 151 | |-------------------:|--------:|-----------------:| 152 | | 10| 0.815| 0.698| 153 | | 50| 0.925| 0.873| 154 | | 100| 0.935| 0.891| 155 | | 1000| 0.960| 0.927| 156 | 157 | 158 | 159 | Of note, the goal in presenting these results is not to necessarily to 160 | maximize the prediction accuracy. Rather, it is to show that the embedding retains the 161 | hierarchical information in the ICD-10-CM codes. Some of the codes correspond to 162 | conditions that could be classified in several ways, and as a result coding 163 | for at least some of the conditions might be considered non-systematic. 164 | 165 | ## An example using the embedding data in R 166 | 167 | To conclude, we present a simple example of how one might use the embedding 168 | information in the R programming environment. Suppose we would like to 169 | visualize the ICD-10-CM codes beginning with G (diseases of the nervous system), 170 | I (diseases of the circulatory system), J (diseases of the respiratory system), 171 | and K (diseases of the digestive system) to better understand the relationships 172 | between these categories or specific conditions in the the 50-dimensional 173 | embedding. For convenience, the projects page includes an `.rds` file 174 | containing the available embeddings along with their URLs, which can be 175 | retrieved from the R console. The code categores can then be visualized 176 | by performing another dimension reduction (in this case we will use the 177 | Rtsne package), to 2 dimensions and presented them 178 | to a scatter plot as shown below. 179 | 180 | 181 | 182 | ```r 183 | library(dplyr) 184 | library(ggplot2) 185 | library(readr) 186 | library(Rtsne) 187 | library(stringr) 188 | 189 | # Download the locations of the embeddings. 190 | tf = tempfile() 191 | download.file( 192 | "https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/icd10_dl.rds?raw=true", 193 | tf 194 | ) 195 | dl = readRDS(tf) 196 | 197 | # Read in the unspecified injury codes. 198 | tf = tempfile() 199 | download.file( 200 | dl$url[dl$year == 2019 & dl$emb_dim == 50], 201 | tf 202 | ) 203 | 204 | icd10s = read_csv(tf) |> 205 | filter(str_detect(code, "^(G|I|J|K)")) |> 206 | mutate(desc = tolower(desc)) |> 207 | mutate(`Leading Letter` = str_sub(code, 1, 1)) 208 | 209 | # Fit tSNE to the embedding. 210 | tsne_fit = icd10s |> 211 | select(starts_with("V")) |> 212 | scale() |> 213 | Rtsne(perplexity = 10) 214 | 215 | # Bind the tSNE values to the data set. 216 | icd10p = bind_cols( 217 | icd10s |> 218 | select(-starts_with("V")), 219 | tsne_fit$Y |> 220 | as.data.frame() |> 221 | rename(tSNE1="V1", tSNE2="V2") |> 222 | as_tibble() 223 | ) 224 | 225 | # Visualize the results. 226 | ggplot(icd10p, aes(x = tSNE1, y = tSNE2, color = `Leading Letter`)) + 227 | geom_point() + 228 | theme_minimal() 229 | ``` 230 | 231 | ![plot of chunk unnamed-chunk-2](figure/unnamed-chunk-2-1.png) 232 | 233 | The visualization shows that a subset of the circulatory diseases (I) and 234 | nervous system diseases (G) are well-differentiated from other conditions. It 235 | also shows overlap between other conditions related to K (digestive diseases), 236 | J (respiratory diseases), and I (circulatory). 237 | 238 | ## A SAS example 239 | 240 | ```sas 241 | /* Options */ 242 | %let dlyear=2019; /* code year; can be 2019, 2020, 2021, 2022 */ 243 | %let dldim=50; /* encoding dimensions; can be 1000, 100, 50, 10 */ 244 | %let tempdir=D:; /* directory for temporary file */ 245 | %let pathsep=\; /* path separator; \ for Windows, / for *NIX */ 246 | %let dsname=icd10cm; /* name for the final dataset */ 247 | %let target=icd-10-cm-&dlyear-%sysfunc(putn(&dldim,z4.)).csv.gz; 248 | %let tempfile=&tempdir&pathsep⌖ 249 | 250 | /* Download gzipped file to a temp location */ 251 | /* -- filename url and filename zip methods don't stack */ 252 | filename rawdl "&tempfile"; 253 | proc http 254 | url="https://github.com/kaneplusplus/icd-10-cm-embedding/raw/main/embedding-data/&target" 255 | out=rawdl; 256 | run; 257 | 258 | /* Read the downloaded temp file into a dataset */ 259 | filename codes ZIP "&tempfile" GZIP; 260 | %macro vlist; 261 | %local i; 262 | %do i=1 %to &dldim; V&i %end; 263 | %mend; 264 | data &dsname; 265 | informat code $4. desc $256. %vlist best.; 266 | infile codes delimiter=',' firstobs=2 dsd; 267 | input code $ desc $ %vlist; 268 | run; 269 | ``` 270 | 271 | ## Reproducing these results 272 | 273 | R version: >= 4.2 274 | 275 | R package dependencies: 276 | 277 | - `arrow` 278 | - `torch` 279 | - `reticulate` 280 | - `dplyr` 281 | - `tidyr` 282 | - `purrr` 283 | - `foreach` 284 | - `itertools` 285 | - `readr` 286 | - `luz` 287 | - `tidyr` 288 | - `tibble` 289 | - `progress` 290 | - `stringr` 291 | - `yardstick` 292 | 293 | Scripts 294 | 295 | - `0-make-embeddings.R` 296 | - Purpose - create the embeddings created by BioGPT-Large 297 | - Dependencies 298 | - A conda evironment with the `torch` and `transformers` packages (see the `make-biogpt-conda-env` script) 299 | - Inputs 300 | - `icd-10-cm-codes/icd10cm_codes_2019.txt` 301 | - `icd-10-cm-codes/icd10cm_codes_2020.txt` 302 | - `icd-10-cm-codes/icd10cm_codes_2021.txt` 303 | - `icd-10-cm-codes/icd10cm_codes_2022.txt` 304 | - Outputs 305 | - An `icd-10-cm-embeddings` directory with subdirectories corresponding to each year, and subsubdirectories with files whose names correspond to the ICD-10-CM code holding R .rds files with the code, description, and BioGPT embedding values stored as a `data.frame`. 306 | - `1-compress-icd-10-embeddings.R` 307 | - Purpose - recreate the embeddings created by BioGPT 308 | - Dependencies 309 | - R files: `autoencoder.R` 310 | - Inputs 311 | - Outputs 312 | - `2-validation.R` 313 | - Purpose - recreate the embeddings created by BioGPT-Large 314 | - Dependencies 315 | - R files: `autoencoder.R` 316 | - Inputs 317 | - Files in the `icd-10-cm-embeddings/2019` directory. 318 | - Outputs 319 | - `model-performance.rds` holding a `data.frame` consisting of the model performance table. 320 | - Files in the `autoencoder-models` directory containing model to create the compressed embeddings. 321 | - `3-create-datasets.R` 322 | - Purpose - recreate the embeddings created by BioGPT-Large 323 | - Dependencies 324 | - R files: `autoencoder.R` 325 | - Inputs 326 | - Files in the `autoencoder-models` directory. 327 | - Files in the `icd-10-cm-embeddings` directory for all years (2019-2020). 328 | - Outputs 329 | - `year-validation.rds` holding a data frame of the autoencoder year-validation model performance. 330 | - Files in the `embedding-data` directory holding the embedding values as .csv files for all year-dimension combinations. 331 | - `4-estimate-leading-char.R` 332 | - Purpose - recreate the embeddings created by BioGPT-Large 333 | - Dependencies 334 | - R files: `alpha-char-model.R` 335 | - Inputs 336 | - Files in the `embedding-data` directory. 337 | - Outputs 338 | - Files in the `luz-supervised-models` directory holding the `luz` package representation of the fitted models. 339 | - The `supervised-model-perf.rds` files containing a `data.frame` summarizing the supervised model performance. 340 | 341 | © Michael J. Kane (kaneplusplus at proton mail dot com) 342 | -------------------------------------------------------------------------------- /README_files/figure-gfm/unnamed-chunk-2-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kaneplusplus/icd-10-cm-embedding/a63d21c7d8f90419515bcfc2b0fce4281a6f1e62/README_files/figure-gfm/unnamed-chunk-2-1.png -------------------------------------------------------------------------------- /README_files/figure-markdown_strict/unnamed-chunk-2-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kaneplusplus/icd-10-cm-embedding/a63d21c7d8f90419515bcfc2b0fce4281a6f1e62/README_files/figure-markdown_strict/unnamed-chunk-2-1.png -------------------------------------------------------------------------------- /alpha-char-model.R: -------------------------------------------------------------------------------- 1 | library(torch) 2 | library(tibble) 3 | library(dplyr) 4 | library(purrr) 5 | library(foreach) 6 | library(readr) 7 | 8 | AlphaCharEmbedding = dataset( 9 | name = "AlphaCharEmbedding", 10 | initialize = function(icd10_emb, output_levels) { 11 | self$x = icd10_emb |> 12 | collect() 13 | lc = contr.treatment(output_levels, contrasts = FALSE) 14 | adf = map_dfr(self$x$code, ~ lc[.x,]) 15 | names(adf) = paste0("alpha_", names(adf)) 16 | self$x = bind_cols(adf, icd10_emb) 17 | self$v_width = ncol(self$x |> select(starts_with("V"))) 18 | }, 19 | width = function() { 20 | self$v_width 21 | }, 22 | .getitem = function(x) { 23 | list( 24 | x = torch_tensor(select(self$x[x,], starts_with("V")) |> unlist()), 25 | y = torch_tensor(select(self$x[x,], starts_with("alpha")) |> unlist()) 26 | ) 27 | }, 28 | .length = function() { 29 | nrow(self$x) 30 | } 31 | ) 32 | 33 | AlphaCodeEstimator = nn_module( 34 | initialize = function(layers) { 35 | self$feature_net = nn_module_list( 36 | foreach(i = seq_along(layers)[-1]) %do% { 37 | nn_linear(layers[i-1], layers[i]) 38 | } 39 | ) 40 | }, 41 | forward = function(x) { 42 | for (i in seq_along(self$feature_net)) { 43 | x = self$feature_net[[i]](x) 44 | } 45 | nnf_softmax(x, dim = 2) 46 | } 47 | ) 48 | 49 | 50 | -------------------------------------------------------------------------------- /autoencoder.R: -------------------------------------------------------------------------------- 1 | library(torch) 2 | library(tibble) 3 | library(dplyr) 4 | library(purrr) 5 | library(foreach) 6 | 7 | ICD10Embedding = dataset( 8 | name = "ICD10Embedding", 9 | initialize = function(files, device = "mps") { 10 | self$files = files 11 | self$device = device 12 | }, 13 | .getitem = function(i) { 14 | ret = readRDS(self$files[i]) 15 | x = torch_tensor(ret$emb[[1]], device = self$device, 16 | dtype = torch_float()) 17 | list(x = x, y = x$clone()) 18 | }, 19 | .length = function() { 20 | length(self$files) 21 | } 22 | ) 23 | 24 | ICD10AutoEncoder = nn_module( 25 | initialize = function(layers) { 26 | if (length(layers) %% 2 != 1) { 27 | stop("The number of layers must be odd.") 28 | } 29 | encoder_layers = layers[ceiling(length(layers) / 2)] 30 | decoder_layers = layers[(length(encoder_layers)):length(layers)] 31 | self$encoder = nn_module_list( 32 | foreach(i = seq_along(encoder_layers)[-1]) %do% { 33 | nn_linear(encoder_layers[i-1], encoder_layers[i]) 34 | } 35 | ) 36 | self$decoder = nn_module_list( 37 | foreach(i = seq_along(decoder_layers)[-1]) %do% { 38 | nn_linear(decoder_layers[i-1], decoder_layers[i]) 39 | } 40 | ) 41 | }, 42 | run_forward = function(x, m) { 43 | for (i in seq_along(m)) { 44 | x = m[[i]](x) 45 | } 46 | x 47 | }, 48 | encode = function(x) { 49 | self$run_forward(x, self$encoder) 50 | }, 51 | decode = function(x) { 52 | self$run_forward(x, self$decoder) 53 | }, 54 | forward = function(x) { 55 | x |> 56 | self$encode() |> 57 | self$decode() 58 | } 59 | ) 60 | 61 | 62 | -------------------------------------------------------------------------------- /bmc-bioinformatics-paper/bmc_article.bib: -------------------------------------------------------------------------------- 1 | % bmc_article.bib 2 | % 3 | % An example of bibtex entries. 4 | % Entries taken from BMC instructions for authors page. 5 | 6 | % uncomment next line to make author-year bibliography 7 | % @settings{label, options="nameyear"} 8 | 9 | @article{mikolov2013, 10 | title={Efficient estimation of word representations in vector space}, 11 | author={Mikolov, Tomas and Chen, Kai and Corrado, Greg and Dean, Jeffrey}, 12 | journal={arXiv preprint arXiv:1301.3781}, 13 | year={2013} 14 | } 15 | 16 | @inproceedings{cui2vec, 17 | title={Clinical concept embeddings learned from massive sources of multimodal medical data}, 18 | author={Beam, Andrew L and Kompa, Benjamin and Schmaltz, Allen and Fried, Inbar and Weber, Griffin and Palmer, Nathan and Shi, Xu and Cai, Tianxi and Kohane, Isaac S}, 19 | booktitle={Pacific Symposium on Biocomputing 2020}, 20 | pages={295--306}, 21 | year={2019}, 22 | organization={World Scientific} 23 | } 24 | 25 | @article{vaswani2017, 26 | title={Attention is all you need}, 27 | author={Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and Kaiser, {\L}ukasz and Polosukhin, Illia}, 28 | journal={Advances in neural information processing systems}, 29 | volume={30}, 30 | year={2017} 31 | } 32 | 33 | @article{church2017, 34 | title={Word2Vec}, 35 | author={Church, Kenneth Ward}, 36 | journal={Natural Language Engineering}, 37 | volume={23}, 38 | number={1}, 39 | pages={155--162}, 40 | year={2017}, 41 | publisher={Cambridge University Press} 42 | } 43 | 44 | @INPROCEEDINGS{medbert, 45 | author={Vasantharajan, Charangan and Tun, Kyaw Zin and Thi-Nga, Ho and Jain, Sparsh and Rong, Tong and Siong, Chng Eng}, 46 | booktitle={2022 Asia-Pacific Signal and Information Processing Association Annual Summit and Conference (APSIPA ASC)}, 47 | title={MedBERT: A Pre-trained Language Model for Biomedical Named Entity Recognition}, 48 | year={2022}, 49 | volume={}, 50 | number={}, 51 | pages={1482-1488}, 52 | doi={10.23919/APSIPAASC55919.2022.9980157} 53 | } 54 | 55 | @inproceedings{balancedaccuracy, 56 | title={The balanced accuracy and its posterior distribution}, 57 | author={Brodersen, Kay Henning and Ong, Cheng Soon and Stephan, Klaas Enno and Buhmann, Joachim M}, 58 | booktitle={2010 20th international conference on pattern recognition}, 59 | pages={3121--3124}, 60 | year={2010}, 61 | organization={IEEE} 62 | } 63 | 64 | @article{msmarco, 65 | title={Ms marco: A human-generated machine reading comprehension dataset}, 66 | author={Nguyen, Tri and Rosenberg, Mir and Song, Xia and Gao, Jianfeng and Tiwary, Saurabh and Majumder, Rangan and Deng, Li}, 67 | year={2016} 68 | } 69 | 70 | @article{umls, 71 | title={The unified medical language system (UMLS): integrating biomedical terminology}, 72 | author={Bodenreider, Olivier}, 73 | journal={Nucleic acids research}, 74 | volume={32}, 75 | number={suppl\_1}, 76 | pages={D267--D270}, 77 | year={2004}, 78 | publisher={Oxford University Press} 79 | } 80 | 81 | @article{pubmedbertqa, 82 | title={Improved Methods To Aid Unsupervised Evidence-Based Fact Checking For Online Health News}, 83 | author={Deka, Pritam and Jurek-Loughrey, Anna and Deepak, P}, 84 | journal={Journal of Data Intelligence}, 85 | volume={3}, 86 | number={4}, 87 | pages={474--504}, 88 | year={2022} 89 | } 90 | 91 | @inproceedings{pubmedbertfull, 92 | title = "Self-Alignment Pretraining for Biomedical Entity Representations", 93 | author = "Liu, Fangyu and 94 | Shareghi, Ehsan and 95 | Meng, Zaiqiao and 96 | Basaldella, Marco and 97 | Collier, Nigel", 98 | booktitle = "Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies", 99 | month = jun, 100 | year = "2021", 101 | address = "Online", 102 | publisher = "Association for Computational Linguistics", 103 | url = "https://www.aclweb.org/anthology/2021.naacl-main.334", 104 | pages = "4228--4238", 105 | abstract = "Despite the widespread success of self-supervised learning via masked language models (MLM), accurately capturing fine-grained semantic relationships in the biomedical domain remains a challenge. This is of paramount importance for entity-level tasks such as entity linking where the ability to model entity relations (especially synonymy) is pivotal. To address this challenge, we propose SapBERT, a pretraining scheme that self-aligns the representation space of biomedical entities. We design a scalable metric learning framework that can leverage UMLS, a massive collection of biomedical ontologies with 4M+ concepts. In contrast with previous pipeline-based hybrid systems, SapBERT offers an elegant one-model-for-all solution to the problem of medical entity linking (MEL), achieving a new state-of-the-art (SOTA) on six MEL benchmarking datasets. In the scientific domain, we achieve SOTA even without task-specific supervision. With substantial improvement over various domain-specific pretrained MLMs such as BioBERT, SciBERTand and PubMedBERT, our pretraining scheme proves to be both effective and robust.", 106 | } 107 | 108 | 109 | @misc{icd10cm, 110 | title = {{ICD-10-CM}}, 111 | author = {{The Center for Disease Control and Prevention (CDC)}}, 112 | url = {\url{https://www.cdc.gov/nchs/icd/icd-10-cm.htm}}, 113 | note = {Accessed: 2023-04-15} 114 | } 115 | 116 | @article{rasmy2021, 117 | title={Med-BERT: pretrained contextualized embeddings on large-scale structured electronic health records for disease prediction}, 118 | author={Rasmy, Laila and Xiang, Yang and Xie, Ziqian and Tao, Cui and Zhi, Degui}, 119 | journal={NPJ digital medicine}, 120 | volume={4}, 121 | number={1}, 122 | pages={86}, 123 | year={2021}, 124 | publisher={Nature Publishing Group UK London} 125 | } 126 | 127 | @article{lee2020, 128 | title={BioBERT: a pre-trained biomedical language representation model for biomedical text mining}, 129 | author={Lee, Jinhyuk and Yoon, Wonjin and Kim, Sungdong and Kim, Donghyeon and Kim, Sunkyu and So, Chan Ho and Kang, Jaewoo}, 130 | journal={Bioinformatics}, 131 | volume={36}, 132 | number={4}, 133 | pages={1234--1240}, 134 | year={2020}, 135 | publisher={Oxford University Press} 136 | } 137 | 138 | @article{raffel2020, 139 | title={Exploring the limits of transfer learning with a unified text-to-text transformer}, 140 | author={Raffel, Colin and Shazeer, Noam and Roberts, Adam and Lee, Katherine and Narang, Sharan and Matena, Michael and Zhou, Yanqi and Li, Wei and Liu, Peter J}, 141 | journal={The Journal of Machine Learning Research}, 142 | volume={21}, 143 | number={1}, 144 | pages={5485--5551}, 145 | year={2020}, 146 | publisher={JMLRORG} 147 | } 148 | 149 | @article{radford2018, 150 | title={Improving language understanding with unsupervised learning}, 151 | author={Radford, Alec and Narasimhan, Karthik and Salimans, Tim and Sutskever, Ilya}, 152 | year={2018}, 153 | publisher={Technical report, OpenAI} 154 | } 155 | 156 | @article{devlin2018, 157 | title={Bert: Pre-training of deep bidirectional transformers for language understanding}, 158 | author={Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina}, 159 | journal={arXiv preprint arXiv:1810.04805}, 160 | year={2018} 161 | } 162 | 163 | @article{huang2019, 164 | title={Clinicalbert: Modeling clinical notes and predicting hospital readmission}, 165 | author={Huang, Kexin and Altosaar, Jaan and Ranganath, Rajesh}, 166 | journal={arXiv preprint arXiv:1904.05342}, 167 | year={2019} 168 | } 169 | 170 | @article{alsentzer2019, 171 | title={Publicly available clinical BERT embeddings}, 172 | author={Alsentzer, Emily and Murphy, John R and Boag, Willie and Weng, Wei-Hung and Jin, Di and Naumann, Tristan and McDermott, Matthew}, 173 | journal={arXiv preprint arXiv:1904.03323}, 174 | year={2019} 175 | } 176 | 177 | @inproceedings{med2vec, 178 | title={Multi-layer representation learning for medical concepts}, 179 | author={Choi, Edward and Bahadori, Mohammad Taha and Searles, Elizabeth and Coffey, Catherine and Thompson, Michael and Bost, James and Tejedor-Sojo, Javier and Sun, Jimeng}, 180 | booktitle={proceedings of the 22nd ACM SIGKDD international conference on knowledge discovery and data mining}, 181 | pages={1495--1504}, 182 | year={2016} 183 | } 184 | 185 | @article{ehr2vec, 186 | title={EHR2Vec: representation learning of medical concepts from temporal patterns of clinical notes based on self-attention mechanism}, 187 | author={Wang, Li and Wang, Qinghua and Bai, Heming and Liu, Cong and Liu, Wei and Zhang, Yuanpeng and Jiang, Lei and Xu, Huji and Wang, Kai and Zhou, Yunyun}, 188 | journal={Frontiers in Genetics}, 189 | volume={11}, 190 | pages={630}, 191 | year={2020}, 192 | publisher={Frontiers Media SA} 193 | } 194 | 195 | @inproceedings{inpatient2vec, 196 | title={Inpatient2vec: Medical representation learning for inpatients}, 197 | author={Wang, Ying and Xu, Xiao and Jin, Tao and Li, Xiang and Xie, Guotong and Wang, Jianmin}, 198 | booktitle={2019 IEEE International Conference on Bioinformatics and Biomedicine (BIBM)}, 199 | pages={1113--1117}, 200 | year={2019}, 201 | organization={IEEE} 202 | } 203 | 204 | @Manual{rcore, 205 | title = {R: A Language and Environment for Statistical Computing}, 206 | author = {{R Core Team}}, 207 | organization = {R Foundation for Statistical Computing}, 208 | address = {Vienna, Austria}, 209 | year = {2023}, 210 | url = {https://www.R-project.org/}, 211 | } 212 | 213 | @Manual{dplyr, 214 | title = {dplyr: A Grammar of Data Manipulation}, 215 | author = {Hadley Wickham and Romain François and Lionel Henry and Kirill Müller and Davis Vaughan}, 216 | year = {2023}, 217 | note = {R package version 1.1.1}, 218 | url = {https://CRAN.R-project.org/package=dplyr}, 219 | } 220 | 221 | @Book{ggplot2, 222 | author = {Hadley Wickham}, 223 | title = {ggplot2: Elegant Graphics for Data Analysis}, 224 | publisher = {{Springer-Verlag}}, 225 | address = {New York}, 226 | year = {2016}, 227 | isbn = {978-3-319-24277-4}, 228 | url = {https://ggplot2.tidyverse.org} 229 | } 230 | 231 | @article{pubmed, 232 | title = {PubMed 2.0}, 233 | author = {White, Jacob}, 234 | journal = {Medical reference services quarterly}, 235 | volume = {39}, 236 | number = {4}, 237 | pages = {382--387}, 238 | year = {2020}, 239 | publisher = {Taylor \& Francis} 240 | } 241 | 242 | @article{pubmedcentral, 243 | title = {PubMed Central: The GenBank of the published literature}, 244 | author = {Roberts, Richard J}, 245 | journal = {Proceedings of the National Academy of Sciences}, 246 | volume = {98}, 247 | number = {2}, 248 | pages = {381--382}, 249 | year = {2001}, 250 | publisher = {National Acad Sciences} 251 | } 252 | 253 | @article{mimiciii, 254 | title = {MIMIC-III, a freely accessible critical care database}, 255 | author = {Johnson, Alistair EW and Pollard, Tom J and Shen, Lu and Lehman, Li-wei H and Feng, Mengling and Ghassemi, Mohammad and Moody, Benjamin and Szolovits, Peter and Anthony Celi, Leo and Mark, Roger G}, 256 | journal = {Scientific data}, 257 | volume = {3}, 258 | number = {1}, 259 | pages = {1--9}, 260 | year = {2016}, 261 | publisher = {Nature Publishing Group} 262 | } 263 | 264 | @Manual{readr, 265 | title = {readr: Read Rectangular Text Data}, 266 | author = {Hadley Wickham and Jim Hester and Jennifer Bryan}, 267 | year = {2023}, 268 | note = {R package version 2.1.4}, 269 | url = {https://CRAN.R-project.org/package=readr}, 270 | } 271 | 272 | @Manual{Rtsne, 273 | title = {{Rtsne}: T-Distributed Stochastic Neighbor Embedding using Barnes-Hut 274 | Implementation}, 275 | author = {Jesse H. Krijthe}, 276 | year = {2015}, 277 | note = {R package version 0.16}, 278 | url = {https://github.com/jkrijthe/Rtsne}, 279 | } 280 | 281 | @Manual{stringr, 282 | title = {stringr: Simple, Consistent Wrappers for Common String Operations}, 283 | author = {Hadley Wickham}, 284 | year = {2023}, 285 | note = {https://stringr.tidyverse.org, 286 | https://github.com/tidyverse/stringr}, 287 | } 288 | 289 | @article{luo2022, 290 | title={BioGPT: generative pre-trained transformer for biomedical text generation and mining}, 291 | author={Luo, Renqian and Sun, Liai and Xia, Yingce and Qin, Tao and Zhang, Sheng and Poon, Hoifung and Liu, Tie-Yan}, 292 | journal={Briefings in Bioinformatics}, 293 | volume={23}, 294 | number={6}, 295 | year={2022}, 296 | publisher={Oxford Academic} 297 | } 298 | 299 | @article{icd10, 300 | title={International classification of diseases 10th revision (ICD-10)}, 301 | author={DiSantostefano, Jan}, 302 | journal={The Journal for Nurse Practitioners}, 303 | volume={5}, 304 | number={1}, 305 | pages={56--57}, 306 | year={2009}, 307 | publisher={Elsevier} 308 | } 309 | 310 | @article{blank, 311 | author = {}, 312 | title = {}, 313 | journal = {}, 314 | year = {}, 315 | month = {}, 316 | volume = {}, 317 | number = {}, 318 | pages = {}, 319 | note = {} 320 | } 321 | 322 | % Article within a journal 323 | @article{koon, 324 | author = {Koonin, E V and Altschul, S F and P Bork}, 325 | title = {BRCA1 protein products: functional motifs}, 326 | journal = {Nat. Genet.}, 327 | year = {1996}, 328 | volume = {13}, 329 | pages = {266-267} 330 | } 331 | 332 | %%%%%%%% 333 | % Article within conference proceedings 334 | @inproceedings{xjon, 335 | author = {X Jones}, 336 | title = {Zeolites and synthetic mechanisms}, 337 | booktitle = {Proceedings of the First National Conference on 338 | Porous Sieves: 27-30 June 1996; Baltimore}, 339 | year = {1996}, 340 | editor = {Y Smith}, 341 | pages = {16-27}, 342 | } 343 | 344 | %%%%%%%% 345 | % Book chapter, or article within a book 346 | @incollection{schn, 347 | author = {E Schnepf}, 348 | title = {From prey via endosymbiont to plastids: 349 | comparative studies in dinoflagellates}, 350 | booktitle = {Origins of Plastids}, 351 | editor = {R A Lewin}, 352 | publisher = {Chapman and Hall}, 353 | pages = {53-76}, 354 | year = {1993}, 355 | address = {New York}, 356 | edition = {2nd} 357 | } 358 | 359 | %%%%%%%% 360 | % Complete book 361 | @book{marg, 362 | author = {L Margulis}, 363 | title = {Origin of Eukaryotic Cells}, 364 | publisher = {Yale University Press}, 365 | year = {1970}, 366 | address = {New Haven} 367 | } 368 | 369 | 370 | %%%%%%%% 371 | % PHD Thesis 372 | @phdthesis{koha, 373 | author = {R Kohavi}, 374 | title = {Wrappers for performance enhancement and 375 | obvious decision graphs}, 376 | school = {Stanford University, Computer Science Department}, 377 | year = {1995} 378 | } 379 | 380 | %%%%%%%% 381 | % Miscellaneous: webpage link/urL, etc/ 382 | @misc{issnic, 383 | author = {{ISSN International Centre}}, 384 | title = {The ISSN register}, 385 | url = {http://www.issn.org}, 386 | year = {2006}, 387 | urldate={Accessed 20 Feb 2007} 388 | } 389 | 390 | -------------------------------------------------------------------------------- /bmc-bioinformatics-paper/bmc_article.tex: -------------------------------------------------------------------------------- 1 | %% BioMed_Central_Tex_Template_v1.06 2 | %% % 3 | % bmc_article.tex ver: 1.06 % 4 | % % 5 | 6 | %%IMPORTANT: do not delete the first line of this template 7 | %%It must be present to enable the BMC Submission system to 8 | %%recognise this template!! 9 | 10 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 11 | %% %% 12 | %% LaTeX template for BioMed Central %% 13 | %% journal article submissions %% 14 | %% %% 15 | %% <8 June 2012> %% 16 | %% %% 17 | %% %% 18 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 19 | 20 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 21 | %% %% 22 | %% For instructions on how to fill out this Tex template %% 23 | %% document please refer to Readme.html and the instructions for %% 24 | %% authors page on the biomed central website %% 25 | %% https://www.biomedcentral.com/getpublished %% 26 | %% %% 27 | %% Please do not use \input{...} to include other tex files. %% 28 | %% Submit your LaTeX manuscript as one .tex document. %% 29 | %% %% 30 | %% All additional figures and files should be attached %% 31 | %% separately and not embedded in the \TeX\ document itself. %% 32 | %% %% 33 | %% BioMed Central currently use the MikTex distribution of %% 34 | %% TeX for Windows) of TeX and LaTeX. This is available from %% 35 | %% https://miktex.org/ %% 36 | %% %% 37 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 38 | 39 | %%% additional documentclass options: 40 | % [doublespacing] 41 | % [linenumbers] - put the line numbers on margins 42 | 43 | %%% loading packages, author definitions 44 | 45 | %\documentclass[twocolumn]{bmcart}% uncomment this for twocolumn layout and comment line below 46 | \documentclass{bmcart} 47 | 48 | %%% Load packages 49 | \usepackage{amsthm,amsmath} 50 | \usepackage{fancyvrb} 51 | \DefineVerbatimEnvironment{Code}{Verbatim}{} 52 | \DefineVerbatimEnvironment{CodeInput}{Verbatim}{fontshape=sl} 53 | \DefineVerbatimEnvironment{CodeOutput}{Verbatim}{} 54 | \newenvironment{CodeChunk}{}{} 55 | %\RequirePackage[numbers]{natbib} 56 | %\RequirePackage[authoryear]{natbib}% uncomment this for author-year bibliography 57 | %\RequirePackage{hyperref} 58 | \usepackage[utf8]{inputenc} %unicode support 59 | \usepackage{graphicx} 60 | %\usepackage[applemac]{inputenc} %applemac support if unicode package fails 61 | %\usepackage[latin1]{inputenc} %UNIX support if unicode package fails 62 | 63 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 64 | %% %% 65 | %% If you wish to display your graphics for %% 66 | %% your own use using includegraphic or %% 67 | %% includegraphics, then comment out the %% 68 | %% following two lines of code. %% 69 | %% NB: These line *must* be included when %% 70 | %% submitting to BMC. %% 71 | %% All figure files must be submitted as %% 72 | %% separate graphics through the BMC %% 73 | %% submission process, not included in the %% 74 | %% submitted article. %% 75 | %% %% 76 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 77 | 78 | %\def\includegraphic{} 79 | %\def\includegraphics{} 80 | 81 | %%% Put your definitions there: 82 | \startlocaldefs 83 | \endlocaldefs 84 | 85 | %%% Begin ... 86 | \begin{document} 87 | 88 | %%% Start of article front matter 89 | \begin{frontmatter} 90 | 91 | \begin{fmbox} 92 | \dochead{Databases} 93 | 94 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 95 | %% %% 96 | %% Enter the title of your article here %% 97 | %% %% 98 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 99 | 100 | \title{A Compressed Large Language Model Embedding Dataset of ICD 10 CM 101 | Descriptions} 102 | 103 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 104 | %% %% 105 | %% Enter the authors here %% 106 | %% %% 107 | %% Specify information, if available, %% 108 | %% in the form: %% 109 | %% ={,} %% 110 | %% = %% 111 | %% Comment or delete the keys which are %% 112 | %% not used. Repeat \author command as much %% 113 | %% as required. %% 114 | %% %% 115 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 116 | 117 | \author[ 118 | addressref={aff1}, % id's of addresses, e.g. {aff1,aff2} 119 | corref={aff1}, % id of corresponding address, if any 120 | % noteref={n1}, % id's of article notes, if any 121 | email={michael.kane@yale.edu} % email address 122 | ]{\inits{M.J.}\fnm{Michael J.} \snm{Kane}} 123 | \author[ 124 | addressref={aff2,aff3}, 125 | email={casey.king@yale.edu} 126 | ]{\inits{C.}\fnm{Casey} \snm{King}} 127 | \author[ 128 | addressref={aff1}, 129 | email={denise.esserman@yale.edu} 130 | ]{\inits{D.}\fnm{Denise} \snm{Esserman}} 131 | \author[ 132 | addressref={aff4}, 133 | email={nklatham@bwh.harvard.edu} 134 | ]{\inits{N.K.}\fnm{Nancy K.} \snm{Latham}} 135 | \author[ 136 | addressref={aff1}, 137 | email={erich.greene@yale.edu} 138 | ]{\inits{E.}\fnm{Erich J.} \snm{Greene}} 139 | \author[ 140 | addressref={aff5}, 141 | email={dganz@mednet.ucla.edu} 142 | ]{\inits{D.A.}\fnm{David A.} \snm{Ganz}} 143 | 144 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 145 | %% %% 146 | %% Enter the authors' addresses here %% 147 | %% %% 148 | %% Repeat \address commands as much as %% 149 | %% required. %% 150 | %% %% 151 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 152 | 153 | \address[id=aff1]{% % unique id 154 | \orgdiv{Department of Biostatistics}, % department, if any 155 | \orgname{School of Public Health}, % university, etc 156 | \street{Yale University}, 157 | \city{New Haven}, % city 158 | \cny{USA} % country 159 | } 160 | \address[id=aff2]{% 161 | \orgdiv{The Jackson School of Global Affairs} 162 | \orgname{Yale University}, 163 | %\street{}, 164 | %\postcode{} 165 | \city{New Haven}, 166 | \cny{USA} 167 | } 168 | \address[id=aff3]{% 169 | \orgdiv{US Healthcare and Life Sciences} 170 | \orgname{Microsoft}, 171 | %\street{}, 172 | %\postcode{} 173 | \city{Redmond}, 174 | \cny{USA} 175 | } 176 | \address[id=aff4]{% 177 | \orgdiv{Research Program in Men’s Health: Aging and Metabolism, Boston Claude D. Pepper Older Americans Independence Center for Function Promoting Therapies} 178 | \orgname{Brigham and Women’s Hospital}, 179 | %\street{}, 180 | %\postcode{} 181 | \city{Boston}, 182 | \cny{USA} 183 | } 184 | \address[id=aff5]{% 185 | \orgdiv{Department of Medicine} 186 | \orgname{VA Greater Los Angeles/UCLA}, 187 | %\street{}, 188 | %\postcode{} 189 | \city{Los Angeles}, 190 | \cny{USA} 191 | } 192 | 193 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 194 | %% %% 195 | %% Enter short notes here %% 196 | %% %% 197 | %% Short notes will be after addresses %% 198 | %% on first page. %% 199 | %% %% 200 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 201 | 202 | %\begin{artnotes} 203 | %%\note{Sample of title note} % note to the article 204 | %\note[id=n1]{Equal contributor} % note, connected to author 205 | %\end{artnotes} 206 | 207 | \end{fmbox}% comment this for two column layout 208 | 209 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 210 | %% %% 211 | %% The Abstract begins here %% 212 | %% %% 213 | %% Please refer to the Instructions for %% 214 | %% authors on https://www.biomedcentral.com/ %% 215 | %% and include the section headings %% 216 | %% accordingly for your article type. %% 217 | %% %% 218 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 219 | 220 | \begin{abstractbox} 221 | 222 | \begin{abstract} % abstract 223 | 224 | This paper presents novel datasets providing numerical representations of 225 | ICD-10-CM codes by generating description embeddings using a large language 226 | model followed by a dimension reduction via autoencoder. The embeddings 227 | serve as informative input features for machine learning models by capturing 228 | relationships among categories and preserving inherent context information. 229 | The model 230 | generating the data was validated in two ways. First, the dimension 231 | reduction was validated using an autoencoder, and secondly, a supervised model 232 | was created to estimate the ICD-10-CM hierarchical categories. Results show 233 | that the dimension of the data can be reduced to as few as 10 dimensions 234 | while maintaining the ability to reproduce the original embeddings, with the 235 | fidelity decreasing as the reduced-dimension representation decreases. 236 | Multiple compression levels are provided, allowing users to choose as per 237 | their requirements, download and use without any other setup. 238 | The readily available datasets of ICD-10-CM codes are 239 | anticipated to be highly valuable for researchers in biomedical informatics, 240 | enabling more advanced analyses in the field. This approach has the potential 241 | to significantly improve the utility of ICD-10-CM codes in the biomedical 242 | domain. 243 | 244 | %\parttitle{First part title} %if any 245 | %Text for this section. 246 | 247 | %\parttitle{Second part title} %if any 248 | %Text for this section. 249 | \end{abstract} 250 | 251 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 252 | %% %% 253 | %% The keywords begin here %% 254 | %% %% 255 | %% Put each keyword in separate \kwd{}. %% 256 | %% %% 257 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 258 | 259 | \begin{keyword} 260 | \kwd{large language model} 261 | \kwd{autoencoder} 262 | \kwd{ICD-10-CM} 263 | \kwd{electronic health records} 264 | \kwd{EHR} 265 | \kwd{NLP} 266 | \end{keyword} 267 | 268 | % MSC classifications codes, if any 269 | %\begin{keyword}[class=AMS] 270 | %\kwd[Primary ]{} 271 | %\kwd{} 272 | %\kwd[; secondary ]{} 273 | %\end{keyword} 274 | 275 | \end{abstractbox} 276 | % 277 | %\end{fmbox}% uncomment this for two column layout 278 | 279 | \end{frontmatter} 280 | 281 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 282 | %% %% 283 | %% The Main Body begins here %% 284 | %% %% 285 | %% Please refer to the instructions for %% 286 | %% authors on: %% 287 | %% https://www.biomedcentral.com/getpublished %% 288 | %% and include the section headings %% 289 | %% accordingly for your article type. %% 290 | %% %% 291 | %% See the Results and Discussion section %% 292 | %% for details on how to create sub-sections %% 293 | %% %% 294 | %% use \cite{...} to cite references %% 295 | %% \cite{koon} and %% 296 | %% \cite{oreg,khar,zvai,xjon,schn,pond} %% 297 | %% %% 298 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 299 | 300 | %%%%%%%%%%%%%%%%%%%%%%%%% start of article main body 301 | % 302 | 303 | %%%%%%%%%%%%%%%% 304 | %% Background %% 305 | %% 306 | 307 | \section*{Background} 308 | 309 | The International Classification of Diseases, 10th Revision, 310 | Clinical Modification (ICD-10-CM) \cite{icd10} is a 311 | standardized classification system 312 | for categorizing diseases, disorders, and health conditions. ICD-10 was 313 | developed by the World Health Organization (WHO) and adapted for use in the 314 | United States as ICD-10-CM by the National Center for Health 315 | Statistics (NCHS) \cite{icd10cm}. The standard plays a crucial role in 316 | the analysis of 317 | electronic medical records (EMRs) or electronic health records (EHRs) for 318 | several reasons: 319 | \begin{enumerate} 320 | \item{Consistency and Standardization: The ICD-10-CM allows for a consistent 321 | and standardized method of coding and documenting medical conditions across 322 | healthcare providers and facilities. This helps to ensure accurate and 323 | uniform data exchange, analysis, and comparison.} 324 | \item{Data Analysis and Research: The ICD-10-CM codes can be used to analyze 325 | patient data for clinical research, epidemiological studies, and public health 326 | surveillance. It helps to identify trends and patterns in diseases, monitor 327 | the effectiveness of treatments, and develop better prevention and management 328 | strategies.} 329 | \item{Quality Measurement and Improvement: ICD-10-CM codes can be used to 330 | evaluate the quality of care provided by healthcare facilities, monitor 331 | patient outcomes and identify areas for improvement. This information can 332 | be used to enhance the overall healthcare delivery system.} 333 | \item{Reimbursement and Billing: ICD-10-CM codes play a vital role in 334 | healthcare reimbursement by providing a standardized method to classify and 335 | report medical conditions. Insurance companies and other payers use these 336 | codes to determine appropriate payments for medical services rendered.} 337 | \item{Health Policy and Planning: ICD-10-CM codes help health authorities and 338 | policymakers to identify population health needs, allocate resources, and 339 | develop targeted healthcare policies and interventions.} 340 | \end{enumerate} 341 | 342 | While ICD-10-CM codes do provide a consistent and comprehensive set of 343 | categories, their incorporation into statistical and machine learning analyses 344 | can be challenging for several reasons. First, in the 2019 version of the 345 | standard, there were 71,932 categories, increasing to 72,184 categories in 346 | 2020; 72,616 categories in 2021; and 72,750 categories in 2022. As a result, 347 | analyses using these codes, where the set of codes is not restricted to a smaller 348 | set, 349 | must take into account their high dimensionality or will require a large 350 | number of training samples in order to fit consistent models. Second, 351 | categorical variables are usually incorporated into analyses with a contrast 352 | encoding such as treatment, helmert, etc. Contrast numeric 353 | representations are orthogonal or, under appropriate statistical assumptions, 354 | independent with respect to their categories. However, ICD-10-CM codes 355 | represent a hierarchical structure, 356 | where codes are organized into chapters, blocks, and categories based on the 357 | type and anatomical location of the diseases or conditions. Applying 358 | traditional contrast encoding methods may 359 | not fully capture this hierarchical information, potentially resulting in a 360 | loss of valuable context and relationships between codes. 361 | 362 | Researchers have considered alternative encoding methods or feature extraction 363 | techniques that can better represent the hierarchical structure of ICD-10-CM 364 | codes. However, incorporating both hierarchical structure and other contextual 365 | information in a general way can be difficult. The previous generation of word 366 | embeddings, which provide vector-encodings of words, were shown effective for 367 | these types of tasks, with models like \texttt{med2vec} \cite{med2vec} 368 | providing improved abilities to predict patient mortality; 369 | \texttt{inpatient2vec} \cite{inpatient2vec} to predict clinical events; 370 | \texttt{EHR2Vec} \cite{ehr2vec} to help analyze sequences of patient 371 | visits; and \texttt{cui2vec} \cite{cui2vec} to learn medical concepts based on multimodal 372 | clinical data. These models have been foundational in advancing the capabilities of machine learning models in understanding and generating human language. 373 | These models are shallow, two-layer neural networks that are trained to reconstruct linguistic contexts of words. Word embeddings produced by Word2Vec \cite{church2017} and 374 | previously mentioned variants, provide vector representations of words in a continuous vector space where semantically similar words are mapped to nearby points. 375 | Within this class of models there are two main training algorithms: Continuous Bag of Words and Skip-Gram models \cite{mikolov2013}. The former predicts target words 376 | (e.g., 'apple') from source context words ('the fruit'). The latter performs 377 | the inverse and predicts source context words from the target words, and 378 | tends to perform better on larger datasets and produces higher-quality embeddings for less frequent words. 379 | 380 | Despite their advantages, word embeddings also have certain 381 | limitations. First, word embeddings are typically generated at the word or 382 | code level, and while word embeddings can capture semantic similarities, they 383 | ofen struggle to represent hierarchical representations like those found in 384 | ICD-10-CM codes. Second, traditional word embeddings generate a single vector 385 | for each word regardless of context. This means that 386 | the same code can have different meanings depending on where and when it is 387 | used. This is something these models do not capture. Third, word embeddings 388 | can have difficulty handling rare codes. Word embeddings typically require 389 | a sufficient number of training samples to learn meaningful representations. 390 | For rarely used ICD-10-CM codes, the learned embedding might not be reliable. 391 | Fourth, traditional word embeddings provide static representations and do not change over time. However, in healthcare, the meaning and usages of certain codes can evolve, and these models cannot capture dynamic changes. 392 | Finally, the quality and representativeness of 393 | the word embeddings depend on the training data used to generate them. If the 394 | training data does not adequately cover the entire spectrum of medical 395 | conditions or encounters, the embeddings may not capture all relevant 396 | relationships or information. 397 | 398 | The Transformer model \cite{vaswani2017} is a more recent architecture primarily designed for handling sequences, and it has become the foundation for many recent models in natural language processing, including the Bidirectional Encoder Representation 399 | Transformer (BERT) \cite{devlin2018}, the Generative Pre-Trained Transformer (GPT) \cite{radford2018}, and the Text-to-Text-Transfer-Transformer T5 \cite{raffel2020}. The Transformer model's main innovation is its self-attention mechanism, which weighs input elements dynamically based on their content and relationship. This allows the model to focus on different parts of the input for different tasks or even different parts of the same task. 400 | 401 | These models fall under the category of Large language models (LLMs) and 402 | address some of the shortcomings of traditional 403 | word embeddings through a combination of advanced techniques and 404 | architectures. Unlike traditional word embeddings that generate static 405 | representations, LLMs generate contextualized embeddings. 406 | These embeddings take into account the surrounding words or tokens, allowing 407 | for a more nuanced representation of words and codes in different contexts. 408 | This helps in capturing the semantic relationships between codes more 409 | effectively. These models are pre-trained on vast amounts of text data, 410 | allowing them to learn general language representations before being 411 | fine-tuned for specific tasks. This pre-training enables the models to 412 | leverage existing knowledge and adapt more effectively to new tasks, even 413 | with limited task-specific data. LLMs can be incrementally 414 | updated or fine-tuned with new data, allowing them to adapt to evolving 415 | medical knowledge and practices more effectively than static word embeddings. 416 | And, while not explicitly designed for hierarchical data like ICD-10-CM codes, 417 | LLMs can implicitly capture aspects of structured hierarchical relationships through 418 | their deep architectures and the context in which codes appear. This can help 419 | capture different levels of granularity and relationships between codes more 420 | effectively than traditional word embeddings. 421 | 422 | Vector embeddings attempt to optimize the conditional probability of observing 423 | the actual output word given an input word (or vice versa, depending on the 424 | variant used). For instance, in the skip-gram variant, given a word 425 | $w_i$ and a context word $w_j$, the model is trained to maximize the 426 | following 427 | \begin{equation*} 428 | P(w_j|w_i) = \frac{e^{v^T{w_j}^T v{w_i}}}{\sum_k e^{v^T{w_k}^T v{w_i}}} 429 | \end{equation*} 430 | where $v_w$ and $v'_w$ represent the ``input'' and ``output'' vector 431 | representations of a word w, and the summation in the denominator is over 432 | all words in the vocabulary. The vectors $v_w$ and $v'_w$ are the word 433 | embeddings learned by a similarity model. 434 | 435 | LLM models also start by converting each word into an 436 | initial word embedding using an embedding matrix. However, these initial 437 | embeddings are then updated based on the context of the word. This is done by 438 | passing the embeddings through several layers of a transformer model, which 439 | uses self-attention mechanisms. The output of the transformer is a contextual 440 | embedding for each word. Mathematically, the self-attention mechanism can be 441 | represented as 442 | \begin{equation*} 443 | \text{Attention}(Q, K, V) = \text{softmax}(QK^T/\sqrt{d}) V 444 | \end{equation*} 445 | where $Q$, $K$, and $V$ represent the query, key, and value matrices, which 446 | are derived from the input embeddings. The softmax function ensures that the 447 | weights of different words sum to 1, and the $\sqrt{d}$ in the denominator is 448 | a scaling factor that improves the stability of the gradients during training. 449 | The resulting matrix product is a weighted sum of the value vectors, where the 450 | weights depend on the similarity between the query and key vectors. 451 | 452 | To generate an embedding for a sentence or description, one common approach is to take the average of the contextual embeddings of the words in the sentence: 453 | \begin{equation*} 454 | E(D) = \frac{1}{n } \sum E(w_i) 455 | \end{equation*} 456 | Here, $E(D)$ is the embedding for the description, $E(w_i)$ is the contextual 457 | embedding for word $w_i$, and the sum is over all words in the description. 458 | 459 | The key difference between the two methods is that vector embeddings generate 460 | a single, static embedding for each word, while LLMs generate a dynamic, 461 | context-dependent embedding. This allows an LLM to capture nuances in meaning 462 | that cannot be represented with static embeddings. 463 | 464 | There are several 465 | BERT or similar transformer-based biomedical models that can been used 466 | to generate embeddings for medical corpuses including ClinicalBERT 467 | \cite{huang2019,alsentzer2019}, BioBERT \cite{lee2020}, and 468 | Med-BERT \cite{rasmy2021}, but to our knowledge none of the current 469 | literature includes the 470 | applications of these models specifically for the purpose of generating 471 | embeddings for ICD-10-CM code that can be consumed as readily available 472 | data sets. These data sets represent a valuable resource for practioners 473 | who are interested in an information-rich representation of those codes, 474 | without needing to acquire models, embed data, and process them. 475 | 476 | %LLM generated embeddings address many of these limitations. They take into 477 | %account the surrounding words or tokens, allowing for a more nuanced 478 | %representation of words and codes in different contexts. This helps in 479 | %capturing the semantic relationships between codes more effectively. These 480 | %models are pre-trained on vast amounts of text data, allowing them to learn 481 | %general language representations before being fine-tuned for specific tasks. 482 | %This pre-training enables the models to leverage existing knowledge and 483 | %adapt more effectively to new tasks, even with limited task-specific data. 484 | %LLMs can be incrementally updated or fine-tuned with new data, allowing them 485 | %to adapt to evolving medical knowledge and practices more effectively than 486 | %static word embeddings. And, while not explicitly designed for hierarchical 487 | %data like ICD-10-CM codes, LLMs can implicitly learn hierarchical 488 | %relationships through their deep architectures and the context in which codes 489 | %appear. This can help capture different levels of granularity and relationships between codes more %effectively than traditional word embeddings. 490 | 491 | This paper describes data sets provided as \texttt{.csv} files, which are 492 | available online in the form of a crosswalk from ICD-10-CM codes 493 | to embeddings (a numeric vector of values), based on their descriptions. A sample 494 | of five descriptions and their embeddings are provided in Supplementary Materials. The 495 | embeddings were generated using the BioGPT LLM \cite{luo2022},which was trained on the biomedical literature including PubMed \cite{pubmed}, 496 | PubMed Central \cite{pubmedcentral}, and clinical notes from MIMIC-III 497 | \cite{mimiciii}. This model was shown to be superior at 498 | encoding context and relational information than competitors 499 | in the medical domain. Since the dimension of the embedding LLM is relatively 500 | high (42384), we provide dimension-reduced versions in 1000, 100, 50, 501 | and 10 dimensions. The model generating the data was validated in two ways. 502 | The first way validates the dimension reduction. The embedding data were 503 | compressed using an auto-encoder. The out-of-sample accuracy of a validation 504 | set is examined as well as the performance of the model for other versions 505 | (by year) of the ICD-10-CM specification. Our results show that we can reduce 506 | the dimension of the data down to as few as 10 dimensions while maintaining 507 | the ability to reproduce the original embeddings, with the fidelity decreasing 508 | as the reduced-dimension representation decreases. The second way validates 509 | the conceptual representation by creating a supervised model to estimate the 510 | ICD-10-CM hierarchical categories. Again, we see as the dimension of the 511 | compressed representation decreases, the model accuracy decreases. Since 512 | multiple compression levels are provided, users are free to choose whichever 513 | suits their needs, allowing them to trade off accuracy for dimensionality. 514 | 515 | The paper proceeds as follows. The next section provides a high-level 516 | description of the BioGPT and the embedding along with the construction of 517 | the autoencoder used to reduce the dimension of the embedding representation. 518 | That section then provides validation for both the dimension reduction as well 519 | as the representation. The third section provides an example of how to use the 520 | dataset to cluster ICD-10-CM codes using the R programming environment 521 | \cite{rcore}. The final section provides a broader look at the 522 | incorporation of LLM approaches to these types of data. 523 | 524 | The data sets and code to generate them are available in a public 525 | repository on Github 526 | \footnote{https://github.com/kaneplusplus/icd-10-cm-embedding}. 527 | The data are licensed under the Creative Commons Attribution NonCommercial 528 | ShareAlike 4.0 International License 529 | \footnote{https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode}. 530 | The code is licensed under GPL-v2 531 | \footnote{https://www.gnu.org/licenses/old-licenses/gpl-2.0.en.html}. 532 | 533 | \section*{Construction and content} 534 | 535 | The provided data are generated by embedding ICD-10-CM descriptions using the 536 | BioGPT-Large model, which comprises 1.5 billion parameters and is accessible 537 | via the Hugging Face model repository, \footnote{https://hugginface.co} and 538 | then performing a dimension reduction using an autoencoder. 539 | The embedding process involves tokenizing 540 | textual phrases into tokens (words, subwords, or characters) and mapping them 541 | to unique vocabulary IDs. Token IDs are passed through an embedding layer, 542 | resulting in a sequence of continuous embedding vectors. Positional encodings 543 | are added elementwise to these vectors, enabling the model to capture token 544 | order and relative positions. The embeddings are then contextualized by 545 | passing them through the model's layers. An attention mask selectively 546 | controls information flow in the attention mechanism, allowing the model to 547 | weigh the importance of input tokens when generating contextualized embeddings 548 | in a 42384-dimension space. 549 | 550 | The embedding is then compressed using an autoencoder. The autoencoder used 551 | here is 552 | a series of fully connected layers where the number of hidden nodes is 553 | approximately one order of magnitude smaller than the previous layer and then 554 | an order of magnitude larger until the output layer. For example, the 555 | autoencoder compressing to 10 dimensions has layers of size 42384, 1000, 556 | 100, 50, 10, 50, 100, 1000, 42384. Models whose dimension is large use 557 | the same structure while retaining only the appropriate layers. 558 | A practioner who would like to make use of these embeddings for their 559 | own modeling task, can download 560 | these data, substituting the embedding values for the ICD 10 representation. 561 | The values are information-rich and will be useful in a variety of 562 | supervised and unsupervised tasks involving medical research. 563 | 564 | \subsection*{Validating the dimension reduction} 565 | 566 | The autoencoder compressing the LLM embedding was fit on the 2019 ICD-10-CM 567 | descriptions for 20 epochs, with batch sizes 64, 128, and 256. The mean-square 568 | error loss between the embedding and autoencoder estimate, and a validation 569 | data set comprised of random subset of 10\% of the samples. The model 570 | performance is shown in Table \ref{tab:autoencoder_perf}. 571 | Based on these results the models with the best 572 | validation loss for each of the compressed embedding dimensions selected 573 | for further validation and eventual distribution. In addition, 574 | benchmarking the validation loss serves two purposes. First, it establishes 575 | a relative measure of performance quantifying the compression loss and allowing 576 | us to pick the best set of model parameters to generate the embedding 577 | data. Second, the validation loss in particular quantifies how much loss 578 | is incurred by new ICD-10-CM codes showing that the loss is comparable to, 579 | and often less than, the error in the training data. 580 | 581 | In addition to the 2019 validation, the models selected for distribution were 582 | tested on the 2020-2022 data sets to ensure their performance is comparable 583 | over years. The results are shown in Table \ref{tab:autoencoder_year}. 584 | It should be noted that the ICD-10-CM codes do not vary much from 585 | one year to the next, so we should not expect large differences. As 586 | expected, the mean square error and coefficients of determination are similar 587 | to the 2019 data. For a given embedding dimension it can be seen that neither 588 | the coefficient of determination nor the mean square error change significantly 589 | over years indicating that the same autoencoder 590 | could likely be used in subsequent years, while incurring similar loss. This 591 | also implies that an incremental approach could be taken in subsequent 592 | years when regenerating the 593 | embeddings where only new codes would need to be processed. 594 | 595 | \subsection*{Validating the embedding representation} 596 | 597 | As a final step in the validation process, we use the fact that in addition to 598 | the description, the ICD-10-CM codes themselves carry hierarchical information, 599 | which can be used to ensure that conceptual relationships are preserved 600 | in the compressed embeddings. In particular, the leading letter and two 601 | numeric values categorize codes For example, codes A00-B99 correspond to 602 | infectious and parasitic diseases, C00-D49 correspond to neoplasms, etc. 603 | There are a total of 22 codes. The full table of categories is provided 604 | in the Supplementary Materials. We 605 | can therefore ensure that at least some of the relevant relationships are 606 | preserved in the compressed embedding representation by confirming that 607 | the categories can be estimated at a rate higher than chance using a 608 | supervised model. Furthermore, we can quantify how much relevant predictive 609 | information is lost in lower-dimensional representations. 610 | 611 | The training data consists of a one-hot encoding of the ICD-10-CM 612 | categories as the dependent variable and the compressed embedding values as 613 | the values. The model consists of two hidden layers with 100 614 | nodes each. The loss function selected was categorical cross-entropy. The 615 | model was trained using 30 epochs and a validation data set comprised of 10\% of 616 | samples, chosen at random. 617 | 618 | To contextualize the results, we fit the same model to four 619 | BERT embeddings that 620 | have also been trained on biomedical corpuses. The first, MedBERT \cite{medbert} 621 | was trained with 57.46M tokens collected from biomedical-related data sources 622 | and biomedical-related articles from Wikipedia. The second, 623 | PubMedBERT-MS-MARCO \cite{pubmedbertqa} was first trained on Pubmed abstracts 624 | and full texts and then fine-tuned using the MS-MARCO data set \cite{msmarco} 625 | to be optimized for information retrieval task in the medical/health text 626 | domain. The third, SapBERT-PubMedBERT, was first trained on Pubmed abstracts 627 | and text, and then fine-tuned semantic relationships between relevant 628 | medical entities using UMLS \cite{umls} biomedical ontologies. The fourth, 629 | ClinBERT \cite{huang2019} was initialized from BERT. Then the training followed the principle of masked language model, in which given a piece of text, we randomly replace some tokens by MASKs, special tokens for masking, and then require the model to predict the original tokens via contextual text. 630 | 631 | The performance in terms of both the out-of-sample accuracy and 632 | the out-of-sample balanced accuracy \cite{balancedaccuracy} is shown in 633 | Table \ref{tab:sup_perf}. The goal 634 | in presenting these results is not to necessarily to 635 | maximize the prediction accuracy. Rather, it is to show that the embedding 636 | retains the 637 | hierarchical information in the ICD-10-CM codes. Some of the codes correspond to 638 | conditions that could be classified in several ways, and as a result coding 639 | for at least some of the conditions might be considered non-systematic. 640 | Based on this criterion, we can conclude the embedding does retain much of the 641 | structural and conceptual information denoted in the descriptions, at least in 642 | terms of mapping to key categories of diseases and conditions. 643 | 644 | The table provides two main results. First, the models using the BioGPT 645 | compressed representation significantly outperform models based on BERT 646 | models with the the former outperforming the latter, even after compressing 647 | the BioGPT embedding to 10 dimensions. Second, for the BioGPT compressed 648 | embeddings, great compression of the data correpsonds to a decrease in 649 | the predictive information in the data, as measured by the accuracy. 650 | 651 | Since the ICD-CM-10 codes are themselves heirarchical with the category 652 | codes being the broadest category it is worth pointing out that these results 653 | imply that some aspect of the code hierarchy is preserved in the embedding. 654 | However, the extent to which this hierarchy can be fully recovered remains an 655 | area of limited understanding. A potential avenue for future work could entail 656 | exploring the feasibility of mapping the embedding space to established 657 | ontologies, such as the UMLS. 658 | 659 | \section*{Conclusions} 660 | 661 | This paper presents novel datasets offering numerical representations of 662 | ICD-10-CM codes by generating description embeddings using a large language 663 | model and applying autoencoders for dimensionality reduction. The approach is 664 | versatile, capable of handling categorical variables with numerous categories 665 | across various domains. By capturing relationships among categories and 666 | preserving inherent information, the embeddings serve as informative input 667 | features for machine learning models. The readily available datasets are 668 | anticipated to be highly valuable for researchers incorporating ICD-10-CM 669 | codes into their analyses, retaining contextual information. This approach 670 | has the potential to significantly improve the utility of ICD-10-CM codes in 671 | biomedical informatics and enable more advanced analyses in the field. 672 | Data analysts can easily incorporate them into their own analyses by 673 | substituting the embedding values for other, lower-information representations 674 | including the categorical ones described above to derive the benefits 675 | of the conceptual information encoded in their embedding. 676 | Future work will address some of the challenges of capturing hierarchical 677 | structure in ICD-10-CM coding systems, experimenting with Ontology-based 678 | methods, hierarchical clustering, hierarchial autoencoding, graph neural 679 | networks and incorporating hierarchical information in training. 680 | 681 | While this approach is 682 | effective, there are some challenges of which we should be aware. While not 683 | insurmountable, they are as follows: 684 | 685 | \begin{enumerate} 686 | \item Interpretability: A significant challenge in machine learning, particularly with complex models like large language models and autoencoders, is interpretability. In healthcare, the ability to understand and explain why a model makes a particular prediction is crucial. This could impact patient trust, clinician adoption, and even legal and regulatory compliance. Techniques like LIME (Local Interpretable Model-Agnostic Explanations) or SHAP (SHapley Additive exPlanations) can be used to improve interpretability, but they do not provide perfect solutions and can be computationally expensive. 687 | \item Overfitting: Overfitting is a common issue in machine learning where a model learns the training data too well and performs poorly on unseen data. This can be particularly problematic in healthcare, where the stakes are high. Techniques such as cross-validation, regularization, or dropout layers can be used to prevent overfitting. 688 | \item Data Privacy: Patient data is highly sensitive, and its usage is strictly regulated (e.g., by laws like HIPAA in the US). Even if the data used to generate the embeddings is anonymized, the model must be carefully designed and used to avoid potential privacy leaks. 689 | \item Generalizability: A model trained on one dataset may not perform well on another due to differences in population characteristics, data collection methods, etc. Ensuring that models generalize well across different settings is a significant challenge. 690 | \item Quality of Input Data: The quality of the embeddings depends heavily on the quality of the input data. If the descriptions associated with the ICD-10-CM codes are inaccurate or not comprehensive, the resulting embeddings may also be flawed. This is a fundamental issue in any data-driven approach: "garbage in, garbage out." 691 | \item Capturing Hierarchical Structure: The ICD-10-CM coding system has a hierarchical structure where certain codes are nested within broader categories. While embeddings generated from code descriptions may capture semantic meaning, they might not adhere to an explicit hierarchical imposed by an ontology like UMLS. 692 | \end{enumerate} 693 | 694 | \section{Example Use of the ICD-10-CM Embedding Data} 695 | 696 | To illustrate the utility of the data, we present a simple example of how one 697 | might use the embedding information in the R programming environment and 698 | making use of the \texttt{dplyr} \cite{dplyr}, \texttt{ggplot2} \cite{ggplot2}, 699 | \texttt{readr} \cite{readr}, \texttt{Rtsne} \cite{Rtsne}, and 700 | \texttt{stringr} \cite{stringr} packages. Suppose 701 | we would like to 702 | visualize the ICD-10-CM codes beginning with 703 | G (diseases of the nervous system), 704 | I (diseases of the circulatory system), J (diseases of the respiratory system), 705 | and K (diseases of the digestive system) to better understand the 706 | contextual relationships 707 | between these categories or specific conditions in the the 50-dimensional 708 | embedding. For convenience, the projects page includes an \texttt{.rds} file 709 | containing the available embeddings along with their URLs, which can be 710 | retrieved from the R console. The code categories can then be visualized 711 | by performing another dimension reduction (in this case we will use the 712 | \texttt{Rtsne} package), to 2 dimensions that can be presented as a scatter plot. 713 | 714 | \vspace{2mm} 715 | 716 | \begin{CodeChunk} 717 | \begin{CodeInput} 718 | library(dplyr) 719 | library(ggplot2) 720 | library(readr) 721 | library(Rtsne) 722 | library(stringr) 723 | 724 | # Download the locations of the embeddings. 725 | tf = tempfile() 726 | download.file( 727 | paste0("https://github.com/kaneplusplus/", 728 | "icd-10-cm-embedding/blob/main/", 729 | "icd10_dl.rds?raw=true"), 730 | tf 731 | ) 732 | dl = readRDS(tf) 733 | 734 | # Read in the unspecified injury codes. 735 | tf = tempfile() 736 | download.file( 737 | dl$url[dl$year == 2019 & dl$emb_dim == 50], 738 | tf 739 | ) 740 | 741 | icd10s = read_csv(tf) |> 742 | filter(str_detect(code, "^(G|I|J|K)")) |> 743 | mutate(desc = tolower(desc)) |> 744 | mutate(`Leading Letter` = str_sub(code, 1, 1)) 745 | 746 | # Fit tSNE to the embedding. 747 | tsne_fit = icd10s |> 748 | select(starts_with("V")) |> 749 | scale() |> 750 | Rtsne(perplexity = 10) 751 | 752 | # Bind the tSNE values to the data set. 753 | icd10p = bind_cols( 754 | icd10s |> 755 | select(-starts_with("V")), 756 | tsne_fit$Y |> 757 | as.data.frame() |> 758 | rename(tSNE1="V1", tSNE2="V2") |> 759 | as_tibble() 760 | ) 761 | 762 | # Visualize the results. 763 | ggplot(icd10p, aes(x = tSNE1, y = tSNE2, color = `Leading Letter`)) + 764 | geom_point() + 765 | theme_minimal() 766 | \end{CodeInput} 767 | \end{CodeChunk} 768 | 769 | \vspace{2mm} 770 | 771 | The output visualization is presented in Figure 1 %\ref{fig:tsne} 772 | and shows that 773 | a subset of the circulatory diseases (I) and 774 | nervous system diseases (G) are well-differentiated from other conditions. It 775 | also shows overlap between other conditions related to K (digestive diseases), 776 | J (respiratory diseases), and I (circulatory). 777 | 778 | 779 | \begin{backmatter} 780 | 781 | %\section*{Acknowledgements}%% if any 782 | %Text for this section\ldots 783 | 784 | \section*{Declarations} 785 | 786 | \subsection*{Funding}%% if any 787 | 788 | This work was supported by the National Institute on Aging of the 789 | National Institutes of Health (NIH) through a grant to Yale 790 | University (1R01AG071528). The organizations funding this study had no role 791 | in the design or conduct of the study; in the collection, management, 792 | analysis, or interpretation of the data; or in the preparation, review, or 793 | approval of the manuscript. The content of this publication is solely the 794 | responsibility of the authors and does not necessarily represent the official 795 | views of the National Institutes of Health, the Department of Veterans 796 | Affairs, or the United States government. 797 | 798 | This work was also partially supported by the Yale Clinical and 799 | Translational Science award (UL1 TR001863) and the Yale Claude D. Pepper 800 | Center (P30AG021342). 801 | 802 | \section*{Competing interests} 803 | 804 | The authors declare that they have no competing interests. 805 | 806 | \subsection*{Ethics approval} 807 | 808 | Not applicable. 809 | 810 | \subsection*{Consent to participate} 811 | 812 | Not applicable. 813 | 814 | \subsection*{Consent for publication} 815 | 816 | Not applicable. 817 | 818 | %\section*{Abbreviations}%% if any 819 | %Text for this section\ldots 820 | 821 | \subsection*{Availability of data and materials}%% if any 822 | 823 | All data presented here along with documentation for 824 | reproducing presented materials is available at 825 | https://github.com/kaneplusplus/icd-10-cm-embedding. 826 | 827 | \subsection*{Code availability}%% if any 828 | 829 | All code presented here along with documentation for 830 | reproducing presented materials is available at 831 | https://github.com/kaneplusplus/icd-10-cm-embedding. 832 | 833 | \subsection*{Authors' contributions} 834 | 835 | Kane proposed, implemented, and created the dataset and wrote the article. 836 | Ganz provided direction for the research and validated results manually. 837 | King provided assessment of the model, a detailed analysis of the limitations 838 | of vector based and BERT approaches, a discussion of LLM 839 | limitations and feedback. 840 | Esserman, Latham, and Greene provided feedback and made suggestions through the 841 | entire process. 842 | 843 | \subsection{Acknowledgements} 844 | 845 | Not applicable. 846 | 847 | %\section*{Ethics approval and consent to participate}%% if any 848 | %Text for this section\ldots 849 | 850 | %\section*{Consent for publication}%% if any 851 | %Text for this section\ldots 852 | 853 | %\section*{Authors' contributions} 854 | %Text for this section \ldots 855 | 856 | %\section*{Authors' information}%% if any 857 | %Text for this section\ldots 858 | 859 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 860 | %% The Bibliography %% 861 | %% %% 862 | %% Bmc_mathpys.bst will be used to %% 863 | %% create a .BBL file for submission. %% 864 | %% After submission of the .TEX file, %% 865 | %% you will be prompted to submit your .BBL file. %% 866 | %% %% 867 | %% %% 868 | %% Note that the displayed Bibliography will not %% 869 | %% necessarily be rendered by Latex exactly as specified %% 870 | %% in the online Instructions for Authors. %% 871 | %% %% 872 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 873 | 874 | % if your bibliography is in bibtex format, use those commands: 875 | \bibliographystyle{bmc-mathphys} % Style BST file (bmc-mathphys, vancouver, spbasic). 876 | \bibliography{bmc_article} % Bibliography file (usually '*.bib' ) 877 | % for author-year bibliography (bmc-mathphys or spbasic) 878 | % a) write to bib file (bmc-mathphys only) 879 | % @settings{label, options="nameyear"} 880 | % b) uncomment next line 881 | %\nocite{label} 882 | 883 | % or include bibliography directly: 884 | % \begin{thebibliography} 885 | % \bibitem{b1} 886 | % \end{thebibliography} 887 | 888 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 889 | %% %% 890 | %% Figures %% 891 | %% %% 892 | %% NB: this is for captions and %% 893 | %% Titles. All graphics must be %% 894 | %% submitted separately and NOT %% 895 | %% included in the Tex document %% 896 | %% %% 897 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 898 | 899 | %% 900 | %% Do not use \listoffigures as most will included as separate files 901 | 902 | \pagebreak 903 | 904 | \section*{Figures} 905 | 906 | \begin{figure}[ht!] 907 | \includegraphics[width=\linewidth]{tsne-plot.png} 908 | \caption{The tSNE plot of the codes.} 909 | \label{fig:tsne} 910 | \end{figure} 911 | 912 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 913 | %% %% 914 | %% Tables %% 915 | %% %% 916 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 917 | 918 | \pagebreak 919 | %% Use of \listoftables is discouraged. 920 | %% 921 | \section*{Tables} 922 | 923 | \begin{table}[ht!] 924 | \caption{The autoencoder parameters and performance ordered by increasing validation loss.} 925 | \label{tab:autoencoder_perf} 926 | \begin{tabular}{|r|r|r|r|} 927 | \hline 928 | Embedding Dimension & Batch Size & Training Loss & Validation Loss\\ 929 | \hline 930 | 100 & 64 & 0.534 & 0.339\\ 931 | \hline 932 | 100 & 128 & 0.487 & 0.381\\ 933 | \hline 934 | 50 & 256 & 0.403 & 0.392\\ 935 | \hline 936 | 1000 & 64 & 0.542 & 0.402\\ 937 | \hline 938 | 100 & 256 & 0.556 & 0.444\\ 939 | \hline 940 | 1000 & 128 & 1.073 & 0.486\\ 941 | \hline 942 | 10 & 256 & 0.599 & 0.594\\ 943 | \hline 944 | 10 & 128 & 0.628 & 0.609\\ 945 | \hline 946 | 10 & 64 & 0.679 & 0.641\\ 947 | \hline 948 | 50 & 64 & 1.134 & 0.699\\ 949 | \hline 950 | 1000 & 256 & 30.435 & 0.803\\ 951 | \hline 952 | 50 & 128 & 1.053 & 0.894\\ 953 | 954 | \hline 955 | \end{tabular} 956 | \end{table} 957 | 958 | \begin{table}[ht!] 959 | \caption{The autoencoder validation performance ordered by year.} 960 | \label{tab:autoencoder_year} 961 | \begin{tabular}{|r|r|r|r|} 962 | \hline 963 | Year of Published ICD-10-CM Code& Embedding Dimension & Mean Square Error& Coef. of Determination\\ 964 | \hline 965 | 2019 & 10 & 0.593 & 0.086\\ 966 | \hline 967 | 2019 & 50 & 0.388 & 0.056\\ 968 | \hline 969 | 2019 & 100 & 0.336 & 0.049\\ 970 | \hline 971 | 2019 & 1000 & 0.400 & 0.058\\ 972 | \hline 973 | 2020 & 10 & 0.593 & 0.086\\ 974 | \hline 975 | 2020 & 50 & 0.388 & 0.056\\ 976 | \hline 977 | 2020 & 100 & 0.336 & 0.049\\ 978 | \hline 979 | 2020 & 1000 & 0.400 & 0.058\\ 980 | \hline 981 | 2021 & 10 & 0.594 & 0.086\\ 982 | \hline 983 | 2021 & 50 & 0.389 & 0.056\\ 984 | \hline 985 | 2021 & 100 & 0.337 & 0.049\\ 986 | \hline 987 | 2021 & 1000 & 0.401 & 0.058\\ 988 | \hline 989 | 2022 & 10 & 0.595 & 0.086\\ 990 | \hline 991 | 2022 & 50 & 0.390 & 0.056\\ 992 | \hline 993 | 2022 & 100 & 0.338 & 0.049\\ 994 | \hline 995 | 2022 & 1000 & 0.402 & 0.058\\ 996 | \hline 997 | \end{tabular} 998 | \end{table} 999 | 1000 | \begin{table}[ht!] 1001 | \caption{The supervised models' performance ordered by decreasing balanced accuracy.} 1002 | \label{tab:sup_perf} 1003 | \begin{tabular}{|r|r|r|r|} 1004 | \hline 1005 | Model & Embedding Dimension & Accuracy & Balanced Accuracy\\ 1006 | \hline 1007 | BioGPT Compressed & 1000 & 0.960 & 0.927\\ 1008 | \hline 1009 | BioGPT Compressed & 100 & 0.935 & 0.891\\ 1010 | \hline 1011 | BioGPT Compressed & 50 & 0.925 & 0.873\\ 1012 | \hline 1013 | BioGPT Compressed & 10 & 0.815 & 0.698\\ 1014 | \hline 1015 | ClinicalBERT & 768 & 0.200 & 0.634 \\ 1016 | \hline 1017 | PubMedBERT-MS-MARCO & 768 & 0.158 & 0.629\\ 1018 | \hline 1019 | SapBERT-PubMedBERT & 768 & 0.159 & 0.616\\ 1020 | \hline 1021 | MedBERT & 768 & 0.171 & 0.613\\ 1022 | \hline 1023 | \end{tabular} 1024 | \end{table} 1025 | 1026 | 1027 | \pagebreak 1028 | %% Use of \listoftables is discouraged. 1029 | %% 1030 | 1031 | \section*{Supplementary Materials} 1032 | 1033 | \subsection{ICD-10-CM Category Codes} 1034 | 1035 | \begin{table}[h] 1036 | %\caption{ICD-10-CM Category Codes.} 1037 | \begin{tabular}{l|l|} 1038 | \hline 1039 | Code & Description\\ 1040 | \hline 1041 | S3559XS & Injury of other iliac blood vessels, sequela\\ 1042 | \hline 1043 | M12262 & Villonodular synovitis (pigmented), left knee\\ 1044 | \hline 1045 | S40011S & Contusion of right shoulder, sequela\\ 1046 | \hline 1047 | K284 & Chronic or unspecified gastrojejunal ulcer with hemorrhage\\ 1048 | \hline 1049 | M90632 & Osteitis deformans in neoplastic diseases, left forearm\\ 1050 | \hline 1051 | \end{tabular} 1052 | \end{table} 1053 | %% Use of \listoftables is discouraged. 1054 | %% 1055 | 1056 | \subsection*{ICD-10-CM Category Codes} 1057 | 1058 | \begin{table}[ht!] 1059 | %\caption{ICD-10-CM Category Codes.} 1060 | \begin{tabular}{|l|l|l|} 1061 | \hline 1062 | First Code & Last Code & Code Description\\ 1063 | \hline 1064 | A00 & B99 & Certain infectious and parasitic diseases\\ 1065 | \hline 1066 | C00 & D49 & Neoplasms\\ 1067 | \hline 1068 | D50 & D89 & Diseases of the blood and blood-forming organs and certain disorders \\ 1069 | & & involving the immune mechanism\\ 1070 | \hline 1071 | E00 & E89 & Endocrine, nutritional and metabolic diseases\\ 1072 | \hline 1073 | F01 & F99 & Mental, Behavioral and Neurodevelopmental disorders\\ 1074 | \hline 1075 | G00 & G99 & Diseases of the nervous system\\ 1076 | \hline 1077 | H00 & H59 & Diseases of the eye and adnexa\\ 1078 | \hline 1079 | H60 & H95 & Diseases of the ear and mastoid process\\ 1080 | \hline 1081 | I00 & I99 & Diseases of the circulatory system\\ 1082 | \hline 1083 | J00 & J99 & Diseases of the respiratory system\\ 1084 | \hline 1085 | K00 & K95 & Diseases of the digestive system\\ 1086 | \hline 1087 | L00 & L99 & Diseases of the skin and subcutaneous tissue\\ 1088 | \hline 1089 | M00 & M99 & Diseases of the musculoskeletal system and connective tissue\\ 1090 | \hline 1091 | N00 & N99 & Diseases of the genitourinary system\\ 1092 | \hline 1093 | O00 & O9A & Pregnancy, childbirth and the puerperium\\ 1094 | \hline 1095 | P00 & P96 & Certain conditions originating in the perinatal period\\ 1096 | \hline 1097 | Q00 & Q99 & Congenital malformations, deformations and chromosomal abnormalities\\ 1098 | \hline 1099 | R00 & R99 & Symptoms, signs and abnormal clinical and laboratory findings, not elsewhere classified\\ 1100 | \hline 1101 | S00 & T88 & Injury, poisoning and certain other consequences of external causes\\ 1102 | \hline 1103 | U00 & U85 & Codes for special purposes\\ 1104 | \hline 1105 | V00 & Y99 & External causes of morbidity\\ 1106 | \hline 1107 | Z00 & Z99 & Factors influencing health status and contact with health services\\ 1108 | \hline 1109 | \end{tabular} 1110 | \end{table} 1111 | 1112 | 1113 | 1114 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1115 | %% %% 1116 | %% Additional Files %% 1117 | %% %% 1118 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1119 | 1120 | %\section*{Additional Files} 1121 | % \subsection*{Additional file 1 --- Sample additional file title} 1122 | % Additional file descriptions text (including details of how to 1123 | % view the file, if it is in a non-standard format or the file extension). This might 1124 | % refer to a multi-page table or a figure. 1125 | % 1126 | % \subsection*{Additional file 2 --- Sample additional file title} 1127 | % Additional file descriptions text. 1128 | 1129 | \end{backmatter} 1130 | \end{document} 1131 | -------------------------------------------------------------------------------- /bmc-bioinformatics-paper/bmcart-biblio.sty: -------------------------------------------------------------------------------- 1 | %% 2 | %% LaTeX 2e packagee for the processing of LaTeX2e files 3 | %% for the BioMed Central 4 | %% Additional commands for the processing of structured reference list 5 | %% 6 | %% Macros written by Vytas Statulevicius, VTeX, Lithuania 7 | %% for the BioMed Central 8 | %% Please submit bugs or your comments to latex-support@vtex.lt 9 | %% 10 | %% The original distribution is located at: 11 | %% http://support.e-publications.org/bmc 12 | %% 13 | %% This class file loads standart "article.cls" with appropriate 14 | %% settings and then redefines layout according to BMC style 15 | %% A lot of efforts are done for the possibility of extraction of 16 | %% information from the LaTeX file 17 | %% 18 | %% You are free to use this style class as you see fit, provided 19 | %% that you do not make changes to the file. 20 | %% If you DO make changes, you are required to rename this file. 21 | %% 22 | %% It may be distributed under the terms of the LaTeX Project Public 23 | %% License, as described in lppl.txt in the base LaTeX distribution. 24 | %% Either version 1.0 or, at your option, any later version. 25 | %% 26 | %% \CharacterTable 27 | %% {Upper-case \A\B\C\D\E\F\G\H\I\J\K\L\M\N\O\P\Q\R\S\T\U\V\W\X\Y\Z 28 | %% Lower-case \a\b\c\d\e\f\g\h\i\j\k\l\m\n\o\p\q\r\s\t\u\v\w\x\y\z 29 | %% Digits \0\1\2\3\4\5\6\7\8\9 30 | %% Exclamation \! Double quote \" Hash (number) \# 31 | %% Dollar \$ Percent \% Ampersand \& 32 | %% Acute accent \' Left paren \( Right paren \) 33 | %% Asterisk \* Plus \+ Comma \, 34 | %% Minus \- Point \. Solidus \/ 35 | %% Colon \: Semicolon \; Less than \< 36 | %% Equals \= Greater than \> Question mark \? 37 | %% Commercial at \@ Left bracket \[ Backslash \\ 38 | %% Right bracket \] Circumflex \^ Underscore \_ 39 | %% Grave accent \` Left brace \{ Vertical bar \| 40 | %% Right brace \} Tilde \~} 41 | %% 42 | %% 43 | %% Bug fixes and changes: 44 | %% at end of file 45 | 46 | 47 | \def\bmc@common@bibl@date{2012/03/06} 48 | 49 | \NeedsTeXFormat{LaTeX2e} 50 | \ProvidesPackage{bmcart-biblio}[\bmc@common@bibl@date 51 | additional macros for the bibliography tagging A++ XML DTD (VS)] 52 | 53 | % Default bibliography style: 54 | \def\bibliography@style{10} 55 | \def\bibliography@style@name{BMC Reference Style} 56 | \def\bibliography@key{bmc} 57 | 58 | % vykdoma tik pirma opcija (t.y. tai leidzia "permusti" opcija per 59 | % \documentclass[foo,..] 60 | 61 | \def\only@first#1{\@ifundefined{o@f@}{\def\o@f@{}#1}{}} 62 | 63 | \DeclareOption{undef} {\only@first{\def\bibliography@style{0}\def\bibliography@key{undef} 64 | \def\bibliography@style@name{undefined}}} 65 | \DeclareOption{basic} {\only@first{\def\bibliography@style{1}\def\bibliography@key{basic} 66 | \def\bibliography@style@name{Basic Springer}}} 67 | \DeclareOption{chemistry}{\only@first{\def\bibliography@style{2}\def\bibliography@key{chemistry} 68 | \def\bibliography@style@name{Chemistry}}} 69 | \DeclareOption{mathphys} {\only@first{\def\bibliography@style{3}\def\bibliography@key{mathphys} 70 | \def\bibliography@style@name{Math and Physical Sciences}}} 71 | \DeclareOption{aps} {\only@first{\def\bibliography@style{4}\def\bibliography@key{aps} 72 | \def\bibliography@style@name{American Physical Society}}} 73 | \DeclareOption{cs} {\only@first{\def\bibliography@style{5}\def\bibliography@key{cs} 74 | \def\bibliography@style@name{Computer Science}}} 75 | \DeclareOption{vancouver}{\only@first{\def\bibliography@style{6}\def\bibliography@key{vancouver} 76 | \def\bibliography@style@name{Vancouver}}} 77 | \DeclareOption{apa} {\only@first{\def\bibliography@style{7}\def\bibliography@key{apa} 78 | \def\bibliography@style@name{APA}}} 79 | \DeclareOption{chicago} {\only@first{\def\bibliography@style{8}\def\bibliography@key{chicago} 80 | \def\bibliography@style@name{Chicago}}} 81 | \DeclareOption{ams} {\only@first{\def\bibliography@style{9}\def\bibliography@key{ams} 82 | \def\bibliography@style@name{AMS}}} 83 | \DeclareOption{bmc} {\only@first{\def\bibliography@style{10}\def\bibliography@key{bmc} 84 | \def\bibliography@style@name{BMC Reference Style}}} 85 | 86 | \ProcessOptions* 87 | 88 | 89 | %%% Common macros: 90 | \def\xml@attr{\@ifnextchar[{\get@attr}{\get@attr[]}} 91 | \def\get@attr[#1]#2{#2} 92 | 93 | \let\betal@style\relax 94 | 95 | \def\common@pub@types{% 96 | \let\binstitute\@firstofone% 97 | \def\betal{{\betal@style et al.}}% 98 | \let\byear\@firstofone% 99 | \let\bfpage\@firstofone% 100 | \let\blpage\@firstofone% 101 | \let\binterref\xml@attr% 102 | \def\burl##1{{\csname burl@style\endcsname\surl{##1}}} 103 | \let\bcomment\@firstofone% 104 | \let\oauthor\@firstofone% 105 | \csname common@pub@types@hook\endcsname% 106 | } 107 | 108 | % Setting a "style" for a command: 109 | % \set@bibl@cmd{bvolume} == \def\bvolume#1{{\bvolume@style #1}} 110 | 111 | \def\set@bibl@cmd#1{\expandafter\def\csname #1\endcsname##1{{\csname #1@style\endcsname##1}}} 112 | 113 | 114 | \AtBeginDocument{\let\doiurlchapter\doiurl} 115 | 116 | %%% BibEditorName 117 | 118 | \def\xml@beditor#1{% 119 | \let\bprefix\@firstofone% 120 | \let\binits\@firstofone% 121 | \let\bsnm\@firstofone% 122 | \let\bparticle\@firstofone% 123 | \let\bsuffix\@firstofone% 124 | \let\bdegs\@firstofone% 125 | #1} 126 | 127 | %%% BibAuthorName 128 | 129 | \def\xml@bauthor#1{% 130 | \let\bprefix\@firstofone% 131 | \let\binits\@firstofone% 132 | % \let\bsnm\@firstofone% 133 | \def\bsnm##1{{\csname bsnm@style\endcsname##1}}% 134 | \let\bparticle\@firstofone% 135 | \let\bsuffix\@firstofone% 136 | \let\bdegs\@firstofone% 137 | #1} 138 | 139 | 140 | %%% bsertitle 141 | 142 | \def\xml@bsertitle{\@ifnextchar[{\@xml@bsertitle}{\@xml@bsertitle[]}} 143 | 144 | \def\@xml@bsertitle[#1]#2{{\csname bsertitle@style\endcsname #2}}% 145 | 146 | \def\xml@batitle{\@ifnextchar[{\@xml@batitle}{\@xml@batitle[]}} 147 | \def\@xml@batitle[#1]#2{{\csname batitle@style\endcsname #2}}% 148 | 149 | \def\xml@bctitle{\@ifnextchar[{\@xml@bctitle}{\@xml@bctitle[]}} 150 | \def\@xml@bctitle[#1]#2{{\csname bctitle@style\endcsname #2}}% 151 | 152 | \def\xml@bbtitle{\@ifnextchar[{\@xml@bbtitle}{\@xml@bbtitle[]}} 153 | \def\@xml@bbtitle[#1]#2{{\csname bbtitle@style\endcsname #2}}% 154 | 155 | 156 | 157 | %%% BibArticle 158 | % \begin{barticle} 159 | % ... 160 | % \end{barticle} 161 | 162 | \def\barticle{% 163 | \common@pub@types% 164 | \let\bauthor\xml@bauthor% 165 | \let\batitle\xml@attr% 166 | \set@bibl@cmd{bjtitle}% 167 | \set@bibl@cmd{bvolume}% 168 | \set@bibl@cmd{bissue}% 169 | \let\bnumber\@firstofone% 170 | } 171 | \def\endbarticle{} 172 | 173 | 174 | %%% BibIssue 175 | % \begin{bissue} 176 | % ... 177 | % \end{bissue} 178 | 179 | \def\bissue{% 180 | \common@pub@types% 181 | \let\bauthor\xml@bauthor% 182 | \set@bibl@cmd{bjtitle}% 183 | \set@bibl@cmd{bvolume}% 184 | \set@bibl@cmd{bissue}% 185 | \set@bibl@cmd{bmonth}% 186 | } 187 | \def\endbissue{} 188 | 189 | 190 | %%% BibChapter 191 | % \begin{bchapter} 192 | % ... 193 | % \end{bchapter} 194 | 195 | \def\bchapter{% 196 | \common@pub@types% 197 | \let\bauthor\xml@bauthor% 198 | \let\bctitle\xml@attr% 199 | \let\beditor\xml@beditor% 200 | \let\binstitutionaled\@firstofone% 201 | \def\beds{eds.}% 202 | \let\bbtitle\xml@bbtitle 203 | \let\bedition\xml@attr% 204 | \set@bibl@cmd{bconfname}% 205 | \set@bibl@cmd{bconflocation}% 206 | \set@bibl@cmd{bconfdate}% 207 | \let\bsertitle\xml@bsertitle% 208 | \set@bibl@cmd{bseriesno}% 209 | \let\bpublisher\@firstofone% 210 | \let\blocation\@firstofone% 211 | \let\bisbn\@firstofone% 212 | } 213 | 214 | \def\endbchapter{} 215 | 216 | %%% BibUnstructured 217 | % \begin{botherref} 218 | % ... 219 | % \end{botherref} 220 | 221 | \def\botherref{\let\binterref\xml@attr\let\oauthor\xml@bauthor\let\bauthor\xml@bauthor} 222 | \def\endbotherref{} 223 | 224 | 225 | %%% BibBook 226 | % \begin{bbook} 227 | % ... 228 | % \end{bbook} 229 | 230 | \def\bbook{% 231 | \common@pub@types% 232 | \let\bauthor\xml@bauthor% 233 | \let\beditor\xml@beditor% 234 | \let\binstitutionaled\@firstofone% 235 | \def\beds{eds.}% 236 | \let\bbtitle\xml@bbtitle 237 | \let\bedition\xml@attr% 238 | \let\bconfname\@firstofone% 239 | \let\bconflocation\@firstofone% 240 | \let\bconfdate\@firstofone% 241 | \let\bsertitle\xml@bsertitle% 242 | \set@bibl@cmd{bseriesno}% 243 | \let\bpublisher\@firstofone% 244 | \let\blocation\@firstofone% 245 | \let\bisbn\@firstofone% 246 | }% 247 | 248 | 249 | \def\endbbook{} 250 | 251 | %%% 252 | \let\endbibitem\relax 253 | 254 | %%%%%% vancouver puslapi nr. formavimas 255 | %% 17-19 -> 17-9 256 | %% 17-21 -> 17-21 257 | %% 1234-1345 -> 345 258 | 259 | \def\vcr@bfpage#1{% 260 | #1\nobreak% 261 | \test@if@number{#1}% 262 | \let\blpage\@firstofone% 263 | \if@page@isnumber\ifnum#1>9\def\@vcr@bfpage{#1}\let\blpage\fmt@blpage\fi\fi} 264 | 265 | 266 | \def\fmt@blpage#1{% 267 | \test@if@number{#1}% 268 | \if@page@isnumber% 269 | \ifnum\@vcr@bfpage<#1% 270 | \vcr@blpage{#1}% 271 | \else% 272 | #1\@latex@error{sprbibl: [\cur@bibitem] \string\bfpage\space > \string\blpage!}{??}% 273 | \fi% 274 | \else% 275 | #1% 276 | \fi} 277 | 278 | % algoritmas 279 | % a) bfpage ir blpage daliname is 10 280 | % b) if bfpage=blpage goto d 281 | % c) if bfpage<10 end else goto a 282 | % d) x:= blpage - int(blpage/10^i)*10^1 283 | 284 | \def\vcr@blpage#1{% 285 | \@tempcnta=\@vcr@bfpage% 286 | \@tempcntb=#1% 287 | \@tempcntc=1% 288 | \def\control@cnt{1}% 289 | \@whilenum\control@cnt>0\do{% 290 | \divide\@tempcnta by10% 291 | \divide\@tempcntb by10% 292 | \multiply\@tempcntc by10% 293 | \ifnum\@tempcnta=\@tempcntb\def\control@cnt{0}\@tempswatrue% 294 | \else% 295 | \ifnum\@tempcnta<10\relax\def\control@cnt{0}\@tempswafalse\fi% 296 | \fi}% 297 | \if@tempswa% 298 | \@tempcnta=#1% 299 | \@tempcntb=#1% 300 | \divide\@tempcntb by\@tempcntc% 301 | \multiply\@tempcntb by\@tempcntc% 302 | \advance\@tempcnta by-\@tempcntb% 303 | \the\@tempcnta% 304 | \else% 305 | #1% 306 | \fi}% 307 | 308 | \newif\if@page@isnumber 309 | 310 | \def\test@if@number#1{% 311 | \@page@isnumberfalse% 312 | \setbox\@tempboxa=\hbox{\@tempcnta=0#1\relax\ignorespaces}% 313 | \ifdim\wd\@tempboxa>\z@\else\@page@isnumbertrue\fi} 314 | 315 | %%%%%% Set bibliography styles: 316 | 317 | \ifcase\bibliography@style\relax 318 | \message{^^J Reference style: \bibliography@style@name^^J}% 319 | \or % basic 320 | \message{^^J Reference style: \bibliography@style@name^^J}% 321 | \def\common@pub@types@hook{\def\betal{{\betal@style et al}}}% 322 | \or % chemistry 323 | \message{^^J Reference style: \bibliography@style@name^^J}% 324 | \or % mathphys 325 | \message{^^J Reference style: \bibliography@style@name^^J}% 326 | \setattribute{bvolume}{style}{\bfseries} 327 | \or % aps 328 | \message{^^J Reference style: \bibliography@style@name^^J}% 329 | \setattribute{bvolume}{style}{\bfseries} 330 | \setattribute{bbtitle}{style}{\itshape} 331 | \or % cs 332 | \message{^^J Reference style: \bibliography@style@name^^J}% 333 | \setattribute{bvolume}{style}{\bfseries} 334 | \or % vancouver 335 | \message{^^J Reference style: \bibliography@style@name^^J}% 336 | \def\common@pub@types@hook{\let\bfpage\vcr@bfpage} 337 | \or % apa 338 | \message{^^J Reference style: \bibliography@style@name^^J}% 339 | \setattribute{bjtitle}{style}{\itshape} 340 | \setattribute{bvolume}{style}{\itshape} 341 | \setattribute{bbtitle}{style}{\itshape} 342 | \setattribute{bsertitle}{style}{\itshape} 343 | \setattribute{bseriesno}{style}{\itshape} 344 | \or % chicago 345 | \message{^^J Reference style: \bibliography@style@name^^J}% 346 | \setattribute{bbtitle} {style}{\itshape} 347 | \setattribute{bjtitle} {style}{\itshape} 348 | \setattribute{bsertitle}{style}{\itshape} 349 | \or % ams 350 | \message{^^J Reference style: \bibliography@style@name^^J}% 351 | \setattribute{batitle}{style}{\itshape}% 352 | \setattribute{bctitle}{style}{\itshape}% 353 | \g@addto@macro\barticle{\let\batitle\xml@batitle}% 354 | \setattribute{bvolume}{style}{\bfseries}% 355 | \setattribute{bseriesno}{style}{\bfseries}% 356 | \g@addto@macro\bbook{\setattribute{bbtitle}{style}{\itshape}}% 357 | \g@addto@macro\bchapter{\let\bctitle\xml@bctitle% 358 | \setattribute{bbtitle}{style}{\upshape}% 359 | }% 360 | \or % bmc 361 | \message{^^J Reference style: \bibliography@style@name^^J}% 362 | \setattribute{bvolume}{style}{\bfseries}% 363 | \setattribute{batitle}{style}{\bfseries\mathversion{bold}}% 364 | \g@addto@macro\barticle{\let\batitle\xml@batitle}% 365 | \setattribute{bjtitle}{style}{\itshape}% 366 | \setattribute{bbtitle}{style}{\itshape}% 367 | \setattribute{bsertitle}{style}{\itshape} 368 | \setattribute{bctitle}{style}{\bfseries\mathversion{bold}}% 369 | \g@addto@macro\bchapter{\let\bctitle\xml@bctitle}% 370 | \fi 371 | 372 | \endinput 373 | -------------------------------------------------------------------------------- /bmc-bioinformatics-paper/tsne-plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kaneplusplus/icd-10-cm-embedding/a63d21c7d8f90419515bcfc2b0fce4281a6f1e62/bmc-bioinformatics-paper/tsne-plot.png -------------------------------------------------------------------------------- /bmc-bioinformatics-paper/vancouver.bst: -------------------------------------------------------------------------------- 1 | %% 2 | %% This `vancouver.bst' bibliographic style file (for LaTeX/BibTeX) is 3 | %% generated with the docstrip utility and modified manually to meet the 4 | %% ``Uniform Requirements for Manuscripts Submitted to Biomedical Journals'' 5 | %% as published in N Engl J Med 1997;336:309-315. 6 | %% (also known as the Vancouver style) 7 | %% This specification may be found on the web page of the 8 | %% International Committe of Medical Journal Editors: 9 | %% 10 | %% http://www.icmje.org 11 | %% 12 | %%------------------------------------------------------------------- 13 | %% 14 | %% Copyright 2004 Folkert van der Beek 15 | %% 16 | %% This work may be distributed and/or modified under the 17 | %% conditions of the LaTeX Project Public License, either version 1.3 18 | %% of this license or (at your option) any later version. 19 | %% The latest version of this license is in 20 | %% http://www.latex-project.org/lppl.txt 21 | %% and version 1.3 or later is part of all distributions of LaTeX 22 | %% version 2005/12/01 or later. 23 | %% 24 | %% This work has the LPPL maintenance status `maintained'. 25 | %% 26 | %% The Current Maintainer of this work is Folkert van der Beek. 27 | %% 28 | %% Complaints, suggestions and comments may be sent to 29 | %% 30 | %% Folkert van der Beek 31 | %% 32 | %%------------------------------------------------------------------- 33 | %% 34 | %% This bibliography style file is intended for texts in ENGLISH 35 | %% This is a numerical citation style, and as such is standard LaTeX. 36 | %% It requires no extra package to interface to the main text. 37 | %% The form of the \bibitem entries is 38 | %% \bibitem{key}... 39 | %% Usage of \cite is as follows: 40 | %% \cite{key} ==>> [#] 41 | %% \cite[chap. 2]{key} ==>> [#, chap. 2] 42 | %% where # is a number determined by the ordering in the reference list. 43 | %% The order in the reference list is that by which the works were originally 44 | %% cited in the text, or that in the database. 45 | % 46 | %% To change the reference numbering system from [1] to 1, 47 | %% put the following code in the preamble: 48 | %% \makeatletter % Reference list option change 49 | %% \renewcommand\@biblabel[1]{#1} % from [1] to 1 50 | %% \makeatother % 51 | %% 52 | %%--------------------------------------------------------------------- 53 | 54 | %% List of all possible fields 55 | ENTRY 56 | { address 57 | assignee % for patents 58 | author 59 | booktitle % for articles in books 60 | chapter % for incollection, esp. internet documents 61 | cartographer % for maps 62 | day 63 | edition 64 | editor 65 | howpublished 66 | institution % for technical reports 67 | inventor % for patents 68 | journal 69 | key 70 | month 71 | note 72 | number 73 | organization 74 | pages 75 | part 76 | publisher 77 | school 78 | series 79 | title 80 | type 81 | volume 82 | word 83 | year 84 | eprint % urlbst 85 | doi % urlbst 86 | url % urlbst 87 | lastchecked % urlbst 88 | updated % urlbst 89 | } 90 | {} 91 | { label } 92 | %% Declaration of integer variables 93 | INTEGERS { output.state before.all mid.sentence after.sentence after.block } 94 | STRINGS { urlintro eprinturl eprintprefix doiprefix doiurl } % urlbst... 95 | INTEGERS { hrefform addeprints adddoiresolver } 96 | % Following constants may be adjusted by hand, if desired 97 | FUNCTION {init.config.constants} 98 | { 99 | "Available from: " 'urlintro := % prefix before URL 100 | "http://arxiv.org/abs/" 'eprinturl := % prefix to make URL from eprint ref 101 | "arXiv:" 'eprintprefix := % text prefix printed before eprint ref 102 | "http://dx.doi.org/" 'doiurl := % prefix to make URL from DOI 103 | "doi:" 'doiprefix := % text prefix printed before DOI ref 104 | #0 'addeprints := % 0=no eprints; 1=include eprints 105 | #0 'adddoiresolver := % 0=no DOI resolver; 1=include it 106 | #0 'hrefform := % 0=no crossrefs; 1=hypertex xrefs; 2=hyperref refs 107 | } 108 | INTEGERS { 109 | bracket.state 110 | outside.brackets 111 | open.brackets 112 | within.brackets 113 | close.brackets 114 | } 115 | % ...urlbst to here 116 | FUNCTION {init.state.consts} 117 | { #0 'outside.brackets := % urlbst 118 | #1 'open.brackets := 119 | #2 'within.brackets := 120 | #3 'close.brackets := 121 | 122 | #0 'before.all := 123 | #1 'mid.sentence := 124 | #2 'after.sentence := 125 | #3 'after.block := 126 | } 127 | %% Declaration of string variables 128 | STRINGS { s t} 129 | 130 | % urlbst 131 | FUNCTION {output.nonnull.original} 132 | { 's := 133 | output.state mid.sentence = 134 | { ". " * write$ } 135 | { output.state after.block = 136 | { add.period$ write$ 137 | newline$ 138 | "\newblock " write$ 139 | } 140 | { output.state before.all = 141 | 'write$ 142 | { add.period$ " " * write$ } 143 | if$ 144 | } 145 | if$ 146 | mid.sentence 'output.state := 147 | } 148 | if$ 149 | s 150 | } 151 | 152 | % urlbst... 153 | FUNCTION {output.nonnull} 154 | { % Save the thing we've been asked to output 155 | 's := 156 | % If the bracket-state is close.brackets, then add a close-bracket to 157 | % what is currently at the top of the stack, and set bracket.state 158 | % to outside.brackets 159 | bracket.state close.brackets = 160 | { "]" * 161 | outside.brackets 'bracket.state := 162 | } 163 | 'skip$ 164 | if$ 165 | bracket.state outside.brackets = 166 | { % We're outside all brackets -- this is the normal situation. 167 | % Write out what's currently at the top of the stack, using the 168 | % original output.nonnull function. 169 | s 170 | output.nonnull.original 171 | } 172 | { % Still in brackets. Add open-bracket or (continuation) comma, add the 173 | % new text (in s) to the top of the stack, and move to the close-brackets 174 | % state, ready for next time (unless inbrackets resets it). If we come 175 | % into this branch, then output.state is carefully undisturbed. 176 | bracket.state open.brackets = 177 | { " [" * } 178 | { ", " * } % bracket.state will be within.brackets 179 | if$ 180 | s * 181 | close.brackets 'bracket.state := 182 | } 183 | if$ 184 | } 185 | 186 | % Call this function just before adding something which should be presented in 187 | % brackets. bracket.state is handled specially within output.nonnull. 188 | FUNCTION {inbrackets} 189 | { bracket.state close.brackets = 190 | { within.brackets 'bracket.state := } % reset the state: not open nor closed 191 | { open.brackets 'bracket.state := } 192 | if$ 193 | } 194 | 195 | FUNCTION {format.lastchecked} 196 | { lastchecked empty$ 197 | { "" } 198 | { updated empty$ 199 | { inbrackets "cited " lastchecked * } 200 | { inbrackets "updated " updated * "; cited " * lastchecked * } 201 | if$ 202 | } 203 | if$ 204 | } 205 | % ...urlbst to here 206 | 207 | FUNCTION {output} 208 | { duplicate$ empty$ 209 | 'pop$ 210 | 'output.nonnull 211 | if$ 212 | } 213 | 214 | FUNCTION {output.check} 215 | { 't := 216 | duplicate$ empty$ 217 | { pop$ "empty " t * " in " * cite$ * warning$ } 218 | 'output.nonnull 219 | if$ 220 | } 221 | 222 | FUNCTION {fin.entry} 223 | { 224 | bracket.state close.brackets = % urlbst 225 | { "]" * } 226 | 'skip$ 227 | if$ 228 | add.period$ 229 | write$ 230 | newline$ 231 | } 232 | 233 | FUNCTION {new.block} 234 | { output.state before.all = 235 | 'skip$ 236 | { after.block 'output.state := } 237 | if$ 238 | } 239 | 240 | FUNCTION {new.sentence} 241 | { output.state after.block = 242 | 'skip$ 243 | { output.state before.all = 244 | 'skip$ 245 | { after.sentence 'output.state := } 246 | if$ 247 | } 248 | if$ 249 | } 250 | 251 | FUNCTION {add.blank} 252 | { " " * before.all 'output.state := 253 | } 254 | 255 | FUNCTION {no.blank.or.punct} 256 | { "" * before.all 'output.state := 257 | } 258 | 259 | FUNCTION {add.semicolon} 260 | { 261 | ";" * 262 | no.blank.or.punct 263 | } 264 | 265 | FUNCTION {date.block} 266 | { 267 | "." * 268 | no.blank.or.punct 269 | } 270 | 271 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 272 | % LOGICAL `NOT', `AND', AND `OR' % 273 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 274 | 275 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 276 | % Logical 'not': 277 | % If the first element on the stack is A then this function 278 | % does the following: 279 | % push { #0 } 280 | % push { #1 } 281 | % So now the first 3 elements of the stack are 282 | % { #1 } { #0 } A 283 | % The first 3 are popped and subjected to 'if': 284 | % If A > 0 then { #0 } is executed, else { #1 } is executed: 285 | % if A > 0 286 | % then 0 287 | % else 1 288 | % So consider integers as logicals, where 1 = true and 0 = false, 289 | % then this does 290 | % (if A then false else true) 291 | % which is a logical 'not'. 292 | 293 | FUNCTION {not} 294 | { { #0 } 295 | { #1 } 296 | if$ 297 | } 298 | 299 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 300 | % Logical 'and': 301 | % If the first 2 elements on the stack are A B 302 | % then this function does the following: 303 | % push 'skip$ 304 | % push { pop$ #0 } 305 | % So now first 4 elements are 306 | % { pop$ #0 } 'skip$ A B 307 | % The first 3 are popped and subjected to 'if' (B is on top of 308 | % the stack): 309 | % If A > 0 then 'skip$ is executed, else { pop$ #0 } is executed: 310 | % if A > 0 311 | % then (B stays on top of stack) 312 | % else (B is popped and #0 is pushed) 313 | % So consider integers as logicals, where 1 = true and 0 = false, 314 | % then this does 315 | % (if A then B else false) 316 | % which is a logical 'and'. 317 | 318 | FUNCTION {and} 319 | { 'skip$ 320 | { pop$ #0 } 321 | if$ 322 | } 323 | 324 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 325 | % Logical 'or': 326 | % If the first 2 elements on the stack are A B 327 | % then this function does the following: 328 | % push { pop$ #1 } 329 | % push 'skip$ 330 | % So now first 4 elements are 331 | % 'skip$ { pop$ #1 } A B 332 | % The first 3 are popped and subjected to 'if' (B is on top of 333 | % the stack): 334 | % If A > 0 then { pop$ #1 } is executed, else 'skip$ is executed: 335 | % if A > 0 336 | % then (B is popped and #1 is pushed) 337 | % else (B stays on top of stack) 338 | % So consider integers as logicals, where 1 = true and 0 = false, 339 | % then this does 340 | % (if A then true else B) 341 | % which is a logical 'or'. 342 | 343 | FUNCTION {or} 344 | { { pop$ #1 } 345 | 'skip$ 346 | if$ 347 | } 348 | 349 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 350 | % GENERAL PURPOSE FUNCTIONS FOR FORMATTING % 351 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 352 | 353 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 354 | % issues warning if field is empty 355 | % call with 356 | % "field" field warning.if.empty 357 | % Note that the first field must be between quotes 358 | % because it is the fieldname for use in the warning message. 359 | % 360 | 361 | FUNCTION {warning.if.empty} 362 | { empty$ 363 | { "No " swap$ * " in " * cite$ * warning$ } 364 | { pop$ } 365 | if$ 366 | } 367 | 368 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 369 | % 370 | % encloses string in pre- and postfix string 371 | % call with 372 | % prefix postfix S enclose.check 373 | % delivers empty string if S empty 374 | % 375 | FUNCTION {enclose.check} 376 | { duplicate$ empty$ 377 | { pop$ pop$ pop$ 378 | "" 379 | } 380 | { swap$ * * } 381 | if$ 382 | } 383 | 384 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 385 | % 386 | % emphasizes top of stack 387 | % call with 388 | % string" emphasize.check 389 | % 390 | 391 | FUNCTION {emphasize.check} 392 | { "\Bem{" swap$ 393 | "}" swap$ 394 | enclose.check 395 | } 396 | 397 | 398 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 399 | % 400 | % brackets top of stack 401 | % call with 402 | % "string" bracket.check 403 | % 404 | FUNCTION {bracket.check} 405 | { "[" swap$ 406 | "]" swap$ 407 | enclose.check 408 | } 409 | 410 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 411 | % 412 | % parenthesizes top of stack 413 | % call with 414 | % "string" parenthesize 415 | % 416 | FUNCTION {parenthesize.check} 417 | { "(" swap$ 418 | ")" swap$ 419 | enclose.check 420 | } 421 | 422 | STRINGS {z} 423 | 424 | FUNCTION {remove.dots} 425 | { 'z := % expects string on top of the stack, pops the string and assigns it to variable z 426 | "" % push empty string 427 | { z empty$ not } % returns 0 if variable z is empty 428 | { z #1 #1 substring$ % push the first character of variable z 429 | z #2 global.max$ substring$ 'z := % assigns the 2nd to last character of variable z to variable z 430 | duplicate$ "\" = % pushes 1 if the last character is "\", otherwise 0 431 | { * % concatenates the last 2 literals 432 | z #1 #1 substring$ % push the first character of variable z 433 | z #2 global.max$ substring$ 'z := % assigns the 2nd to last character of variable z to variable z 434 | * % concatenates the last 2 literals, i.e. every character, even a dot, following a "\" will be printed 435 | } 436 | { duplicate$ "." = % pushes 1 if the last character is ".", otherwise 0 437 | 'pop$ % pushes the pop$ function 438 | { * } % concatenates the last 2 literals 439 | if$ % pops the last character if it is a dot, otherwise concatenates it with the string on top of the stack 440 | } 441 | if$ 442 | } 443 | while$ 444 | } 445 | 446 | INTEGERS {l} 447 | FUNCTION{string.length} 448 | { 449 | #1 'l := 450 | { duplicate$ duplicate$ #1 l substring$ = not } 451 | { l #1 + 'l := } 452 | while$ 453 | pop$ l 454 | } 455 | 456 | STRINGS {replace find text} 457 | INTEGERS {find_length} 458 | FUNCTION {find.replace} 459 | { 460 | 'replace := 461 | 'find := 462 | 'text := 463 | find string.length 'find_length := 464 | "" 465 | { text empty$ not } 466 | { text #1 find_length substring$ find = 467 | { 468 | replace * 469 | text #1 find_length + global.max$ substring$ 'text := 470 | } 471 | { text #1 #1 substring$ * 472 | text #2 global.max$ substring$ 'text := 473 | } 474 | if$ 475 | } 476 | while$ 477 | } 478 | 479 | FUNCTION {new.block.checka} 480 | { empty$ 481 | 'skip$ 482 | 'new.block 483 | if$ 484 | } 485 | 486 | FUNCTION {new.block.checkb} 487 | { empty$ 488 | swap$ empty$ 489 | and 490 | 'skip$ 491 | 'new.block 492 | if$ 493 | } 494 | 495 | FUNCTION {new.sentence.checka} 496 | { empty$ 497 | 'skip$ 498 | 'new.sentence 499 | if$ 500 | } 501 | 502 | FUNCTION {new.sentence.checkb} 503 | { empty$ 504 | swap$ empty$ 505 | and 506 | 'skip$ 507 | 'new.sentence 508 | if$ 509 | } 510 | 511 | FUNCTION {field.or.null} 512 | { duplicate$ empty$ 513 | { pop$ "" } 514 | 'skip$ 515 | if$ 516 | } 517 | 518 | FUNCTION {emphasize} 519 | { skip$ } 520 | 521 | FUNCTION {tie.or.space.prefix} 522 | { duplicate$ text.length$ #3 < 523 | { "~" } 524 | { " " } 525 | if$ 526 | swap$ 527 | } 528 | 529 | FUNCTION {capitalize} 530 | { "u" change.case$ "t" change.case$ } 531 | 532 | FUNCTION {space.word} 533 | { " " swap$ * " " * } 534 | 535 | % Here are the language-specific definitions for explicit words. 536 | % Each function has a name bbl.xxx where xxx is the English word. 537 | % The language selected here is ENGLISH 538 | 539 | FUNCTION {bbl.and} 540 | { "and"} 541 | 542 | FUNCTION {bbl.etal} 543 | { "et~al." } 544 | 545 | FUNCTION {bbl.editors} 546 | { "editors" } 547 | 548 | FUNCTION {bbl.editor} 549 | { "editor" } 550 | 551 | FUNCTION {bbl.cartographers} 552 | { "cartographers" } 553 | 554 | FUNCTION {bbl.cartographer} 555 | { "cartographer" } 556 | 557 | FUNCTION {bbl.inventors} 558 | { "inventors" } 559 | 560 | FUNCTION {bbl.inventor} 561 | { "inventor" } 562 | 563 | FUNCTION {bbl.assignees} 564 | { "assignees" } 565 | 566 | FUNCTION {bbl.assignee} 567 | { "assignee" } 568 | 569 | FUNCTION {bbl.edby} 570 | { "edited by" } 571 | 572 | FUNCTION {bbl.edition} 573 | { "ed." } 574 | 575 | FUNCTION {bbl.volume} 576 | { "vol." } 577 | 578 | FUNCTION {bbl.of} 579 | { "of" } 580 | 581 | FUNCTION {bbl.number} 582 | { "no." } 583 | 584 | FUNCTION {bbl.nr} 585 | { "no." } 586 | 587 | FUNCTION {bbl.in} 588 | { "in" } 589 | 590 | FUNCTION {bbl.pages} 591 | { "p." } 592 | 593 | FUNCTION {bbl.page} 594 | { "p." } 595 | 596 | FUNCTION {bbl.chapter} 597 | { "chap." } 598 | 599 | FUNCTION {bbl.techrep} 600 | { "Tech. Rep." } 601 | 602 | FUNCTION {bbl.mthesis} 603 | { "Master's thesis" } 604 | 605 | FUNCTION {bbl.phdthesis} 606 | { "Ph.D. thesis" } 607 | 608 | FUNCTION {bbl.first} 609 | { "1st" } 610 | 611 | FUNCTION {bbl.second} 612 | { "2nd" } 613 | 614 | FUNCTION {bbl.third} 615 | { "3rd" } 616 | 617 | FUNCTION {bbl.fourth} 618 | { "4th" } 619 | 620 | FUNCTION {bbl.fifth} 621 | { "5th" } 622 | 623 | FUNCTION {bbl.st} 624 | { "st" } 625 | 626 | FUNCTION {bbl.nd} 627 | { "nd" } 628 | 629 | FUNCTION {bbl.rd} 630 | { "rd" } 631 | 632 | FUNCTION {bbl.th} 633 | { "th" } 634 | 635 | MACRO {jan} {"Jan."} 636 | 637 | MACRO {feb} {"Feb."} 638 | 639 | MACRO {mar} {"Mar."} 640 | 641 | MACRO {apr} {"Apr."} 642 | 643 | MACRO {may} {"May"} 644 | 645 | MACRO {jun} {"Jun."} 646 | 647 | MACRO {jul} {"Jul."} 648 | 649 | MACRO {aug} {"Aug."} 650 | 651 | MACRO {sep} {"Sep."} 652 | 653 | MACRO {oct} {"Oct."} 654 | 655 | MACRO {nov} {"Nov."} 656 | 657 | MACRO {dec} {"Dec."} 658 | 659 | FUNCTION {eng.ord} 660 | { duplicate$ "1" swap$ * 661 | #-2 #1 substring$ "1" = 662 | { bbl.th * } 663 | { duplicate$ #-1 #1 substring$ 664 | duplicate$ "1" = 665 | { pop$ bbl.st * } 666 | { duplicate$ "2" = 667 | { pop$ bbl.nd * } 668 | { "3" = 669 | { bbl.rd * } 670 | { bbl.th * } 671 | if$ 672 | } 673 | if$ 674 | } 675 | if$ 676 | } 677 | if$ 678 | } 679 | 680 | FUNCTION {bibinfo.check} 681 | { swap$ 682 | duplicate$ missing$ 683 | { 684 | pop$ pop$ 685 | "" 686 | } 687 | { duplicate$ empty$ 688 | { 689 | swap$ pop$ 690 | } 691 | { swap$ 692 | pop$ 693 | } 694 | if$ 695 | } 696 | if$ 697 | } 698 | 699 | FUNCTION {bibinfo.warn} 700 | { swap$ 701 | duplicate$ missing$ 702 | { 703 | swap$ "missing " swap$ * " in " * cite$ * warning$ pop$ 704 | "" 705 | } 706 | { duplicate$ empty$ 707 | { 708 | swap$ "empty " swap$ * " in " * cite$ * warning$ 709 | } 710 | { swap$ 711 | pop$ 712 | } 713 | if$ 714 | } 715 | if$ 716 | } 717 | 718 | STRINGS { bibinfo} 719 | INTEGERS { nameptr namesleft numnames } 720 | 721 | FUNCTION {format.names} 722 | { 'bibinfo := 723 | duplicate$ empty$ 'skip$ { 724 | "." ". " find.replace 's := 725 | "" 't := 726 | #1 'nameptr := 727 | s num.names$ 'numnames := 728 | numnames 'namesleft := 729 | { namesleft #0 > } 730 | { s nameptr 731 | "{vv~}{ll}{ f{}}{ jj}" 732 | format.name$ 733 | remove.dots 734 | bibinfo bibinfo.check 735 | 't := 736 | nameptr #1 > 737 | { 738 | nameptr #6 739 | #1 + = 740 | numnames #6 741 | > and 742 | { "others" 't := 743 | #1 'namesleft := } 744 | 'skip$ 745 | if$ 746 | namesleft #1 > 747 | { ", " * t * } 748 | { 749 | "," * 750 | s nameptr "{ll}" format.name$ duplicate$ "others" = 751 | { 't := } 752 | { pop$ } 753 | if$ 754 | t "others" = 755 | { 756 | " " * bbl.etal * 757 | } 758 | { " " * t * } 759 | if$ 760 | } 761 | if$ 762 | } 763 | 't 764 | if$ 765 | nameptr #1 + 'nameptr := 766 | namesleft #1 - 'namesleft := 767 | } 768 | while$ 769 | } if$ 770 | } 771 | 772 | FUNCTION {format.names.org} 773 | { 'bibinfo := 774 | duplicate$ empty$ 'skip$ { 775 | 's := 776 | "" 't := 777 | #1 'nameptr := 778 | s num.names$ 'numnames := 779 | numnames 'namesleft := 780 | { namesleft #0 > } 781 | { s nameptr 782 | "{ff~}{vv~}{ll}" 783 | format.name$ 784 | bibinfo bibinfo.check 785 | 't := 786 | nameptr #1 > 787 | { 788 | namesleft #1 > 789 | { "; " * t * } 790 | { 791 | ";" * 792 | s nameptr "{ll}" format.name$ duplicate$ "others" = 793 | { 't := } 794 | { pop$ } 795 | if$ 796 | t "others" = 797 | { 798 | " " * bbl.etal * 799 | } 800 | { " " * t * } 801 | if$ 802 | } 803 | if$ 804 | } 805 | 't 806 | if$ 807 | nameptr #1 + 'nameptr := 808 | namesleft #1 - 'namesleft := 809 | } 810 | while$ 811 | } if$ 812 | } 813 | 814 | FUNCTION {format.names.ed} 815 | { 816 | format.names 817 | } 818 | 819 | FUNCTION {format.authors} 820 | { 821 | author "author" format.names 822 | %%"." " " "author" find.replace format.names 823 | } 824 | 825 | FUNCTION {format.organizations} 826 | { organization "organization" format.names.org 827 | } 828 | 829 | FUNCTION {get.bbl.editor} 830 | { editor num.names$ #1 > 'bbl.editors 'bbl.editor if$ } 831 | 832 | FUNCTION {get.bbl.cartographer} 833 | { cartographer num.names$ #1 > 'bbl.cartographers 'bbl.cartographer if$ } 834 | 835 | FUNCTION {get.bbl.inventor} 836 | { inventor num.names$ #1 > 'bbl.inventors 'bbl.inventor if$ } 837 | 838 | FUNCTION {get.bbl.assignee} 839 | { assignee num.names$ #1 > 'bbl.assignees 'bbl.assignee if$ } 840 | 841 | FUNCTION {format.editors} 842 | { editor "editor" format.names duplicate$ empty$ 'skip$ 843 | { 844 | "," * 845 | " " * 846 | get.bbl.editor 847 | * 848 | } 849 | if$ 850 | } 851 | 852 | FUNCTION {format.assignees} 853 | { assignee "assignee" format.names.org duplicate$ empty$ 'skip$ 854 | { 855 | "," * 856 | " " * 857 | get.bbl.assignee 858 | * 859 | } 860 | if$ 861 | } 862 | 863 | FUNCTION {format.cartographers} 864 | { cartographer "cartographer" format.names duplicate$ empty$ 'skip$ 865 | { 866 | "," * 867 | " " * 868 | get.bbl.cartographer 869 | * 870 | } 871 | if$ 872 | } 873 | 874 | FUNCTION {format.inventors} 875 | { inventor "inventor" format.names duplicate$ empty$ 'skip$ 876 | { 877 | "," * 878 | " " * 879 | get.bbl.inventor 880 | * 881 | } 882 | if$ 883 | } 884 | 885 | FUNCTION {format.note} 886 | { 887 | note empty$ 888 | { "" } 889 | { note #1 #1 substring$ 890 | duplicate$ "{" = 891 | 'skip$ 892 | { output.state mid.sentence = 893 | { "l" } 894 | { "u" } 895 | if$ 896 | change.case$ 897 | } 898 | if$ 899 | note #2 global.max$ substring$ * "note" bibinfo.check 900 | } 901 | if$ 902 | } 903 | 904 | FUNCTION {format.title} 905 | { title 906 | %%duplicate$ empty$ 'skip$ 907 | %% { "t" change.case$ } 908 | %%if$ 909 | "title" bibinfo.check 910 | } 911 | 912 | FUNCTION {format.type} 913 | { type empty$ 914 | 'skip$ 915 | { inbrackets type } 916 | %%{ add.blank "[" type * "]" * } 917 | if$ 918 | } 919 | 920 | FUNCTION {output.bibitem} 921 | { outside.brackets 'bracket.state := % urlbst 922 | newline$ 923 | "\bibitem{" write$ 924 | cite$ write$ 925 | "}" write$ 926 | newline$ 927 | "" 928 | before.all 'output.state := 929 | } 930 | 931 | FUNCTION {n.dashify} 932 | { 933 | 't := 934 | "" 935 | { t empty$ not } 936 | { t #1 #1 substring$ "-" = 937 | { t #1 #2 substring$ "--" = not 938 | { "--" * 939 | t #2 global.max$ substring$ 't := 940 | } 941 | { { t #1 #1 substring$ "-" = } 942 | { "-" * 943 | t #2 global.max$ substring$ 't := 944 | } 945 | while$ 946 | } 947 | if$ 948 | } 949 | { t #1 #1 substring$ * 950 | t #2 global.max$ substring$ 't := 951 | } 952 | if$ 953 | } 954 | while$ 955 | } 956 | 957 | FUNCTION {word.in} 958 | { bbl.in capitalize 959 | ":" * 960 | " " * } 961 | 962 | FUNCTION {format.journal.date} 963 | { 964 | month "month" bibinfo.check 965 | duplicate$ empty$ 966 | year "year" bibinfo.check duplicate$ empty$ 967 | { 968 | swap$ 'skip$ 969 | { "there's a month but no year in " cite$ * warning$ } 970 | if$ 971 | * 972 | } 973 | { swap$ 'skip$ 974 | { 975 | " " * swap$ 976 | } 977 | if$ 978 | * 979 | remove.dots 980 | } 981 | if$ 982 | duplicate$ empty$ 983 | 'skip$ 984 | { 985 | before.all 'output.state := 986 | after.sentence 'output.state := 987 | } 988 | if$ 989 | } 990 | 991 | FUNCTION {format.date} 992 | { 993 | no.blank.or.punct 994 | ";" 995 | duplicate$ empty$ 996 | year "year" bibinfo.check duplicate$ empty$ 997 | { swap$ 'skip$ 998 | { "there's a month but no year in " cite$ * warning$ } 999 | if$ 1000 | * 1001 | } 1002 | { swap$ 'skip$ 1003 | { 1004 | swap$ 1005 | " " * swap$ 1006 | } 1007 | if$ 1008 | * 1009 | } 1010 | if$ 1011 | } 1012 | 1013 | FUNCTION {format.btitle} 1014 | { title "title" bibinfo.check 1015 | duplicate$ empty$ 'skip$ 1016 | { 1017 | } 1018 | if$ 1019 | } 1020 | 1021 | FUNCTION {either.or.check} 1022 | { empty$ 1023 | 'pop$ 1024 | { "can't use both " swap$ * " fields in " * cite$ * warning$ } 1025 | if$ 1026 | } 1027 | 1028 | FUNCTION {format.bvolume} 1029 | { volume empty$ 1030 | { "" } 1031 | { bbl.volume volume tie.or.space.prefix 1032 | "volume" bibinfo.check * * 1033 | series "series" bibinfo.check 1034 | duplicate$ empty$ 'pop$ 1035 | { swap$ bbl.of space.word * swap$ 1036 | emphasize * } 1037 | if$ 1038 | "volume and number" number either.or.check 1039 | } 1040 | if$ 1041 | } 1042 | 1043 | FUNCTION {format.number.series} 1044 | { volume empty$ 1045 | { number empty$ 1046 | { series field.or.null } 1047 | { series empty$ 1048 | { number "number" bibinfo.check } 1049 | { output.state mid.sentence = 1050 | { bbl.number } 1051 | { bbl.number capitalize } 1052 | if$ 1053 | number tie.or.space.prefix "number" bibinfo.check * * 1054 | bbl.in space.word * 1055 | series "series" bibinfo.check * 1056 | } 1057 | if$ 1058 | } 1059 | if$ 1060 | } 1061 | { "" } 1062 | if$ 1063 | } 1064 | 1065 | FUNCTION {is.num} 1066 | { chr.to.int$ 1067 | duplicate$ "0" chr.to.int$ < not 1068 | swap$ "9" chr.to.int$ > not and 1069 | } 1070 | 1071 | FUNCTION {extract.num} 1072 | { duplicate$ 't := 1073 | "" 's := 1074 | { t empty$ not } 1075 | { t #1 #1 substring$ 1076 | t #2 global.max$ substring$ 't := 1077 | duplicate$ is.num 1078 | { s swap$ * 's := } 1079 | { pop$ "" 't := } 1080 | if$ 1081 | } 1082 | while$ 1083 | s empty$ 1084 | 'skip$ 1085 | { pop$ s } 1086 | if$ 1087 | } 1088 | 1089 | FUNCTION {convert.edition} 1090 | { extract.num "l" change.case$ 's := 1091 | s "first" = s "1" = or 1092 | { bbl.first 't := } 1093 | { s "second" = s "2" = or 1094 | { bbl.second 't := } 1095 | { s "third" = s "3" = or 1096 | { bbl.third 't := } 1097 | { s "fourth" = s "4" = or 1098 | { bbl.fourth 't := } 1099 | { s "fifth" = s "5" = or 1100 | { bbl.fifth 't := } 1101 | { s #1 #1 substring$ is.num 1102 | { s eng.ord 't := } 1103 | { edition 't := } 1104 | if$ 1105 | } 1106 | if$ 1107 | } 1108 | if$ 1109 | } 1110 | if$ 1111 | } 1112 | if$ 1113 | } 1114 | if$ 1115 | t 1116 | } 1117 | 1118 | FUNCTION {format.edition} 1119 | { edition duplicate$ empty$ 'skip$ 1120 | { 1121 | convert.edition 1122 | output.state mid.sentence = 1123 | { "l" } 1124 | { "t" } 1125 | if$ change.case$ 1126 | "edition" bibinfo.check 1127 | " " * bbl.edition * 1128 | } 1129 | if$ 1130 | } 1131 | INTEGERS { multiresult } 1132 | FUNCTION {multi.page.check} 1133 | { 't := 1134 | #0 'multiresult := 1135 | { multiresult not 1136 | t empty$ not 1137 | and 1138 | } 1139 | { t #1 #1 substring$ 1140 | duplicate$ "-" = 1141 | swap$ duplicate$ "," = 1142 | swap$ "+" = 1143 | or or 1144 | { #1 'multiresult := } 1145 | { t #2 global.max$ substring$ 't := } 1146 | if$ 1147 | } 1148 | while$ 1149 | multiresult 1150 | } 1151 | 1152 | FUNCTION {format.pages} 1153 | { pages duplicate$ empty$ 'skip$ 1154 | { duplicate$ multi.page.check 1155 | { 1156 | bbl.pages swap$ 1157 | n.dashify 1158 | } 1159 | { 1160 | bbl.page swap$ 1161 | } 1162 | if$ 1163 | tie.or.space.prefix 1164 | "pages" bibinfo.check 1165 | * * 1166 | } 1167 | if$ 1168 | } 1169 | 1170 | FUNCTION {format.journal.pages} 1171 | { pages duplicate$ empty$ 'pop$ 1172 | { swap$ duplicate$ empty$ 1173 | { pop$ pop$ format.pages } 1174 | { 1175 | ":" * 1176 | swap$ 1177 | n.dashify 1178 | "pages" bibinfo.check 1179 | * 1180 | } 1181 | if$ 1182 | } 1183 | if$ 1184 | } 1185 | 1186 | FUNCTION {format.vol.num} 1187 | { volume field.or.null 1188 | duplicate$ empty$ 'skip$ 1189 | { 1190 | "volume" bibinfo.check 1191 | } 1192 | if$ 1193 | number "number" bibinfo.check duplicate$ empty$ 'skip$ 1194 | { 1195 | swap$ duplicate$ empty$ 1196 | { "there's a number but no volume in " cite$ * warning$ } 1197 | 'skip$ 1198 | if$ 1199 | swap$ 1200 | "(" swap$ * ")" * 1201 | } 1202 | if$ * 1203 | } 1204 | 1205 | FUNCTION {format.vol.num.pages} 1206 | { volume field.or.null 1207 | duplicate$ empty$ 'skip$ 1208 | { 1209 | "volume" bibinfo.check 1210 | } 1211 | if$ 1212 | number "number" bibinfo.check duplicate$ empty$ 'skip$ 1213 | { 1214 | swap$ duplicate$ empty$ 1215 | { "there's a number but no volume in " cite$ * warning$ } 1216 | 'skip$ 1217 | if$ 1218 | swap$ 1219 | "(" swap$ * ")" * 1220 | } 1221 | if$ * 1222 | format.journal.pages 1223 | } 1224 | 1225 | FUNCTION {format.chapter.pages} 1226 | { chapter empty$ 1227 | 'format.pages 1228 | { type empty$ 1229 | { bbl.chapter } 1230 | { type "l" change.case$ 1231 | "type" bibinfo.check 1232 | } 1233 | if$ 1234 | chapter tie.or.space.prefix 1235 | "chapter" bibinfo.check 1236 | * * 1237 | pages empty$ 1238 | 'skip$ 1239 | { ", " * format.pages * } 1240 | if$ 1241 | } 1242 | if$ 1243 | } 1244 | 1245 | FUNCTION {format.booktitle} 1246 | { 1247 | booktitle "booktitle" bibinfo.check 1248 | } 1249 | 1250 | FUNCTION {format.in.ed.booktitle} 1251 | { format.booktitle duplicate$ empty$ 'skip$ 1252 | { 1253 | editor "editor" format.names.ed duplicate$ empty$ 'pop$ 1254 | { 1255 | "," * 1256 | " " * 1257 | get.bbl.editor 1258 | ". " * 1259 | * swap$ 1260 | * } 1261 | if$ 1262 | word.in swap$ * 1263 | } 1264 | if$ 1265 | } 1266 | 1267 | FUNCTION {format.in.ed.title} 1268 | { format.title duplicate$ empty$ 'skip$ 1269 | { 1270 | editor "editor" format.names.ed duplicate$ empty$ 'pop$ 1271 | { 1272 | "," * 1273 | " " * 1274 | get.bbl.editor 1275 | ". " * 1276 | * swap$ 1277 | * } 1278 | if$ 1279 | word.in swap$ * 1280 | } 1281 | if$ 1282 | } 1283 | 1284 | FUNCTION {empty.misc.check} 1285 | { author empty$ title empty$ howpublished empty$ 1286 | month empty$ year empty$ note empty$ 1287 | and and and and and 1288 | { "all relevant fields are empty in " cite$ * warning$ } 1289 | 'skip$ 1290 | if$ 1291 | } 1292 | FUNCTION {format.thesis.type} 1293 | { type duplicate$ empty$ 1294 | 'pop$ 1295 | { swap$ pop$ 1296 | "t" change.case$ "type" bibinfo.check 1297 | } 1298 | if$ 1299 | } 1300 | FUNCTION {format.tr.number} 1301 | { 1302 | number "number" bibinfo.check 1303 | %%type duplicate$ empty$ 1304 | %%{ pop$ bbl.techrep } 1305 | %%'skip$ 1306 | %%if$ 1307 | %%"type" bibinfo.check 1308 | %%swap$ duplicate$ empty$ 1309 | %%{ pop$ "t" change.case$ } 1310 | %%{ tie.or.space.prefix * * } 1311 | %%if$ 1312 | } 1313 | 1314 | FUNCTION {format.org.or.pub} 1315 | { 't := 1316 | "" 1317 | address empty$ t empty$ and 1318 | 'skip$ 1319 | { 1320 | address "address" bibinfo.check * 1321 | t empty$ 1322 | 'skip$ 1323 | { address empty$ 1324 | 'skip$ 1325 | { ": " * } 1326 | if$ 1327 | t * 1328 | } 1329 | if$ 1330 | } 1331 | if$ 1332 | } 1333 | 1334 | FUNCTION {format.publisher.address} 1335 | { publisher "publisher" bibinfo.warn format.org.or.pub 1336 | } 1337 | 1338 | FUNCTION {format.organization.address} 1339 | { organization "organization" bibinfo.check format.org.or.pub 1340 | } 1341 | 1342 | FUNCTION {format.institution.address} 1343 | { institution "institution" bibinfo.check format.org.or.pub 1344 | } 1345 | 1346 | 1347 | % urlbst... 1348 | % Functions for making hypertext links. 1349 | % In all cases, the stack has (link-text href-url) 1350 | % 1351 | % make 'null' specials 1352 | FUNCTION {make.href.null} 1353 | { 1354 | pop$ 1355 | } 1356 | % make hypertex specials 1357 | FUNCTION {make.href.hypertex} 1358 | { 1359 | "\special {html: }" * swap$ * 1361 | "\special {html:}" * 1362 | } 1363 | % make hyperref specials 1364 | FUNCTION {make.href.hyperref} 1365 | { 1366 | "\href {" swap$ * "} {" * swap$ * "}" * 1367 | } 1368 | FUNCTION {make.href} 1369 | { hrefform #2 = 1370 | 'make.href.hyperref % hrefform = 2 1371 | { hrefform #1 = 1372 | 'make.href.hypertex % hrefform = 1 1373 | 'make.href.null % hrefform = 0 (or anything else) 1374 | if$ 1375 | } 1376 | if$ 1377 | } 1378 | 1379 | FUNCTION {format.url} 1380 | { url empty$ 1381 | { "" } 1382 | { hrefform #1 = 1383 | { % special case -- add HyperTeX specials 1384 | urlintro "\url{" url * "}" * url make.href.hypertex * } 1385 | { urlintro "\url{" * url * "}" * } 1386 | if$ 1387 | } 1388 | if$ 1389 | } 1390 | 1391 | FUNCTION {format.eprint} 1392 | { eprint empty$ 1393 | { "" } 1394 | { eprintprefix eprint * eprinturl eprint * make.href } 1395 | if$ 1396 | } 1397 | 1398 | FUNCTION {format.doi} 1399 | { doi empty$ 1400 | { "" } 1401 | { doiprefix doi * doiurl doi * make.href } 1402 | if$ 1403 | } 1404 | 1405 | % Output a URL. We can't use the more normal idiom (something like 1406 | % `format.url output'), because the `inbrackets' within 1407 | % format.lastchecked applies to everything between calls to `output', 1408 | % so that `format.url format.lastchecked * output' ends up with both 1409 | % the URL and the lastchecked in brackets. 1410 | FUNCTION {output.url} 1411 | { url empty$ 1412 | 'skip$ 1413 | { new.block 1414 | format.url output 1415 | format.lastchecked output 1416 | } 1417 | if$ 1418 | } 1419 | 1420 | FUNCTION {output.web.refs} 1421 | { 1422 | new.block 1423 | output.url 1424 | addeprints eprint empty$ not and 1425 | { format.eprint output.nonnull } 1426 | 'skip$ 1427 | if$ 1428 | adddoiresolver doi empty$ not and 1429 | { format.doi output.nonnull } 1430 | 'skip$ 1431 | if$ 1432 | % addeprints 1433 | % { eprint empty$ 1434 | % 'skip$ 1435 | % { format.eprint output.nonnull } 1436 | % if$ 1437 | % } 1438 | % 'skip$ 1439 | % if$ 1440 | } 1441 | 1442 | % Webpage entry type. 1443 | % Title and url fields required; 1444 | % author, note, year, month, and lastchecked fields optional 1445 | STRINGS {database} 1446 | FUNCTION {webpage} 1447 | { output.bibitem 1448 | author empty$ 1449 | { editor empty$ 1450 | 'skip$ % author and editor both optional 1451 | { format.editors output.nonnull } 1452 | if$ 1453 | } 1454 | { editor empty$ 1455 | { format.authors output.nonnull } 1456 | { "can't use both author and editor fields in " cite$ * warning$ } 1457 | if$ 1458 | } 1459 | if$ 1460 | % author empty$ 1461 | % 'skip$ 1462 | % { format.authors output.nonnull } 1463 | % if$ 1464 | new.block 1465 | format.title "title" output.check 1466 | journal empty$ 1467 | { 1468 | format.type "type" output.check 1469 | publisher empty$ 1470 | 'skip$ 1471 | { format.publisher.address output } 1472 | if$ 1473 | "database on the Internet" 'database := 1474 | type database = 1475 | { format.journal.date "year" output.check } 1476 | { format.date "year" output.check } 1477 | if$ 1478 | lastchecked empty$ 1479 | 'skip$ 1480 | { format.lastchecked output } 1481 | if$ 1482 | new.block 1483 | part empty$ 1484 | 'skip$ 1485 | { part output } 1486 | if$ 1487 | pages empty$ 1488 | 'skip$ 1489 | { pages bracket.check output } 1490 | if$ 1491 | } 1492 | { journal 1493 | remove.dots 1494 | "journal" bibinfo.check 1495 | "journal" output.check 1496 | format.type "type" output.check 1497 | format.journal.date "year" output.check 1498 | lastchecked empty$ 1499 | 'skip$ 1500 | { format.lastchecked output 1501 | ";" no.blank.or.punct output 1502 | } 1503 | if$ 1504 | no.blank.or.punct format.vol.num output 1505 | pages empty$ 1506 | 'skip$ 1507 | { ":" no.blank.or.punct output 1508 | no.blank.or.punct pages bracket.check output 1509 | } 1510 | if$ 1511 | new.block 1512 | } 1513 | if$ 1514 | format.url "url" output.check 1515 | new.block 1516 | note output 1517 | fin.entry 1518 | } 1519 | % ...urlbst to here 1520 | 1521 | FUNCTION {misc} 1522 | { output.bibitem 1523 | format.authors "author" output.check 1524 | format.editors "author and editor" output.check 1525 | format.title "title" output.check 1526 | type missing$ 1527 | { skip$ } 1528 | { format.type "type" output.check } 1529 | %%{ inbrackets type output } 1530 | if$ 1531 | new.block 1532 | format.publisher.address output 1533 | format.date "year" output.check 1534 | new.block 1535 | format.note output 1536 | new.block 1537 | howpublished new.block.checka 1538 | howpublished "howpublished" bibinfo.check output 1539 | output.web.refs % urlbst 1540 | fin.entry 1541 | empty.misc.check 1542 | } 1543 | 1544 | FUNCTION {article} 1545 | { output.bibitem 1546 | format.authors "author" output.check 1547 | organization empty$ 1548 | 'skip$ 1549 | { author empty$ 1550 | { 1551 | format.organizations "organization" output.check 1552 | } 1553 | { 1554 | "; " * 1555 | no.blank.or.punct 1556 | format.organizations "organization" output.check 1557 | } 1558 | if$ 1559 | } 1560 | if$ 1561 | new.block 1562 | format.title "title" output.check 1563 | type missing$ 1564 | { skip$ } 1565 | { format.type "type" output.check } 1566 | if$ 1567 | new.block 1568 | journal 1569 | remove.dots 1570 | "journal" bibinfo.check 1571 | "journal" output.check 1572 | format.journal.date "year" output.check 1573 | add.semicolon 1574 | format.vol.num.pages output 1575 | new.block 1576 | format.note output 1577 | output.web.refs % urlbst 1578 | fin.entry 1579 | } 1580 | 1581 | FUNCTION {book} 1582 | { output.bibitem 1583 | author empty$ 1584 | { editor empty$ 1585 | { format.organizations "organization" output.check } 1586 | { format.editors "author and editor" output.check } 1587 | if$ 1588 | } 1589 | { format.authors output.nonnull 1590 | "author and editor" editor either.or.check 1591 | } 1592 | if$ 1593 | new.block 1594 | format.btitle "title" output.check 1595 | format.bvolume output 1596 | new.block 1597 | format.edition output 1598 | new.sentence 1599 | author empty$ not 1600 | editor empty$ not 1601 | and 1602 | { format.editors "author and editor" output.check } 1603 | 'skip$ 1604 | if$ 1605 | format.number.series output 1606 | format.publisher.address output 1607 | format.date "year" output.check 1608 | new.block 1609 | format.note output 1610 | output.web.refs % urlbst 1611 | fin.entry 1612 | } 1613 | 1614 | FUNCTION {booklet} 1615 | { misc } 1616 | 1617 | FUNCTION {dictionary} 1618 | { output.bibitem 1619 | format.booktitle "booktitle" output.check 1620 | format.bvolume output 1621 | new.block 1622 | format.edition output 1623 | new.sentence 1624 | format.publisher.address output 1625 | format.date "year" output.check 1626 | format.btitle "title" output.check 1627 | add.semicolon 1628 | add.blank 1629 | format.pages "pages" output.check 1630 | new.block 1631 | format.note output 1632 | output.web.refs % urlbst 1633 | fin.entry 1634 | } 1635 | 1636 | FUNCTION {inbook} 1637 | { output.bibitem 1638 | format.authors "author" output.check 1639 | new.block 1640 | chapter "chapter" output.check 1641 | new.block 1642 | format.in.ed.title "title" output.check 1643 | format.bvolume output 1644 | format.edition output 1645 | new.sentence 1646 | format.number.series output 1647 | format.publisher.address output 1648 | format.date "year" output.check 1649 | date.block 1650 | add.blank 1651 | format.pages "pages" output.check 1652 | new.block 1653 | format.note output 1654 | output.web.refs % urlbst 1655 | fin.entry 1656 | } 1657 | 1658 | FUNCTION {incollection} 1659 | { output.bibitem 1660 | format.authors "author" output.check 1661 | new.block 1662 | format.title "title" output.check 1663 | new.block 1664 | format.in.ed.booktitle "booktitle" output.check 1665 | format.bvolume output 1666 | format.edition output 1667 | new.sentence 1668 | format.number.series output 1669 | format.publisher.address output 1670 | format.date "year" output.check 1671 | date.block 1672 | add.blank 1673 | format.pages "pages" output.check 1674 | new.block 1675 | format.note output 1676 | output.web.refs % urlbst 1677 | fin.entry 1678 | } 1679 | 1680 | FUNCTION {inproceedings} 1681 | { output.bibitem 1682 | format.authors "author" output.check 1683 | new.block 1684 | format.title "title" output.check 1685 | new.block 1686 | format.in.ed.booktitle "booktitle" output.check 1687 | format.bvolume output 1688 | new.sentence 1689 | format.number.series output 1690 | publisher empty$ 1691 | { format.organization.address output } 1692 | { organization "organization" bibinfo.check output 1693 | format.publisher.address output 1694 | } 1695 | if$ 1696 | format.date "year" output.check 1697 | date.block 1698 | add.blank 1699 | format.pages "pages" output.check 1700 | new.block 1701 | format.note output 1702 | output.web.refs % urlbst 1703 | fin.entry 1704 | } 1705 | 1706 | FUNCTION {conference} 1707 | {inproceedings} 1708 | 1709 | FUNCTION {manual} 1710 | {misc} 1711 | 1712 | FUNCTION {phdthesis} 1713 | { output.bibitem 1714 | format.authors "author" output.check 1715 | new.block 1716 | format.btitle 1717 | "title" output.check 1718 | format.type "type" output.check 1719 | new.block 1720 | school "school" bibinfo.warn output 1721 | address "address" bibinfo.check output 1722 | format.date "year" output.check 1723 | new.block 1724 | format.note output 1725 | output.web.refs % urlbst 1726 | fin.entry 1727 | } 1728 | 1729 | FUNCTION {mastersthesis} 1730 | {phdthesis} 1731 | 1732 | FUNCTION {proceedings} 1733 | { output.bibitem 1734 | editor empty$ 1735 | { organization "organization" bibinfo.check output 1736 | } 1737 | { format.editors output.nonnull } 1738 | if$ 1739 | new.block 1740 | format.btitle "title" output.check 1741 | format.bvolume output 1742 | editor empty$ 1743 | { publisher empty$ 1744 | 'skip$ 1745 | { 1746 | new.sentence 1747 | format.number.series output 1748 | format.publisher.address output 1749 | } 1750 | if$ 1751 | } 1752 | { publisher empty$ 1753 | { 1754 | new.sentence 1755 | format.organization.address output } 1756 | { 1757 | new.sentence 1758 | organization "organization" bibinfo.check output 1759 | format.publisher.address output 1760 | } 1761 | if$ 1762 | } 1763 | if$ 1764 | format.date "year" output.check 1765 | new.block 1766 | format.note output 1767 | output.web.refs % urlbst 1768 | fin.entry 1769 | } 1770 | 1771 | FUNCTION {techreport} 1772 | { output.bibitem 1773 | format.authors "author" output.check 1774 | new.block 1775 | format.title 1776 | "title" output.check 1777 | new.block 1778 | format.institution.address output 1779 | format.date "year" output.check 1780 | format.tr.number output.nonnull 1781 | new.block 1782 | format.note output 1783 | output.web.refs % urlbst 1784 | fin.entry 1785 | } 1786 | 1787 | FUNCTION {map} 1788 | { output.bibitem 1789 | format.cartographers "cartographer" output.check 1790 | new.block 1791 | format.title 1792 | "title" output.check 1793 | format.type "type" output.check 1794 | new.block 1795 | format.publisher.address output 1796 | format.date "year" output.check 1797 | new.block 1798 | format.note output 1799 | output.web.refs % urlbst 1800 | fin.entry 1801 | } 1802 | 1803 | FUNCTION {patent} 1804 | { output.bibitem 1805 | format.inventors "inventor" output.check 1806 | "; " * 1807 | no.blank.or.punct 1808 | format.assignees "assignee" output.check 1809 | new.block 1810 | format.title 1811 | "title" output.check 1812 | new.block 1813 | format.tr.number output.nonnull 1814 | format.date "year" output.check 1815 | new.block 1816 | format.note output 1817 | output.web.refs % urlbst 1818 | fin.entry 1819 | } 1820 | 1821 | FUNCTION {unpublished} 1822 | { output.bibitem 1823 | format.authors "author" output.check 1824 | new.block 1825 | format.title "title" output.check 1826 | format.date output 1827 | new.block 1828 | format.note "note" output.check 1829 | output.web.refs % urlbst 1830 | fin.entry 1831 | } 1832 | 1833 | FUNCTION {default.type} { misc } 1834 | READ 1835 | STRINGS { longest.label } 1836 | INTEGERS { number.label longest.label.width } 1837 | FUNCTION {initialize.longest.label} 1838 | { "" 'longest.label := 1839 | #1 'number.label := 1840 | #0 'longest.label.width := 1841 | } 1842 | FUNCTION {longest.label.pass} 1843 | { number.label int.to.str$ 'label := 1844 | number.label #1 + 'number.label := 1845 | label width$ longest.label.width > 1846 | { label 'longest.label := 1847 | label width$ 'longest.label.width := 1848 | } 1849 | 'skip$ 1850 | if$ 1851 | } 1852 | EXECUTE {initialize.longest.label} 1853 | ITERATE {longest.label.pass} 1854 | FUNCTION {begin.bib} 1855 | { preamble$ empty$ 1856 | 'skip$ 1857 | { preamble$ write$ newline$ } 1858 | if$ 1859 | "\begin{thebibliography}{" longest.label * "}" * 1860 | write$ newline$ 1861 | } 1862 | EXECUTE {begin.bib} 1863 | EXECUTE {init.config.constants} 1864 | EXECUTE {init.state.consts} 1865 | ITERATE {call.type$} 1866 | FUNCTION {end.bib} 1867 | { newline$ 1868 | "\end{thebibliography}" write$ newline$ 1869 | } 1870 | EXECUTE {end.bib} 1871 | %% End of customized bst file 1872 | %% 1873 | %% End of file `vancouver.bst'. 1874 | -------------------------------------------------------------------------------- /category-codes.ssv: -------------------------------------------------------------------------------- 1 | first;last;description 2 | A00;B99;Certain infectious and parasitic diseases 3 | C00;D49;Neoplasms 4 | D50;D89;Diseases of the blood and blood-forming organs and certain disorders involving the immune mechanism 5 | E00;E89;Endocrine, nutritional and metabolic diseases 6 | F01;F99;Mental, Behavioral and Neurodevelopmental disorders 7 | G00;G99;Diseases of the nervous system 8 | H00;H59;Diseases of the eye and adnexa 9 | H60;H95;Diseases of the ear and mastoid process 10 | I00;I99;Diseases of the circulatory system 11 | J00;J99;Diseases of the respiratory system 12 | K00;K95;Diseases of the digestive system 13 | L00;L99;Diseases of the skin and subcutaneous tissue 14 | M00;M99;Diseases of the musculoskeletal system and connective tissue 15 | N00;N99;Diseases of the genitourinary system 16 | O00;O9A;Pregnancy, childbirth and the puerperium 17 | P00;P96;Certain conditions originating in the perinatal period 18 | Q00;Q99;Congenital malformations, deformations and chromosomal abnormalities 19 | R00;R99;Symptoms, signs and abnormal clinical and laboratory findings, not elsewhere classified 20 | S00;T88;Injury, poisoning and certain other consequences of external causes 21 | U00;U85;Codes for special purposes 22 | V00;Y99;External causes of morbidity 23 | Z00;Z99;Factors influencing health status and contact with health services 24 | -------------------------------------------------------------------------------- /comparators/REAME.md: -------------------------------------------------------------------------------- 1 | This directory contains the code used to compare the performance of the BioGPT model with MedBERT and two version of PubMedBERT (pubmedbert-fulltext and pubmedbert-ms-marco). 2 | 3 | The following files are in the current directory. 4 | - `setup` creates the conda environment to run each of the models. 5 | - short-codes.R creates the training data to estimate the icd10 category(leading letter). 6 | - alpha-char-embedding-model.R contains the `dataset` object and supervised models. 7 | 8 | Each directory (medbert, pubmedbert-fulltext, pubmedbert-ms-marco) contains: 9 | - ref.txt contains the url the model was downloaded from on HuggingFace. 10 | - 0-make-embedding.R create the embedding data set. 11 | - 1-benchmark.R performs the benchmark. 12 | -------------------------------------------------------------------------------- /comparators/alpha-char-embedding-model.R: -------------------------------------------------------------------------------- 1 | library(torch) 2 | library(dplyr) 3 | library(foreach) 4 | 5 | AlphaCharEmbedding = dataset( 6 | name = "AlphaCharEmbedding", 7 | initialize = function(x) { 8 | self$x = x 9 | self$contr = contr.treatment(sort(unique(x$ll)), contrasts = FALSE) 10 | }, 11 | width = function() { 12 | self$x$embed[[1]] |> length() 13 | }, 14 | .getitem = function(i) { 15 | list( 16 | x = torch_tensor(self$x$embed[[i]]), 17 | y = torch_tensor(self$contr[self$x$ll[i],]) 18 | ) 19 | }, 20 | .length = function() { 21 | nrow(self$x) 22 | } 23 | ) 24 | 25 | AlphaCodeEstimator = nn_module( 26 | initialize = function(layers) { 27 | self$feature_net = nn_module_list( 28 | foreach(i = seq_along(layers)[-1]) %do% { 29 | nn_linear(layers[i-1], layers[i]) 30 | } 31 | ) 32 | }, 33 | forward = function(x) { 34 | x = x$squeeze() 35 | for (i in seq_along(self$feature_net)) { 36 | x = self$feature_net[[i]](x) 37 | } 38 | nnf_softmax(x, dim = 1) 39 | } 40 | ) 41 | -------------------------------------------------------------------------------- /comparators/clinicalbert/0-make-embedding.R: -------------------------------------------------------------------------------- 1 | library(reticulate) 2 | library(readr) 3 | library(purrr) 4 | library(foreach) 5 | library(itertools) 6 | 7 | use_condaenv("icd-10-huggingface", required = TRUE) 8 | 9 | source("../short-code.R") 10 | 11 | transformers = import("transformers") 12 | tokenizer = transformers$AutoTokenizer$from_pretrained("medicalai/ClinicalBERT") 13 | torch = import("torch") 14 | np = import("numpy") 15 | 16 | builtins = import_builtins() 17 | builtins$setattr(torch$distributed, "is_initialized", py_eval("lambda : False")) 18 | model = transformers$AutoModel$from_pretrained("medicalai/ClinicalBERT") 19 | 20 | mean_pooling = function(model_output, attention_mask) { 21 | token_embeddings = model_output[[1]] #First element of model_output contains all token embeddings 22 | input_mask_expanded = attention_mask$unsqueeze(-1L)$expand(token_embeddings$size())$float() 23 | torch$sum(token_embeddings * input_mask_expanded, 1L) / torch$clamp(input_mask_expanded$sum(1L), min=1e-9) 24 | } 25 | 26 | # A function to embed a set of string. 27 | embed = function(strings) { 28 | encoded_input = tokenizer( 29 | strings, 30 | padding = TRUE, 31 | truncation = TRUE, 32 | max_length = 256L, 33 | return_tensors = 'pt' 34 | ) 35 | model_output = model( 36 | input_ids = encoded_input$input_ids, 37 | attention_mask = encoded_input$attention_mask 38 | ) 39 | mean_pooling(model_output, 40 | encoded_input$attention_mask)$detach()$cpu()$numpy() |> 41 | (\(x) {rownames(x) = strings; x})() 42 | } 43 | 44 | x = read_fwf( 45 | "../../icd-10-cm-codes/icd10cm_codes_2019.txt", 46 | fwf_cols(code = 8, desc = 150) 47 | ) 48 | 49 | embs = foreach(it = isplitVector(seq_len(nrow(x)), chunkSize = 1000), 50 | .combine = c) %do% { 51 | ret = embed(x$desc[it]) 52 | ret = map(seq_len(nrow(ret)), ~ list(ret[.x,,drop = FALSE])) 53 | message(tail(it, 1), " of ", nrow(x)) 54 | gc() 55 | ret 56 | } 57 | 58 | x$embed = unlist(embs, recursive = FALSE) 59 | x$ll = get_short_code(x$code) 60 | saveRDS(x, "x-with-embedding.rds") 61 | -------------------------------------------------------------------------------- /comparators/clinicalbert/1-benchmark.R: -------------------------------------------------------------------------------- 1 | library(luz) 2 | library(foreach) 3 | library(yardstick) 4 | library(tidyr) 5 | 6 | source("../alpha-char-embedding-model.R") 7 | 8 | x = readRDS("x-with-embedding.rds") 9 | 10 | traini = sample.int(nrow(x), round(0.9 * nrow(x))) 11 | testi = setdiff(seq_len(nrow(x)), traini) 12 | 13 | train = AlphaCharEmbedding(x[traini, ]) 14 | test = AlphaCharEmbedding(x[testi, ]) 15 | 16 | layers = c(train$width(), 100, 100, 21) 17 | 18 | batch_size = c(64, 128, 256) 19 | epochs = 30 20 | num_workers = 6 21 | 22 | loss = function(input, target) { 23 | torch_mean(-torch_sum(target * torch_log(input$squeeze() + 1e-16), 2)) 24 | } 25 | 26 | mss = foreach (bs = batch_size, .combine = bind_rows) %do% { 27 | luz_model = AlphaCodeEstimator |> 28 | setup( 29 | loss = loss, #nn_cross_entropy_loss(26), 30 | optimizer = optim_adam 31 | ) |> 32 | set_hparams(layers = layers) |> 33 | fit( 34 | data = dataloader( 35 | train, 36 | batch_size = bs, 37 | shuffle = TRUE, 38 | num_workers = num_workers, 39 | worker_packages = c("torch", "dplyr") 40 | ), 41 | epochs = epochs, 42 | valid_data = dataloader( 43 | test, 44 | batch_size = bs, 45 | shuffle = FALSE, 46 | num_workers = num_workers, 47 | worker_packages = c("torch", "dplyr") 48 | ), 49 | callbacks = list( 50 | luz_callback_keep_best_model() 51 | ) 52 | ) 53 | 54 | preds = 55 | predict( 56 | luz_model, 57 | dataloader( 58 | test, 59 | batch_size = bs, 60 | num_workers = num_workers, 61 | worker_packages = c("torch", "dplyr") 62 | ) 63 | ) 64 | 65 | comp = tibble( 66 | obs = x[testi,]$ll |> 67 | factor(levels = 1:21), 68 | pred = preds |> 69 | torch_tensor(device = "cpu") |> 70 | as.matrix() |> 71 | apply(1, which.max) |> 72 | factor(levels = 1:21) 73 | ) 74 | bind_cols( 75 | metric_set(accuracy, bal_accuracy)(comp, truth = obs, estimate = pred), 76 | batch_size = bs 77 | ) |> 78 | select(-.estimator) |> 79 | pivot_wider(names_from = .metric, values_from = .estimate) 80 | } 81 | print(mss) 82 | saveRDS(mss, "ms.rds") 83 | -------------------------------------------------------------------------------- /comparators/clinicalbert/ref.txt: -------------------------------------------------------------------------------- 1 | https://huggingface.co/medicalai/ClinicalBERT 2 | -------------------------------------------------------------------------------- /comparators/medbert/0-make-embedding.R: -------------------------------------------------------------------------------- 1 | library(reticulate) 2 | library(readr) 3 | library(purrr) 4 | library(foreach) 5 | library(itertools) 6 | 7 | use_condaenv("icd-10-huggingface", required = TRUE) 8 | 9 | source("../short-code.R") 10 | 11 | transformers = import("transformers") 12 | tokenizer = transformers$AutoTokenizer$from_pretrained("Charangan/MedBERT") 13 | torch = import("torch") 14 | np = import("numpy") 15 | 16 | builtins = import_builtins() 17 | builtins$setattr(torch$distributed, "is_initialized", py_eval("lambda : False")) 18 | model = transformers$AutoModel$from_pretrained("Charangan/MedBERT") 19 | 20 | # A function to embed a set of string. 21 | embed = function(strings) { 22 | encoded_input = tokenizer( 23 | strings, 24 | padding = TRUE, 25 | truncation = TRUE, 26 | max_length = 256L, 27 | return_tensors = 'pt' 28 | ) 29 | ret = model( 30 | input_ids = encoded_input$input_ids, 31 | attention_mask = encoded_input$attention_mask 32 | )$pooler_output 33 | ret$detach()$cpu()$numpy() 34 | } 35 | 36 | x = read_fwf( 37 | "../../icd-10-cm-codes/icd10cm_codes_2019.txt", 38 | fwf_cols(code = 8, desc = 150) 39 | ) 40 | 41 | embs = foreach(it = isplitVector(seq_len(nrow(x)), chunkSize = 1000), 42 | .combine = c) %do% { 43 | ret = embed(x$desc[it]) 44 | ret = map(seq_len(nrow(ret)), ~ list(ret[.x,,drop = FALSE])) 45 | message(tail(it, 1), " of ", nrow(x)) 46 | gc() 47 | ret 48 | } 49 | 50 | x$embed = unlist(embs, recursive = FALSE) 51 | x$ll = get_short_code(x$code) 52 | saveRDS(x, "x-with-embedding.rds") 53 | -------------------------------------------------------------------------------- /comparators/medbert/1-benchmark.R: -------------------------------------------------------------------------------- 1 | library(luz) 2 | library(foreach) 3 | library(yardstick) 4 | library(tidyr) 5 | 6 | source("../alpha-char-embedding-model.R") 7 | 8 | x = readRDS("x-with-embedding.rds") 9 | 10 | traini = sample.int(nrow(x), round(0.9 * nrow(x))) 11 | testi = setdiff(seq_len(nrow(x)), traini) 12 | 13 | train = AlphaCharEmbedding(x[traini, ]) 14 | test = AlphaCharEmbedding(x[testi, ]) 15 | 16 | layers = c(train$width(), 100, 100, 21) 17 | 18 | batch_size = c(64, 128, 256) 19 | epochs = 30 20 | num_workers = 6 21 | 22 | loss = function(input, target) { 23 | torch_mean(-torch_sum(target * torch_log(input$squeeze() + 1e-16), 2)) 24 | } 25 | 26 | mss = foreach (bs = batch_size, .combine = bind_rows) %do% { 27 | luz_model = AlphaCodeEstimator |> 28 | setup( 29 | loss = loss, #nn_cross_entropy_loss(26), 30 | optimizer = optim_adam 31 | ) |> 32 | set_hparams(layers = layers) |> 33 | fit( 34 | data = dataloader( 35 | train, 36 | batch_size = bs, 37 | shuffle = TRUE, 38 | num_workers = num_workers, 39 | worker_packages = c("torch", "dplyr") 40 | ), 41 | epochs = epochs, 42 | valid_data = dataloader( 43 | test, 44 | batch_size = bs, 45 | shuffle = FALSE, 46 | num_workers = num_workers, 47 | worker_packages = c("torch", "dplyr") 48 | ), 49 | callbacks = list( 50 | luz_callback_keep_best_model() 51 | ) 52 | ) 53 | 54 | preds = 55 | predict( 56 | luz_model, 57 | dataloader( 58 | test, 59 | batch_size = bs, 60 | num_workers = num_workers, 61 | worker_packages = c("torch", "dplyr") 62 | ) 63 | ) 64 | 65 | comp = tibble( 66 | obs = x[testi,]$ll |> 67 | factor(levels = 1:21), 68 | pred = preds |> 69 | torch_tensor(device = "cpu") |> 70 | as.matrix() |> 71 | apply(1, which.max) |> 72 | factor(levels = 1:21) 73 | ) 74 | bind_cols( 75 | metric_set(accuracy, bal_accuracy)(comp, truth = obs, estimate = pred), 76 | batch_size = bs 77 | ) |> 78 | select(-.estimator) |> 79 | pivot_wider(names_from = .metric, values_from = .estimate) 80 | } 81 | print(mss) 82 | saveRDS(mss, "ms.rds") 83 | -------------------------------------------------------------------------------- /comparators/medbert/ref.txt: -------------------------------------------------------------------------------- 1 | https://huggingface.co/Charangan/MedBERT 2 | -------------------------------------------------------------------------------- /comparators/pubmedbert-fulltext/0-make-embedding.R: -------------------------------------------------------------------------------- 1 | library(reticulate) 2 | library(readr) 3 | library(purrr) 4 | library(foreach) 5 | library(itertools) 6 | 7 | use_condaenv("icd-10-huggingface", required = TRUE) 8 | 9 | source("../short-code.R") 10 | 11 | transformers = import("transformers") 12 | torch = import("torch") 13 | np = import("numpy") 14 | tqdm = import("tqdm") 15 | 16 | builtins = import_builtins() 17 | builtins$setattr(torch$distributed, "is_initialized", py_eval("lambda : False")) 18 | 19 | tokenizer = transformers$AutoTokenizer$from_pretrained( 20 | 'cambridgeltl/SapBERT-from-PubMedBERT-fulltext' 21 | ) 22 | model = transformers$AutoModel$from_pretrained( 23 | 'cambridgeltl/SapBERT-from-PubMedBERT-fulltext' 24 | ) 25 | 26 | mean_pooling = function(model_output, attention_mask) { 27 | token_embeddings = model_output[[1]] #First element of model_output contains all token embeddings 28 | input_mask_expanded = attention_mask$unsqueeze(-1L)$expand(token_embeddings$size())$float() 29 | torch$sum(token_embeddings * input_mask_expanded, 1L) / torch$clamp(input_mask_expanded$sum(1L), min=1e-9) 30 | } 31 | 32 | # A function to embed a set of string. 33 | embed = function(strings) { 34 | encoded_input = tokenizer( 35 | strings, 36 | padding = TRUE, 37 | truncation = TRUE, 38 | max_length = 256L, 39 | return_tensors = 'pt' 40 | ) 41 | model_output = model( 42 | input_ids = encoded_input$input_ids, 43 | attention_mask = encoded_input$attention_mask 44 | ) 45 | mean_pooling(model_output, 46 | encoded_input$attention_mask)$detach()$cpu()$numpy() |> 47 | (\(x) {rownames(x) = strings; x})() 48 | } 49 | 50 | x = read_fwf( 51 | "../../icd-10-cm-codes/icd10cm_codes_2019.txt", 52 | fwf_cols(code = 8, desc = 150) 53 | ) 54 | 55 | embs = foreach(it = isplitVector(seq_len(nrow(x)), chunkSize = 1000), 56 | .combine = c) %do% { 57 | ret = embed(x$desc[it]) 58 | ret = map(seq_len(nrow(ret)), ~ list(ret[.x,,drop = FALSE])) 59 | message(tail(it, 1), " of ", nrow(x)) 60 | gc() 61 | ret 62 | } 63 | 64 | x$embed = unlist(embs, recursive = FALSE) 65 | x$ll = get_short_code(x$code) 66 | saveRDS(x, "x-with-embedding.rds") 67 | -------------------------------------------------------------------------------- /comparators/pubmedbert-fulltext/1-benchmark.R: -------------------------------------------------------------------------------- 1 | library(luz) 2 | library(foreach) 3 | library(yardstick) 4 | library(tidyr) 5 | 6 | source("../alpha-char-embedding-model.R") 7 | 8 | x = readRDS("x-with-embedding.rds") 9 | 10 | traini = sample.int(nrow(x), round(0.9 * nrow(x))) 11 | testi = setdiff(seq_len(nrow(x)), traini) 12 | 13 | train = AlphaCharEmbedding(x[traini, ]) 14 | test = AlphaCharEmbedding(x[testi, ]) 15 | 16 | layers = c(train$width(), 100, 100, 21) 17 | 18 | batch_size = c(64, 128, 256) 19 | epochs = 30 20 | num_workers = 6 21 | 22 | loss = function(input, target) { 23 | torch_mean(-torch_sum(target * torch_log(input$squeeze() + 1e-16), 2)) 24 | } 25 | 26 | mss = foreach (bs = batch_size, .combine = bind_rows) %do% { 27 | luz_model = AlphaCodeEstimator |> 28 | setup( 29 | loss = loss, #nn_cross_entropy_loss(26), 30 | optimizer = optim_adam 31 | ) |> 32 | set_hparams(layers = layers) |> 33 | fit( 34 | data = dataloader( 35 | train, 36 | batch_size = bs, 37 | shuffle = TRUE, 38 | num_workers = num_workers, 39 | worker_packages = c("torch", "dplyr") 40 | ), 41 | epochs = epochs, 42 | valid_data = dataloader( 43 | test, 44 | batch_size = bs, 45 | shuffle = FALSE, 46 | num_workers = num_workers, 47 | worker_packages = c("torch", "dplyr") 48 | ), 49 | callbacks = list( 50 | luz_callback_keep_best_model() 51 | ) 52 | ) 53 | 54 | preds = 55 | predict( 56 | luz_model, 57 | dataloader( 58 | test, 59 | batch_size = bs, 60 | num_workers = num_workers, 61 | worker_packages = c("torch", "dplyr") 62 | ) 63 | ) 64 | 65 | comp = tibble( 66 | obs = x[testi,]$ll |> 67 | factor(levels = 1:21), 68 | pred = preds |> 69 | torch_tensor(device = "cpu") |> 70 | as.matrix() |> 71 | apply(1, which.max) |> 72 | factor(levels = 1:21) 73 | ) 74 | bind_cols( 75 | metric_set(accuracy, bal_accuracy)(comp, truth = obs, estimate = pred), 76 | batch_size = bs 77 | ) |> 78 | select(-.estimator) |> 79 | pivot_wider(names_from = .metric, values_from = .estimate) 80 | } 81 | print(mss) 82 | saveRDS(mss, "ms.rds") 83 | -------------------------------------------------------------------------------- /comparators/pubmedbert-fulltext/ref.txt: -------------------------------------------------------------------------------- 1 | https://huggingface.co/cambridgeltl/SapBERT-from-PubMedBERT-fulltext 2 | -------------------------------------------------------------------------------- /comparators/pubmedbert-ms-marco/0-make-embedding.R: -------------------------------------------------------------------------------- 1 | library(reticulate) 2 | library(readr) 3 | library(purrr) 4 | library(foreach) 5 | library(itertools) 6 | 7 | use_condaenv("icd-10-huggingface", required = TRUE) 8 | 9 | source("../short-code.R") 10 | 11 | transformers = import("transformers") 12 | torch = import("torch") 13 | np = import("numpy") 14 | builtins = import_builtins() 15 | builtins$setattr(torch$distributed, "is_initialized", py_eval("lambda : False")) 16 | 17 | tokenizer = transformers$AutoTokenizer$from_pretrained( 18 | 'pritamdeka/S-PubMedBert-MS-MARCO' 19 | ) 20 | model = transformers$AutoModel$from_pretrained( 21 | 'pritamdeka/S-PubMedBert-MS-MARCO' 22 | ) 23 | 24 | mean_pooling = function(model_output, attention_mask) { 25 | token_embeddings = model_output[[1]] #First element of model_output contains all token embeddings 26 | input_mask_expanded = attention_mask$unsqueeze(-1L)$expand(token_embeddings$size())$float() 27 | torch$sum(token_embeddings * input_mask_expanded, 1L) / torch$clamp(input_mask_expanded$sum(1L), min=1e-9) 28 | } 29 | 30 | 31 | # A function to embed a set of string. 32 | embed = function(strings) { 33 | encoded_input = tokenizer( 34 | strings, 35 | padding = TRUE, 36 | truncation = TRUE, 37 | max_length = 256L, 38 | return_tensors = 'pt' 39 | ) 40 | model_output = model( 41 | input_ids = encoded_input$input_ids, 42 | attention_mask = encoded_input$attention_mask 43 | ) 44 | mean_pooling(model_output, 45 | encoded_input$attention_mask)$detach()$cpu()$numpy() |> 46 | (\(x) {rownames(x) = strings; x})() 47 | } 48 | 49 | x = read_fwf( 50 | "../../icd-10-cm-codes/icd10cm_codes_2019.txt", 51 | fwf_cols(code = 8, desc = 150) 52 | ) 53 | 54 | embs = foreach(it = isplitVector(seq_len(nrow(x)), chunkSize = 1000), 55 | .combine = c) %do% { 56 | ret = embed(x$desc[it]) 57 | ret = map(seq_len(nrow(ret)), ~ list(ret[.x,,drop = FALSE])) 58 | message(tail(it, 1), " of ", nrow(x)) 59 | gc() 60 | ret 61 | } 62 | 63 | x$embed = unlist(embs, recursive = FALSE) 64 | x$ll = get_short_code(x$code) 65 | saveRDS(x, "x-with-embedding.rds") 66 | -------------------------------------------------------------------------------- /comparators/pubmedbert-ms-marco/1-benchmark.R: -------------------------------------------------------------------------------- 1 | library(luz) 2 | library(foreach) 3 | library(yardstick) 4 | library(tidyr) 5 | 6 | source("../alpha-char-embedding-model.R") 7 | 8 | x = readRDS("x-with-embedding.rds") 9 | 10 | traini = sample.int(nrow(x), round(0.9 * nrow(x))) 11 | testi = setdiff(seq_len(nrow(x)), traini) 12 | 13 | train = AlphaCharEmbedding(x[traini, ]) 14 | test = AlphaCharEmbedding(x[testi, ]) 15 | 16 | layers = c(train$width(), 100, 100, 21) 17 | 18 | batch_size = c(64, 128, 256) 19 | epochs = 30 20 | num_workers = 6 21 | 22 | loss = function(input, target) { 23 | torch_mean(-torch_sum(target * torch_log(input$squeeze() + 1e-16), 2)) 24 | } 25 | 26 | mss = foreach (bs = batch_size, .combine = bind_rows) %do% { 27 | luz_model = AlphaCodeEstimator |> 28 | setup( 29 | loss = loss, #nn_cross_entropy_loss(26), 30 | optimizer = optim_adam 31 | ) |> 32 | set_hparams(layers = layers) |> 33 | fit( 34 | data = dataloader( 35 | train, 36 | batch_size = bs, 37 | shuffle = TRUE, 38 | num_workers = num_workers, 39 | worker_packages = c("torch", "dplyr") 40 | ), 41 | epochs = epochs, 42 | valid_data = dataloader( 43 | test, 44 | batch_size = bs, 45 | shuffle = FALSE, 46 | num_workers = num_workers, 47 | worker_packages = c("torch", "dplyr") 48 | ), 49 | callbacks = list( 50 | luz_callback_keep_best_model() 51 | ) 52 | ) 53 | 54 | preds = 55 | predict( 56 | luz_model, 57 | dataloader( 58 | test, 59 | batch_size = bs, 60 | num_workers = num_workers, 61 | worker_packages = c("torch", "dplyr") 62 | ) 63 | ) 64 | 65 | comp = tibble( 66 | obs = x[testi,]$ll |> 67 | factor(levels = 1:21), 68 | pred = preds |> 69 | torch_tensor(device = "cpu") |> 70 | as.matrix() |> 71 | apply(1, which.max) |> 72 | factor(levels = 1:21) 73 | ) 74 | bind_cols( 75 | metric_set(accuracy, bal_accuracy)(comp, truth = obs, estimate = pred), 76 | batch_size = bs 77 | ) |> 78 | select(-.estimator) |> 79 | pivot_wider(names_from = .metric, values_from = .estimate) 80 | } 81 | print(mss) 82 | saveRDS(mss, "ms.rds") 83 | -------------------------------------------------------------------------------- /comparators/pubmedbert-ms-marco/ref.txt: -------------------------------------------------------------------------------- 1 | https://huggingface.co/pritamdeka/S-PubMedBert-MS-MARCO 2 | -------------------------------------------------------------------------------- /comparators/setup: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | conda create -n icd-10-huggingface -c huggingface transformers pytorch numpy tqdm 4 | 5 | -------------------------------------------------------------------------------- /comparators/short-code.R: -------------------------------------------------------------------------------- 1 | library(purrr) 2 | 3 | ccc = c( 4 | "^[AB].*", 5 | "(^C|^D[0-4]).*", 6 | "^D[5-8].*", 7 | "^E[0-8][0-9].*", 8 | "^F.*", 9 | "^G.*", 10 | "^H[0-5][0-9].*", 11 | "^H[6-9][0-9].*", 12 | "^I.*", 13 | "^J.*", 14 | "^K.*", 15 | "^L.*", 16 | "^M.*", 17 | "^N.*", 18 | "^O[0-9].*", 19 | "^P.*", 20 | "^Q.*", 21 | "^R.*", 22 | "^[ST].*", 23 | "^[UVWXY].*", 24 | "^[Z].*" 25 | ) 26 | 27 | get_short_code_impl = function(code) { 28 | which(map_lgl(ccc, ~ grepl(.x, code))) 29 | } 30 | 31 | get_short_code = function(code) { 32 | map_int(code, get_short_code_impl) 33 | } 34 | -------------------------------------------------------------------------------- /embedding-data/.gitattributes: -------------------------------------------------------------------------------- 1 | *.csv filter=lfs diff=lfs merge=lfs -text 2 | *.gz filter=lfs diff=lfs merge=lfs -text 3 | -------------------------------------------------------------------------------- /embedding-data/icd-10-cm-2019-0010.csv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:453921d25f49450a6a3547f9cb814f59a166ad3a58f4dd58cfe9dc3c5a86baef 3 | size 6751427 4 | -------------------------------------------------------------------------------- /embedding-data/icd-10-cm-2019-0050.csv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:4f8fdf1ceff951bae456b5ff9978506893e3200bc76c0c5e606dda2648f268fc 3 | size 30557163 4 | -------------------------------------------------------------------------------- /embedding-data/icd-10-cm-2019-0100.csv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:2f3faedc2c3018756945254a40910d9a5b5e04f5ae12cf4b10b2bbd9e1f5a113 3 | size 60770182 4 | -------------------------------------------------------------------------------- /embedding-data/icd-10-cm-2019-1000.csv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:a5d367ca6af8b405bc04d40dca3abc2cf22a79bdecbe880e9e5884b0299d8cd5 3 | size 621858266 4 | -------------------------------------------------------------------------------- /embedding-data/icd-10-cm-2020-0010.csv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:69a49cfa852a2b8a7ca6fcfa27e0bd796061419888f48cdc0347408671eb50d9 3 | size 6775185 4 | -------------------------------------------------------------------------------- /embedding-data/icd-10-cm-2020-0050.csv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:13917a88d5c9b24222b31e8d955645c72734fa97110f6226cf6c385709fb1431 3 | size 30663679 4 | -------------------------------------------------------------------------------- /embedding-data/icd-10-cm-2020-0100.csv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:d16e54c76594574d5f87352afb91b7ff6fbc06d7962ef0ec57ca082a98e5251a 3 | size 60981659 4 | -------------------------------------------------------------------------------- /embedding-data/icd-10-cm-2020-1000.csv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:94eb814d7433ee9626de75f8a0fcc49eabfe9495db43a9132f2ab2fd7eb4c742 3 | size 624037283 4 | -------------------------------------------------------------------------------- /embedding-data/icd-10-cm-2021-0010.csv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:5a17516e78b7755895743a4c5b63f60003a3c66ea1a9fbd111b92bfb205c2b99 3 | size 6815772 4 | -------------------------------------------------------------------------------- /embedding-data/icd-10-cm-2021-0050.csv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:b24d905f03d427c69efbb05784d464e4c1b158b22be66463021c420c3bdc20fc 3 | size 30848223 4 | -------------------------------------------------------------------------------- /embedding-data/icd-10-cm-2021-0100.csv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:a8de1a232bd2d5e5adbedcfcefacbe750763ebb9e4d9c45a8cb4d74ccc20c525 3 | size 61347342 4 | -------------------------------------------------------------------------------- /embedding-data/icd-10-cm-2021-1000.csv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:c7f88839cc0f327834626ca3320243800886376652c557426ca54d9e59109ebd 3 | size 627764959 4 | -------------------------------------------------------------------------------- /embedding-data/icd-10-cm-2022-0010.csv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:079dbccd38141ccc2660239ffd49b442671cbdf6c5b8acd66b8a371fad7b3841 3 | size 6829052 4 | -------------------------------------------------------------------------------- /embedding-data/icd-10-cm-2022-0050.csv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:576e3e6ef00a0722d9ad39c1e7e2a438a97c8a7b3bc8a64fd85e2236e8c246fe 3 | size 30905849 4 | -------------------------------------------------------------------------------- /embedding-data/icd-10-cm-2022-0100.csv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:ec467c16aa628e554286e901bb0f9a479d425d8ecce8365f3bb58043b45b9ee3 3 | size 61460417 4 | -------------------------------------------------------------------------------- /embedding-data/icd-10-cm-2022-1000.csv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:68f6aac04c85f0de9c87c91c81872acfe8d5502f62edfda127d8de74c8c790b5 3 | size 628925623 4 | -------------------------------------------------------------------------------- /figure/unnamed-chunk-2-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kaneplusplus/icd-10-cm-embedding/a63d21c7d8f90419515bcfc2b0fce4281a6f1e62/figure/unnamed-chunk-2-1.png -------------------------------------------------------------------------------- /icd-10-cm-embedding.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | -------------------------------------------------------------------------------- /icd10_dl.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kaneplusplus/icd-10-cm-embedding/a63d21c7d8f90419515bcfc2b0fce4281a6f1e62/icd10_dl.rds -------------------------------------------------------------------------------- /make-biogpt-conda-env: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | conda create --name biogpt torch transformers 3 | -------------------------------------------------------------------------------- /make-download-rds.R: -------------------------------------------------------------------------------- 1 | library(tibble) 2 | library(tidyr) 3 | 4 | icd10_dl = expand_grid( 5 | tibble(year = 2019:2022), 6 | tibble(emb_dim = c(10, 50, 100, 1000)) 7 | ) 8 | 9 | icd10_dl$url = 10 | sprintf( 11 | "https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/embedding-data/icd-10-cm-%d-%04d.csv?raw=true", 12 | icd10_dl$year, 13 | icd10_dl$emb_dim 14 | ) 15 | 16 | saveRDS(icd10_dl, "icd10_dl.rds") 17 | -------------------------------------------------------------------------------- /model-performance.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kaneplusplus/icd-10-cm-embedding/a63d21c7d8f90419515bcfc2b0fce4281a6f1e62/model-performance.rds -------------------------------------------------------------------------------- /sup-model-perf.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kaneplusplus/icd-10-cm-embedding/a63d21c7d8f90419515bcfc2b0fce4281a6f1e62/sup-model-perf.rds -------------------------------------------------------------------------------- /year-validation.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kaneplusplus/icd-10-cm-embedding/a63d21c7d8f90419515bcfc2b0fce4281a6f1e62/year-validation.rds --------------------------------------------------------------------------------