├── .gitignore
├── 0-make-embeddings.R
├── 1-compress-icd-10-embeddings.R
├── 2-validation.R
├── 3-create-datasets.R
├── 4-estimate-leading-char.R
├── README.Rmd
├── README.md
├── README_files
    ├── figure-gfm
    │   └── unnamed-chunk-2-1.png
    └── figure-markdown_strict
    │   └── unnamed-chunk-2-1.png
├── alpha-char-model.R
├── autoencoder.R
├── bmc-bioinformatics-paper
    ├── bmc_article.bib
    ├── bmc_article.tex
    ├── bmcart-biblio.sty
    ├── bmcart.cls
    ├── tsne-plot.png
    └── vancouver.bst
├── category-codes.ssv
├── comparators
    ├── REAME.md
    ├── alpha-char-embedding-model.R
    ├── clinicalbert
    │   ├── 0-make-embedding.R
    │   ├── 1-benchmark.R
    │   └── ref.txt
    ├── medbert
    │   ├── 0-make-embedding.R
    │   ├── 1-benchmark.R
    │   └── ref.txt
    ├── pubmedbert-fulltext
    │   ├── 0-make-embedding.R
    │   ├── 1-benchmark.R
    │   └── ref.txt
    ├── pubmedbert-ms-marco
    │   ├── 0-make-embedding.R
    │   ├── 1-benchmark.R
    │   └── ref.txt
    ├── setup
    └── short-code.R
├── embedding-data
    ├── .gitattributes
    ├── icd-10-cm-2019-0010.csv.gz
    ├── icd-10-cm-2019-0050.csv.gz
    ├── icd-10-cm-2019-0100.csv.gz
    ├── icd-10-cm-2019-1000.csv.gz
    ├── icd-10-cm-2020-0010.csv.gz
    ├── icd-10-cm-2020-0050.csv.gz
    ├── icd-10-cm-2020-0100.csv.gz
    ├── icd-10-cm-2020-1000.csv.gz
    ├── icd-10-cm-2021-0010.csv.gz
    ├── icd-10-cm-2021-0050.csv.gz
    ├── icd-10-cm-2021-0100.csv.gz
    ├── icd-10-cm-2021-1000.csv.gz
    ├── icd-10-cm-2022-0010.csv.gz
    ├── icd-10-cm-2022-0050.csv.gz
    ├── icd-10-cm-2022-0100.csv.gz
    └── icd-10-cm-2022-1000.csv.gz
├── figure
    └── unnamed-chunk-2-1.png
├── icd-10-cm-codes
    ├── icd10cm_codes_2019.txt
    ├── icd10cm_codes_2020.txt
    ├── icd10cm_codes_2021.txt
    └── icd10cm_codes_2022.txt
├── icd-10-cm-embedding.Rproj
├── icd10_dl.rds
├── make-biogpt-conda-env
├── make-download-rds.R
├── model-performance.rds
├── sup-model-perf.rds
└── year-validation.rds


/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | .Ruserdata
5 | 


--------------------------------------------------------------------------------
/0-make-embeddings.R:
--------------------------------------------------------------------------------
  1 | # Create a directory called icd-10-embeddings and add BioGPT embedding
  2 | # values to it.
  3 | 
  4 | library(reticulate)
  5 | library(dplyr)
  6 | library(tidyr)
  7 | library(purrr)
  8 | library(foreach)
  9 | library(itertools)
 10 | library(readr)
 11 | library(tibble)
 12 | library(doMC)
 13 | library(iterators)
 14 | registerDoMC(cores = 2)
 15 | 
 16 | # Use the conda environment created from make-biogpt-conda-env
 17 | use_condaenv("biogpt")
 18 | 
 19 | # Import the needed libraries.
 20 | torch = import("torch")
 21 | BioGptTokenizer = import("transformers")$BioGptTokenizer
 22 | BioGptForCausalLM = import("transformers")$BioGptForCausalLM
 23 | 
 24 | # Get the BioGPT tokenizer and model from Huggingface.
 25 | tokenizer = BioGptTokenizer$from_pretrained("microsoft/biogpt")
 26 | model = BioGptForCausalLM$from_pretrained("microsoft/biogpt")
 27 | 
 28 | # A function to calculate the embedding location.
 29 | mean_pooling = function(model_output, attention_mask) {
 30 |   #First element of model_output contains all token embeddings
 31 |   token_embeddings = model_output[1] 
 32 |   input_mask_expanded = attention_mask$unsqueeze(-1L)$expand(token_embeddings$logits$size())$float()
 33 |   sum_embeddings = torch$sum(torch$multiply(token_embeddings$logits, input_mask_expanded), 1L)
 34 |   sum_mask = torch$clamp(input_mask_expanded$sum(1L), min=1e-9)
 35 |   ret = purrr::reduce(sum_embeddings$div(sum_mask)$tolist(), rbind)
 36 |   rownames(ret) = as.character(seq_len(nrow(ret)))
 37 |   ret
 38 | }
 39 | 
 40 | # A function to embed a set of string.
 41 | embed = function(strings, max_len = 256) {
 42 |   encoded_input = tokenizer(
 43 |     strings,
 44 |     padding = TRUE,
 45 |     truncation = TRUE,
 46 |     max_length = max_len,
 47 |     return_tensors = 'pt'
 48 |   )
 49 |   model_output = model(
 50 |     input_ids = encoded_input$input_ids, 
 51 |     attention_mask = encoded_input$attention_mask
 52 |   )
 53 |   mean_pooling(model_output, encoded_input$attention_mask) |> 
 54 |     (\(x) {rownames(x) = strings; x})() 
 55 | }
 56 | 
 57 | create_embeddings = function(icd10, dir_name) {
 58 |   dir.create(dir_name)
 59 |   
 60 |   foreach(it = isplitVector(seq_len(nrow(icd10)), chunkSize = 500)) %do% {
 61 |     icds = icd10[it,] 
 62 |     emb = embed(icds$desc) 
 63 |     icds$emb = map(seq_len(nrow(emb)), ~ emb[.x,])
 64 |     walk(
 65 |       seq_len(nrow(icds)), 
 66 |       ~ saveRDS(icds[.x,], sprintf("%s/%s.rds", dir_name, icds$code[.x]))
 67 |     )
 68 |     print(it[length(it)])
 69 |     NULL
 70 |   } |> unlist() |> invisible()
 71 | }
 72 | 
 73 | # The directory where the embeddings will go, by year.
 74 | dir.create("icd-10-cm-embeddings")
 75 | 
 76 | # Write the embeddings to their respective years.
 77 | 
 78 | for (year in 2019:2022) {
 79 |   print(year)
 80 |   icd10 = sprintf("icd-10-cm-codes/icd10cm_codes_%s.txt", year) |>
 81 |     read_fwf(fwf_cols(code = 8, desc = 1000))
 82 | 
 83 |   write_dir = file.path("icd-10-cm-embeddings", year)
 84 |   dir.create(write_dir)
 85 | 
 86 |   # Write the code, description, and embedding to a file with one file
 87 |   # per code.
 88 |   foreach(it = isplitVector(seq_len(nrow(icd10)), chunkSize = 200)) %do% {
 89 |     icds = icd10[it,] 
 90 |     emb = embed(icds$desc) 
 91 |     icds$emb = map(seq_len(nrow(emb)), ~ emb[.x,])
 92 |     walk(
 93 |       seq_len(nrow(icds)), 
 94 |       ~ saveRDS(icds[.x,], sprintf("%s/%s.rds", write_dir, icds$code[.x]))
 95 |     )
 96 |     gc()
 97 |     NULL
 98 |   }
 99 | }
100 | 
101 | 


--------------------------------------------------------------------------------
/1-compress-icd-10-embeddings.R:
--------------------------------------------------------------------------------
  1 | library(luz)
  2 | library(tidyr)
  3 | 
  4 | source("autoencoder.R")
  5 | 
  6 | embedding_dir = file.path("icd-10-cm-embeddings", "2019")
  7 | 
  8 | fn = file.path(embedding_dir, dir(embedding_dir))
  9 | set.seed(123)
 10 | train = sample.int(length(fn), round(0.9 * length(fn)))
 11 | test = setdiff(seq_len(length(fn)), train)
 12 | 
 13 | icd10_emb_train = ICD10Embedding(fn[train])
 14 | icd10_emb_test = ICD10Embedding(fn[test])
 15 | 
 16 | emb_len = icd10_emb_train[1]$x$shape[1]
 17 | 
 18 | # Create the set of parameters we will create autoencoders over.
 19 | 
 20 | params = expand_grid(
 21 |   model_layers = list(
 22 |     c(emb_len, 1000, emb_len),
 23 |     c(emb_len, 1000, 100, 1000, emb_len),
 24 |     c(emb_len, 1000, 100, 50, 100, 1000, emb_len),
 25 |     c(emb_len, 1000, 100, 50, 10, 50, 100, 1000, emb_len)),
 26 |   batch_size = c(64, 128, 256)
 27 | )
 28 | 
 29 | params$model_name = as.character(seq_len(nrow(params)))
 30 | 
 31 | make_model = function(model_layers, batch_size, model_name, epochs = 30) {
 32 | 
 33 |   ret_luz = ICD10AutoEncoder |>
 34 |     setup(
 35 |       loss = nn_mse_loss(),
 36 |       optimizer = optim_adam
 37 | #      metrics = list(
 38 | #        luz_metric_mse(),
 39 | #        luz_metric_data_variation()
 40 | #      )
 41 |     ) |>
 42 |     set_hparams(layers = model_layers) |>
 43 |     fit(
 44 |       data = dataloader(
 45 |         icd10_emb_train, 
 46 |         batch_size = batch_size, 
 47 |         shuffle = TRUE, 
 48 |         num_workers = 4,
 49 |         worker_packages = "torch"
 50 |       ),
 51 |       epochs = epochs,
 52 |       valid_data = dataloader(
 53 |         icd10_emb_test, 
 54 |         batch_size = batch_size, 
 55 |         shuffle = TRUE, 
 56 |         num_workers = 4,
 57 |         worker_packages = "torch"
 58 |       ),
 59 |       callbacks = 
 60 |         list(
 61 |           luz_callback_keep_best_model()
 62 |         )
 63 |     )
 64 |   ret_luz
 65 | }
 66 | 
 67 | # The parameters and the models
 68 | md = params
 69 | 
 70 | 
 71 | md$model = map(
 72 |   seq_len(nrow(params)),
 73 |   ~ make_model(
 74 |     params$model_layers[[.x]], 
 75 |     params$batch_size[.x], 
 76 |     params$model_name[.x]
 77 |   )
 78 | )
 79 | 
 80 | md$embedding_dim = rep(c(1000, 100, 50, 10), each = 3)
 81 | 
 82 | # Best valid loss index
 83 | bvli = map_int(md$model, ~ which.min(unlist(.x$records$metrics$valid)))
 84 | 
 85 | md$best_valid_loss = 
 86 |   map_dbl(
 87 |     seq_along(md$model), 
 88 |     ~ unlist(md$model[[.x]]$records$metrics$valid[bvli[.x]])
 89 |   )
 90 | 
 91 | md$best_train_loss = 
 92 |   map_dbl(
 93 |     seq_along(md$model), 
 94 |     ~ unlist(md$model[[.x]]$records$metrics$train[bvli[.x]])
 95 |   )
 96 | 
 97 | 
 98 | # Save the luz_models
 99 | luz_model_dir = "luz-models"
100 | dir.create(luz_model_dir)
101 | md$model_path = NA
102 | for (i in seq_len(nrow(md))) {
103 |   model_path = file.path(luz_model_dir, sprintf("luz-model-%02d.luz", i))
104 |   luz_save(
105 |     md$model[[i]], 
106 |     model_path
107 |   )
108 |   md$model_path = model_path
109 | }
110 | 
111 | md = md |> arrange(best_valid_loss, best_train_loss)
112 | 
113 | mdo = md |> 
114 |   select(embedding_dim, batch_size, starts_with("best")) 
115 | 
116 | saveRDS(mdo, "model-performance.rds")
117 | 
118 | dir.create("autoencoder-models")
119 | 
120 | torch_save(
121 |   (md |> filter(embedding_dim == 10))$model[[1]]$model, 
122 |   file.path("autoencoder-models", "icd10cm-0010.pt")
123 | )
124 | 
125 | torch_save(
126 |   (md |> filter(embedding_dim == 50))$model[[1]]$model, 
127 |   file.path("autoencoder-models", "icd10cm-0050.pt")
128 | )
129 | 
130 | torch_save(
131 |   (md |> filter(embedding_dim == 100))$model[[1]]$model, 
132 |   file.path("autoencoder-models", "icd10cm-0100.pt")
133 | )
134 | 
135 | torch_save(
136 |   (md |> filter(embedding_dim == 1000))$model[[1]]$model, 
137 |   file.path("autoencoder-models", "icd10cm-1000.pt")
138 | )
139 | 
140 | 


--------------------------------------------------------------------------------
/2-validation.R:
--------------------------------------------------------------------------------
 1 | # Validate the selected models using ICD 10 codes from other years.
 2 | 
 3 | library(torch)
 4 | library(purrr)
 5 | library(stringr)
 6 | library(readr)
 7 | library(tibble)
 8 | library(tidyr)
 9 | library(progress)
10 | 
11 | source("autoencoder.R")
12 | 
13 | default_device = "cpu"
14 | if (backends_cudnn_is_available()) {
15 |   default_device = "cuda"
16 | } else if (backends_mps_is_available()) {
17 |   default_device = "mps"
18 | } else if (backends_mkldnn_is_available()) {
19 |   default_device = "mkldnn"
20 | } else if (backends_openmp_is_available()) {
21 |   default_device = "openmp"
22 | } else if (backends_mkl_is_available()) {
23 |   default_device = "mkl"
24 | }
25 | 
26 | ae_model_paths = "autoencoder-models" |>
27 |   (\(x) file.path(x, dir(x)))()
28 | 
29 | icd10_embedding_paths = file.path("icd-10-cm-embeddings", 2019:2022) |>
30 |   map( ~ file.path(.x, dir(.x)))
31 | 
32 | xs = tibble(
33 |   embed = map(icd10_embedding_paths, ICD10Embedding),
34 |   year = 2019:2022
35 | )
36 | 
37 | vds = tibble(
38 |   model = map(ae_model_paths, torch_load),
39 |   embedding_dim = str_extract(ae_model_paths, "\\d{4}") |> as.integer()
40 | )
41 | 
42 | pred_error = function(d, m, device = default_device) {
43 |   m = m$to(device = device)
44 |   ret = c()
45 |   dl = dataloader(d, batch_size = 100, num_workers = 5)
46 |   pb = progress_bar$new(
47 |     format = "[:bar] :percent eta: :eta",
48 |     total = length(dl)
49 |   )
50 |   loop(for (b in dl) {
51 |     xt = b$x$to(device = device)
52 |     r = torch_mean((xt - m(xt))^2, 2)$to(device = "cpu") |>
53 |       as.numeric()
54 |     pb$tick()
55 |     ret = c(ret, r)
56 |   })
57 |   mean(ret)
58 | }
59 | 
60 | variation = function(d, device = default_device) {
61 |   ret = c()
62 |   dl = dataloader(d, batch_size = 100, num_workers = 5)
63 |   pb = progress_bar$new(
64 |     format = "[:bar] :percent eta: :eta",
65 |     total = length(dl)
66 |   )
67 |   loop(for (b in dl) {
68 |     xt = b$x$to(device = device)
69 |     r = torch_var(xt, 2)$to(device = "cpu") |> as.numeric()
70 |     pb$tick()
71 |     ret = c(ret, r)
72 |   })
73 |   mean(ret)
74 | }
75 | 
76 | x = expand_grid(vds, xs) 
77 | 
78 | x$pred_error = 
79 |   map_dbl(
80 |     seq_len(nrow(x)), 
81 |     ~ {print(.x); pred_error(x$embed[[.x]], x$model[[.x]])})
82 | 
83 | x$variation = 
84 |   map_dbl(
85 |     seq_len(nrow(x)), 
86 |     ~ {print(.x); variation(x$embed[[.x]])})
87 | 
88 | saveRDS(x, "year-validation-raw.rds")
89 | 
90 | x |> 
91 |   arrange(year) |> 
92 |   select(-model, -embed) |>
93 |   mutate(cod = pred_error / variation) |>
94 |   select(year, embedding_dim, pred_error, cod) |>
95 |   saveRDS("year-validation.rds")
96 | 


--------------------------------------------------------------------------------
/3-create-datasets.R:
--------------------------------------------------------------------------------
  1 | # Write the datasets.
  2 | 
  3 | library(torch)
  4 | library(purrr)
  5 | library(stringr)
  6 | library(readr)
  7 | library(tibble)
  8 | library(tidyr)
  9 | library(progress)
 10 | library(dplyr)
 11 | library(itertools)
 12 | library(foreach)
 13 | #library(doMC)
 14 | #registerDoMC(cores = 2)
 15 | registerDoSEQ()
 16 | 
 17 | source("autoencoder.R")
 18 | 
 19 | default_device = "cpu"
 20 | if (backends_cudnn_is_available()) {
 21 |   default_device = "cuda"
 22 | } else if (backends_mps_is_available()) {
 23 |   default_device = "mps"
 24 | } else if (backends_mkldnn_is_available()) {
 25 |   default_device = "mkldnn"
 26 | } else if (backends_openmp_is_available()) {
 27 |   default_device = "openmp"
 28 | } else if (backends_mkl_is_available()) {
 29 |   default_device = "mkl"
 30 | }
 31 | 
 32 | 
 33 | ae_model_paths = "autoencoder-models" |>
 34 |   (\(x) file.path(x, dir(x)))()
 35 | 
 36 | icd10_embedding_paths = file.path("icd-10-cm-embeddings", 2019:2022) |>
 37 |   map( ~ file.path(.x, dir(.x)))
 38 | 
 39 | icd10_code_paths = 2019:2022 |> 
 40 |   map_chr(~file.path("icd-10-cm-codes", sprintf("icd10cm_codes_%s.txt", .x)))
 41 | 
 42 | xs = tibble(
 43 |   embed = map(icd10_embedding_paths, ICD10Embedding),
 44 |   year = 2019:2022,
 45 |   codes = map(icd10_code_paths, ~read_fwf(.x, fwf_cols(code = 8, desc = 150)))
 46 | )
 47 | 
 48 | vds = tibble(
 49 |   model = map(ae_model_paths, torch_load),
 50 |   embedding_dim = str_extract(ae_model_paths, "\\d{4}") |> as.integer()
 51 | )
 52 | 
 53 | get_embedding = function(d, m, device = default_device) {
 54 |   m = m$to(device = device)
 55 |   ret = c()
 56 |   dl = dataloader(d, batch_size = 100, num_workers = 6)
 57 |   pb = progress_bar$new( format = "[:bar] :percent eta: :eta",
 58 |     total = length(dl)
 59 |   )
 60 |   loop(for (b in dl) {
 61 |     xt = b$x$to(device = device)
 62 |     for (i in seq_len(length(m$decoder) / 2)) {
 63 |       x = xt
 64 |       xt = m$decoder[[i]](x)
 65 |     }
 66 |     pb$tick()
 67 |     gc()
 68 |     ret = rbind(ret, as.matrix(xt$to(device = "cpu")))
 69 |   })
 70 |   ret = as_tibble(as.data.frame(ret))
 71 |   ret
 72 | }
 73 | 
 74 | xd = expand_grid(xs, vds)
 75 | xd$embedding = map(
 76 |   seq_len(nrow(xd)), 
 77 |   ~ {print(.x); get_embedding(xd$embed[[.x]], xd$model[[.x]])}
 78 | )
 79 | 
 80 | dir.create("embedding-data")
 81 | 
 82 | for (i in seq_len(nrow(xd))) {
 83 |   d = bind_cols(xd$codes[[i]], xd$embedding[[i]])
 84 |   write_csv(
 85 |     d, 
 86 |     file.path(
 87 |       "embedding-data", 
 88 |       sprintf("icd-10-cm-%s-%04d.csv.gz", xd$year[i], xd$embedding_dim[i])
 89 |     )
 90 |   )
 91 |   gc()
 92 | }
 93 | 
 94 | for (year in 2019:2022) {
 95 |   fns = file.path("icd-10-cm-embeddings", year) |>
 96 |     (\(x) file.path(x, dir(x)))()
 97 |   
 98 |   dfs = foreach(it = isplitVector(fns, chunkSize = 1000),
 99 |                 .combine = bind_rows, 
100 |                 .inorder = FALSE, .errorhandling = "remove",
101 |                 .multicombine = TRUE) %do% {
102 |     print(tail(it, 1))
103 |     df = foreach(fn = it, .combine = bind_rows, 
104 |                  .errorhandling = "remove", .multicombine = TRUE) %dopar% {
105 |       ret = readRDS(fn) 
106 |       ret = 
107 |         bind_cols(
108 |           ret[,1:2],
109 |           ret$emb[[1]] |> t() |> as.data.frame()
110 |         )
111 |       gc()
112 |       ret
113 |     }
114 |     gc()
115 |     print(nrow(df))
116 |     df
117 |   }
118 |   write_csv(dfs, sprintf("embedding-data/icd-10-cm-%s-full.csv", year))
119 | }
120 | 
121 | 
122 | 


--------------------------------------------------------------------------------
/4-estimate-leading-char.R:
--------------------------------------------------------------------------------
  1 | library(luz)
  2 | library(yardstick)
  3 | library(tibble)
  4 | library(stringr)
  5 | 
  6 | source("alpha-char-model.R")
  7 | 
  8 | ccc = c(
  9 |   "^[AB].*",
 10 |   "(^C|^D[0-4]).*",
 11 |   "^D[5-8].*",
 12 |   "^E[0-8][0-9].*",
 13 |   "^F.*",
 14 |   "^G.*",
 15 |   "^H[0-5][0-9].*",
 16 |   "^H[6-9][0-9].*",
 17 |   "^I.*",
 18 |   "^J.*",
 19 |   "^K.*",
 20 |   "^L.*",
 21 |   "^M.*",
 22 |   "^N.*",
 23 |   "^O[0-9].*",
 24 |   "^P.*",
 25 |   "^Q.*",
 26 |   "^R.*",
 27 |   "^[ST].*",
 28 |   "^[UVWXY].*",
 29 |   "^[Z].*" 
 30 | )
 31 | 
 32 | get_short_code_impl = function(code) {
 33 |   which(map_lgl(ccc, ~ grepl(.x, code)))
 34 | }
 35 | 
 36 | get_short_code = function(code) {
 37 |   map_int(code, get_short_code_impl)
 38 | }
 39 | 
 40 | emb_data_dir = "embedding-data"
 41 | 
 42 | params = tibble(
 43 |   embedding_files = 
 44 |     file.path(emb_data_dir, dir(emb_data_dir) |> str_subset("2019")),
 45 |   emb_dim = 
 46 |     str_extract(embedding_files, "-\\d{4}\\.") |> str_extract("\\d{4}")
 47 | )
 48 | 
 49 | dir.create("luz-supervised-models")
 50 | 
 51 | ms = list()
 52 | 
 53 | for (i in seq_len(nrow(params))) {
 54 | 
 55 |   aced = params$embedding_files[i]|> read_csv() 
 56 | 
 57 |   traini = sample.int(nrow(aced), round(0.9 * nrow(aced)))
 58 |   testi = setdiff(seq_len(nrow(aced)), traini)
 59 | 
 60 |   aced$code = get_short_code(aced$code)
 61 | 
 62 |   train = AlphaCharEmbedding(aced[traini, ], sort(unique(aced$code)))
 63 |   test = AlphaCharEmbedding(aced[testi, ], sort(unique(aced$code)))
 64 | 
 65 |   layers = c(train$width(), 100, 100, 21)
 66 |   batch_size = 64
 67 |   epochs = 30
 68 | 
 69 |   # Cross entropy
 70 |   loss = function(input, target) {
 71 |     torch_mean(-torch_sum(target * torch_log(input + 1e-16), 2))
 72 |   }
 73 | 
 74 |   luz_model = AlphaCodeEstimator |>
 75 |     setup(
 76 |       loss = loss, #nn_cross_entropy_loss(26),
 77 |       optimizer = optim_adam
 78 |     ) |>
 79 |     set_hparams(layers = layers) |>
 80 |     fit(
 81 |       data = dataloader(
 82 |         train,
 83 |         batch_size = batch_size,
 84 |         shuffle = TRUE,
 85 |         num_workers = 4,
 86 |         worker_packages = c("torch", "dplyr")
 87 |       ),
 88 |       epochs = epochs,
 89 |       valid_data = dataloader(
 90 |         test,
 91 |         batch_size = batch_size,
 92 |         shuffle = TRUE,
 93 |         num_workers = 4,
 94 |         worker_packages = c("torch", "dplyr")
 95 |       ),
 96 |       callbacks = list(
 97 |         luz_callback_keep_best_model()
 98 |       )
 99 |     )
100 | 
101 |   luz_save(
102 |     luz_model, 
103 |     file.path("luz-supervised-models", 
104 |               sprintf("luz-model-%s.pt", params$emb_dim[i]))
105 |   )
106 | 
107 |   preds = 
108 |     predict(
109 |       luz_model,
110 |       dataloader(
111 |         test,
112 |         batch_size = batch_size,
113 |         num_workers = 4,
114 |         worker_packages = c("torch", "dplyr")
115 |       )
116 |     )
117 | 
118 |   comp = tibble(
119 |     obs =  aced[testi,]$code |>
120 |       factor(levels = 1:21),
121 |     pred = preds |> 
122 |       torch_tensor(device = "cpu") |>
123 |       as.matrix() |>
124 |       apply(1, which.max) |> 
125 |       factor(levels = 1:21)
126 |   )
127 | 
128 |   ms = c(ms, 
129 |     list(
130 |       metric_set(accuracy, bal_accuracy)(comp, truth = obs, estimate = pred) 
131 |     )
132 |   )
133 |   print(ms)
134 | }
135 | 
136 | params$accuracy = c(ms[[3]][1], ms[[6]][1], ms[[9]][1], ms[[12]][1])
137 | params$bal_accuracy = c(ms[[3]][2], ms[[6]][2], ms[[9]][2], ms[[12]][2])
138 | 
139 | saveRDS(params |> select(-embedding_files), "sup-model-perf.rds")
140 | 
141 | 
142 | 


--------------------------------------------------------------------------------
/README.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | output: md_document
  3 | ---
  4 | 
  5 | <!-- README.md is generated from README.Rmd. Please edit that file -->
  6 | 
  7 | ```{r, include = FALSE}
  8 | knitr::opts_chunk$set(
  9 |   collapse = TRUE,
 10 |   comment = "#>"
 11 | )
 12 | ```
 13 | 
 14 | # Compressed, Large-Language-Model Embedded Datasets of ICD-10-CM Descriptions
 15 | 
 16 | ## Citing this work
 17 | 
 18 | ```
 19 | @article{kane2023llm
 20 |   author = {Michael J. Kane and Casey King and Denise Esserman and Nancy K. Latham and Erich J. Greene and David A. Ganz},
 21 |   title = {A Compressed Large Language Model Embedding Dataset of ICD 10 CM Descriptions},
 22 |   elocation-id = {2023.04.24.23289046},
 23 |   year = {2023},
 24 |   doi = {10.1101/2023.04.24.23289046},
 25 |   publisher = {Cold Spring Harbor Laboratory Press},
 26 |   URL = {https://www.medrxiv.org/content/early/2023/05/15/2023.04.24.23289046.1},
 27 |   eprint = {https://www.medrxiv.org/content/early/2023/05/15/2023.04.24.23289046.1.full.pdf},
 28 |   journal = {medRxiv}
 29 | }
 30 | ```
 31 | 
 32 | ## License 
 33 | 
 34 | The code in this repository is licensed under [GPL v2](https://www.gnu.org/licenses/old-licenses/gpl-2.0.en.html) and the data
 35 | are licenced under [CC BY-NC-SA 4.0](https://creativecommons.org/licenses/by-nc-sa/4.0/).
 36 | 
 37 | # Funding
 38 | 
 39 | This work was supported by the National Institute on Aging of the National Institutes of Health (NIH) through a project grant to Yale University (1R01AG071528). The organizations funding this study had no role in the design or conduct of the study; in the collection, management, analysis, or interpretation of the data; or in the preparation, review, or approval of the manuscript. The content of this publication is solely the responsibility of the authors and does not necessarily represent the official views of the National Institutes of Health, the Department of Veterans Affairs, or the United States government. 
 40 | 
 41 | ## ICD-10-CM Datasets
 42 | 
 43 | ### 2022
 44 | 
 45 | 1. [ICD-10-CM, 10-dimensions](https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/embedding-data/icd-10-cm-2022-0010.csv.gz?raw=true)
 46 | 1. [ICD-10-CM, 50-dimensions](https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/embedding-data/icd-10-cm-2022-0050.csv.gz?raw=true)
 47 | 1. [ICD-10-CM, 100-dimensions](https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/embedding-data/icd-10-cm-2022-0100.csv.gz?raw=true)
 48 | 1. [ICD-10-CM, 1000-dimensions](https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/embedding-data/icd-10-cm-2022-1000.csv.gz?raw=true)
 49 | 1. [ICD-10-CM, 1000-dimensions](https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/embedding-data/icd-10-cm-2022-1000.csv.gz?raw=true)
 50 | 1. [ICD-10-CM, 42,384-dimensions (not compressed)](https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/embedding-data/icd-10-cm-2022-full.csv.gz?raw=true)
 51 | 
 52 | ### 2021
 53 | 
 54 | 1. [ICD-10-CM, 10-dimensions](https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/embedding-data/icd-10-cm-2021-0010.csv.gz?raw=true)
 55 | 1. [ICD-10-CM, 50-dimensions](https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/embedding-data/icd-10-cm-2021-0050.csv.gz?raw=true)
 56 | 1. [ICD-10-CM, 100-dimensions](https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/embedding-data/icd-10-cm-2021-0100.csv.gz?raw=true)
 57 | 1. [ICD-10-CM, 1000-dimensions](https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/embedding-data/icd-10-cm-2021-1000.csv.gz?raw=true)
 58 | 1. [ICD-10-CM, 42,384-dimensions (not compressed)](https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/embedding-data/icd-10-cm-2021-full.csv.gz?raw=true)
 59 | 
 60 | ### 2020
 61 | 
 62 | 1. [ICD-10-CM, 10-dimensions](https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/embedding-data/icd-10-cm-2020-0010.csv.gz?raw=true)
 63 | 1. [ICD-10-CM, 50-dimensions](https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/embedding-data/icd-10-cm-2020-0050.csv.gz?raw=true)
 64 | 1. [ICD-10-CM, 100-dimensions](https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/embedding-data/icd-10-cm-2020-0100.csv.gz?raw=true)
 65 | 1. [ICD-10-CM, 1000-dimensions](https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/embedding-data/icd-10-cm-2020-1000.csv.gz?raw=true)
 66 | 
 67 | ### 2019
 68 | 
 69 | 1. [ICD-10-CM, 10-dimensions](https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/embedding-data/icd-10-cm-2019-0010.csv.gz?raw=true)
 70 | 1. [ICD-10-CM, 50-dimensions](https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/embedding-data/icd-10-cm-2019-0050.csv.gz?raw=true)
 71 | 1. [ICD-10-CM, 100-dimensions](https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/embedding-data/icd-10-cm-2019-0100.csv.gz?raw=true)
 72 | 1. [ICD-10-CM, 1000-dimensions](https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/embedding-data/icd-10-cm-2019-1000.csv.gz?raw=true)
 73 | 1. [ICD-10-CM, 42,384-dimensions (not compressed)](https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/embedding-data/icd-10-cm-2019-full.csv.gz?raw=true)
 74 | 
 75 | ## Overview
 76 | 
 77 | The International Classification of Diseases, 10th Revision, Clinical Modification ([ICD-10-CM](https://www.cdc.gov/nchs/icd/icd-10-cm.htm)) is a standardized classification system used for diagnosing diseases, disorders, and health conditions. It plays a crucial role in analyzing electronic medical records (EMRs) or electronic health records (EHRs). However, the high dimensionality of ICD-10-CM codes and their hierarchical structure make their incorporation into statistical and machine learning analyses challenging. Traditional contrast encoding methods like one-hot and treatment may not fully capture the hierarchical information of the codes. Large language models (LLMs) generate contextualized embeddings that capture the semantic relationships between codes more effectively. This repository provides data sets of ICD-10-CM codes mapped to embeddings generated using the [BioGPT Large Language Model](https://academic.oup.com/bib/article/23/6/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9&login=false). The embeddings provide informative input features for machine learning models, and dimension-reduced versions in 1,000, 100, 50, and 10 dimensions are provided. Validation for both the dimension reduction and the representation of the embeddings are shown below. The readily available datasets are anticipated to be highly valuable for researchers incorporating ICD-10-CM codes into their analyses, retaining contextual information, and enabling more advanced analyses in the field.
 78 | 
 79 | The data sets and code use to generate them are available at https://github.com/kaneplusplus/icd-10-cs-embedding. The data are licensed under the [Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License](https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode). The code is 
 80 | licensed under [GPL-v2](https://www.gnu.org/licenses/old-licenses/gpl-2.0.en.html)
 81 | 
 82 | ## Model description and performance
 83 | 
 84 | The data provided are generated by embedding ICD-10-CM descriptions using the BioGPT-Large model, which tokenizes textual phrases into tokens and maps them to unique vocabulary IDs, resulting in a sequence of continuous embedding vectors. The embeddings are then contextualized by passing them through the model's layers with an attention mask. The embeddings are in a 42,384 dimensional space, which are then compressed using an auto-encoder with fully connected layers of decreasing and increasing sizes until the output layer. The autoencoder structure is the same for models with larger dimensions, with only the appropriate layers retained.
 85 | 
 86 | ### Validating the dimension reduction
 87 | 
 88 | ```{r model_perf, message = FALSE, warning = FALSE, echo = FALSE, fig.cap = "The autoencoder performance diagnostics ordered by decreasing Validation Loss."}
 89 | library(dplyr)
 90 | x = readRDS("model-performance.rds") 
 91 | x = x |>
 92 |   mutate(best_valid_loss = round(best_valid_loss, 3),
 93 |          best_train_loss = round(best_train_loss, 3)) |>
 94 |   select(embedding_dim, batch_size, best_train_loss, best_valid_loss)
 95 | names(x) = c("Embedding Dimension", "Batch Size","Training Loss", "Validation Loss")
 96 | knitr::kable(x, caption = "The autoencoder parameters and performance ordered by increasing validation loss.")
 97 | ```
 98 | 
 99 | The autoencoder compressing the LLM embedding was fit on the 2019 ICD-10-CM descriptions
100 | for 20 epochs, with batch sizes 64, 128, and 256, mean-square error loss between
101 | the embedding and autoencoder estimate, and a validation data set comprised
102 | of random subset of 10\% of the samples. The model performance is shown above.
103 | Based on these results the models with the best validation loss where selected for distribution.
104 | 
105 | ```{r autoencoder_perf, echo = FALSE, fig.cap = "The autoencoder year-validation diagnostics ordered by year."}
106 | x = readRDS("year-validation.rds")
107 | x = x |>
108 |   mutate(pred_error = round(pred_error, 3),
109 |          cod = round(cod, 3))
110 | names(x) = c("Year", "Embedding Dimension", "MSE", "Coef. of Determination")
111 | knitr::kable(x, caption = "The autoencoder year validation performance ordered by year.")
112 | ```
113 | 
114 | In addition to the 2019 validation the models selected for distribution were
115 | tested on the 2020-2022 data sets to ensure their performance is comparable
116 | over years. It should be noted that the ICD-10-CM codes do not vary much from
117 | one year to the the next,
118 | so we should not expect large differences. As expected, the mean square error
119 | and coefficients of determination are similar to the 2019 data.
120 | 
121 | ### Validating the embedding representation
122 | 
123 | To validate the compressed embeddings, the hierarchical information in the ICD-10-CM codes was used to ensure that relevant relationships were preserved. The leading letter and two numeric values categorize codes, allowing for the estimation of categories at a rate higher than chance using a supervised model. The training data was a one-hot encoding of the ICD-10-CM categories as the dependent variable and the compressed embedding values as the independent variable. The model consisted of two hidden layers with 100 nodes each, using categorical cross-entropy as the loss function. The model was trained using 30 epochs, and the performance in terms of accuracy and balanced accuracy was evaluated. The compressed embeddings result in an increase in lost predictive information, as is typical for this type of problem.
124 | 
125 | ```{r sm_perf, echo = FALSE, fig.cap = "The supervised model performance."}
126 | x = readRDS("sup-model-perf.rds")
127 | x = x |>
128 |   mutate(accuracy = round(accuracy, 3),
129 |          bal_accuracy = round(bal_accuracy, 3),
130 |          emb_dim = as.integer(emb_dim))
131 | names(x) = c("Embedding Dimension", "Accuracy", "Balanced Accuracy")
132 | knitr::kable(x, caption = "The supervised models' performance ordered by increasing embedding dimension.")
133 | ```
134 | 
135 | Of note, the goal in presenting these results is not to necessarily to 
136 | maximize the prediction accuracy. Rather, it is to show that the embedding retains the
137 | hierarchical information in the ICD-10-CM codes. Some of the codes correspond to
138 | conditions that could be classified in several ways, and as a result coding
139 | for at least some of the conditions might be considered non-systematic.
140 | 
141 | ## An example using the embedding data in R
142 | 
143 | To conclude, we present a simple example of how one might use the embedding
144 | information in the R programming environment. Suppose we would like to 
145 | visualize the ICD-10-CM codes beginning with G (diseases of the nervous system), 
146 | I (diseases of the circulatory system), J (diseases of the respiratory system), 
147 | and K (diseases of the digestive system) to better understand the relationships
148 | between these categories or specific conditions in the the 50-dimensional 
149 | embedding. For convenience, the projects page includes an `.rds` file
150 | containing the available embeddings along with their URLs, which can be 
151 | retrieved from the R console. The code categores can then be visualized 
152 | by performing another dimension reduction (in this case we will use the
153 | Rtsne package), to 2 dimensions and presented them
154 | to a scatter plot as shown below. 
155 | 
156 | 
157 | ```{r message = FALSE, eval = TRUE, warning = FALSE}
158 | library(dplyr)
159 | library(ggplot2)
160 | library(readr)
161 | library(Rtsne)
162 | library(stringr)
163 | 
164 | # Download the locations of the embeddings.
165 | tf = tempfile()
166 | download.file(
167 |   "https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/icd10_dl.rds?raw=true",
168 |   tf
169 | )
170 | dl = readRDS(tf)
171 | 
172 | # Read in the unspecified injury codes.
173 | tf = tempfile()
174 | download.file(
175 |   dl$url[dl$year == 2019 & dl$emb_dim == 50],
176 |   tf
177 | )
178 | 
179 | icd10s = read_csv(tf) |>
180 |   filter(str_detect(code, "^(G|I|J|K)")) |>
181 |   mutate(desc = tolower(desc)) |>
182 |   mutate(`Leading Letter` = str_sub(code, 1, 1)) 
183 | 
184 | # Fit tSNE to the embedding.
185 | tsne_fit = icd10s |> 
186 |   select(starts_with("V")) |>
187 |   scale() |>
188 |   Rtsne(perplexity = 10)
189 | 
190 | # Bind the tSNE values to the data set.
191 | icd10p = bind_cols(
192 |   icd10s |>
193 |     select(-starts_with("V")),
194 |   tsne_fit$Y |>
195 |     as.data.frame() |>
196 |     rename(tSNE1="V1", tSNE2="V2") |>
197 |     as_tibble()
198 | )
199 | 
200 | # Visualize the results.
201 | ggplot(icd10p, aes(x = tSNE1, y = tSNE2, color = `Leading Letter`)) +
202 |   geom_point() +
203 |   theme_minimal()
204 | ```
205 | 
206 | The visualization shows that a subset of the circulatory diseases (I) and
207 | nervous system diseases (G) are well-differentiated from other conditions. It
208 | also shows overlap between other conditions related to K (digestive diseases), 
209 | J (respiratory diseases), and I (circulatory).
210 | 
211 | ## A SAS example
212 | 
213 | ```sas
214 | /* Options */
215 | %let dlyear=2019;     /* code year; can be 2019, 2020, 2021, 2022 */
216 | %let dldim=50;        /* encoding dimensions; can be 1000, 100, 50, 10 */
217 | %let tempdir=D:;      /* directory for temporary file */
218 | %let pathsep=\;       /* path separator; \ for Windows, / for *NIX */
219 | %let dsname=icd10cm;  /* name for the final dataset */
220 | %let target=icd-10-cm-&dlyear-%sysfunc(putn(&dldim,z4.)).csv.gz;
221 | %let tempfile=&tempdir&pathsep&target;
222 | 
223 | /* Download gzipped file to a temp location */
224 | /* -- filename url and filename zip methods don't stack */
225 | filename rawdl "&tempfile";
226 | proc http
227 |   url="https://github.com/kaneplusplus/icd-10-cm-embedding/raw/main/embedding-data/&target"
228 |   out=rawdl;
229 | run;
230 | 
231 | /* Read the downloaded temp file into a dataset */
232 | filename codes ZIP "&tempfile" GZIP;
233 | %macro vlist;
234 |   %local i;
235 |   %do i=1 %to &dldim; V&i %end;
236 | %mend;
237 | data &dsname;
238 |   informat code $4. desc $256. %vlist best.;
239 |   infile codes delimiter=',' firstobs=2 dsd;
240 |   input code $ desc $ %vlist;
241 | run;
242 | ```
243 | 
244 | ## Reproducing these results
245 | 
246 | R version: >= 4.2
247 | 
248 | R package dependencies:
249 |   
250 |   - `arrow`
251 |   - `torch`
252 |   - `reticulate`
253 |   - `dplyr`
254 |   - `tidyr`
255 |   - `purrr`
256 |   - `foreach`
257 |   - `itertools`
258 |   - `readr`
259 |   - `luz`
260 |   - `tidyr`
261 |   - `tibble`
262 |   - `progress`
263 |   - `stringr`
264 |   - `yardstick`
265 |   
266 | Scripts
267 | 
268 | - `0-make-embeddings.R`
269 |   - Purpose - create the embeddings created by BioGPT-Large
270 |   - Dependencies
271 |     - A conda evironment with the `torch` and `transformers` packages (see the `make-biogpt-conda-env` script)
272 |   - Inputs
273 |     - `icd-10-cm-codes/icd10cm_codes_2019.txt`
274 |     - `icd-10-cm-codes/icd10cm_codes_2020.txt`
275 |     - `icd-10-cm-codes/icd10cm_codes_2021.txt`
276 |     - `icd-10-cm-codes/icd10cm_codes_2022.txt`
277 |   - Outputs
278 |     - An `icd-10-cm-embeddings` directory with subdirectories corresponding to each year, and subsubdirectories with files whose names correspond to the ICD-10-CM code holding R .rds files with the code, description, and BioGPT embedding values stored as a `data.frame`.
279 | - `1-compress-icd-10-embeddings.R`
280 |   - Purpose - recreate the embeddings created by BioGPT
281 |   - Dependencies
282 |     - R files: `autoencoder.R`
283 |   - Inputs
284 |   - Outputs
285 | - `2-validation.R`
286 |   - Purpose - recreate the embeddings created by BioGPT-Large
287 |   - Dependencies
288 |     - R files: `autoencoder.R`
289 |   - Inputs
290 |     - Files in the `icd-10-cm-embeddings/2019` directory.
291 |   - Outputs
292 |     - `model-performance.rds` holding a `data.frame` consisting of the model performance table.
293 |     - Files in the `autoencoder-models` directory containing model to create the compressed embeddings.
294 | - `3-create-datasets.R`
295 |   - Purpose - recreate the embeddings created by BioGPT-Large
296 |   - Dependencies
297 |     - R files: `autoencoder.R`
298 |   - Inputs
299 |     - Files in the `autoencoder-models` directory.
300 |     - Files in the `icd-10-cm-embeddings` directory for all years (2019-2020).
301 |   - Outputs
302 |     - `year-validation.rds` holding a data frame of the autoencoder year-validation model performance.
303 |     - Files in the `embedding-data` directory holding the embedding values as .csv files for all year-dimension combinations.
304 | - `4-estimate-leading-char.R`
305 |   - Purpose - recreate the embeddings created by BioGPT-Large
306 |   - Dependencies
307 |     - R files: `alpha-char-model.R`
308 |   - Inputs
309 |     - Files in the `embedding-data` directory.
310 |   - Outputs
311 |     - Files in the `luz-supervised-models` directory holding the `luz` package representation of the fitted models.
312 |     - The `supervised-model-perf.rds` files containing a `data.frame` summarizing the supervised model performance.
313 | 
314 | © Michael J. Kane (kaneplusplus at proton mail dot com)
315 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | output: md_document
  3 | ---
  4 | 
  5 | <!-- README.md is generated from README.Rmd. Please edit that file -->
  6 | 
  7 | 
  8 | 
  9 | # Compressed, Large-Language-Model Embedded Datasets of ICD-10-CM Descriptions
 10 | 
 11 | ## Citing this work
 12 | 
 13 | ```
 14 | @article{kane2023llm
 15 |   author = {Michael J. Kane and Casey King and Denise Esserman and Nancy K. Latham and Erich J. Greene and David A. Ganz},
 16 |   title = {A Compressed Large Language Model Embedding Dataset of ICD 10 CM Descriptions},
 17 |   elocation-id = {2023.04.24.23289046},
 18 |   year = {2023},
 19 |   doi = {10.1101/2023.04.24.23289046},
 20 |   publisher = {Cold Spring Harbor Laboratory Press},
 21 |   URL = {https://www.medrxiv.org/content/early/2023/05/15/2023.04.24.23289046.1},
 22 |   eprint = {https://www.medrxiv.org/content/early/2023/05/15/2023.04.24.23289046.1.full.pdf},
 23 |   journal = {medRxiv}
 24 | }
 25 | ```
 26 | 
 27 | ## License 
 28 | 
 29 | The code in this repository is licensed under [GPL v2](https://www.gnu.org/licenses/old-licenses/gpl-2.0.en.html) and the data
 30 | are licenced under [CC BY-NC-SA 4.0](https://creativecommons.org/licenses/by-nc-sa/4.0/).
 31 | 
 32 | # Funding
 33 | 
 34 | This work was supported by the National Institute on Aging of the National Institutes of Health (NIH) through a project grant to Yale University (1R01AG071528). The organizations funding this study had no role in the design or conduct of the study; in the collection, management, analysis, or interpretation of the data; or in the preparation, review, or approval of the manuscript. The content of this publication is solely the responsibility of the authors and does not necessarily represent the official views of the National Institutes of Health, the Department of Veterans Affairs, or the United States government. 
 35 | 
 36 | ## ICD-10-CM Datasets
 37 | 
 38 | ### 2022
 39 | 
 40 | 1. [ICD-10-CM, 10-dimensions](https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/embedding-data/icd-10-cm-2022-0010.csv.gz?raw=true)
 41 | 1. [ICD-10-CM, 50-dimensions](https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/embedding-data/icd-10-cm-2022-0050.csv.gz?raw=true)
 42 | 1. [ICD-10-CM, 100-dimensions](https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/embedding-data/icd-10-cm-2022-0100.csv.gz?raw=true)
 43 | 1. [ICD-10-CM, 1000-dimensions](https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/embedding-data/icd-10-cm-2022-1000.csv.gz?raw=true)
 44 | 1. [ICD-10-CM, 1000-dimensions](https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/embedding-data/icd-10-cm-2022-1000.csv.gz?raw=true)
 45 | 1. [ICD-10-CM, 42,384-dimensions (not compressed)](https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/embedding-data/icd-10-cm-2022-full.csv.gz?raw=true)
 46 | 
 47 | ### 2021
 48 | 
 49 | 1. [ICD-10-CM, 10-dimensions](https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/embedding-data/icd-10-cm-2021-0010.csv.gz?raw=true)
 50 | 1. [ICD-10-CM, 50-dimensions](https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/embedding-data/icd-10-cm-2021-0050.csv.gz?raw=true)
 51 | 1. [ICD-10-CM, 100-dimensions](https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/embedding-data/icd-10-cm-2021-0100.csv.gz?raw=true)
 52 | 1. [ICD-10-CM, 1000-dimensions](https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/embedding-data/icd-10-cm-2021-1000.csv.gz?raw=true)
 53 | 1. [ICD-10-CM, 42,384-dimensions (not compressed)](https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/embedding-data/icd-10-cm-2021-full.csv.gz?raw=true)
 54 | 
 55 | ### 2020
 56 | 
 57 | 1. [ICD-10-CM, 10-dimensions](https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/embedding-data/icd-10-cm-2020-0010.csv.gz?raw=true)
 58 | 1. [ICD-10-CM, 50-dimensions](https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/embedding-data/icd-10-cm-2020-0050.csv.gz?raw=true)
 59 | 1. [ICD-10-CM, 100-dimensions](https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/embedding-data/icd-10-cm-2020-0100.csv.gz?raw=true)
 60 | 1. [ICD-10-CM, 1000-dimensions](https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/embedding-data/icd-10-cm-2020-1000.csv.gz?raw=true)
 61 | 
 62 | ### 2019
 63 | 
 64 | 1. [ICD-10-CM, 10-dimensions](https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/embedding-data/icd-10-cm-2019-0010.csv.gz?raw=true)
 65 | 1. [ICD-10-CM, 50-dimensions](https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/embedding-data/icd-10-cm-2019-0050.csv.gz?raw=true)
 66 | 1. [ICD-10-CM, 100-dimensions](https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/embedding-data/icd-10-cm-2019-0100.csv.gz?raw=true)
 67 | 1. [ICD-10-CM, 1000-dimensions](https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/embedding-data/icd-10-cm-2019-1000.csv.gz?raw=true)
 68 | 1. [ICD-10-CM, 42,384-dimensions (not compressed)](https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/embedding-data/icd-10-cm-2019-full.csv.gz?raw=true)
 69 | 
 70 | ## Overview
 71 | 
 72 | The International Classification of Diseases, 10th Revision, Clinical Modification ([ICD-10-CM](https://www.cdc.gov/nchs/icd/icd-10-cm.htm)) is a standardized classification system used for diagnosing diseases, disorders, and health conditions. It plays a crucial role in analyzing electronic medical records (EMRs) or electronic health records (EHRs). However, the high dimensionality of ICD-10-CM codes and their hierarchical structure make their incorporation into statistical and machine learning analyses challenging. Traditional contrast encoding methods like one-hot and treatment may not fully capture the hierarchical information of the codes. Large language models (LLMs) generate contextualized embeddings that capture the semantic relationships between codes more effectively. This repository provides data sets of ICD-10-CM codes mapped to embeddings generated using the [BioGPT Large Language Model](https://academic.oup.com/bib/article/23/6/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9&login=false). The embeddings provide informative input features for machine learning models, and dimension-reduced versions in 1,000, 100, 50, and 10 dimensions are provided. Validation for both the dimension reduction and the representation of the embeddings are shown below. The readily available datasets are anticipated to be highly valuable for researchers incorporating ICD-10-CM codes into their analyses, retaining contextual information, and enabling more advanced analyses in the field.
 73 | 
 74 | The data sets and code use to generate them are available at https://github.com/kaneplusplus/icd-10-cs-embedding. The data are licensed under the [Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License](https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode). The code is 
 75 | licensed under [GPL-v2](https://www.gnu.org/licenses/old-licenses/gpl-2.0.en.html)
 76 | 
 77 | ## Model description and performance
 78 | 
 79 | The data provided are generated by embedding ICD-10-CM descriptions using the BioGPT-Large model, which tokenizes textual phrases into tokens and maps them to unique vocabulary IDs, resulting in a sequence of continuous embedding vectors. The embeddings are then contextualized by passing them through the model's layers with an attention mask. The embeddings are in a 42,384 dimensional space, which are then compressed using an auto-encoder with fully connected layers of decreasing and increasing sizes until the output layer. The autoencoder structure is the same for models with larger dimensions, with only the appropriate layers retained.
 80 | 
 81 | ### Validating the dimension reduction
 82 | 
 83 | 
 84 | 
 85 | Table: The autoencoder parameters and performance ordered by increasing validation loss.
 86 | 
 87 | | Embedding Dimension| Batch Size| Training Loss| Validation Loss|
 88 | |-------------------:|----------:|-------------:|---------------:|
 89 | |                  50|        256|         0.390|           0.377|
 90 | |                 100|         64|         0.710|           0.435|
 91 | |                  50|        128|         0.470|           0.438|
 92 | |                1000|        128|         1.106|           0.479|
 93 | |                1000|         64|         1.299|           0.484|
 94 | |                  10|        128|         0.614|           0.605|
 95 | |                  10|        256|         0.610|           0.610|
 96 | |                  10|         64|         0.693|           0.634|
 97 | |                 100|        128|        11.647|           0.653|
 98 | |                  50|         64|         0.757|           0.658|
 99 | |                 100|        256|         1.418|           0.805|
100 | |                1000|        256|         0.863|           0.847|
101 | 
102 | 
103 | 
104 | The autoencoder compressing the LLM embedding was fit on the 2019 ICD-10-CM descriptions
105 | for 20 epochs, with batch sizes 64, 128, and 256, mean-square error loss between
106 | the embedding and autoencoder estimate, and a validation data set comprised
107 | of random subset of 10\% of the samples. The model performance is shown above.
108 | Based on these results the models with the best validation loss where selected for distribution.
109 | 
110 | 
111 | 
112 | Table: The autoencoder year validation performance ordered by year.
113 | 
114 | | Year| Embedding Dimension|   MSE| Coef. of Determination|
115 | |----:|-------------------:|-----:|----------------------:|
116 | | 2019|                  10| 0.600|                  0.087|
117 | | 2019|                  50| 0.372|                  0.054|
118 | | 2019|                 100| 0.431|                  0.062|
119 | | 2019|                1000| 0.473|                  0.068|
120 | | 2020|                  10| 0.601|                  0.087|
121 | | 2020|                  50| 0.373|                  0.054|
122 | | 2020|                 100| 0.431|                  0.062|
123 | | 2020|                1000| 0.474|                  0.068|
124 | | 2021|                  10| 0.602|                  0.087|
125 | | 2021|                  50| 0.374|                  0.054|
126 | | 2021|                 100| 0.432|                  0.062|
127 | | 2021|                1000| 0.475|                  0.069|
128 | | 2022|                  10| 0.602|                  0.087|
129 | | 2022|                  50| 0.374|                  0.054|
130 | | 2022|                 100| 0.433|                  0.063|
131 | | 2022|                1000| 0.475|                  0.069|
132 | 
133 | 
134 | 
135 | In addition to the 2019 validation the models selected for distribution were
136 | tested on the 2020-2022 data sets to ensure their performance is comparable
137 | over years. It should be noted that the ICD-10-CM codes do not vary much from
138 | one year to the the next,
139 | so we should not expect large differences. As expected, the mean square error
140 | and coefficients of determination are similar to the 2019 data.
141 | 
142 | ### Validating the embedding representation
143 | 
144 | To validate the compressed embeddings, the hierarchical information in the ICD-10-CM codes was used to ensure that relevant relationships were preserved. The leading letter and two numeric values categorize codes, allowing for the estimation of categories at a rate higher than chance using a supervised model. The training data was a one-hot encoding of the ICD-10-CM categories as the dependent variable and the compressed embedding values as the independent variable. The model consisted of two hidden layers with 100 nodes each, using categorical cross-entropy as the loss function. The model was trained using 30 epochs, and the performance in terms of accuracy and balanced accuracy was evaluated. The compressed embeddings result in an increase in lost predictive information, as is typical for this type of problem.
145 | 
146 | 
147 | 
148 | Table: The supervised models' performance ordered by increasing embedding dimension.
149 | 
150 | | Embedding Dimension| Accuracy| Balanced Accuracy|
151 | |-------------------:|--------:|-----------------:|
152 | |                  10|    0.815|             0.698|
153 | |                  50|    0.925|             0.873|
154 | |                 100|    0.935|             0.891|
155 | |                1000|    0.960|             0.927|
156 | 
157 | 
158 | 
159 | Of note, the goal in presenting these results is not to necessarily to 
160 | maximize the prediction accuracy. Rather, it is to show that the embedding retains the
161 | hierarchical information in the ICD-10-CM codes. Some of the codes correspond to
162 | conditions that could be classified in several ways, and as a result coding
163 | for at least some of the conditions might be considered non-systematic.
164 | 
165 | ## An example using the embedding data in R
166 | 
167 | To conclude, we present a simple example of how one might use the embedding
168 | information in the R programming environment. Suppose we would like to 
169 | visualize the ICD-10-CM codes beginning with G (diseases of the nervous system), 
170 | I (diseases of the circulatory system), J (diseases of the respiratory system), 
171 | and K (diseases of the digestive system) to better understand the relationships
172 | between these categories or specific conditions in the the 50-dimensional 
173 | embedding. For convenience, the projects page includes an `.rds` file
174 | containing the available embeddings along with their URLs, which can be 
175 | retrieved from the R console. The code categores can then be visualized 
176 | by performing another dimension reduction (in this case we will use the
177 | Rtsne package), to 2 dimensions and presented them
178 | to a scatter plot as shown below. 
179 | 
180 | 
181 | 
182 | ```r
183 | library(dplyr)
184 | library(ggplot2)
185 | library(readr)
186 | library(Rtsne)
187 | library(stringr)
188 | 
189 | # Download the locations of the embeddings.
190 | tf = tempfile()
191 | download.file(
192 |   "https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/icd10_dl.rds?raw=true",
193 |   tf
194 | )
195 | dl = readRDS(tf)
196 | 
197 | # Read in the unspecified injury codes.
198 | tf = tempfile()
199 | download.file(
200 |   dl$url[dl$year == 2019 & dl$emb_dim == 50],
201 |   tf
202 | )
203 | 
204 | icd10s = read_csv(tf) |>
205 |   filter(str_detect(code, "^(G|I|J|K)")) |>
206 |   mutate(desc = tolower(desc)) |>
207 |   mutate(`Leading Letter` = str_sub(code, 1, 1)) 
208 | 
209 | # Fit tSNE to the embedding.
210 | tsne_fit = icd10s |> 
211 |   select(starts_with("V")) |>
212 |   scale() |>
213 |   Rtsne(perplexity = 10)
214 | 
215 | # Bind the tSNE values to the data set.
216 | icd10p = bind_cols(
217 |   icd10s |>
218 |     select(-starts_with("V")),
219 |   tsne_fit$Y |>
220 |     as.data.frame() |>
221 |     rename(tSNE1="V1", tSNE2="V2") |>
222 |     as_tibble()
223 | )
224 | 
225 | # Visualize the results.
226 | ggplot(icd10p, aes(x = tSNE1, y = tSNE2, color = `Leading Letter`)) +
227 |   geom_point() +
228 |   theme_minimal()
229 | ```
230 | 
231 | ![plot of chunk unnamed-chunk-2](figure/unnamed-chunk-2-1.png)
232 | 
233 | The visualization shows that a subset of the circulatory diseases (I) and
234 | nervous system diseases (G) are well-differentiated from other conditions. It
235 | also shows overlap between other conditions related to K (digestive diseases), 
236 | J (respiratory diseases), and I (circulatory).
237 | 
238 | ## A SAS example
239 | 
240 | ```sas
241 | /* Options */
242 | %let dlyear=2019;     /* code year; can be 2019, 2020, 2021, 2022 */
243 | %let dldim=50;        /* encoding dimensions; can be 1000, 100, 50, 10 */
244 | %let tempdir=D:;      /* directory for temporary file */
245 | %let pathsep=\;       /* path separator; \ for Windows, / for *NIX */
246 | %let dsname=icd10cm;  /* name for the final dataset */
247 | %let target=icd-10-cm-&dlyear-%sysfunc(putn(&dldim,z4.)).csv.gz;
248 | %let tempfile=&tempdir&pathsep&target;
249 | 
250 | /* Download gzipped file to a temp location */
251 | /* -- filename url and filename zip methods don't stack */
252 | filename rawdl "&tempfile";
253 | proc http
254 |   url="https://github.com/kaneplusplus/icd-10-cm-embedding/raw/main/embedding-data/&target"
255 |   out=rawdl;
256 | run;
257 | 
258 | /* Read the downloaded temp file into a dataset */
259 | filename codes ZIP "&tempfile" GZIP;
260 | %macro vlist;
261 |   %local i;
262 |   %do i=1 %to &dldim; V&i %end;
263 | %mend;
264 | data &dsname;
265 |   informat code $4. desc $256. %vlist best.;
266 |   infile codes delimiter=',' firstobs=2 dsd;
267 |   input code $ desc $ %vlist;
268 | run;
269 | ```
270 | 
271 | ## Reproducing these results
272 | 
273 | R version: >= 4.2
274 | 
275 | R package dependencies:
276 |   
277 |   - `arrow`
278 |   - `torch`
279 |   - `reticulate`
280 |   - `dplyr`
281 |   - `tidyr`
282 |   - `purrr`
283 |   - `foreach`
284 |   - `itertools`
285 |   - `readr`
286 |   - `luz`
287 |   - `tidyr`
288 |   - `tibble`
289 |   - `progress`
290 |   - `stringr`
291 |   - `yardstick`
292 |   
293 | Scripts
294 | 
295 | - `0-make-embeddings.R`
296 |   - Purpose - create the embeddings created by BioGPT-Large
297 |   - Dependencies
298 |     - A conda evironment with the `torch` and `transformers` packages (see the `make-biogpt-conda-env` script)
299 |   - Inputs
300 |     - `icd-10-cm-codes/icd10cm_codes_2019.txt`
301 |     - `icd-10-cm-codes/icd10cm_codes_2020.txt`
302 |     - `icd-10-cm-codes/icd10cm_codes_2021.txt`
303 |     - `icd-10-cm-codes/icd10cm_codes_2022.txt`
304 |   - Outputs
305 |     - An `icd-10-cm-embeddings` directory with subdirectories corresponding to each year, and subsubdirectories with files whose names correspond to the ICD-10-CM code holding R .rds files with the code, description, and BioGPT embedding values stored as a `data.frame`.
306 | - `1-compress-icd-10-embeddings.R`
307 |   - Purpose - recreate the embeddings created by BioGPT
308 |   - Dependencies
309 |     - R files: `autoencoder.R`
310 |   - Inputs
311 |   - Outputs
312 | - `2-validation.R`
313 |   - Purpose - recreate the embeddings created by BioGPT-Large
314 |   - Dependencies
315 |     - R files: `autoencoder.R`
316 |   - Inputs
317 |     - Files in the `icd-10-cm-embeddings/2019` directory.
318 |   - Outputs
319 |     - `model-performance.rds` holding a `data.frame` consisting of the model performance table.
320 |     - Files in the `autoencoder-models` directory containing model to create the compressed embeddings.
321 | - `3-create-datasets.R`
322 |   - Purpose - recreate the embeddings created by BioGPT-Large
323 |   - Dependencies
324 |     - R files: `autoencoder.R`
325 |   - Inputs
326 |     - Files in the `autoencoder-models` directory.
327 |     - Files in the `icd-10-cm-embeddings` directory for all years (2019-2020).
328 |   - Outputs
329 |     - `year-validation.rds` holding a data frame of the autoencoder year-validation model performance.
330 |     - Files in the `embedding-data` directory holding the embedding values as .csv files for all year-dimension combinations.
331 | - `4-estimate-leading-char.R`
332 |   - Purpose - recreate the embeddings created by BioGPT-Large
333 |   - Dependencies
334 |     - R files: `alpha-char-model.R`
335 |   - Inputs
336 |     - Files in the `embedding-data` directory.
337 |   - Outputs
338 |     - Files in the `luz-supervised-models` directory holding the `luz` package representation of the fitted models.
339 |     - The `supervised-model-perf.rds` files containing a `data.frame` summarizing the supervised model performance.
340 | 
341 | © Michael J. Kane (kaneplusplus at proton mail dot com)
342 | 


--------------------------------------------------------------------------------
/README_files/figure-gfm/unnamed-chunk-2-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kaneplusplus/icd-10-cm-embedding/a63d21c7d8f90419515bcfc2b0fce4281a6f1e62/README_files/figure-gfm/unnamed-chunk-2-1.png


--------------------------------------------------------------------------------
/README_files/figure-markdown_strict/unnamed-chunk-2-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kaneplusplus/icd-10-cm-embedding/a63d21c7d8f90419515bcfc2b0fce4281a6f1e62/README_files/figure-markdown_strict/unnamed-chunk-2-1.png


--------------------------------------------------------------------------------
/alpha-char-model.R:
--------------------------------------------------------------------------------
 1 | library(torch)
 2 | library(tibble)
 3 | library(dplyr)
 4 | library(purrr)
 5 | library(foreach)
 6 | library(readr)
 7 | 
 8 | AlphaCharEmbedding = dataset(
 9 |   name = "AlphaCharEmbedding",
10 |   initialize = function(icd10_emb, output_levels) {
11 |     self$x = icd10_emb |>
12 |       collect()
13 |     lc = contr.treatment(output_levels, contrasts = FALSE)
14 |     adf = map_dfr(self$x$code, ~ lc[.x,])
15 |     names(adf) = paste0("alpha_", names(adf))
16 |     self$x = bind_cols(adf, icd10_emb)
17 |     self$v_width = ncol(self$x |> select(starts_with("V")))
18 |   }, 
19 |   width = function() {
20 |     self$v_width
21 |   },
22 |   .getitem = function(x) {
23 |     list(
24 |       x = torch_tensor(select(self$x[x,], starts_with("V")) |> unlist()),
25 |       y = torch_tensor(select(self$x[x,], starts_with("alpha")) |> unlist())
26 |     )
27 |   },
28 |   .length = function() {
29 |     nrow(self$x)
30 |   }
31 | )
32 | 
33 | AlphaCodeEstimator = nn_module( 
34 |   initialize = function(layers) {
35 |     self$feature_net = nn_module_list(
36 |       foreach(i = seq_along(layers)[-1]) %do% {
37 |         nn_linear(layers[i-1], layers[i])
38 |       }
39 |     )
40 |   },
41 |   forward = function(x) {
42 |     for (i in seq_along(self$feature_net)) {
43 |       x = self$feature_net[[i]](x)
44 |     }
45 |     nnf_softmax(x, dim = 2)
46 |   }
47 | )
48 | 
49 | 
50 | 


--------------------------------------------------------------------------------
/autoencoder.R:
--------------------------------------------------------------------------------
 1 | library(torch)
 2 | library(tibble)
 3 | library(dplyr)
 4 | library(purrr)
 5 | library(foreach)
 6 | 
 7 | ICD10Embedding = dataset( 
 8 |   name = "ICD10Embedding",
 9 |   initialize = function(files, device = "mps") {
10 |     self$files = files
11 |     self$device = device
12 |   },
13 |   .getitem = function(i) {
14 |     ret = readRDS(self$files[i])
15 |     x = torch_tensor(ret$emb[[1]], device = self$device, 
16 |                  dtype = torch_float())
17 |     list(x = x, y = x$clone())
18 |   },
19 |   .length = function() {
20 |     length(self$files)
21 |   }
22 | )
23 | 
24 | ICD10AutoEncoder = nn_module(
25 |   initialize = function(layers) {
26 |     if (length(layers) %% 2 != 1) {
27 |       stop("The number of layers must be odd.")
28 |     }
29 |     encoder_layers = layers[ceiling(length(layers) / 2)]
30 |     decoder_layers = layers[(length(encoder_layers)):length(layers)]
31 |     self$encoder = nn_module_list(
32 |       foreach(i = seq_along(encoder_layers)[-1]) %do% {
33 |         nn_linear(encoder_layers[i-1], encoder_layers[i])
34 |       }
35 |     )
36 |     self$decoder = nn_module_list(
37 |       foreach(i = seq_along(decoder_layers)[-1]) %do% {
38 |         nn_linear(decoder_layers[i-1], decoder_layers[i])
39 |       }
40 |     )
41 |   },
42 |   run_forward = function(x, m) {
43 |     for (i in seq_along(m)) {
44 |       x = m[[i]](x)
45 |     }
46 |     x
47 |   },
48 |   encode = function(x) {
49 |     self$run_forward(x, self$encoder)
50 |   },
51 |   decode = function(x) {
52 |     self$run_forward(x, self$decoder)
53 |   },
54 |   forward = function(x) {
55 |     x |>
56 |       self$encode() |>
57 |       self$decode()
58 |   }
59 | )
60 | 
61 | 
62 | 


--------------------------------------------------------------------------------
/bmc-bioinformatics-paper/bmc_article.bib:
--------------------------------------------------------------------------------
  1 | % bmc_article.bib
  2 | % 
  3 | %  An example of bibtex entries.
  4 | %  Entries taken from BMC instructions for authors page.
  5 | 
  6 | % uncomment next line to make author-year bibliography
  7 | % @settings{label, options="nameyear"}
  8 | 
  9 | @article{mikolov2013,
 10 |   title={Efficient estimation of word representations in vector space},
 11 |   author={Mikolov, Tomas and Chen, Kai and Corrado, Greg and Dean, Jeffrey},
 12 |   journal={arXiv preprint arXiv:1301.3781},
 13 |   year={2013}
 14 | }
 15 | 
 16 | @inproceedings{cui2vec,
 17 |   title={Clinical concept embeddings learned from massive sources of multimodal medical data},
 18 |   author={Beam, Andrew L and Kompa, Benjamin and Schmaltz, Allen and Fried, Inbar and Weber, Griffin and Palmer, Nathan and Shi, Xu and Cai, Tianxi and Kohane, Isaac S},
 19 |   booktitle={Pacific Symposium on Biocomputing 2020},
 20 |   pages={295--306},
 21 |   year={2019},
 22 |   organization={World Scientific}
 23 | }
 24 | 
 25 | @article{vaswani2017,
 26 |   title={Attention is all you need},
 27 |   author={Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and Kaiser, {\L}ukasz and Polosukhin, Illia},
 28 |   journal={Advances in neural information processing systems},
 29 |   volume={30},
 30 |   year={2017}
 31 | }
 32 | 
 33 | @article{church2017,
 34 |   title={Word2Vec},
 35 |   author={Church, Kenneth Ward},
 36 |   journal={Natural Language Engineering},
 37 |   volume={23},
 38 |   number={1},
 39 |   pages={155--162},
 40 |   year={2017},
 41 |   publisher={Cambridge University Press}
 42 | }
 43 | 
 44 | @INPROCEEDINGS{medbert,
 45 |     author={Vasantharajan, Charangan and Tun, Kyaw Zin and Thi-Nga, Ho and Jain, Sparsh and Rong, Tong and Siong, Chng Eng},
 46 |     booktitle={2022 Asia-Pacific Signal and Information Processing Association Annual Summit and Conference (APSIPA ASC)},
 47 |     title={MedBERT: A Pre-trained Language Model for Biomedical Named Entity Recognition},
 48 |     year={2022},
 49 |     volume={},
 50 |     number={},
 51 |     pages={1482-1488},
 52 |     doi={10.23919/APSIPAASC55919.2022.9980157}
 53 | }
 54 | 
 55 | @inproceedings{balancedaccuracy,
 56 |   title={The balanced accuracy and its posterior distribution},
 57 |   author={Brodersen, Kay Henning and Ong, Cheng Soon and Stephan, Klaas Enno and Buhmann, Joachim M},
 58 |   booktitle={2010 20th international conference on pattern recognition},
 59 |   pages={3121--3124},
 60 |   year={2010},
 61 |   organization={IEEE}
 62 | }
 63 | 
 64 | @article{msmarco,
 65 |   title={Ms marco: A human-generated machine reading comprehension dataset},
 66 |   author={Nguyen, Tri and Rosenberg, Mir and Song, Xia and Gao, Jianfeng and Tiwary, Saurabh and Majumder, Rangan and Deng, Li},
 67 |   year={2016}
 68 | }
 69 | 
 70 | @article{umls,
 71 |   title={The unified medical language system (UMLS): integrating biomedical terminology},
 72 |   author={Bodenreider, Olivier},
 73 |   journal={Nucleic acids research},
 74 |   volume={32},
 75 |   number={suppl\_1},
 76 |   pages={D267--D270},
 77 |   year={2004},
 78 |   publisher={Oxford University Press}
 79 | }
 80 | 
 81 | @article{pubmedbertqa,
 82 |   title={Improved Methods To Aid Unsupervised Evidence-Based Fact Checking For Online Health News},
 83 |   author={Deka, Pritam and Jurek-Loughrey, Anna and Deepak, P},
 84 |   journal={Journal of Data Intelligence},
 85 |   volume={3},
 86 |   number={4},
 87 |   pages={474--504},
 88 |   year={2022}
 89 | }
 90 | 
 91 | @inproceedings{pubmedbertfull,
 92 |     title = "Self-Alignment Pretraining for Biomedical Entity Representations",
 93 |     author = "Liu, Fangyu  and
 94 |       Shareghi, Ehsan  and
 95 |       Meng, Zaiqiao  and
 96 |       Basaldella, Marco  and
 97 |       Collier, Nigel",
 98 |     booktitle = "Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies",
 99 |     month = jun,
100 |     year = "2021",
101 |     address = "Online",
102 |     publisher = "Association for Computational Linguistics",
103 |     url = "https://www.aclweb.org/anthology/2021.naacl-main.334",
104 |     pages = "4228--4238",
105 |     abstract = "Despite the widespread success of self-supervised learning via masked language models (MLM), accurately capturing fine-grained semantic relationships in the biomedical domain remains a challenge. This is of paramount importance for entity-level tasks such as entity linking where the ability to model entity relations (especially synonymy) is pivotal. To address this challenge, we propose SapBERT, a pretraining scheme that self-aligns the representation space of biomedical entities. We design a scalable metric learning framework that can leverage UMLS, a massive collection of biomedical ontologies with 4M+ concepts. In contrast with previous pipeline-based hybrid systems, SapBERT offers an elegant one-model-for-all solution to the problem of medical entity linking (MEL), achieving a new state-of-the-art (SOTA) on six MEL benchmarking datasets. In the scientific domain, we achieve SOTA even without task-specific supervision. With substantial improvement over various domain-specific pretrained MLMs such as BioBERT, SciBERTand and PubMedBERT, our pretraining scheme proves to be both effective and robust.",
106 | }
107 | 
108 | 
109 | @misc{icd10cm,
110 |   title = {{ICD-10-CM}},
111 |   author = {{The Center for Disease Control and Prevention (CDC)}},
112 |   url = {\url{https://www.cdc.gov/nchs/icd/icd-10-cm.htm}},
113 |   note = {Accessed: 2023-04-15}
114 | }
115 | 
116 | @article{rasmy2021,
117 |   title={Med-BERT: pretrained contextualized embeddings on large-scale structured electronic health records for disease prediction},
118 |   author={Rasmy, Laila and Xiang, Yang and Xie, Ziqian and Tao, Cui and Zhi, Degui},
119 |   journal={NPJ digital medicine},
120 |   volume={4},
121 |   number={1},
122 |   pages={86},
123 |   year={2021},
124 |   publisher={Nature Publishing Group UK London}
125 | }
126 | 
127 | @article{lee2020,
128 |   title={BioBERT: a pre-trained biomedical language representation model for biomedical text mining},
129 |   author={Lee, Jinhyuk and Yoon, Wonjin and Kim, Sungdong and Kim, Donghyeon and Kim, Sunkyu and So, Chan Ho and Kang, Jaewoo},
130 |   journal={Bioinformatics},
131 |   volume={36},
132 |   number={4},
133 |   pages={1234--1240},
134 |   year={2020},
135 |   publisher={Oxford University Press}
136 | }
137 | 
138 | @article{raffel2020,
139 |   title={Exploring the limits of transfer learning with a unified text-to-text transformer},
140 |   author={Raffel, Colin and Shazeer, Noam and Roberts, Adam and Lee, Katherine and Narang, Sharan and Matena, Michael and Zhou, Yanqi and Li, Wei and Liu, Peter J},
141 |   journal={The Journal of Machine Learning Research},
142 |   volume={21},
143 |   number={1},
144 |   pages={5485--5551},
145 |   year={2020},
146 |   publisher={JMLRORG}
147 | }
148 | 
149 | @article{radford2018,
150 |   title={Improving language understanding with unsupervised learning},
151 |   author={Radford, Alec and Narasimhan, Karthik and Salimans, Tim and Sutskever, Ilya},
152 |   year={2018},
153 |   publisher={Technical report, OpenAI}
154 | }
155 | 
156 | @article{devlin2018,
157 |   title={Bert: Pre-training of deep bidirectional transformers for language understanding},
158 |   author={Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina},
159 |   journal={arXiv preprint arXiv:1810.04805},
160 |   year={2018}
161 | }
162 | 
163 | @article{huang2019,
164 |   title={Clinicalbert: Modeling clinical notes and predicting hospital readmission},
165 |   author={Huang, Kexin and Altosaar, Jaan and Ranganath, Rajesh},
166 |   journal={arXiv preprint arXiv:1904.05342},
167 |   year={2019}
168 | }
169 | 
170 | @article{alsentzer2019,
171 |   title={Publicly available clinical BERT embeddings},
172 |   author={Alsentzer, Emily and Murphy, John R and Boag, Willie and Weng, Wei-Hung and Jin, Di and Naumann, Tristan and McDermott, Matthew},
173 |   journal={arXiv preprint arXiv:1904.03323},
174 |   year={2019}
175 | }
176 | 
177 | @inproceedings{med2vec,
178 |   title={Multi-layer representation learning for medical concepts},
179 |   author={Choi, Edward and Bahadori, Mohammad Taha and Searles, Elizabeth and Coffey, Catherine and Thompson, Michael and Bost, James and Tejedor-Sojo, Javier and Sun, Jimeng},
180 |   booktitle={proceedings of the 22nd ACM SIGKDD international conference on knowledge discovery and data mining},
181 |   pages={1495--1504},
182 |   year={2016}
183 | }
184 | 
185 | @article{ehr2vec,
186 |   title={EHR2Vec: representation learning of medical concepts from temporal patterns of clinical notes based on self-attention mechanism},
187 |   author={Wang, Li and Wang, Qinghua and Bai, Heming and Liu, Cong and Liu, Wei and Zhang, Yuanpeng and Jiang, Lei and Xu, Huji and Wang, Kai and Zhou, Yunyun},
188 |   journal={Frontiers in Genetics},
189 |   volume={11},
190 |   pages={630},
191 |   year={2020},
192 |   publisher={Frontiers Media SA}
193 | }
194 | 
195 | @inproceedings{inpatient2vec,
196 |   title={Inpatient2vec: Medical representation learning for inpatients},
197 |   author={Wang, Ying and Xu, Xiao and Jin, Tao and Li, Xiang and Xie, Guotong and Wang, Jianmin},
198 |   booktitle={2019 IEEE International Conference on Bioinformatics and Biomedicine (BIBM)},
199 |   pages={1113--1117},
200 |   year={2019},
201 |   organization={IEEE}
202 | }
203 | 
204 | @Manual{rcore,
205 |     title = {R: A Language and Environment for Statistical Computing},
206 |     author = {{R Core Team}},
207 |     organization = {R Foundation for Statistical Computing},
208 |     address = {Vienna, Austria},
209 |     year = {2023},
210 |     url = {https://www.R-project.org/},
211 |   }
212 | 
213 | @Manual{dplyr,
214 |     title = {dplyr: A Grammar of Data Manipulation},
215 |     author = {Hadley Wickham and Romain François and Lionel Henry and Kirill Müller and Davis Vaughan},
216 |     year = {2023},
217 |     note = {R package version 1.1.1},
218 |     url = {https://CRAN.R-project.org/package=dplyr},
219 | }
220 | 
221 | @Book{ggplot2,
222 |     author = {Hadley Wickham},
223 |     title = {ggplot2: Elegant Graphics for Data Analysis},
224 |     publisher = {{Springer-Verlag}},
225 |     address = {New York},
226 |     year = {2016},
227 |     isbn = {978-3-319-24277-4},
228 |     url = {https://ggplot2.tidyverse.org}
229 | }
230 | 
231 | @article{pubmed,
232 |   title = {PubMed 2.0},
233 |   author = {White, Jacob},
234 |   journal = {Medical reference services quarterly},
235 |   volume = {39},
236 |   number = {4},
237 |   pages = {382--387},
238 |   year = {2020},
239 |   publisher = {Taylor \& Francis}
240 | }
241 | 
242 | @article{pubmedcentral,
243 |   title = {PubMed Central: The GenBank of the published literature},
244 |   author = {Roberts, Richard J},
245 |   journal = {Proceedings of the National Academy of Sciences},
246 |   volume = {98},
247 |   number = {2},
248 |   pages = {381--382},
249 |   year = {2001},
250 |   publisher = {National Acad Sciences}
251 | }
252 | 
253 | @article{mimiciii,
254 |   title = {MIMIC-III, a freely accessible critical care database},
255 |   author = {Johnson, Alistair EW and Pollard, Tom J and Shen, Lu and Lehman, Li-wei H and Feng, Mengling and Ghassemi, Mohammad and Moody, Benjamin and Szolovits, Peter and Anthony Celi, Leo and Mark, Roger G},
256 |   journal = {Scientific data},
257 |   volume = {3},
258 |   number = {1},
259 |   pages = {1--9},
260 |   year = {2016},
261 |   publisher = {Nature Publishing Group}
262 | }
263 | 
264 | @Manual{readr,
265 |   title = {readr: Read Rectangular Text Data},
266 |   author = {Hadley Wickham and Jim Hester and Jennifer Bryan},
267 |   year = {2023},
268 |   note = {R package version 2.1.4},
269 |   url = {https://CRAN.R-project.org/package=readr},
270 | }
271 | 
272 | @Manual{Rtsne,
273 |   title = {{Rtsne}: T-Distributed Stochastic Neighbor Embedding using Barnes-Hut
274 | Implementation},
275 |   author = {Jesse H. Krijthe},
276 |   year = {2015},
277 |   note = {R package version 0.16},
278 |   url = {https://github.com/jkrijthe/Rtsne},
279 | }
280 | 
281 | @Manual{stringr,
282 |   title = {stringr: Simple, Consistent Wrappers for Common String Operations},
283 |   author = {Hadley Wickham},
284 |   year = {2023},
285 |   note = {https://stringr.tidyverse.org,
286 | https://github.com/tidyverse/stringr},
287 | }
288 | 
289 | @article{luo2022,
290 |   title={BioGPT: generative pre-trained transformer for biomedical text generation and mining},
291 |   author={Luo, Renqian and Sun, Liai and Xia, Yingce and Qin, Tao and Zhang, Sheng and Poon, Hoifung and Liu, Tie-Yan},
292 |   journal={Briefings in Bioinformatics},
293 |   volume={23},
294 |   number={6},
295 |   year={2022},
296 |   publisher={Oxford Academic}
297 | }
298 | 
299 | @article{icd10,
300 |   title={International classification of diseases 10th revision (ICD-10)},
301 |   author={DiSantostefano, Jan},
302 |   journal={The Journal for Nurse Practitioners},
303 |   volume={5},
304 |   number={1},
305 |   pages={56--57},
306 |   year={2009},
307 |   publisher={Elsevier}
308 | }
309 | 
310 | @article{blank,
311 |     author  = {}, 
312 |     title   = {},
313 |     journal = {}, 
314 |     year    = {},
315 |     month   = {}, 
316 |     volume  = {}, 
317 |     number  = {}, 
318 |     pages   = {},
319 |     note    = {} 
320 | }    
321 | 
322 | % Article within a journal
323 | @article{koon,
324 |     author  = {Koonin, E V and Altschul, S F and P Bork}, 
325 |     title   = {BRCA1 protein products: functional motifs}, 
326 |     journal = {Nat. Genet.}, 
327 |     year    = {1996},
328 |     volume  = {13}, 
329 |     pages   = {266-267}
330 | }    
331 | 
332 | %%%%%%%%
333 | % Article within conference proceedings
334 | @inproceedings{xjon,
335 |     author    = {X Jones}, 
336 |     title     = {Zeolites and synthetic mechanisms},
337 |     booktitle = {Proceedings of the First National Conference on 
338 |                 Porous Sieves: 27-30 June 1996; Baltimore},
339 |     year      = {1996},
340 |     editor    = {Y Smith}, 
341 |     pages     = {16-27},
342 | }    
343 | 
344 | %%%%%%%%
345 | %  Book chapter, or article within a book
346 | @incollection{schn,
347 |     author    = {E Schnepf}, 
348 |     title     = {From prey via endosymbiont to plastids: 
349 |              comparative studies in dinoflagellates},
350 |     booktitle = {Origins of Plastids}, 
351 |     editor    = {R A Lewin}, 
352 |     publisher = {Chapman and Hall},
353 |     pages     = {53-76}, 
354 |     year      = {1993},
355 |     address = {New York}, 
356 |     edition = {2nd} 
357 | }    
358 | 
359 | %%%%%%%%
360 | % Complete book
361 | @book{marg,
362 |     author    = {L Margulis}, 
363 |     title     = {Origin of Eukaryotic Cells},
364 |     publisher = {Yale University Press}, 
365 |     year      = {1970},
366 |     address   = {New Haven} 
367 | }
368 | 
369 | 
370 | %%%%%%%%
371 | % PHD Thesis
372 | @phdthesis{koha,
373 |     author = {R Kohavi}, 
374 |     title  = {Wrappers for performance enhancement and
375 |              obvious decision graphs},
376 |     school = {Stanford University, Computer Science Department},
377 |     year   = {1995}
378 | }
379 | 
380 | %%%%%%%%
381 | %  Miscellaneous: webpage link/urL, etc/
382 | @misc{issnic,
383 |     author  = {{ISSN International Centre}},
384 |     title = {The ISSN register},
385 |     url = {http://www.issn.org},
386 |     year = {2006},
387 |     urldate={Accessed 20 Feb 2007}
388 | }
389 | 
390 | 


--------------------------------------------------------------------------------
/bmc-bioinformatics-paper/bmc_article.tex:
--------------------------------------------------------------------------------
   1 | %% BioMed_Central_Tex_Template_v1.06
   2 | %%                                      %
   3 | %  bmc_article.tex            ver: 1.06 %
   4 | %                                       %
   5 | 
   6 | %%IMPORTANT: do not delete the first line of this template
   7 | %%It must be present to enable the BMC Submission system to
   8 | %%recognise this template!!
   9 | 
  10 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  11 | %%                                     %%
  12 | %%  LaTeX template for BioMed Central  %%
  13 | %%     journal article submissions     %%
  14 | %%                                     %%
  15 | %%          <8 June 2012>              %%
  16 | %%                                     %%
  17 | %%                                     %%
  18 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  19 | 
  20 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  21 | %%                                                                 %%
  22 | %% For instructions on how to fill out this Tex template           %%
  23 | %% document please refer to Readme.html and the instructions for   %%
  24 | %% authors page on the biomed central website                      %%
  25 | %% https://www.biomedcentral.com/getpublished                      %%
  26 | %%                                                                 %%
  27 | %% Please do not use \input{...} to include other tex files.       %%
  28 | %% Submit your LaTeX manuscript as one .tex document.              %%
  29 | %%                                                                 %%
  30 | %% All additional figures and files should be attached             %%
  31 | %% separately and not embedded in the \TeX\ document itself.       %%
  32 | %%                                                                 %%
  33 | %% BioMed Central currently use the MikTex distribution of         %%
  34 | %% TeX for Windows) of TeX and LaTeX.  This is available from      %%
  35 | %% https://miktex.org/                                             %%
  36 | %%                                                                 %%
  37 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  38 | 
  39 | %%% additional documentclass options:
  40 | %  [doublespacing]
  41 | %  [linenumbers]   - put the line numbers on margins
  42 | 
  43 | %%% loading packages, author definitions
  44 | 
  45 | %\documentclass[twocolumn]{bmcart}% uncomment this for twocolumn layout and comment line below
  46 | \documentclass{bmcart}
  47 | 
  48 | %%% Load packages
  49 | \usepackage{amsthm,amsmath}
  50 | \usepackage{fancyvrb}
  51 | \DefineVerbatimEnvironment{Code}{Verbatim}{}
  52 | \DefineVerbatimEnvironment{CodeInput}{Verbatim}{fontshape=sl}
  53 | \DefineVerbatimEnvironment{CodeOutput}{Verbatim}{}
  54 | \newenvironment{CodeChunk}{}{}
  55 | %\RequirePackage[numbers]{natbib}
  56 | %\RequirePackage[authoryear]{natbib}% uncomment this for author-year bibliography
  57 | %\RequirePackage{hyperref}
  58 | \usepackage[utf8]{inputenc} %unicode support
  59 | \usepackage{graphicx} 
  60 | %\usepackage[applemac]{inputenc} %applemac support if unicode package fails
  61 | %\usepackage[latin1]{inputenc} %UNIX support if unicode package fails
  62 | 
  63 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  64 | %%                                             %%
  65 | %%  If you wish to display your graphics for   %%
  66 | %%  your own use using includegraphic or       %%
  67 | %%  includegraphics, then comment out the      %%
  68 | %%  following two lines of code.               %%
  69 | %%  NB: These line *must* be included when     %%
  70 | %%  submitting to BMC.                         %%
  71 | %%  All figure files must be submitted as      %%
  72 | %%  separate graphics through the BMC          %%
  73 | %%  submission process, not included in the    %%
  74 | %%  submitted article.                         %%
  75 | %%                                             %%
  76 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  77 | 
  78 | %\def\includegraphic{}
  79 | %\def\includegraphics{}
  80 | 
  81 | %%% Put your definitions there:
  82 | \startlocaldefs
  83 | \endlocaldefs
  84 | 
  85 | %%% Begin ...
  86 | \begin{document}
  87 | 
  88 | %%% Start of article front matter
  89 | \begin{frontmatter}
  90 | 
  91 | \begin{fmbox}
  92 | \dochead{Databases}
  93 | 
  94 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  95 | %%                                          %%
  96 | %% Enter the title of your article here     %%
  97 | %%                                          %%
  98 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  99 | 
 100 | \title{A Compressed Large Language Model Embedding Dataset of ICD 10 CM 
 101 | Descriptions}
 102 | 
 103 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 104 | %%                                          %%
 105 | %% Enter the authors here                   %%
 106 | %%                                          %%
 107 | %% Specify information, if available,       %%
 108 | %% in the form:                             %%
 109 | %%   <key>={<id1>,<id2>}                    %%
 110 | %%   <key>=                                 %%
 111 | %% Comment or delete the keys which are     %%
 112 | %% not used. Repeat \author command as much %%
 113 | %% as required.                             %%
 114 | %%                                          %%
 115 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 116 | 
 117 | \author[
 118 |   addressref={aff1},                   % id's of addresses, e.g. {aff1,aff2}
 119 |   corref={aff1},                       % id of corresponding address, if any
 120 | % noteref={n1},                        % id's of article notes, if any
 121 |   email={michael.kane@yale.edu}   % email address
 122 | ]{\inits{M.J.}\fnm{Michael J.} \snm{Kane}}
 123 | \author[
 124 |   addressref={aff2,aff3},
 125 |   email={casey.king@yale.edu}
 126 | ]{\inits{C.}\fnm{Casey} \snm{King}}
 127 | \author[
 128 |   addressref={aff1},
 129 |   email={denise.esserman@yale.edu}
 130 | ]{\inits{D.}\fnm{Denise} \snm{Esserman}}
 131 | \author[
 132 |   addressref={aff4},
 133 |   email={nklatham@bwh.harvard.edu}
 134 | ]{\inits{N.K.}\fnm{Nancy K.} \snm{Latham}}
 135 | \author[
 136 |   addressref={aff1},
 137 |   email={erich.greene@yale.edu}
 138 | ]{\inits{E.}\fnm{Erich J.} \snm{Greene}}
 139 | \author[
 140 |   addressref={aff5},
 141 |   email={dganz@mednet.ucla.edu}
 142 | ]{\inits{D.A.}\fnm{David A.} \snm{Ganz}}
 143 | 
 144 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 145 | %%                                          %%
 146 | %% Enter the authors' addresses here        %%
 147 | %%                                          %%
 148 | %% Repeat \address commands as much as      %%
 149 | %% required.                                %%
 150 | %%                                          %%
 151 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 152 | 
 153 | \address[id=aff1]{%                           % unique id
 154 |   \orgdiv{Department of Biostatistics},       % department, if any
 155 |   \orgname{School of Public Health},          % university, etc
 156 |   \street{Yale University},
 157 |   \city{New Haven},                              % city
 158 |   \cny{USA}                                    % country
 159 | }
 160 | \address[id=aff2]{%
 161 |   \orgdiv{The Jackson School of Global Affairs}
 162 |   \orgname{Yale University},
 163 |   %\street{},
 164 |   %\postcode{}
 165 |   \city{New Haven},
 166 |   \cny{USA}
 167 | }
 168 | \address[id=aff3]{%
 169 |   \orgdiv{US Healthcare and Life Sciences}
 170 |   \orgname{Microsoft},
 171 |   %\street{},
 172 |   %\postcode{}
 173 |   \city{Redmond},
 174 |   \cny{USA}
 175 | }
 176 | \address[id=aff4]{%
 177 |   \orgdiv{Research Program in Men’s Health: Aging and Metabolism, Boston Claude D. Pepper Older Americans Independence Center for Function Promoting Therapies}
 178 |   \orgname{Brigham and Women’s Hospital},
 179 |   %\street{},
 180 |   %\postcode{}
 181 |   \city{Boston},
 182 |   \cny{USA}
 183 | }
 184 | \address[id=aff5]{%
 185 |   \orgdiv{Department of Medicine}
 186 |   \orgname{VA Greater Los Angeles/UCLA},
 187 |   %\street{},
 188 |   %\postcode{}
 189 |   \city{Los Angeles},
 190 |   \cny{USA}
 191 | }
 192 | 
 193 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 194 | %%                                          %%
 195 | %% Enter short notes here                   %%
 196 | %%                                          %%
 197 | %% Short notes will be after addresses      %%
 198 | %% on first page.                           %%
 199 | %%                                          %%
 200 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 201 | 
 202 | %\begin{artnotes}
 203 | %%\note{Sample of title note}     % note to the article
 204 | %\note[id=n1]{Equal contributor} % note, connected to author
 205 | %\end{artnotes}
 206 | 
 207 | \end{fmbox}% comment this for two column layout
 208 | 
 209 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 210 | %%                                           %%
 211 | %% The Abstract begins here                  %%
 212 | %%                                           %%
 213 | %% Please refer to the Instructions for      %%
 214 | %% authors on https://www.biomedcentral.com/ %%
 215 | %% and include the section headings          %%
 216 | %% accordingly for your article type.        %%
 217 | %%                                           %%
 218 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 219 | 
 220 | \begin{abstractbox}
 221 | 
 222 | \begin{abstract} % abstract
 223 | 
 224 | This paper presents novel datasets providing numerical representations of 
 225 | ICD-10-CM codes by generating description embeddings using a large language 
 226 | model followed by a dimension reduction via autoencoder. The embeddings 
 227 | serve as informative input features for machine learning models by capturing 
 228 | relationships among categories and preserving inherent context information. 
 229 | The model 
 230 | generating the data was validated in two ways. First, the dimension 
 231 | reduction was validated using an autoencoder, and secondly, a supervised model 
 232 | was created to estimate the ICD-10-CM hierarchical categories. Results show 
 233 | that the dimension of the data can be reduced to as few as 10 dimensions 
 234 | while maintaining the ability to reproduce the original embeddings, with the 
 235 | fidelity decreasing as the reduced-dimension representation decreases. 
 236 | Multiple compression levels are provided, allowing users to choose as per 
 237 | their requirements, download and use without any other setup. 
 238 | The readily available datasets of ICD-10-CM codes are 
 239 | anticipated to be highly valuable for researchers in biomedical informatics, 
 240 | enabling more advanced analyses in the field. This approach has the potential 
 241 | to significantly improve the utility of ICD-10-CM codes in the biomedical 
 242 | domain.
 243 | 
 244 | %\parttitle{First part title} %if any
 245 | %Text for this section.
 246 | 
 247 | %\parttitle{Second part title} %if any
 248 | %Text for this section.
 249 | \end{abstract}
 250 | 
 251 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 252 | %%                                          %%
 253 | %% The keywords begin here                  %%
 254 | %%                                          %%
 255 | %% Put each keyword in separate \kwd{}.     %%
 256 | %%                                          %%
 257 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 258 | 
 259 | \begin{keyword}
 260 | \kwd{large language model}
 261 | \kwd{autoencoder}
 262 | \kwd{ICD-10-CM}
 263 | \kwd{electronic health records}
 264 | \kwd{EHR}
 265 | \kwd{NLP}
 266 | \end{keyword}
 267 | 
 268 | % MSC classifications codes, if any
 269 | %\begin{keyword}[class=AMS]
 270 | %\kwd[Primary ]{}
 271 | %\kwd{}
 272 | %\kwd[; secondary ]{}
 273 | %\end{keyword}
 274 | 
 275 | \end{abstractbox}
 276 | %
 277 | %\end{fmbox}% uncomment this for two column layout
 278 | 
 279 | \end{frontmatter}
 280 | 
 281 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 282 | %%                                            %%
 283 | %% The Main Body begins here                  %%
 284 | %%                                            %%
 285 | %% Please refer to the instructions for       %%
 286 | %% authors on:                                %%
 287 | %% https://www.biomedcentral.com/getpublished %%
 288 | %% and include the section headings           %%
 289 | %% accordingly for your article type.         %%
 290 | %%                                            %%
 291 | %% See the Results and Discussion section     %%
 292 | %% for details on how to create sub-sections  %%
 293 | %%                                            %%
 294 | %% use \cite{...} to cite references          %%
 295 | %%  \cite{koon} and                           %%
 296 | %%  \cite{oreg,khar,zvai,xjon,schn,pond}      %%
 297 | %%                                            %%
 298 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 299 | 
 300 | %%%%%%%%%%%%%%%%%%%%%%%%% start of article main body
 301 | % <put your article body there>
 302 | 
 303 | %%%%%%%%%%%%%%%%
 304 | %% Background %%
 305 | %%
 306 | 
 307 | \section*{Background}
 308 | 
 309 | The International Classification of Diseases, 10th Revision, 
 310 | Clinical Modification (ICD-10-CM) \cite{icd10} is a 
 311 | standardized classification system 
 312 | for categorizing diseases, disorders, and health conditions. ICD-10 was 
 313 | developed by the World Health Organization (WHO) and adapted for use in the 
 314 | United States as ICD-10-CM by the National Center for Health 
 315 | Statistics (NCHS) \cite{icd10cm}. The standard plays a crucial role in 
 316 | the analysis of 
 317 | electronic medical records (EMRs) or electronic health records (EHRs) for 
 318 | several reasons:
 319 | \begin{enumerate}
 320 | \item{Consistency and Standardization: The ICD-10-CM allows for a consistent 
 321 | and standardized method of coding and documenting medical conditions across 
 322 | healthcare providers and facilities. This helps to ensure accurate and 
 323 | uniform data exchange, analysis, and comparison.}
 324 | \item{Data Analysis and Research: The ICD-10-CM codes can be used to analyze 
 325 | patient data for clinical research, epidemiological studies, and public health 
 326 | surveillance. It helps to identify trends and patterns in diseases, monitor 
 327 | the effectiveness of treatments, and develop better prevention and management 
 328 | strategies.}
 329 | \item{Quality Measurement and Improvement: ICD-10-CM codes can be used to 
 330 | evaluate the quality of care provided by healthcare facilities, monitor 
 331 | patient outcomes and identify areas for improvement. This information can 
 332 | be used to enhance the overall healthcare delivery system.}
 333 | \item{Reimbursement and Billing: ICD-10-CM codes play a vital role in 
 334 | healthcare reimbursement by providing a standardized method to classify and 
 335 | report medical conditions. Insurance companies and other payers use these 
 336 | codes to determine appropriate payments for medical services rendered.}
 337 | \item{Health Policy and Planning: ICD-10-CM codes help health authorities and 
 338 | policymakers to identify population health needs, allocate resources, and 
 339 | develop targeted healthcare policies and interventions.}
 340 | \end{enumerate}
 341 | 
 342 | While ICD-10-CM codes do provide a consistent and comprehensive set of 
 343 | categories, their incorporation into statistical and machine learning analyses 
 344 | can be challenging for several reasons. First, in the 2019 version of the 
 345 | standard, there were 71,932 categories, increasing to 72,184 categories in 
 346 | 2020; 72,616 categories in 2021; and 72,750 categories in 2022. As a result, 
 347 | analyses using these codes, where the set of codes is not restricted to a smaller 
 348 | set, 
 349 | must take into account their high dimensionality or will require a large 
 350 | number of training samples in order to fit consistent models. Second, 
 351 | categorical variables are usually incorporated into analyses with a contrast 
 352 | encoding such as treatment, helmert, etc. Contrast numeric 
 353 | representations are orthogonal or, under appropriate statistical assumptions, 
 354 | independent with respect to their categories. However, ICD-10-CM codes 
 355 | represent a hierarchical structure, 
 356 | where codes are organized into chapters, blocks, and categories based on the 
 357 | type and anatomical location of the diseases or conditions. Applying 
 358 | traditional contrast encoding methods may 
 359 | not fully capture this hierarchical information, potentially resulting in a 
 360 | loss of valuable context and relationships between codes.
 361 | 
 362 | Researchers have considered alternative encoding methods or feature extraction 
 363 | techniques that can better represent the hierarchical structure of ICD-10-CM 
 364 | codes. However, incorporating both hierarchical structure and other contextual 
 365 | information in a general way can be difficult. The previous generation of word 
 366 | embeddings, which provide vector-encodings of words, were shown effective for 
 367 | these types of tasks, with models like \texttt{med2vec} \cite{med2vec} 
 368 | providing improved abilities to predict patient mortality; 
 369 | \texttt{inpatient2vec} \cite{inpatient2vec} to predict clinical events; 
 370 | \texttt{EHR2Vec} \cite{ehr2vec} to help analyze sequences of patient 
 371 | visits; and \texttt{cui2vec} \cite{cui2vec} to learn medical concepts based on multimodal
 372 | clinical data. These models have been foundational in advancing the capabilities of machine learning models in understanding and generating human language. 
 373 | These models are shallow, two-layer neural networks that are trained to reconstruct linguistic contexts of words. Word embeddings produced by Word2Vec \cite{church2017} and
 374 | previously mentioned variants, provide vector representations of words in a continuous vector space where semantically similar words are mapped to nearby points.
 375 | Within this class of models there are two main training algorithms: Continuous Bag of Words and Skip-Gram models \cite{mikolov2013}. The former predicts target words 
 376 | (e.g., 'apple') from source context words ('the fruit'). The latter performs
 377 | the inverse and predicts source context words from the target words, and
 378 | tends to perform better on larger datasets and produces higher-quality embeddings for less frequent words.
 379 | 
 380 | Despite their advantages, word embeddings also have certain 
 381 | limitations. First, word embeddings are typically generated at the word or 
 382 | code level, and while word embeddings can capture semantic similarities, they 
 383 | ofen struggle to represent hierarchical representations like those found in 
 384 | ICD-10-CM codes. Second, traditional word embeddings generate a single vector 
 385 | for each word regardless of context. This means that 
 386 | the same code can have different meanings depending on where and when it is 
 387 | used. This is something these models do not capture. Third, word embeddings 
 388 | can have difficulty handling rare codes. Word embeddings typically require 
 389 | a sufficient number of training samples to learn meaningful representations. 
 390 | For rarely used ICD-10-CM codes, the learned embedding might not be reliable. 
 391 | Fourth, traditional word embeddings provide static representations and do not change over time. However, in healthcare, the meaning and usages of certain codes can evolve, and these models cannot capture dynamic changes. 
 392 | Finally, the quality and representativeness of 
 393 | the word embeddings depend on the training data used to generate them. If the 
 394 | training data does not adequately cover the entire spectrum of medical 
 395 | conditions or encounters, the embeddings may not capture all relevant 
 396 | relationships or information.
 397 | 
 398 | The Transformer model \cite{vaswani2017} is a more recent architecture primarily designed for handling sequences, and it has become the foundation for many recent models in natural language processing, including the Bidirectional Encoder Representation
 399 | Transformer (BERT) \cite{devlin2018}, the Generative Pre-Trained Transformer (GPT) \cite{radford2018}, and the Text-to-Text-Transfer-Transformer T5 \cite{raffel2020}. The Transformer model's main innovation is its self-attention mechanism, which weighs input elements dynamically based on their content and relationship. This allows the model to focus on different parts of the input for different tasks or even different parts of the same task.
 400 | 
 401 | These models fall under the category of Large language models (LLMs) and
 402 | address some of the shortcomings of traditional 
 403 | word embeddings through a combination of advanced techniques and 
 404 | architectures. Unlike traditional word embeddings that generate static 
 405 | representations, LLMs generate contextualized embeddings. 
 406 | These embeddings take into account the surrounding words or tokens, allowing 
 407 | for a more nuanced representation of words and codes in different contexts. 
 408 | This helps in capturing the semantic relationships between codes more 
 409 | effectively. These models are pre-trained on vast amounts of text data, 
 410 | allowing them to learn general language representations before being 
 411 | fine-tuned for specific tasks. This pre-training enables the models to 
 412 | leverage existing knowledge and adapt more effectively to new tasks, even 
 413 | with limited task-specific data. LLMs can be incrementally 
 414 | updated or fine-tuned with new data, allowing them to adapt to evolving 
 415 | medical knowledge and practices more effectively than static word embeddings. 
 416 | And, while not explicitly designed for hierarchical data like ICD-10-CM codes, 
 417 | LLMs can implicitly capture aspects of structured hierarchical relationships through 
 418 | their deep architectures and the context in which codes appear. This can help 
 419 | capture different levels of granularity and relationships between codes more 
 420 | effectively than traditional word embeddings.
 421 | 
 422 | Vector embeddings attempt to optimize the conditional probability of observing 
 423 | the actual output word given an input word (or vice versa, depending on the 
 424 | variant used). For instance, in the skip-gram variant, given a word 
 425 | $w_i$ and a context word $w_j$, the model is trained to maximize the 
 426 | following
 427 | \begin{equation*}
 428 | P(w_j|w_i) = \frac{e^{v^T{w_j}^T v{w_i}}}{\sum_k e^{v^T{w_k}^T v{w_i}}}
 429 | \end{equation*}
 430 | where $v_w$ and $v'_w$ represent the ``input'' and ``output'' vector 
 431 | representations of a word w, and the summation in the denominator is over 
 432 | all words in the vocabulary. The vectors $v_w$ and $v'_w$ are the word 
 433 | embeddings learned by a similarity model.
 434 | 
 435 | LLM models also start by converting each word into an 
 436 | initial word embedding using an embedding matrix. However, these initial 
 437 | embeddings are then updated based on the context of the word. This is done by 
 438 | passing the embeddings through several layers of a transformer model, which 
 439 | uses self-attention mechanisms. The output of the transformer is a contextual 
 440 | embedding for each word. Mathematically, the self-attention mechanism can be 
 441 | represented as
 442 | \begin{equation*}
 443 | \text{Attention}(Q, K, V) = \text{softmax}(QK^T/\sqrt{d}) V
 444 | \end{equation*}
 445 | where $Q$, $K$, and $V$ represent the query, key, and value matrices, which 
 446 | are derived from the input embeddings. The softmax function ensures that the 
 447 | weights of different words sum to 1, and the $\sqrt{d}$ in the denominator is 
 448 | a scaling factor that improves the stability of the gradients during training. 
 449 | The resulting matrix product is a weighted sum of the value vectors, where the 
 450 | weights depend on the similarity between the query and key vectors.
 451 | 
 452 | To generate an embedding for a sentence or description, one common approach is to take the average of the contextual embeddings of the words in the sentence:
 453 | \begin{equation*}
 454 | E(D) = \frac{1}{n } \sum E(w_i)
 455 | \end{equation*}
 456 | Here, $E(D)$ is the embedding for the description, $E(w_i)$ is the contextual 
 457 | embedding for word $w_i$, and the sum is over all words in the description.
 458 | 
 459 | The key difference between the two methods is that vector embeddings generate 
 460 | a single, static embedding for each word, while LLMs generate a dynamic, 
 461 | context-dependent embedding. This allows an LLM to capture nuances in meaning 
 462 | that cannot be represented with static embeddings.
 463 | 
 464 | There are several  
 465 | BERT or similar transformer-based biomedical models that can been used 
 466 | to generate embeddings for medical corpuses including ClinicalBERT 
 467 | \cite{huang2019,alsentzer2019}, BioBERT \cite{lee2020}, and
 468 | Med-BERT \cite{rasmy2021}, but to our knowledge none of the current 
 469 | literature includes the 
 470 | applications of these models specifically for the purpose of generating 
 471 | embeddings for ICD-10-CM code that can be consumed as readily available 
 472 | data sets. These data sets represent a valuable resource for practioners
 473 | who are interested in an information-rich representation of those codes,
 474 | without needing to acquire models, embed data, and process them.
 475 | 
 476 | %LLM generated embeddings address many of these limitations. They take into 
 477 | %account the surrounding words or tokens, allowing for a more nuanced 
 478 | %representation of words and codes in different contexts. This helps in 
 479 | %capturing the semantic relationships between codes more effectively. These 
 480 | %models are pre-trained on vast amounts of text data, allowing them to learn 
 481 | %general language representations before being fine-tuned for specific tasks. 
 482 | %This pre-training enables the models to leverage existing knowledge and 
 483 | %adapt more effectively to new tasks, even with limited task-specific data. 
 484 | %LLMs can be incrementally updated or fine-tuned with new data, allowing them 
 485 | %to adapt to evolving medical knowledge and practices more effectively than 
 486 | %static word embeddings. And, while not explicitly designed for hierarchical 
 487 | %data like ICD-10-CM codes, LLMs can implicitly learn hierarchical 
 488 | %relationships through their deep architectures and the context in which codes 
 489 | %appear. This can help capture different levels of granularity and relationships between codes more %effectively than traditional word embeddings.
 490 | 
 491 | This paper describes data sets provided as \texttt{.csv} files, which are 
 492 | available online in the form of a crosswalk from ICD-10-CM codes 
 493 | to embeddings (a numeric vector of values), based on their descriptions. A sample
 494 | of five descriptions and their embeddings are provided in Supplementary Materials. The 
 495 | embeddings were generated using the BioGPT LLM \cite{luo2022},which was trained on the biomedical literature including PubMed \cite{pubmed}, 
 496 | PubMed Central \cite{pubmedcentral}, and clinical notes from MIMIC-III 
 497 | \cite{mimiciii}. This model was shown to be superior at 
 498 | encoding context and relational information than competitors 
 499 | in the medical domain. Since the dimension of the embedding LLM is relatively
 500 | high (42384), we provide dimension-reduced versions in 1000, 100, 50, 
 501 | and 10 dimensions. The model generating the data was validated in two ways. 
 502 | The first way validates the dimension reduction. The embedding data were 
 503 | compressed using an auto-encoder. The out-of-sample accuracy of a validation 
 504 | set is examined as well as the performance of the model for other versions 
 505 | (by year) of the ICD-10-CM specification. Our results show that we can reduce 
 506 | the dimension of the data down to as few as 10 dimensions while maintaining 
 507 | the ability to reproduce the original embeddings, with the fidelity decreasing 
 508 | as the reduced-dimension representation decreases. The second way validates 
 509 | the conceptual representation by creating a supervised model to estimate the 
 510 | ICD-10-CM hierarchical categories. Again, we see as the dimension of the 
 511 | compressed representation decreases, the model accuracy decreases. Since 
 512 | multiple compression levels are provided, users are free to choose whichever 
 513 | suits their needs, allowing them to trade off accuracy for dimensionality.
 514 | 
 515 | The paper proceeds as follows. The next section provides a high-level 
 516 | description of the BioGPT and the embedding along with the construction of 
 517 | the autoencoder used to reduce the dimension of the embedding representation. 
 518 | That section then provides validation for both the dimension reduction as well 
 519 | as the representation. The third section provides an example of how to use the 
 520 | dataset to cluster ICD-10-CM codes using the R programming environment 
 521 | \cite{rcore}. The final section provides a broader look at the
 522 | incorporation of LLM approaches to these types of data.
 523 | 
 524 | The data sets and code to generate them are available in a public
 525 | repository on Github 
 526 | \footnote{https://github.com/kaneplusplus/icd-10-cm-embedding}. 
 527 | The data are licensed under the Creative Commons Attribution NonCommercial 
 528 | ShareAlike 4.0 International License 
 529 | \footnote{https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode}. 
 530 | The code is licensed under GPL-v2
 531 | \footnote{https://www.gnu.org/licenses/old-licenses/gpl-2.0.en.html}.
 532 | 
 533 | \section*{Construction and content}
 534 | 
 535 | The provided data are generated by embedding ICD-10-CM descriptions using the 
 536 | BioGPT-Large model, which comprises 1.5 billion parameters and is accessible 
 537 | via the Hugging Face model repository, \footnote{https://hugginface.co} and
 538 | then performing a dimension reduction using an autoencoder. 
 539 | The embedding process involves tokenizing 
 540 | textual phrases into tokens (words, subwords, or characters) and mapping them 
 541 | to unique vocabulary IDs. Token IDs are passed through an embedding layer, 
 542 | resulting in a sequence of continuous embedding vectors. Positional encodings 
 543 | are added elementwise to these vectors, enabling the model to capture token 
 544 | order and relative positions. The embeddings are then contextualized by 
 545 | passing them through the model's layers. An attention mask selectively 
 546 | controls information flow in the attention mechanism, allowing the model to 
 547 | weigh the importance of input tokens when generating contextualized embeddings 
 548 | in a 42384-dimension space.
 549 | 
 550 | The embedding is then compressed using an autoencoder. The autoencoder used
 551 | here is 
 552 | a series of fully connected layers where the number of hidden nodes is 
 553 | approximately one order of magnitude smaller than the previous layer and then 
 554 | an order of magnitude larger until the output layer.  For example, the 
 555 | autoencoder compressing to 10 dimensions has layers of size 42384, 1000,
 556 | 100, 50, 10, 50, 100, 1000, 42384. Models whose dimension is large use 
 557 | the same structure while retaining only the appropriate layers. 
 558 | A practioner who would like to make use of these embeddings for their
 559 | own modeling task, can download 
 560 | these data, substituting the embedding values for the ICD 10 representation.
 561 | The values are information-rich and will be useful in a variety of 
 562 | supervised and unsupervised tasks involving medical research.
 563 | 
 564 | \subsection*{Validating the dimension reduction}
 565 | 
 566 | The autoencoder compressing the LLM embedding was fit on the 2019 ICD-10-CM 
 567 | descriptions for 20 epochs, with batch sizes 64, 128, and 256. The mean-square 
 568 | error loss between the embedding and autoencoder estimate, and a validation 
 569 | data set comprised of random subset of 10\% of the samples. The model 
 570 | performance is shown in Table \ref{tab:autoencoder_perf}.  
 571 | Based on these results the models with the best 
 572 | validation loss for each of the compressed embedding dimensions selected 
 573 | for further validation and eventual distribution. In addition,
 574 | benchmarking the validation loss serves two purposes. First, it establishes
 575 | a relative measure of performance quantifying the compression loss and allowing
 576 | us to pick the best set of model parameters to generate the embedding
 577 | data. Second, the validation loss in particular quantifies how much loss
 578 | is incurred by new ICD-10-CM codes showing that the loss is comparable to,
 579 | and often less than, the error in the training data. 
 580 | 
 581 | In addition to the 2019 validation, the models selected for distribution were
 582 | tested on the 2020-2022 data sets to ensure their performance is comparable
 583 | over years. The results are shown in Table \ref{tab:autoencoder_year}. 
 584 | It should be noted that the ICD-10-CM codes do not vary much from
 585 | one year to the next, so we should not expect large differences. As 
 586 | expected, the mean square error and coefficients of determination are similar 
 587 | to the 2019 data. For a given embedding dimension it can be seen that neither
 588 | the coefficient of determination nor the mean square error change significantly 
 589 | over years indicating that the same autoencoder
 590 | could likely be used in subsequent years, while incurring similar loss. This 
 591 | also implies that an incremental approach could be taken in subsequent
 592 | years when regenerating the 
 593 | embeddings where only new codes would need to be processed.
 594 | 
 595 | \subsection*{Validating the embedding representation}
 596 | 
 597 | As a final step in the validation process, we use the fact that in addition to
 598 | the description, the ICD-10-CM codes themselves carry hierarchical information,
 599 | which can be used to ensure that conceptual relationships are preserved
 600 | in the compressed embeddings. In particular, the leading letter and two 
 601 | numeric values categorize codes For example, codes A00-B99 correspond to
 602 | infectious and parasitic diseases, C00-D49 correspond to neoplasms, etc. 
 603 | There are a total of 22 codes. The full table of categories is provided
 604 | in the Supplementary Materials. We
 605 | can therefore ensure that at least some of the relevant relationships are 
 606 | preserved in the compressed embedding representation by confirming that
 607 | the categories can be estimated at a rate higher than chance using a 
 608 | supervised model. Furthermore, we can quantify how much relevant predictive
 609 | information is lost in lower-dimensional representations.
 610 | 
 611 | The training data consists of a one-hot encoding of the ICD-10-CM
 612 | categories as the dependent variable and the compressed embedding values as
 613 | the values. The model consists of two hidden layers with 100
 614 | nodes each. The loss function selected was categorical cross-entropy. The
 615 | model was trained using 30 epochs and a validation data set comprised of 10\% of
 616 | samples, chosen at random. 
 617 | 
 618 | To contextualize the results, we fit the same model to four 
 619 | BERT embeddings that
 620 | have also been trained on biomedical corpuses. The first, MedBERT \cite{medbert}
 621 | was trained with 57.46M tokens collected from biomedical-related data sources 
 622 | and biomedical-related articles from Wikipedia. The second, 
 623 | PubMedBERT-MS-MARCO \cite{pubmedbertqa} was first trained on Pubmed abstracts 
 624 | and full texts and then fine-tuned using the MS-MARCO data set \cite{msmarco} 
 625 | to be optimized for information retrieval task in the medical/health text 
 626 | domain. The third, SapBERT-PubMedBERT, was first trained on Pubmed abstracts 
 627 | and text, and then fine-tuned semantic relationships between relevant
 628 | medical entities using UMLS \cite{umls} biomedical ontologies. The fourth, 
 629 | ClinBERT \cite{huang2019} was initialized from BERT. Then the training followed the principle of masked language model, in which given a piece of text, we randomly replace some tokens by MASKs, special tokens for masking, and then require the model to predict the original tokens via contextual text. 
 630 | 
 631 | The performance in terms of both the out-of-sample accuracy and
 632 | the out-of-sample balanced accuracy \cite{balancedaccuracy} is shown in 
 633 | Table \ref{tab:sup_perf}. The goal 
 634 | in presenting these results is not to necessarily to 
 635 | maximize the prediction accuracy. Rather, it is to show that the embedding 
 636 | retains the
 637 | hierarchical information in the ICD-10-CM codes. Some of the codes correspond to
 638 | conditions that could be classified in several ways, and as a result coding
 639 | for at least some of the conditions might be considered non-systematic.
 640 | Based on this criterion, we can conclude the embedding does retain much of the 
 641 | structural and conceptual information denoted in the descriptions, at least in 
 642 | terms of mapping to key categories of diseases and conditions.
 643 | 
 644 | The table provides two main results. First, the models using the BioGPT 
 645 | compressed representation significantly outperform models based on BERT
 646 | models with the the former outperforming the latter, even after compressing
 647 | the BioGPT embedding to 10 dimensions. Second, for the BioGPT compressed
 648 | embeddings, great compression of the data correpsonds to a decrease in
 649 | the predictive information in the data, as measured by the accuracy.
 650 | 
 651 | Since the ICD-CM-10 codes are themselves heirarchical with the category
 652 | codes being the broadest category it is worth pointing out that these results
 653 | imply that some aspect of the code hierarchy is preserved in the embedding.
 654 | However, the extent to which this hierarchy can be fully recovered remains an 
 655 | area of limited understanding. A potential avenue for future work could entail
 656 | exploring the feasibility of mapping the embedding space to established 
 657 | ontologies, such as the UMLS. 
 658 | 
 659 | \section*{Conclusions}
 660 | 
 661 | This paper presents novel datasets offering numerical representations of 
 662 | ICD-10-CM codes by generating description embeddings using a large language 
 663 | model and applying autoencoders for dimensionality reduction. The approach is 
 664 | versatile, capable of handling categorical variables with numerous categories 
 665 | across various domains. By capturing relationships among categories and 
 666 | preserving inherent information, the embeddings serve as informative input 
 667 | features for machine learning models.  The readily available datasets are 
 668 | anticipated to be highly valuable for researchers incorporating ICD-10-CM 
 669 | codes into their analyses, retaining contextual information. This approach 
 670 | has the potential to significantly improve the utility of ICD-10-CM codes in 
 671 | biomedical informatics and enable more advanced analyses in the field.
 672 | Data analysts can easily incorporate them into their own analyses by 
 673 | substituting the embedding values for other, lower-information representations
 674 | including the categorical ones described above to derive the benefits
 675 | of the conceptual information encoded in their embedding.
 676 | Future work will address some of the challenges of capturing hierarchical
 677 | structure in ICD-10-CM coding systems, experimenting with Ontology-based 
 678 | methods, hierarchical clustering, hierarchial autoencoding, graph neural 
 679 | networks and incorporating hierarchical information in training.
 680 | 
 681 | While this approach is 
 682 | effective, there are some challenges of which we should be aware. While not 
 683 | insurmountable, they are as follows:
 684 | 
 685 | \begin{enumerate}
 686 | \item Interpretability: A significant challenge in machine learning, particularly with complex models like large language models and autoencoders, is interpretability. In healthcare, the ability to understand and explain why a model makes a particular prediction is crucial. This could impact patient trust, clinician adoption, and even legal and regulatory compliance. Techniques like LIME (Local Interpretable Model-Agnostic Explanations) or SHAP (SHapley Additive exPlanations) can be used to improve interpretability, but they do not provide perfect solutions and can be computationally expensive.
 687 | \item Overfitting: Overfitting is a common issue in machine learning where a model learns the training data too well and performs poorly on unseen data. This can be particularly problematic in healthcare, where the stakes are high. Techniques such as cross-validation, regularization, or dropout layers can be used to prevent overfitting.
 688 | \item Data Privacy: Patient data is highly sensitive, and its usage is strictly regulated (e.g., by laws like HIPAA in the US). Even if the data used to generate the embeddings is anonymized, the model must be carefully designed and used to avoid potential privacy leaks.
 689 | \item Generalizability: A model trained on one dataset may not perform well on another due to differences in population characteristics, data collection methods, etc. Ensuring that models generalize well across different settings is a significant challenge.
 690 | \item Quality of Input Data: The quality of the embeddings depends heavily on the quality of the input data. If the descriptions associated with the ICD-10-CM codes are inaccurate or not comprehensive, the resulting embeddings may also be flawed. This is a fundamental issue in any data-driven approach: "garbage in, garbage out."
 691 | \item Capturing Hierarchical Structure: The ICD-10-CM coding system has a hierarchical structure where certain codes are nested within broader categories. While embeddings generated from code descriptions may capture semantic meaning, they might not adhere to an explicit hierarchical imposed by an ontology like UMLS.
 692 | \end{enumerate}
 693 | 
 694 | \section{Example Use of the ICD-10-CM Embedding Data}
 695 | 
 696 | To illustrate the utility of the data, we present a simple example of how one 
 697 | might use the embedding information in the R programming environment and
 698 | making use of the \texttt{dplyr} \cite{dplyr}, \texttt{ggplot2} \cite{ggplot2}, 
 699 | \texttt{readr} \cite{readr}, \texttt{Rtsne} \cite{Rtsne}, and 
 700 | \texttt{stringr} \cite{stringr} packages. Suppose 
 701 | we would like to 
 702 | visualize the ICD-10-CM codes beginning with 
 703 | G (diseases of the nervous system), 
 704 | I (diseases of the circulatory system), J (diseases of the respiratory system), 
 705 | and K (diseases of the digestive system) to better understand the 
 706 | contextual relationships
 707 | between these categories or specific conditions in the the 50-dimensional 
 708 | embedding. For convenience, the projects page includes an \texttt{.rds} file
 709 | containing the available embeddings along with their URLs, which can be 
 710 | retrieved from the R console. The code categories can then be visualized 
 711 | by performing another dimension reduction (in this case we will use the
 712 | \texttt{Rtsne} package), to 2 dimensions that can be presented as a scatter plot.
 713 | 
 714 | \vspace{2mm}
 715 | 
 716 | \begin{CodeChunk}
 717 | \begin{CodeInput}
 718 | library(dplyr)
 719 | library(ggplot2)
 720 | library(readr)
 721 | library(Rtsne)
 722 | library(stringr)
 723 | 
 724 | # Download the locations of the embeddings.
 725 | tf = tempfile()
 726 | download.file(
 727 |   paste0("https://github.com/kaneplusplus/",
 728 |          "icd-10-cm-embedding/blob/main/",
 729 |          "icd10_dl.rds?raw=true"),
 730 |   tf
 731 | )
 732 | dl = readRDS(tf)
 733 | 
 734 | # Read in the unspecified injury codes.
 735 | tf = tempfile()
 736 | download.file(
 737 |   dl$url[dl$year == 2019 & dl$emb_dim == 50],
 738 |   tf
 739 | )
 740 | 
 741 | icd10s = read_csv(tf) |>
 742 |   filter(str_detect(code, "^(G|I|J|K)")) |>
 743 |   mutate(desc = tolower(desc)) |>
 744 |   mutate(`Leading Letter` = str_sub(code, 1, 1)) 
 745 | 
 746 | # Fit tSNE to the embedding.
 747 | tsne_fit = icd10s |> 
 748 |   select(starts_with("V")) |>
 749 |   scale() |>
 750 |   Rtsne(perplexity = 10)
 751 | 
 752 | # Bind the tSNE values to the data set.
 753 | icd10p = bind_cols(
 754 |   icd10s |>
 755 |     select(-starts_with("V")),
 756 |   tsne_fit$Y |>
 757 |     as.data.frame() |>
 758 |     rename(tSNE1="V1", tSNE2="V2") |>
 759 |     as_tibble()
 760 | )
 761 | 
 762 | # Visualize the results.
 763 | ggplot(icd10p, aes(x = tSNE1, y = tSNE2, color = `Leading Letter`)) +
 764 |   geom_point() +
 765 |   theme_minimal()
 766 | \end{CodeInput}
 767 | \end{CodeChunk}
 768 | 
 769 | \vspace{2mm}
 770 | 
 771 | The output visualization is presented in Figure 1 %\ref{fig:tsne} 
 772 | and shows that 
 773 | a subset of the circulatory diseases (I) and
 774 | nervous system diseases (G) are well-differentiated from other conditions. It
 775 | also shows overlap between other conditions related to K (digestive diseases), 
 776 | J (respiratory diseases), and I (circulatory). 
 777 | 
 778 | 
 779 | \begin{backmatter}
 780 | 
 781 | %\section*{Acknowledgements}%% if any
 782 | %Text for this section\ldots
 783 | 
 784 | \section*{Declarations}
 785 | 
 786 | \subsection*{Funding}%% if any
 787 | 
 788 | This work was supported by the National Institute on Aging of the 
 789 | National Institutes of Health (NIH) through a grant to Yale 
 790 | University (1R01AG071528). The organizations funding this study had no role 
 791 | in the design or conduct of the study; in the collection, management, 
 792 | analysis, or interpretation of the data; or in the preparation, review, or 
 793 | approval of the manuscript. The content of this publication is solely the 
 794 | responsibility of the authors and does not necessarily represent the official 
 795 | views of the National Institutes of Health, the Department of Veterans 
 796 | Affairs, or the United States government. 
 797 | 
 798 | This work was also partially supported by the Yale Clinical and 
 799 | Translational Science award (UL1 TR001863) and the Yale Claude D. Pepper 
 800 | Center (P30AG021342).
 801 | 
 802 | \section*{Competing interests}
 803 | 
 804 | The authors declare that they have no competing interests.
 805 | 
 806 | \subsection*{Ethics approval}
 807 | 
 808 | Not applicable.
 809 | 
 810 | \subsection*{Consent to participate}
 811 | 
 812 | Not applicable.
 813 | 
 814 | \subsection*{Consent for publication}
 815 | 
 816 | Not applicable.
 817 | 
 818 | %\section*{Abbreviations}%% if any
 819 | %Text for this section\ldots
 820 | 
 821 | \subsection*{Availability of data and materials}%% if any
 822 | 
 823 | All data presented here along with documentation for 
 824 | reproducing presented materials is available at 
 825 | https://github.com/kaneplusplus/icd-10-cm-embedding.
 826 | 
 827 | \subsection*{Code availability}%% if any
 828 | 
 829 | All code presented here along with documentation for 
 830 | reproducing presented materials is available at 
 831 | https://github.com/kaneplusplus/icd-10-cm-embedding.
 832 | 
 833 | \subsection*{Authors' contributions}
 834 | 
 835 | Kane proposed, implemented, and created the dataset and wrote the article. 
 836 | Ganz provided direction for the research and validated results manually. 
 837 | King provided assessment of the model, a detailed analysis of the limitations 
 838 | of vector based and BERT approaches, a discussion of LLM 
 839 | limitations and feedback.
 840 | Esserman, Latham, and Greene provided feedback and made suggestions through the 
 841 | entire process. 
 842 | 
 843 | \subsection{Acknowledgements}
 844 | 
 845 | Not applicable.
 846 | 
 847 | %\section*{Ethics approval and consent to participate}%% if any
 848 | %Text for this section\ldots
 849 | 
 850 | %\section*{Consent for publication}%% if any
 851 | %Text for this section\ldots
 852 | 
 853 | %\section*{Authors' contributions}
 854 | %Text for this section \ldots
 855 | 
 856 | %\section*{Authors' information}%% if any
 857 | %Text for this section\ldots
 858 | 
 859 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 860 | %%                  The Bibliography                       %%
 861 | %%                                                         %%
 862 | %%  Bmc_mathpys.bst  will be used to                       %%
 863 | %%  create a .BBL file for submission.                     %%
 864 | %%  After submission of the .TEX file,                     %%
 865 | %%  you will be prompted to submit your .BBL file.         %%
 866 | %%                                                         %%
 867 | %%                                                         %%
 868 | %%  Note that the displayed Bibliography will not          %%
 869 | %%  necessarily be rendered by Latex exactly as specified  %%
 870 | %%  in the online Instructions for Authors.                %%
 871 | %%                                                         %%
 872 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 873 | 
 874 | % if your bibliography is in bibtex format, use those commands:
 875 | \bibliographystyle{bmc-mathphys} % Style BST file (bmc-mathphys, vancouver, spbasic).
 876 | \bibliography{bmc_article}      % Bibliography file (usually '*.bib' )
 877 | % for author-year bibliography (bmc-mathphys or spbasic)
 878 | % a) write to bib file (bmc-mathphys only)
 879 | % @settings{label, options="nameyear"}
 880 | % b) uncomment next line
 881 | %\nocite{label}
 882 | 
 883 | % or include bibliography directly:
 884 | % \begin{thebibliography}
 885 | % \bibitem{b1}
 886 | % \end{thebibliography}
 887 | 
 888 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 889 | %%                               %%
 890 | %% Figures                       %%
 891 | %%                               %%
 892 | %% NB: this is for captions and  %%
 893 | %% Titles. All graphics must be  %%
 894 | %% submitted separately and NOT  %%
 895 | %% included in the Tex document  %%
 896 | %%                               %%
 897 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 898 | 
 899 | %%
 900 | %% Do not use \listoffigures as most will included as separate files
 901 | 
 902 | \pagebreak
 903 | 
 904 | \section*{Figures}
 905 | 
 906 | \begin{figure}[ht!]
 907 |   \includegraphics[width=\linewidth]{tsne-plot.png}
 908 |   \caption{The tSNE plot of the codes.}
 909 |   \label{fig:tsne}
 910 | \end{figure}
 911 | 
 912 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 913 | %%                               %%
 914 | %% Tables                        %%
 915 | %%                               %%
 916 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 917 | 
 918 | \pagebreak
 919 | %% Use of \listoftables is discouraged.
 920 | %%
 921 | \section*{Tables}
 922 | 
 923 | \begin{table}[ht!]
 924 | \caption{The autoencoder parameters and performance ordered by increasing validation loss.}
 925 | \label{tab:autoencoder_perf}
 926 | \begin{tabular}{|r|r|r|r|}
 927 | \hline
 928 | Embedding Dimension & Batch Size & Training Loss & Validation Loss\\
 929 | \hline
 930 | 100 & 64 & 0.534 & 0.339\\
 931 | \hline
 932 | 100 & 128 & 0.487 & 0.381\\
 933 | \hline
 934 | 50 & 256 & 0.403 & 0.392\\
 935 | \hline
 936 | 1000 & 64 & 0.542 & 0.402\\
 937 | \hline
 938 | 100 & 256 & 0.556 & 0.444\\
 939 | \hline
 940 | 1000 & 128 & 1.073 & 0.486\\
 941 | \hline
 942 | 10 & 256 & 0.599 & 0.594\\
 943 | \hline
 944 | 10 & 128 & 0.628 & 0.609\\
 945 | \hline
 946 | 10 & 64 & 0.679 & 0.641\\
 947 | \hline
 948 | 50 & 64 & 1.134 & 0.699\\
 949 | \hline
 950 | 1000 & 256 & 30.435 & 0.803\\
 951 | \hline
 952 | 50 & 128 & 1.053 & 0.894\\
 953 | 
 954 | \hline
 955 | \end{tabular}
 956 | \end{table}
 957 | 
 958 | \begin{table}[ht!]
 959 | \caption{The autoencoder validation performance ordered by year.}
 960 | \label{tab:autoencoder_year}
 961 | \begin{tabular}{|r|r|r|r|}
 962 | \hline
 963 | Year of Published ICD-10-CM Code& Embedding Dimension & Mean Square Error& Coef. of Determination\\
 964 | \hline
 965 | 2019 & 10 & 0.593 & 0.086\\
 966 | \hline
 967 | 2019 & 50 & 0.388 & 0.056\\
 968 | \hline
 969 | 2019 & 100 & 0.336 & 0.049\\
 970 | \hline
 971 | 2019 & 1000 & 0.400 & 0.058\\
 972 | \hline
 973 | 2020 & 10 & 0.593 & 0.086\\
 974 | \hline
 975 | 2020 & 50 & 0.388 & 0.056\\
 976 | \hline
 977 | 2020 & 100 & 0.336 & 0.049\\
 978 | \hline
 979 | 2020 & 1000 & 0.400 & 0.058\\
 980 | \hline
 981 | 2021 & 10 & 0.594 & 0.086\\
 982 | \hline
 983 | 2021 & 50 & 0.389 & 0.056\\
 984 | \hline
 985 | 2021 & 100 & 0.337 & 0.049\\
 986 | \hline
 987 | 2021 & 1000 & 0.401 & 0.058\\
 988 | \hline
 989 | 2022 & 10 & 0.595 & 0.086\\
 990 | \hline
 991 | 2022 & 50 & 0.390 & 0.056\\
 992 | \hline
 993 | 2022 & 100 & 0.338 & 0.049\\
 994 | \hline
 995 | 2022 & 1000 & 0.402 & 0.058\\
 996 | \hline
 997 | \end{tabular}
 998 | \end{table}
 999 | 
1000 | \begin{table}[ht!]
1001 | \caption{The supervised models' performance ordered by decreasing balanced accuracy.}
1002 | \label{tab:sup_perf}
1003 | \begin{tabular}{|r|r|r|r|}
1004 | \hline
1005 | Model & Embedding Dimension & Accuracy & Balanced Accuracy\\
1006 | \hline
1007 | BioGPT Compressed & 1000 & 0.960 & 0.927\\
1008 | \hline
1009 | BioGPT Compressed & 100 & 0.935 & 0.891\\
1010 | \hline
1011 | BioGPT Compressed & 50 & 0.925 & 0.873\\
1012 | \hline
1013 | BioGPT Compressed & 10 & 0.815 & 0.698\\
1014 | \hline
1015 | ClinicalBERT & 768 & 0.200 & 0.634 \\
1016 | \hline
1017 | PubMedBERT-MS-MARCO & 768 & 0.158 & 0.629\\
1018 | \hline
1019 | SapBERT-PubMedBERT & 768 & 0.159 & 0.616\\
1020 | \hline
1021 | MedBERT & 768 & 0.171 & 0.613\\
1022 | \hline
1023 | \end{tabular}
1024 | \end{table}
1025 | 
1026 | 
1027 | \pagebreak
1028 | %% Use of \listoftables is discouraged.
1029 | %%
1030 | 
1031 | \section*{Supplementary Materials}
1032 | 
1033 | \subsection{ICD-10-CM Category Codes}
1034 | 
1035 | \begin{table}[h]
1036 | %\caption{ICD-10-CM Category Codes.}
1037 | \begin{tabular}{l|l|}
1038 | \hline
1039 | Code & Description\\
1040 | \hline
1041 | S3559XS & Injury of other iliac blood vessels, sequela\\
1042 | \hline
1043 | M12262 & Villonodular synovitis (pigmented), left knee\\
1044 | \hline
1045 | S40011S & Contusion of right shoulder, sequela\\
1046 | \hline
1047 | K284 & Chronic or unspecified gastrojejunal ulcer with hemorrhage\\
1048 | \hline
1049 | M90632 & Osteitis deformans in neoplastic diseases, left forearm\\
1050 | \hline
1051 | \end{tabular}
1052 | \end{table}
1053 | %% Use of \listoftables is discouraged.
1054 | %%
1055 | 
1056 | \subsection*{ICD-10-CM Category Codes}
1057 | 
1058 | \begin{table}[ht!]
1059 | %\caption{ICD-10-CM Category Codes.}
1060 | \begin{tabular}{|l|l|l|}
1061 | \hline
1062 | First Code & Last Code & Code Description\\
1063 | \hline
1064 | A00 & B99 & Certain infectious and parasitic diseases\\
1065 | \hline
1066 | C00 & D49 & Neoplasms\\
1067 | \hline
1068 | D50 & D89 & Diseases of the blood and blood-forming organs and certain disorders \\
1069 | & & involving the immune mechanism\\
1070 | \hline
1071 | E00 & E89 & Endocrine, nutritional and metabolic diseases\\
1072 | \hline
1073 | F01 & F99 & Mental, Behavioral and Neurodevelopmental disorders\\
1074 | \hline
1075 | G00 & G99 & Diseases of the nervous system\\
1076 | \hline
1077 | H00 & H59 & Diseases of the eye and adnexa\\
1078 | \hline
1079 | H60 & H95 & Diseases of the ear and mastoid process\\
1080 | \hline
1081 | I00 & I99 & Diseases of the circulatory system\\
1082 | \hline
1083 | J00 & J99 & Diseases of the respiratory system\\
1084 | \hline
1085 | K00 & K95 & Diseases of the digestive system\\
1086 | \hline
1087 | L00 & L99 & Diseases of the skin and subcutaneous tissue\\
1088 | \hline
1089 | M00 & M99 & Diseases of the musculoskeletal system and connective tissue\\
1090 | \hline
1091 | N00 & N99 & Diseases of the genitourinary system\\
1092 | \hline
1093 | O00 & O9A & Pregnancy, childbirth and the puerperium\\
1094 | \hline
1095 | P00 & P96 & Certain conditions originating in the perinatal period\\
1096 | \hline
1097 | Q00 & Q99 & Congenital malformations, deformations and chromosomal abnormalities\\
1098 | \hline
1099 | R00 & R99 & Symptoms, signs and abnormal clinical and laboratory findings, not elsewhere classified\\
1100 | \hline
1101 | S00 & T88 & Injury, poisoning and certain other consequences of external causes\\
1102 | \hline
1103 | U00 & U85 & Codes for special purposes\\
1104 | \hline
1105 | V00 & Y99 & External causes of morbidity\\
1106 | \hline
1107 | Z00 & Z99 & Factors influencing health status and contact with health services\\
1108 | \hline
1109 | \end{tabular}
1110 | \end{table}
1111 | 
1112 | 
1113 | 
1114 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1115 | %%                               %%
1116 | %% Additional Files              %%
1117 | %%                               %%
1118 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1119 | 
1120 | %\section*{Additional Files}
1121 | %  \subsection*{Additional file 1 --- Sample additional file title}
1122 | %    Additional file descriptions text (including details of how to
1123 | %    view the file, if it is in a non-standard format or the file extension).  This might
1124 | %    refer to a multi-page table or a figure.
1125 | %
1126 | %  \subsection*{Additional file 2 --- Sample additional file title}
1127 | %    Additional file descriptions text.
1128 | 
1129 | \end{backmatter}
1130 | \end{document}
1131 | 


--------------------------------------------------------------------------------
/bmc-bioinformatics-paper/bmcart-biblio.sty:
--------------------------------------------------------------------------------
  1 | %%
  2 | %% LaTeX 2e packagee for the processing of LaTeX2e files
  3 | %% for the BioMed Central
  4 | %% Additional commands for the processing of structured reference list
  5 | %%
  6 | %% Macros written by Vytas Statulevicius, VTeX, Lithuania
  7 | %% for the BioMed Central
  8 | %% Please submit bugs or your comments to latex-support@vtex.lt
  9 | %%
 10 | %% The original distribution is located at:
 11 | %% http://support.e-publications.org/bmc
 12 | %%
 13 | %% This class file loads standart "article.cls" with appropriate 
 14 | %% settings and then redefines layout according to BMC style
 15 | %% A lot of efforts are done for the possibility of extraction of 
 16 | %% information from the LaTeX file
 17 | %%
 18 | %% You are free to use this style class as you see fit, provided 
 19 | %% that you do not make changes to the file. 
 20 | %% If you DO make changes, you are required to rename this file.
 21 | %%
 22 | %% It may be distributed under the terms of the LaTeX Project Public
 23 | %% License, as described in lppl.txt in the base LaTeX distribution.
 24 | %% Either version 1.0 or, at your option, any later version.
 25 | %%
 26 | %% \CharacterTable
 27 | %%  {Upper-case    \A\B\C\D\E\F\G\H\I\J\K\L\M\N\O\P\Q\R\S\T\U\V\W\X\Y\Z
 28 | %%   Lower-case    \a\b\c\d\e\f\g\h\i\j\k\l\m\n\o\p\q\r\s\t\u\v\w\x\y\z
 29 | %%   Digits        \0\1\2\3\4\5\6\7\8\9
 30 | %%   Exclamation   \!     Double quote  \"     Hash (number) \#
 31 | %%   Dollar        \$     Percent       \%     Ampersand     \&
 32 | %%   Acute accent  \'     Left paren    \(     Right paren   \)
 33 | %%   Asterisk      \*     Plus          \+     Comma         \,
 34 | %%   Minus         \-     Point         \.     Solidus       \/
 35 | %%   Colon         \:     Semicolon     \;     Less than     \<
 36 | %%   Equals        \=     Greater than  \>     Question mark \?
 37 | %%   Commercial at \@     Left bracket  \[     Backslash     \\
 38 | %%   Right bracket \]     Circumflex    \^     Underscore    \_
 39 | %%   Grave accent  \`     Left brace    \{     Vertical bar  \|
 40 | %%   Right brace   \}     Tilde         \~}
 41 | %%
 42 | %%
 43 | %% Bug fixes and changes:
 44 | %% at end of file
 45 | 
 46 | 
 47 | \def\bmc@common@bibl@date{2012/03/06}
 48 | 
 49 | \NeedsTeXFormat{LaTeX2e}
 50 | \ProvidesPackage{bmcart-biblio}[\bmc@common@bibl@date
 51 |                                   additional macros for the bibliography tagging A++ XML DTD (VS)]
 52 | 
 53 | % Default bibliography style:
 54 | \def\bibliography@style{10}
 55 | \def\bibliography@style@name{BMC Reference Style}
 56 | \def\bibliography@key{bmc}
 57 | 
 58 | % vykdoma tik pirma opcija (t.y. tai leidzia "permusti" opcija per
 59 | % \documentclass[foo,..]
 60 | 
 61 | \def\only@first#1{\@ifundefined{o@f@}{\def\o@f@{}#1}{}}
 62 | 
 63 | \DeclareOption{undef}    {\only@first{\def\bibliography@style{0}\def\bibliography@key{undef}
 64 |                           \def\bibliography@style@name{undefined}}} 
 65 | \DeclareOption{basic}    {\only@first{\def\bibliography@style{1}\def\bibliography@key{basic}
 66 |                           \def\bibliography@style@name{Basic Springer}}} 
 67 | \DeclareOption{chemistry}{\only@first{\def\bibliography@style{2}\def\bibliography@key{chemistry}
 68 |                           \def\bibliography@style@name{Chemistry}}} 
 69 | \DeclareOption{mathphys} {\only@first{\def\bibliography@style{3}\def\bibliography@key{mathphys}
 70 |                           \def\bibliography@style@name{Math and Physical Sciences}}} 
 71 | \DeclareOption{aps}      {\only@first{\def\bibliography@style{4}\def\bibliography@key{aps}
 72 |                           \def\bibliography@style@name{American Physical Society}}} 
 73 | \DeclareOption{cs}       {\only@first{\def\bibliography@style{5}\def\bibliography@key{cs}
 74 |                           \def\bibliography@style@name{Computer Science}}}
 75 | \DeclareOption{vancouver}{\only@first{\def\bibliography@style{6}\def\bibliography@key{vancouver}
 76 |                           \def\bibliography@style@name{Vancouver}}}
 77 | \DeclareOption{apa}      {\only@first{\def\bibliography@style{7}\def\bibliography@key{apa}
 78 |                           \def\bibliography@style@name{APA}}}
 79 | \DeclareOption{chicago}  {\only@first{\def\bibliography@style{8}\def\bibliography@key{chicago}
 80 |                           \def\bibliography@style@name{Chicago}}}
 81 | \DeclareOption{ams}      {\only@first{\def\bibliography@style{9}\def\bibliography@key{ams}
 82 |                           \def\bibliography@style@name{AMS}}}
 83 | \DeclareOption{bmc}      {\only@first{\def\bibliography@style{10}\def\bibliography@key{bmc}
 84 |                           \def\bibliography@style@name{BMC Reference Style}}}
 85 | 
 86 | \ProcessOptions*
 87 | 
 88 | 
 89 | %%% Common macros:
 90 | \def\xml@attr{\@ifnextchar[{\get@attr}{\get@attr[]}}
 91 | \def\get@attr[#1]#2{#2}
 92 | 
 93 | \let\betal@style\relax
 94 | 
 95 | \def\common@pub@types{%
 96 |   \let\binstitute\@firstofone%
 97 |   \def\betal{{\betal@style et al.}}%
 98 |   \let\byear\@firstofone%  
 99 |   \let\bfpage\@firstofone%
100 |   \let\blpage\@firstofone%
101 |   \let\binterref\xml@attr%
102 |   \def\burl##1{{\csname burl@style\endcsname\surl{##1}}}
103 |   \let\bcomment\@firstofone%
104 |   \let\oauthor\@firstofone%
105 |   \csname common@pub@types@hook\endcsname%
106 | }
107 | 
108 | % Setting a "style" for a command:
109 | % \set@bibl@cmd{bvolume}  == \def\bvolume#1{{\bvolume@style #1}}
110 | 
111 | \def\set@bibl@cmd#1{\expandafter\def\csname #1\endcsname##1{{\csname #1@style\endcsname##1}}}
112 | 
113 | 
114 | \AtBeginDocument{\let\doiurlchapter\doiurl}
115 | 
116 | %%% BibEditorName 
117 | 
118 | \def\xml@beditor#1{%
119 |   \let\bprefix\@firstofone%
120 |   \let\binits\@firstofone%
121 |   \let\bsnm\@firstofone%
122 |   \let\bparticle\@firstofone%
123 |   \let\bsuffix\@firstofone%
124 |   \let\bdegs\@firstofone%
125 | #1}
126 | 
127 | %%% BibAuthorName
128 | 
129 | \def\xml@bauthor#1{%
130 |   \let\bprefix\@firstofone%
131 |   \let\binits\@firstofone%
132 | %  \let\bsnm\@firstofone%
133 |   \def\bsnm##1{{\csname bsnm@style\endcsname##1}}%
134 |   \let\bparticle\@firstofone%
135 |   \let\bsuffix\@firstofone%
136 |   \let\bdegs\@firstofone%
137 | #1}
138 | 
139 | 
140 | %%% bsertitle
141 | 
142 | \def\xml@bsertitle{\@ifnextchar[{\@xml@bsertitle}{\@xml@bsertitle[]}}
143 | 
144 | \def\@xml@bsertitle[#1]#2{{\csname bsertitle@style\endcsname #2}}%
145 | 
146 | \def\xml@batitle{\@ifnextchar[{\@xml@batitle}{\@xml@batitle[]}}
147 | \def\@xml@batitle[#1]#2{{\csname batitle@style\endcsname #2}}%
148 | 
149 | \def\xml@bctitle{\@ifnextchar[{\@xml@bctitle}{\@xml@bctitle[]}}
150 | \def\@xml@bctitle[#1]#2{{\csname bctitle@style\endcsname #2}}%
151 | 
152 | \def\xml@bbtitle{\@ifnextchar[{\@xml@bbtitle}{\@xml@bbtitle[]}}
153 | \def\@xml@bbtitle[#1]#2{{\csname bbtitle@style\endcsname #2}}%
154 | 
155 | 
156 | 
157 | %%% BibArticle
158 | % \begin{barticle}
159 | % ...
160 | % \end{barticle}
161 | 
162 | \def\barticle{%
163 |   \common@pub@types%
164 |   \let\bauthor\xml@bauthor%
165 |   \let\batitle\xml@attr%
166 |   \set@bibl@cmd{bjtitle}%
167 |   \set@bibl@cmd{bvolume}%
168 |   \set@bibl@cmd{bissue}%
169 |   \let\bnumber\@firstofone%
170 | }
171 | \def\endbarticle{}
172 | 
173 | 
174 | %%% BibIssue
175 | % \begin{bissue}
176 | % ...
177 | % \end{bissue}
178 | 
179 | \def\bissue{%
180 |   \common@pub@types%
181 |   \let\bauthor\xml@bauthor%
182 |   \set@bibl@cmd{bjtitle}%
183 |   \set@bibl@cmd{bvolume}%
184 |   \set@bibl@cmd{bissue}%
185 |   \set@bibl@cmd{bmonth}%
186 | }
187 | \def\endbissue{}
188 | 
189 | 
190 | %%% BibChapter
191 | % \begin{bchapter}
192 | % ...
193 | % \end{bchapter}
194 | 
195 | \def\bchapter{%
196 |   \common@pub@types%
197 |   \let\bauthor\xml@bauthor%
198 |   \let\bctitle\xml@attr% 
199 |   \let\beditor\xml@beditor% 
200 |   \let\binstitutionaled\@firstofone%
201 |   \def\beds{eds.}%
202 |   \let\bbtitle\xml@bbtitle
203 |   \let\bedition\xml@attr% 
204 |   \set@bibl@cmd{bconfname}%
205 |   \set@bibl@cmd{bconflocation}%
206 |   \set@bibl@cmd{bconfdate}%
207 |   \let\bsertitle\xml@bsertitle% 
208 |   \set@bibl@cmd{bseriesno}%
209 |   \let\bpublisher\@firstofone% 
210 |   \let\blocation\@firstofone%
211 |   \let\bisbn\@firstofone%
212 | }
213 | 
214 | \def\endbchapter{}
215 | 
216 | %%% BibUnstructured
217 | % \begin{botherref}
218 | % ...
219 | % \end{botherref}
220 | 
221 | \def\botherref{\let\binterref\xml@attr\let\oauthor\xml@bauthor\let\bauthor\xml@bauthor}
222 | \def\endbotherref{}
223 | 
224 | 
225 | %%% BibBook
226 | % \begin{bbook}
227 | % ...
228 | % \end{bbook}
229 | 
230 | \def\bbook{%
231 |   \common@pub@types%
232 |   \let\bauthor\xml@bauthor% 
233 |   \let\beditor\xml@beditor% 
234 |   \let\binstitutionaled\@firstofone%
235 |   \def\beds{eds.}%
236 |   \let\bbtitle\xml@bbtitle
237 |   \let\bedition\xml@attr% 
238 |   \let\bconfname\@firstofone%
239 |   \let\bconflocation\@firstofone%
240 |   \let\bconfdate\@firstofone%
241 |   \let\bsertitle\xml@bsertitle%
242 |   \set@bibl@cmd{bseriesno}%
243 |   \let\bpublisher\@firstofone%
244 |   \let\blocation\@firstofone%
245 |   \let\bisbn\@firstofone%
246 | }%
247 | 
248 | 
249 | \def\endbbook{}
250 | 
251 | %%%
252 | \let\endbibitem\relax
253 | 
254 | %%%%%% vancouver  puslapi nr. formavimas
255 | %% 17-19 -> 17-9
256 | %% 17-21 -> 17-21
257 | %% 1234-1345 -> 345
258 | 
259 | \def\vcr@bfpage#1{%
260 |   #1\nobreak%
261 |   \test@if@number{#1}%
262 |   \let\blpage\@firstofone%
263 |   \if@page@isnumber\ifnum#1>9\def\@vcr@bfpage{#1}\let\blpage\fmt@blpage\fi\fi}
264 | 
265 | 
266 | \def\fmt@blpage#1{%
267 |   \test@if@number{#1}%
268 |   \if@page@isnumber%
269 |     \ifnum\@vcr@bfpage<#1%
270 |        \vcr@blpage{#1}% 
271 |     \else%
272 |        #1\@latex@error{sprbibl: [\cur@bibitem] \string\bfpage\space > \string\blpage!}{??}%
273 |     \fi%
274 |   \else%
275 |     #1%
276 |   \fi}
277 | 
278 | % algoritmas
279 | % a) bfpage ir blpage daliname is 10 
280 | % b) if bfpage=blpage goto d
281 | % c) if bfpage<10 end else goto a
282 | % d) x:= blpage - int(blpage/10^i)*10^1
283 | 
284 | \def\vcr@blpage#1{%
285 |    \@tempcnta=\@vcr@bfpage%
286 |    \@tempcntb=#1%
287 |    \@tempcntc=1%
288 |    \def\control@cnt{1}%
289 |    \@whilenum\control@cnt>0\do{%
290 |      \divide\@tempcnta by10%
291 |      \divide\@tempcntb by10%
292 |      \multiply\@tempcntc by10%
293 |      \ifnum\@tempcnta=\@tempcntb\def\control@cnt{0}\@tempswatrue%
294 |      \else%
295 |        \ifnum\@tempcnta<10\relax\def\control@cnt{0}\@tempswafalse\fi%
296 |      \fi}%
297 |    \if@tempswa%
298 |      \@tempcnta=#1%
299 |      \@tempcntb=#1%
300 |      \divide\@tempcntb by\@tempcntc%
301 |      \multiply\@tempcntb by\@tempcntc%
302 |      \advance\@tempcnta by-\@tempcntb%
303 |      \the\@tempcnta%
304 |    \else%
305 |      #1%
306 |    \fi}%
307 | 
308 | \newif\if@page@isnumber 
309 | 
310 | \def\test@if@number#1{%
311 |   \@page@isnumberfalse% 
312 |   \setbox\@tempboxa=\hbox{\@tempcnta=0#1\relax\ignorespaces}%
313 |   \ifdim\wd\@tempboxa>\z@\else\@page@isnumbertrue\fi}
314 | 
315 | %%%%%% Set bibliography styles:
316 | 
317 | \ifcase\bibliography@style\relax
318 |   \message{^^J Reference style: \bibliography@style@name^^J}%
319 | \or % basic
320 |   \message{^^J Reference style: \bibliography@style@name^^J}%
321 |   \def\common@pub@types@hook{\def\betal{{\betal@style et al}}}%
322 | \or % chemistry
323 |   \message{^^J Reference style: \bibliography@style@name^^J}%
324 | \or % mathphys
325 |   \message{^^J Reference style: \bibliography@style@name^^J}%
326 |   \setattribute{bvolume}{style}{\bfseries}
327 | \or % aps
328 |   \message{^^J Reference style: \bibliography@style@name^^J}%
329 |   \setattribute{bvolume}{style}{\bfseries}
330 |   \setattribute{bbtitle}{style}{\itshape}
331 | \or % cs
332 |   \message{^^J Reference style: \bibliography@style@name^^J}%
333 |   \setattribute{bvolume}{style}{\bfseries}
334 | \or % vancouver
335 |   \message{^^J Reference style: \bibliography@style@name^^J}%
336 |   \def\common@pub@types@hook{\let\bfpage\vcr@bfpage}
337 | \or % apa
338 |   \message{^^J Reference style: \bibliography@style@name^^J}%
339 |   \setattribute{bjtitle}{style}{\itshape}
340 |   \setattribute{bvolume}{style}{\itshape}
341 |   \setattribute{bbtitle}{style}{\itshape}
342 |   \setattribute{bsertitle}{style}{\itshape}
343 |   \setattribute{bseriesno}{style}{\itshape}
344 | \or % chicago
345 |   \message{^^J Reference style: \bibliography@style@name^^J}%
346 |   \setattribute{bbtitle}  {style}{\itshape}
347 |   \setattribute{bjtitle}  {style}{\itshape}
348 |   \setattribute{bsertitle}{style}{\itshape}
349 | \or % ams
350 |   \message{^^J Reference style: \bibliography@style@name^^J}%
351 |   \setattribute{batitle}{style}{\itshape}%  
352 |   \setattribute{bctitle}{style}{\itshape}%  
353 |   \g@addto@macro\barticle{\let\batitle\xml@batitle}%
354 |   \setattribute{bvolume}{style}{\bfseries}%
355 |   \setattribute{bseriesno}{style}{\bfseries}%
356 |   \g@addto@macro\bbook{\setattribute{bbtitle}{style}{\itshape}}%
357 |   \g@addto@macro\bchapter{\let\bctitle\xml@bctitle%
358 |   \setattribute{bbtitle}{style}{\upshape}% 
359 |   }%
360 | \or % bmc
361 |   \message{^^J Reference style: \bibliography@style@name^^J}%
362 |   \setattribute{bvolume}{style}{\bfseries}%
363 |   \setattribute{batitle}{style}{\bfseries\mathversion{bold}}%
364 |   \g@addto@macro\barticle{\let\batitle\xml@batitle}%
365 |   \setattribute{bjtitle}{style}{\itshape}%
366 |   \setattribute{bbtitle}{style}{\itshape}%
367 |   \setattribute{bsertitle}{style}{\itshape}
368 |   \setattribute{bctitle}{style}{\bfseries\mathversion{bold}}%
369 |   \g@addto@macro\bchapter{\let\bctitle\xml@bctitle}%
370 | \fi
371 | 
372 | \endinput
373 | 


--------------------------------------------------------------------------------
/bmc-bioinformatics-paper/tsne-plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kaneplusplus/icd-10-cm-embedding/a63d21c7d8f90419515bcfc2b0fce4281a6f1e62/bmc-bioinformatics-paper/tsne-plot.png


--------------------------------------------------------------------------------
/bmc-bioinformatics-paper/vancouver.bst:
--------------------------------------------------------------------------------
   1 | %%
   2 | %% This `vancouver.bst' bibliographic style file (for LaTeX/BibTeX) is
   3 | %% generated with the docstrip utility and modified manually to meet the
   4 | %% ``Uniform Requirements for Manuscripts Submitted to Biomedical Journals''
   5 | %% as published in N Engl J Med 1997;336:309-315.
   6 | %% (also known as the Vancouver style)
   7 | %% This specification may be found on the web page of the
   8 | %% International Committe of Medical Journal Editors:
   9 | %%
  10 | %% http://www.icmje.org
  11 | %%
  12 | %%-------------------------------------------------------------------
  13 | %%
  14 | %% Copyright 2004  Folkert van der Beek
  15 | %%
  16 | %% This work may be distributed and/or modified under the
  17 | %% conditions of the LaTeX Project Public License, either version 1.3
  18 | %% of this license or (at your option) any later version.
  19 | %% The latest version of this license is in
  20 | %%   http://www.latex-project.org/lppl.txt
  21 | %% and version 1.3 or later is part of all distributions of LaTeX
  22 | %% version 2005/12/01 or later.
  23 | %%
  24 | %% This work has the LPPL maintenance status `maintained'.
  25 | %% 
  26 | %% The Current Maintainer of this work is Folkert van der Beek.
  27 | %%
  28 | %% Complaints, suggestions and comments may be sent to
  29 | %%
  30 | %% Folkert van der Beek <folkertvanderbeek at gmail dot com>
  31 | %%
  32 | %%-------------------------------------------------------------------
  33 | %%
  34 | %% This bibliography style file is intended for texts in ENGLISH
  35 | %% This is a numerical citation style, and as such is standard LaTeX.
  36 | %% It requires no extra package to interface to the main text.
  37 | %% The form of the \bibitem entries is
  38 | %%   \bibitem{key}...
  39 | %% Usage of \cite is as follows:
  40 | %%   \cite{key} ==>>          [#]
  41 | %%   \cite[chap. 2]{key} ==>> [#, chap. 2]
  42 | %% where # is a number determined by the ordering in the reference list.
  43 | %% The order in the reference list is that by which the works were originally
  44 | %%   cited in the text, or that in the database.
  45 |  %
  46 | %% To change the reference numbering system from [1] to 1,
  47 | %% put the following code in the preamble:
  48 | %% \makeatletter % Reference list option change
  49 | %% \renewcommand\@biblabel[1]{#1} % from [1] to 1
  50 | %% \makeatother %
  51 | %%
  52 | %%---------------------------------------------------------------------
  53 | 
  54 | %% List of all possible fields
  55 | ENTRY
  56 |   { address
  57 |     assignee     % for patents
  58 |     author
  59 |     booktitle    % for articles in books
  60 |     chapter      % for incollection, esp. internet documents
  61 |     cartographer % for maps
  62 |     day
  63 |     edition
  64 |     editor
  65 |     howpublished
  66 |     institution  % for technical reports
  67 |     inventor     % for patents
  68 |     journal
  69 |     key
  70 |     month
  71 |     note
  72 |     number
  73 |     organization
  74 |     pages
  75 |     part
  76 |     publisher
  77 |     school
  78 |     series
  79 |     title
  80 |     type
  81 |     volume
  82 |     word
  83 |     year
  84 |     eprint % urlbst
  85 |     doi % urlbst
  86 |     url % urlbst
  87 |     lastchecked % urlbst
  88 |     updated % urlbst
  89 |   }
  90 |   {}
  91 |   { label }
  92 | %% Declaration of integer variables
  93 | INTEGERS { output.state before.all mid.sentence after.sentence after.block }
  94 | STRINGS { urlintro eprinturl eprintprefix doiprefix doiurl } % urlbst...
  95 | INTEGERS { hrefform addeprints adddoiresolver }
  96 | % Following constants may be adjusted by hand, if desired
  97 | FUNCTION {init.config.constants}
  98 | {
  99 |   "Available from: " 'urlintro := % prefix before URL
 100 |   "http://arxiv.org/abs/" 'eprinturl := % prefix to make URL from eprint ref
 101 |   "arXiv:" 'eprintprefix := % text prefix printed before eprint ref
 102 |   "http://dx.doi.org/" 'doiurl := % prefix to make URL from DOI
 103 |   "doi:" 'doiprefix := % text prefix printed before DOI ref
 104 |   #0 'addeprints := % 0=no eprints; 1=include eprints
 105 |   #0 'adddoiresolver := % 0=no DOI resolver; 1=include it
 106 |   #0 'hrefform := % 0=no crossrefs; 1=hypertex xrefs; 2=hyperref refs
 107 | }
 108 | INTEGERS { 
 109 |   bracket.state
 110 |   outside.brackets
 111 |   open.brackets
 112 |   within.brackets
 113 |   close.brackets
 114 | }
 115 | % ...urlbst to here
 116 | FUNCTION {init.state.consts}
 117 | { #0 'outside.brackets := % urlbst
 118 |   #1 'open.brackets :=
 119 |   #2 'within.brackets :=
 120 |   #3 'close.brackets :=
 121 | 
 122 |   #0 'before.all :=
 123 |   #1 'mid.sentence :=
 124 |   #2 'after.sentence :=
 125 |   #3 'after.block :=
 126 | }
 127 | %% Declaration of string variables
 128 | STRINGS { s t}
 129 | 
 130 | % urlbst
 131 | FUNCTION {output.nonnull.original}
 132 | { 's :=
 133 |   output.state mid.sentence =
 134 |     { ". " * write$ }
 135 |     { output.state after.block =
 136 |         { add.period$ write$
 137 |           newline$
 138 |           "\newblock " write$
 139 |         }
 140 |         { output.state before.all =
 141 |             'write$
 142 |             { add.period$ " " * write$ }
 143 |           if$
 144 |         }
 145 |       if$
 146 |       mid.sentence 'output.state :=
 147 |     }
 148 |   if$
 149 |   s
 150 | }
 151 | 
 152 | % urlbst...
 153 | FUNCTION {output.nonnull}
 154 | { % Save the thing we've been asked to output
 155 |   's :=
 156 |   % If the bracket-state is close.brackets, then add a close-bracket to
 157 |   % what is currently at the top of the stack, and set bracket.state
 158 |   % to outside.brackets
 159 |   bracket.state close.brackets =
 160 |     { "]" *
 161 |       outside.brackets 'bracket.state :=
 162 |     }
 163 |     'skip$
 164 |   if$
 165 |   bracket.state outside.brackets =
 166 |     { % We're outside all brackets -- this is the normal situation.
 167 |       % Write out what's currently at the top of the stack, using the
 168 |       % original output.nonnull function.
 169 |       s
 170 |       output.nonnull.original
 171 |     }
 172 |     { % Still in brackets.  Add open-bracket or (continuation) comma, add the
 173 |       % new text (in s) to the top of the stack, and move to the close-brackets
 174 |       % state, ready for next time (unless inbrackets resets it).  If we come
 175 |       % into this branch, then output.state is carefully undisturbed.
 176 |       bracket.state open.brackets =
 177 |         { " [" * }
 178 |         { ", " * } % bracket.state will be within.brackets
 179 |       if$ 
 180 |       s * 
 181 |       close.brackets 'bracket.state :=
 182 |     }
 183 |   if$
 184 | }
 185 | 
 186 | % Call this function just before adding something which should be presented in 
 187 | % brackets.  bracket.state is handled specially within output.nonnull.
 188 | FUNCTION {inbrackets}
 189 | { bracket.state close.brackets =
 190 |     { within.brackets 'bracket.state := } % reset the state: not open nor closed
 191 |     { open.brackets 'bracket.state := }
 192 |   if$
 193 | }
 194 | 
 195 | FUNCTION {format.lastchecked}
 196 | { lastchecked empty$
 197 |     { "" }
 198 |     { updated empty$
 199 |       { inbrackets "cited " lastchecked * }
 200 |       { inbrackets "updated " updated * "; cited " * lastchecked * }
 201 |     if$
 202 |     }
 203 |   if$
 204 | }
 205 | % ...urlbst to here
 206 | 
 207 | FUNCTION {output}
 208 | { duplicate$ empty$
 209 |     'pop$
 210 |     'output.nonnull
 211 |   if$
 212 | }
 213 | 
 214 | FUNCTION {output.check}
 215 | { 't :=
 216 |   duplicate$ empty$
 217 |     { pop$ "empty " t * " in " * cite$ * warning$ }
 218 |     'output.nonnull
 219 |   if$
 220 | }
 221 | 
 222 | FUNCTION {fin.entry}
 223 | { 
 224 |   bracket.state close.brackets = % urlbst
 225 |     { "]" * }
 226 |     'skip$
 227 |   if$
 228 |    add.period$
 229 |   write$
 230 |   newline$
 231 | }
 232 | 
 233 | FUNCTION {new.block}
 234 | { output.state before.all =
 235 |     'skip$
 236 |     { after.block 'output.state := }
 237 |   if$
 238 | }
 239 | 
 240 | FUNCTION {new.sentence}
 241 | { output.state after.block =
 242 |     'skip$
 243 |     { output.state before.all =
 244 |         'skip$
 245 |         { after.sentence 'output.state := }
 246 |       if$
 247 |     }
 248 |   if$
 249 | }
 250 | 
 251 | FUNCTION {add.blank}
 252 | {  " " * before.all 'output.state :=
 253 | }
 254 | 
 255 | FUNCTION {no.blank.or.punct}
 256 | {  "" * before.all 'output.state :=
 257 | }
 258 | 
 259 | FUNCTION {add.semicolon}
 260 | {
 261 |   ";" *
 262 |   no.blank.or.punct
 263 | }
 264 | 
 265 | FUNCTION {date.block}
 266 | {
 267 |   "." *
 268 |   no.blank.or.punct
 269 | }
 270 | 
 271 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 272 | %            LOGICAL `NOT', `AND', AND `OR'                 %
 273 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 274 | 
 275 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 276 | % Logical 'not':
 277 | % If the first element on the stack is A then this function
 278 | % does the following:
 279 | %     push { #0 }
 280 | %     push { #1 }
 281 | % So now the first 3 elements of the stack are
 282 | %     { #1 } { #0 } A
 283 | % The first 3 are popped and subjected to 'if':
 284 | % If A > 0 then { #0 } is executed, else { #1 } is executed:
 285 | %     if A > 0
 286 | %     then 0
 287 | %     else 1
 288 | % So consider integers as logicals, where 1 = true and 0 = false,
 289 | % then this does
 290 | %     (if A then false else true)
 291 | % which is a logical 'not'.
 292 | 
 293 | FUNCTION {not}
 294 | {   { #0 }
 295 |     { #1 }
 296 |   if$
 297 | }
 298 | 
 299 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 300 | % Logical 'and':
 301 | % If the first 2 elements on the stack are A B
 302 | % then this function does the following:
 303 | %     push 'skip$
 304 | %     push { pop$ #0 }
 305 | % So now first 4 elements are
 306 | %     { pop$ #0 } 'skip$ A B
 307 | % The first 3 are popped and subjected to 'if' (B is on top of
 308 | % the stack):
 309 | % If A > 0 then 'skip$ is executed, else { pop$ #0 } is executed:
 310 | %     if A > 0
 311 | %     then (B stays on top of stack)
 312 | %     else (B is popped and #0 is pushed)
 313 | % So consider integers as logicals, where 1 = true and 0 = false,
 314 | % then this does
 315 | %     (if A then B else false)
 316 | % which is a logical 'and'.
 317 | 
 318 | FUNCTION {and}
 319 | {   'skip$
 320 |     { pop$ #0 }
 321 |   if$
 322 | }
 323 | 
 324 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 325 | % Logical 'or':
 326 | % If the first 2 elements on the stack are A B
 327 | % then this function does the following:
 328 | %     push { pop$ #1 }
 329 | %     push 'skip$
 330 | % So now first 4 elements are
 331 | %     'skip$ { pop$ #1 } A B
 332 | % The first 3 are popped and subjected to 'if' (B is on top of
 333 | % the stack):
 334 | % If A > 0 then { pop$ #1 } is executed, else 'skip$ is executed:
 335 | %     if A > 0
 336 | %     then (B is popped and #1 is pushed)
 337 | %     else (B stays on top of stack)
 338 | % So consider integers as logicals, where 1 = true and 0 = false,
 339 | % then this does
 340 | %     (if A then true else B)
 341 | % which is a logical 'or'.
 342 | 
 343 | FUNCTION {or}
 344 | {   { pop$ #1 }
 345 |     'skip$
 346 |   if$
 347 | }
 348 | 
 349 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 350 | %  GENERAL PURPOSE FUNCTIONS FOR FORMATTING                 %
 351 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 352 | 
 353 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 354 | % issues warning if field is empty
 355 | % call with
 356 | %    "field"  field  warning.if.empty
 357 | % Note that the first field must be between quotes
 358 | % because it is the fieldname for use in the warning message.
 359 | %
 360 | 
 361 | FUNCTION {warning.if.empty}
 362 | { empty$
 363 |     { "No "  swap$ * " in " * cite$ * warning$ }
 364 |     { pop$ }
 365 |   if$
 366 | }
 367 | 
 368 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 369 |     %
 370 |     % encloses string in pre- and postfix string
 371 |     % call with
 372 |     %    prefix postfix  S  enclose.check
 373 |     % delivers empty string if S empty
 374 |     %
 375 | FUNCTION {enclose.check}
 376 | { duplicate$ empty$
 377 |     { pop$ pop$ pop$
 378 |       ""
 379 |     }
 380 |     { swap$ * * }
 381 |   if$
 382 | }
 383 | 
 384 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 385 | %
 386 | % emphasizes top of stack
 387 | % call with
 388 | %    string" emphasize.check
 389 | %
 390 | 
 391 | FUNCTION {emphasize.check}
 392 | { "\Bem{" swap$
 393 |   "}"     swap$
 394 |   enclose.check
 395 | }
 396 | 
 397 | 
 398 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 399 |     %
 400 |     % brackets top of stack
 401 |     % call with
 402 |     %     "string" bracket.check
 403 |     %
 404 | FUNCTION {bracket.check}
 405 | { "[" swap$
 406 |   "]" swap$
 407 |   enclose.check
 408 | }
 409 | 
 410 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 411 |     %
 412 |     % parenthesizes top of stack
 413 |     % call with
 414 |     %     "string" parenthesize
 415 |     %
 416 | FUNCTION {parenthesize.check}
 417 | { "(" swap$
 418 |   ")" swap$
 419 |   enclose.check
 420 | }
 421 | 
 422 | STRINGS {z}
 423 | 
 424 | FUNCTION {remove.dots}
 425 | { 'z :=	% expects string on top of the stack, pops the string and assigns it to variable z
 426 |   "" % push empty string
 427 |   { z empty$ not } % returns 0 if variable z is empty
 428 |   { z #1 #1 substring$ % push the first character of variable z
 429 |     z #2 global.max$ substring$ 'z := % assigns the 2nd to last character of variable z to variable z
 430 |     duplicate$ "\" = % pushes 1 if the last character is "\", otherwise 0
 431 |     { * % concatenates the last 2 literals
 432 |       z #1 #1 substring$ % push the first character of variable z
 433 |       z #2 global.max$ substring$ 'z := % assigns the 2nd to last character of variable z to variable z
 434 |       * % concatenates the last 2 literals, i.e. every character, even a dot, following a "\" will be printed
 435 |     }
 436 |     { duplicate$ "." = % pushes 1 if the last character is ".", otherwise 0
 437 |       'pop$ %  pushes the pop$ function 
 438 |       { * } % concatenates the last 2 literals
 439 |     if$ % pops the last character if it is a dot, otherwise concatenates it with the string on top of the stack
 440 |     }
 441 |     if$
 442 |   }
 443 |   while$
 444 | }
 445 | 
 446 | INTEGERS {l}
 447 | FUNCTION{string.length}
 448 | {
 449 |   #1 'l :=
 450 |   { duplicate$ duplicate$ #1 l substring$ = not }
 451 |     { l #1 + 'l := }
 452 |   while$
 453 |   pop$ l
 454 | }
 455 | 
 456 | STRINGS {replace find text}
 457 | INTEGERS {find_length}
 458 | FUNCTION {find.replace}
 459 | { 
 460 |   'replace :=
 461 |   'find :=
 462 |   'text :=
 463 |   find string.length 'find_length :=
 464 |   ""
 465 |     { text empty$ not }
 466 |     { text #1 find_length substring$ find =
 467 |       {
 468 |         replace *
 469 |         text #1 find_length + global.max$ substring$ 'text :=
 470 |       }
 471 |       { text #1 #1 substring$ *
 472 |         text #2  global.max$ substring$ 'text :=
 473 |       }
 474 |     if$
 475 |     }
 476 |   while$
 477 | }
 478 | 
 479 | FUNCTION {new.block.checka}
 480 | { empty$
 481 |     'skip$
 482 |     'new.block
 483 |   if$
 484 | }
 485 | 
 486 | FUNCTION {new.block.checkb}
 487 | { empty$
 488 |   swap$ empty$
 489 |   and
 490 |     'skip$
 491 |     'new.block
 492 |   if$
 493 | }
 494 | 
 495 | FUNCTION {new.sentence.checka}
 496 | { empty$
 497 |     'skip$
 498 |     'new.sentence
 499 |   if$
 500 | }
 501 | 
 502 | FUNCTION {new.sentence.checkb}
 503 | { empty$
 504 |   swap$ empty$
 505 |   and
 506 |     'skip$
 507 |     'new.sentence
 508 |   if$
 509 | }
 510 | 
 511 | FUNCTION {field.or.null}
 512 | { duplicate$ empty$
 513 |     { pop$ "" }
 514 |     'skip$
 515 |   if$
 516 | }
 517 | 
 518 | FUNCTION {emphasize}
 519 | { skip$ }
 520 | 
 521 | FUNCTION {tie.or.space.prefix}
 522 | { duplicate$ text.length$ #3 <
 523 |     { "~" }
 524 |     { " " }
 525 |   if$
 526 |   swap$
 527 | }
 528 | 
 529 | FUNCTION {capitalize}
 530 | { "u" change.case$ "t" change.case$ }
 531 | 
 532 | FUNCTION {space.word}
 533 | { " " swap$ * " " * }
 534 | 
 535 |  % Here are the language-specific definitions for explicit words.
 536 |  % Each function has a name bbl.xxx where xxx is the English word.
 537 |  % The language selected here is ENGLISH
 538 | 
 539 | FUNCTION {bbl.and}
 540 | { "and"}
 541 | 
 542 | FUNCTION {bbl.etal}
 543 | { "et~al." }
 544 | 
 545 | FUNCTION {bbl.editors}
 546 | { "editors" }
 547 | 
 548 | FUNCTION {bbl.editor}
 549 | { "editor" }
 550 | 
 551 | FUNCTION {bbl.cartographers}
 552 | { "cartographers" }
 553 | 
 554 | FUNCTION {bbl.cartographer}
 555 | { "cartographer" }
 556 | 
 557 | FUNCTION {bbl.inventors}
 558 | { "inventors" }
 559 | 
 560 | FUNCTION {bbl.inventor}
 561 | { "inventor" }
 562 | 
 563 | FUNCTION {bbl.assignees}
 564 | { "assignees" }
 565 | 
 566 | FUNCTION {bbl.assignee}
 567 | { "assignee" }
 568 | 
 569 | FUNCTION {bbl.edby}
 570 | { "edited by" }
 571 | 
 572 | FUNCTION {bbl.edition}
 573 | { "ed." }
 574 | 
 575 | FUNCTION {bbl.volume}
 576 | { "vol." }
 577 | 
 578 | FUNCTION {bbl.of}
 579 | { "of" }
 580 | 
 581 | FUNCTION {bbl.number}
 582 | { "no." }
 583 | 
 584 | FUNCTION {bbl.nr}
 585 | { "no." }
 586 | 
 587 | FUNCTION {bbl.in}
 588 | { "in" }
 589 | 
 590 | FUNCTION {bbl.pages}
 591 | { "p." }
 592 | 
 593 | FUNCTION {bbl.page}
 594 | { "p." }
 595 | 
 596 | FUNCTION {bbl.chapter}
 597 | { "chap." }
 598 | 
 599 | FUNCTION {bbl.techrep}
 600 | { "Tech. Rep." }
 601 | 
 602 | FUNCTION {bbl.mthesis}
 603 | { "Master's thesis" }
 604 | 
 605 | FUNCTION {bbl.phdthesis}
 606 | { "Ph.D. thesis" }
 607 | 
 608 | FUNCTION {bbl.first}
 609 | { "1st" }
 610 | 
 611 | FUNCTION {bbl.second}
 612 | { "2nd" }
 613 | 
 614 | FUNCTION {bbl.third}
 615 | { "3rd" }
 616 | 
 617 | FUNCTION {bbl.fourth}
 618 | { "4th" }
 619 | 
 620 | FUNCTION {bbl.fifth}
 621 | { "5th" }
 622 | 
 623 | FUNCTION {bbl.st}
 624 | { "st" }
 625 | 
 626 | FUNCTION {bbl.nd}
 627 | { "nd" }
 628 | 
 629 | FUNCTION {bbl.rd}
 630 | { "rd" }
 631 | 
 632 | FUNCTION {bbl.th}
 633 | { "th" }
 634 | 
 635 | MACRO {jan} {"Jan."}
 636 | 
 637 | MACRO {feb} {"Feb."}
 638 | 
 639 | MACRO {mar} {"Mar."}
 640 | 
 641 | MACRO {apr} {"Apr."}
 642 | 
 643 | MACRO {may} {"May"}
 644 | 
 645 | MACRO {jun} {"Jun."}
 646 | 
 647 | MACRO {jul} {"Jul."}
 648 | 
 649 | MACRO {aug} {"Aug."}
 650 | 
 651 | MACRO {sep} {"Sep."}
 652 | 
 653 | MACRO {oct} {"Oct."}
 654 | 
 655 | MACRO {nov} {"Nov."}
 656 | 
 657 | MACRO {dec} {"Dec."}
 658 | 
 659 | FUNCTION {eng.ord}
 660 | { duplicate$ "1" swap$ *
 661 |   #-2 #1 substring$ "1" =
 662 |      { bbl.th * }
 663 |      { duplicate$ #-1 #1 substring$
 664 |        duplicate$ "1" =
 665 |          { pop$ bbl.st * }
 666 |          { duplicate$ "2" =
 667 |              { pop$ bbl.nd * }
 668 |              { "3" =
 669 |                  { bbl.rd * }
 670 |                  { bbl.th * }
 671 |                if$
 672 |              }
 673 |            if$
 674 |           }
 675 |        if$
 676 |      }
 677 |    if$
 678 | }
 679 | 
 680 | FUNCTION {bibinfo.check}
 681 | { swap$
 682 |   duplicate$ missing$
 683 |     {
 684 |       pop$ pop$
 685 |       ""
 686 |     }
 687 |     { duplicate$ empty$
 688 |         {
 689 |           swap$ pop$
 690 |         }
 691 |         { swap$
 692 |           pop$
 693 |         }
 694 |       if$
 695 |     }
 696 |   if$
 697 | }
 698 | 
 699 | FUNCTION {bibinfo.warn}
 700 | { swap$
 701 |   duplicate$ missing$
 702 |     {
 703 |       swap$ "missing " swap$ * " in " * cite$ * warning$ pop$
 704 |       ""
 705 |     }
 706 |     { duplicate$ empty$
 707 |         {
 708 |           swap$ "empty " swap$ * " in " * cite$ * warning$
 709 |         }
 710 |         { swap$
 711 |           pop$
 712 |         }
 713 |       if$
 714 |     }
 715 |   if$
 716 | }
 717 | 
 718 | STRINGS  { bibinfo}
 719 | INTEGERS { nameptr namesleft numnames }
 720 | 
 721 | FUNCTION {format.names}
 722 | { 'bibinfo :=
 723 |   duplicate$ empty$ 'skip$ {
 724 |   "." ". " find.replace 's :=
 725 |   "" 't :=
 726 |   #1 'nameptr :=
 727 |   s num.names$ 'numnames :=
 728 |   numnames 'namesleft :=
 729 |     { namesleft #0 > }
 730 |     { s nameptr
 731 |       "{vv~}{ll}{ f{}}{ jj}"
 732 |       format.name$
 733 |       remove.dots
 734 |       bibinfo bibinfo.check
 735 |       't :=
 736 |       nameptr #1 >
 737 |         {
 738 |           nameptr #6
 739 |           #1 + =
 740 |           numnames #6
 741 |           > and
 742 |             { "others" 't :=
 743 |               #1 'namesleft := }
 744 |             'skip$
 745 |           if$
 746 |           namesleft #1 >
 747 |             { ", " * t * }
 748 |             {
 749 |               "," *
 750 |               s nameptr "{ll}" format.name$ duplicate$ "others" =
 751 |                 { 't := }
 752 |                 { pop$ }
 753 |               if$
 754 |               t "others" =
 755 |                 {
 756 |                   " " * bbl.etal *
 757 |                 }
 758 |                 { " " * t * }
 759 |               if$
 760 |             }
 761 |           if$
 762 |         }
 763 |         't
 764 |       if$
 765 |       nameptr #1 + 'nameptr :=
 766 |       namesleft #1 - 'namesleft :=
 767 |     }
 768 |   while$
 769 |   } if$
 770 | }
 771 | 
 772 | FUNCTION {format.names.org}
 773 | { 'bibinfo :=
 774 |   duplicate$ empty$ 'skip$ {
 775 |   's :=
 776 |   "" 't :=
 777 |   #1 'nameptr :=
 778 |   s num.names$ 'numnames :=
 779 |   numnames 'namesleft :=
 780 |     { namesleft #0 > }
 781 |     { s nameptr
 782 |       "{ff~}{vv~}{ll}"
 783 |       format.name$
 784 |       bibinfo bibinfo.check
 785 |       't :=
 786 |       nameptr #1 >
 787 |         {
 788 |           namesleft #1 >
 789 |             { "; " * t * }
 790 |             {
 791 |               ";" *
 792 |               s nameptr "{ll}" format.name$ duplicate$ "others" =
 793 |                 { 't := }
 794 |                 { pop$ }
 795 |               if$
 796 |               t "others" =
 797 |                 {
 798 |                   " " * bbl.etal *
 799 |                 }
 800 |                 { " " * t * }
 801 |               if$
 802 |             }
 803 |           if$
 804 |         }
 805 |         't
 806 |       if$
 807 |       nameptr #1 + 'nameptr :=
 808 |       namesleft #1 - 'namesleft :=
 809 |     }
 810 |   while$
 811 |   } if$
 812 | }
 813 | 
 814 | FUNCTION {format.names.ed}
 815 | {
 816 |   format.names
 817 | }
 818 | 
 819 | FUNCTION {format.authors}
 820 | { 
 821 |   author "author" format.names
 822 |   %%"." " " "author" find.replace format.names
 823 | }
 824 | 
 825 | FUNCTION {format.organizations}
 826 | { organization "organization" format.names.org
 827 | }
 828 | 
 829 | FUNCTION {get.bbl.editor}
 830 | { editor num.names$ #1 > 'bbl.editors 'bbl.editor if$ }
 831 | 
 832 | FUNCTION {get.bbl.cartographer}
 833 | { cartographer num.names$ #1 > 'bbl.cartographers 'bbl.cartographer if$ }
 834 | 
 835 | FUNCTION {get.bbl.inventor}
 836 | { inventor num.names$ #1 > 'bbl.inventors 'bbl.inventor if$ }
 837 | 
 838 | FUNCTION {get.bbl.assignee}
 839 | { assignee num.names$ #1 > 'bbl.assignees 'bbl.assignee if$ }
 840 | 
 841 | FUNCTION {format.editors}
 842 | { editor "editor" format.names duplicate$ empty$ 'skip$
 843 |     {
 844 |       "," *
 845 |       " " *
 846 |       get.bbl.editor
 847 |       *
 848 |     }
 849 |   if$
 850 | }
 851 | 
 852 | FUNCTION {format.assignees}
 853 | { assignee "assignee" format.names.org duplicate$ empty$ 'skip$
 854 |     {
 855 |       "," *
 856 |       " " *
 857 |       get.bbl.assignee
 858 |       *
 859 |     }
 860 |   if$
 861 | }
 862 | 
 863 | FUNCTION {format.cartographers}
 864 | { cartographer "cartographer" format.names duplicate$ empty$ 'skip$
 865 |     {
 866 |       "," *
 867 |       " " *
 868 |       get.bbl.cartographer
 869 |       *
 870 |     }
 871 |   if$
 872 | }
 873 | 
 874 | FUNCTION {format.inventors}
 875 | { inventor "inventor" format.names duplicate$ empty$ 'skip$
 876 |     {
 877 |       "," *
 878 |       " " *
 879 |       get.bbl.inventor
 880 |       *
 881 |     }
 882 |   if$
 883 | }
 884 | 
 885 | FUNCTION {format.note}
 886 | {
 887 |  note empty$
 888 |     { "" }
 889 |     { note #1 #1 substring$
 890 |       duplicate$ "{" =
 891 |         'skip$
 892 |         { output.state mid.sentence =
 893 |           { "l" }
 894 |           { "u" }
 895 |         if$
 896 |         change.case$
 897 |         }
 898 |       if$
 899 |       note #2 global.max$ substring$ * "note" bibinfo.check
 900 |     }
 901 |   if$
 902 | }
 903 | 
 904 | FUNCTION {format.title}
 905 | { title
 906 | %%duplicate$ empty$ 'skip$
 907 | %%  { "t" change.case$ }
 908 | %%if$
 909 |   "title" bibinfo.check
 910 | }
 911 | 
 912 | FUNCTION {format.type}
 913 | { type empty$
 914 |     'skip$
 915 |     { inbrackets type }
 916 |     %%{ add.blank "[" type * "]" * }
 917 |   if$
 918 | }
 919 | 
 920 | FUNCTION {output.bibitem}
 921 | { outside.brackets 'bracket.state := % urlbst
 922 |    newline$
 923 |   "\bibitem{" write$
 924 |   cite$ write$
 925 |   "}" write$
 926 |   newline$
 927 |   ""
 928 |   before.all 'output.state :=
 929 | }
 930 | 
 931 | FUNCTION {n.dashify}
 932 | {
 933 |   't :=
 934 |   ""
 935 |     { t empty$ not }
 936 |     { t #1 #1 substring$ "-" =
 937 |         { t #1 #2 substring$ "--" = not
 938 |             { "--" *
 939 |               t #2 global.max$ substring$ 't :=
 940 |             }
 941 |             {   { t #1 #1 substring$ "-" = }
 942 |                 { "-" *
 943 |                   t #2 global.max$ substring$ 't :=
 944 |                 }
 945 |               while$
 946 |             }
 947 |           if$
 948 |         }
 949 |         { t #1 #1 substring$ *
 950 |           t #2 global.max$ substring$ 't :=
 951 |         }
 952 |       if$
 953 |     }
 954 |   while$
 955 | }
 956 | 
 957 | FUNCTION {word.in}
 958 | { bbl.in capitalize
 959 |   ":" *
 960 |   " " * }
 961 | 
 962 | FUNCTION {format.journal.date}
 963 | {
 964 |   month "month" bibinfo.check
 965 |   duplicate$ empty$
 966 |   year  "year"  bibinfo.check duplicate$ empty$
 967 |     {
 968 |       swap$ 'skip$
 969 |       { "there's a month but no year in " cite$ * warning$ }
 970 |       if$
 971 |       *
 972 |     }
 973 |     { swap$ 'skip$
 974 |         {
 975 |           " " * swap$
 976 |         }
 977 |       if$
 978 |       *
 979 |       remove.dots
 980 |     }
 981 |   if$
 982 |   duplicate$ empty$
 983 |     'skip$
 984 |     {
 985 |       before.all 'output.state :=
 986 |     after.sentence 'output.state :=
 987 |     }
 988 |   if$
 989 | }
 990 | 
 991 | FUNCTION {format.date}
 992 | {
 993 |   no.blank.or.punct
 994 |   ";" 
 995 |   duplicate$ empty$
 996 |   year  "year"  bibinfo.check duplicate$ empty$
 997 |     { swap$ 'skip$
 998 |         { "there's a month but no year in " cite$ * warning$ }
 999 |       if$
1000 |       *
1001 |     }
1002 |     { swap$ 'skip$
1003 |         {
1004 |           swap$
1005 |           " " * swap$
1006 |         }
1007 |       if$
1008 |       *
1009 |     }
1010 |   if$
1011 | }
1012 | 
1013 | FUNCTION {format.btitle}
1014 | { title "title" bibinfo.check
1015 |   duplicate$ empty$ 'skip$
1016 |     {
1017 |     }
1018 |   if$
1019 | }
1020 | 
1021 | FUNCTION {either.or.check}
1022 | { empty$
1023 |     'pop$
1024 |     { "can't use both " swap$ * " fields in " * cite$ * warning$ }
1025 |   if$
1026 | }
1027 | 
1028 | FUNCTION {format.bvolume}
1029 | { volume empty$
1030 |     { "" }
1031 |     { bbl.volume volume tie.or.space.prefix
1032 |       "volume" bibinfo.check * *
1033 |       series "series" bibinfo.check
1034 |       duplicate$ empty$ 'pop$
1035 |         { swap$ bbl.of space.word * swap$
1036 |           emphasize * }
1037 |       if$
1038 |       "volume and number" number either.or.check
1039 |     }
1040 |   if$
1041 | }
1042 | 
1043 | FUNCTION {format.number.series}
1044 | { volume empty$
1045 |     { number empty$
1046 |         { series field.or.null }
1047 |         { series empty$
1048 |             { number "number" bibinfo.check }
1049 |         { output.state mid.sentence =
1050 |             { bbl.number }
1051 |             { bbl.number capitalize }
1052 |           if$
1053 |           number tie.or.space.prefix "number" bibinfo.check * *
1054 |           bbl.in space.word *
1055 |           series "series" bibinfo.check *
1056 |         }
1057 |       if$
1058 |     }
1059 |       if$
1060 |     }
1061 |     { "" }
1062 |   if$
1063 | }
1064 | 
1065 | FUNCTION {is.num}
1066 | { chr.to.int$
1067 |   duplicate$ "0" chr.to.int$ < not
1068 |   swap$ "9" chr.to.int$ > not and
1069 | }
1070 | 
1071 | FUNCTION {extract.num}
1072 | { duplicate$ 't :=
1073 |   "" 's :=
1074 |   { t empty$ not }
1075 |   { t #1 #1 substring$
1076 |     t #2 global.max$ substring$ 't :=
1077 |     duplicate$ is.num
1078 |       { s swap$ * 's := }
1079 |       { pop$ "" 't := }
1080 |     if$
1081 |   }
1082 |   while$
1083 |   s empty$
1084 |     'skip$
1085 |     { pop$ s }
1086 |   if$
1087 | }
1088 | 
1089 | FUNCTION {convert.edition}
1090 | { extract.num "l" change.case$ 's :=
1091 |   s "first" = s "1" = or
1092 |     { bbl.first 't := }
1093 |     { s "second" = s "2" = or
1094 |         { bbl.second 't := }
1095 |         { s "third" = s "3" = or
1096 |             { bbl.third 't := }
1097 |             { s "fourth" = s "4" = or
1098 |                 { bbl.fourth 't := }
1099 |                 { s "fifth" = s "5" = or
1100 |                     { bbl.fifth 't := }
1101 |                     { s #1 #1 substring$ is.num
1102 |                         { s eng.ord 't := }
1103 |                         { edition 't := }
1104 |                       if$
1105 |                     }
1106 |                   if$
1107 |                 }
1108 |               if$
1109 |             }
1110 |           if$
1111 |         }
1112 |       if$
1113 |     }
1114 |   if$
1115 |   t
1116 | }
1117 | 
1118 | FUNCTION {format.edition}
1119 | { edition duplicate$ empty$ 'skip$
1120 |     {
1121 |       convert.edition
1122 |       output.state mid.sentence =
1123 |         { "l" }
1124 |         { "t" }
1125 |       if$ change.case$
1126 |       "edition" bibinfo.check
1127 |       " " * bbl.edition *
1128 |     }
1129 |   if$
1130 | }
1131 | INTEGERS { multiresult }
1132 | FUNCTION {multi.page.check}
1133 | { 't :=
1134 |   #0 'multiresult :=
1135 |     { multiresult not
1136 |       t empty$ not
1137 |       and
1138 |     }
1139 |     { t #1 #1 substring$
1140 |       duplicate$ "-" =
1141 |       swap$ duplicate$ "," =
1142 |       swap$ "+" =
1143 |       or or
1144 |         { #1 'multiresult := }
1145 |         { t #2 global.max$ substring$ 't := }
1146 |       if$
1147 |     }
1148 |   while$
1149 |   multiresult
1150 | }
1151 | 
1152 | FUNCTION {format.pages}
1153 | { pages duplicate$ empty$ 'skip$
1154 |     { duplicate$ multi.page.check
1155 |         {
1156 |           bbl.pages swap$
1157 |           n.dashify
1158 |         }
1159 |         {
1160 |           bbl.page swap$
1161 |         }
1162 |       if$
1163 |       tie.or.space.prefix
1164 |       "pages" bibinfo.check
1165 |       * *
1166 |     }
1167 |   if$
1168 | }
1169 | 
1170 | FUNCTION {format.journal.pages}
1171 | { pages duplicate$ empty$ 'pop$
1172 |     { swap$ duplicate$ empty$
1173 |         { pop$ pop$ format.pages }
1174 |         {
1175 |           ":" *
1176 |           swap$
1177 |           n.dashify
1178 |           "pages" bibinfo.check
1179 |           *
1180 |         }
1181 |       if$
1182 |     }
1183 |   if$
1184 | }
1185 | 
1186 | FUNCTION {format.vol.num}
1187 | { volume field.or.null
1188 |   duplicate$ empty$ 'skip$
1189 |     {
1190 |       "volume" bibinfo.check
1191 |     }
1192 |   if$
1193 |   number "number" bibinfo.check duplicate$ empty$ 'skip$
1194 |     {
1195 |       swap$ duplicate$ empty$
1196 |         { "there's a number but no volume in " cite$ * warning$ }
1197 |         'skip$
1198 |       if$
1199 |       swap$
1200 |       "(" swap$ * ")" *
1201 |     }
1202 |   if$ *
1203 | }
1204 | 
1205 | FUNCTION {format.vol.num.pages}
1206 | { volume field.or.null
1207 |   duplicate$ empty$ 'skip$
1208 |     {
1209 |       "volume" bibinfo.check
1210 |     }
1211 |   if$
1212 |   number "number" bibinfo.check duplicate$ empty$ 'skip$
1213 |     {
1214 |       swap$ duplicate$ empty$
1215 |         { "there's a number but no volume in " cite$ * warning$ }
1216 |         'skip$
1217 |       if$
1218 |       swap$
1219 |       "(" swap$ * ")" *
1220 |     }
1221 |   if$ *
1222 |   format.journal.pages
1223 | }
1224 | 
1225 | FUNCTION {format.chapter.pages}
1226 | { chapter empty$
1227 |     'format.pages
1228 |     { type empty$
1229 |         { bbl.chapter }
1230 | 		{ type "l" change.case$
1231 | 		  "type" bibinfo.check
1232 | 		}
1233 | 	      if$
1234 | 	      chapter tie.or.space.prefix
1235 | 	      "chapter" bibinfo.check
1236 | 	      * *
1237 | 	      pages empty$
1238 | 		'skip$
1239 | 		{ ", " * format.pages * }
1240 | 	      if$
1241 | 	    }
1242 | 	  if$
1243 | 	}
1244 | 
1245 | 	FUNCTION {format.booktitle}
1246 | 	{
1247 | 	  booktitle "booktitle" bibinfo.check
1248 | 	}
1249 | 
1250 | 	FUNCTION {format.in.ed.booktitle}
1251 | 	{ format.booktitle duplicate$ empty$ 'skip$
1252 | 	    {
1253 | 	      editor "editor" format.names.ed duplicate$ empty$ 'pop$
1254 | 		{
1255 | 		  "," *
1256 | 		  " " *
1257 | 		  get.bbl.editor
1258 | 		  ". " *
1259 | 		  * swap$
1260 | 		  * }
1261 | 	      if$
1262 | 	      word.in swap$ *
1263 | 	    }
1264 | 	  if$
1265 | 	}
1266 | 
1267 | 	FUNCTION {format.in.ed.title}
1268 | 	{ format.title duplicate$ empty$ 'skip$
1269 | 	    {
1270 | 	      editor "editor" format.names.ed duplicate$ empty$ 'pop$
1271 | 		{
1272 | 		  "," *
1273 | 		  " " *
1274 | 		  get.bbl.editor
1275 | 		  ". " *
1276 | 		  * swap$
1277 | 		  * }
1278 | 	      if$
1279 | 	      word.in swap$ *
1280 | 	    }
1281 | 	  if$
1282 | 	}
1283 | 
1284 | 	FUNCTION {empty.misc.check}
1285 | 	{ author empty$ title empty$ howpublished empty$
1286 | 	  month empty$ year empty$ note empty$
1287 | 	  and and and and and
1288 | 	    { "all relevant fields are empty in " cite$ * warning$ }
1289 | 	    'skip$
1290 | 	  if$
1291 | 	}
1292 | 	FUNCTION {format.thesis.type}
1293 | 	{ type duplicate$ empty$
1294 | 	    'pop$
1295 | 	    { swap$ pop$
1296 | 	      "t" change.case$ "type" bibinfo.check
1297 |     }
1298 |   if$
1299 | }
1300 | FUNCTION {format.tr.number}
1301 | {
1302 |     number "number" bibinfo.check
1303 |   %%type duplicate$ empty$
1304 |     %%{ pop$ bbl.techrep }
1305 |     %%'skip$
1306 |   %%if$
1307 |   %%"type" bibinfo.check
1308 |   %%swap$ duplicate$ empty$
1309 |     %%{ pop$ "t" change.case$ }
1310 |     %%{ tie.or.space.prefix * * }
1311 |   %%if$
1312 | }
1313 | 
1314 | FUNCTION {format.org.or.pub}
1315 | { 't :=
1316 |   ""
1317 |   address empty$ t empty$ and
1318 |     'skip$
1319 |     {
1320 |       address "address" bibinfo.check *
1321 |       t empty$
1322 |         'skip$
1323 |         { address empty$
1324 |             'skip$
1325 |             { ": " * }
1326 |           if$
1327 |           t *
1328 |         }
1329 |       if$
1330 |     }
1331 |   if$
1332 | }
1333 | 
1334 | FUNCTION {format.publisher.address}
1335 | { publisher "publisher" bibinfo.warn format.org.or.pub
1336 | }
1337 | 
1338 | FUNCTION {format.organization.address}
1339 | { organization "organization" bibinfo.check format.org.or.pub
1340 | }
1341 | 
1342 | FUNCTION {format.institution.address}
1343 | { institution "institution" bibinfo.check format.org.or.pub
1344 | }
1345 | 
1346 | 
1347 | % urlbst...
1348 | % Functions for making hypertext links.
1349 | % In all cases, the stack has (link-text href-url)
1350 | %
1351 | % make 'null' specials
1352 | FUNCTION {make.href.null}
1353 | {
1354 |   pop$
1355 | }
1356 | % make hypertex specials
1357 | FUNCTION {make.href.hypertex}
1358 | { 
1359 |   "\special {html:<a href=" quote$ *
1360 |   swap$ * quote$ * "> }" * swap$ *
1361 |   "\special {html:</a>}" *
1362 | }
1363 | % make hyperref specials
1364 | FUNCTION {make.href.hyperref}
1365 | { 
1366 |   "\href {" swap$ * "} {" * swap$ * "}" *
1367 | }
1368 | FUNCTION {make.href}
1369 | { hrefform #2 =
1370 |     'make.href.hyperref      % hrefform = 2
1371 |     { hrefform #1 =
1372 |         'make.href.hypertex  % hrefform = 1
1373 |         'make.href.null      % hrefform = 0 (or anything else)
1374 |       if$
1375 |     }
1376 |   if$
1377 | }
1378 | 
1379 | FUNCTION {format.url}
1380 | { url empty$
1381 |     { "" }
1382 |       { hrefform #1 =
1383 |           { % special case -- add HyperTeX specials
1384 |             urlintro "\url{" url * "}" * url make.href.hypertex * }
1385 |           { urlintro "\url{" * url * "}" * }
1386 |        if$
1387 |      }
1388 |   if$
1389 | }
1390 | 
1391 | FUNCTION {format.eprint}
1392 | { eprint empty$
1393 |     { "" }
1394 |     { eprintprefix eprint * eprinturl eprint * make.href }
1395 |   if$
1396 | }
1397 | 
1398 | FUNCTION {format.doi}
1399 | { doi empty$
1400 |     { "" }
1401 |     { doiprefix doi * doiurl doi * make.href }
1402 |   if$
1403 | }
1404 | 
1405 | % Output a URL.  We can't use the more normal idiom (something like
1406 | % `format.url output'), because the `inbrackets' within
1407 | % format.lastchecked applies to everything between calls to `output',
1408 | % so that `format.url format.lastchecked * output' ends up with both
1409 | % the URL and the lastchecked in brackets.
1410 | FUNCTION {output.url}
1411 | { url empty$
1412 |     'skip$ 
1413 |     { new.block 
1414 |       format.url output
1415 |       format.lastchecked output 
1416 |     }
1417 |   if$
1418 | }
1419 | 
1420 | FUNCTION {output.web.refs}
1421 | {
1422 |   new.block
1423 |   output.url
1424 |   addeprints eprint empty$ not and
1425 |     { format.eprint output.nonnull }
1426 |     'skip$
1427 |   if$
1428 |   adddoiresolver doi empty$ not and
1429 |     { format.doi output.nonnull }
1430 |     'skip$
1431 |   if$
1432 | %  addeprints
1433 | %    { eprint empty$
1434 | %        'skip$
1435 | %        { format.eprint output.nonnull }
1436 | %      if$
1437 | %    }
1438 | %    'skip$
1439 | %  if$
1440 | }
1441 | 
1442 | % Webpage entry type.
1443 | % Title and url fields required;
1444 | % author, note, year, month, and lastchecked fields optional
1445 | STRINGS {database}
1446 | FUNCTION {webpage}
1447 | { output.bibitem
1448 |   author empty$
1449 |     { editor empty$
1450 |         'skip$  % author and editor both optional
1451 |         { format.editors output.nonnull }
1452 |       if$
1453 |     }
1454 |     { editor empty$
1455 |         { format.authors output.nonnull }
1456 |         { "can't use both author and editor fields in " cite$ * warning$ }
1457 |       if$
1458 |     }
1459 |   if$
1460 | %  author empty$
1461 | %    'skip$
1462 | %    { format.authors output.nonnull }
1463 | %  if$
1464 |   new.block
1465 |   format.title "title" output.check
1466 |   journal empty$
1467 |     {
1468 |       format.type "type" output.check
1469 |       publisher empty$
1470 |         'skip$
1471 |         { format.publisher.address output }
1472 |       if$
1473 |       "database on the Internet" 'database :=
1474 |       type database =
1475 |         { format.journal.date "year" output.check }
1476 |         { format.date "year" output.check }
1477 |       if$
1478 |       lastchecked empty$
1479 |         'skip$
1480 |         { format.lastchecked output }
1481 |       if$
1482 |       new.block
1483 |       part empty$
1484 |         'skip$
1485 |         { part output }
1486 |       if$
1487 |       pages empty$
1488 |         'skip$
1489 |         { pages bracket.check output }
1490 |       if$
1491 |     }
1492 |     { journal
1493 |       remove.dots
1494 |       "journal" bibinfo.check
1495 |       "journal" output.check
1496 |       format.type "type" output.check
1497 |       format.journal.date "year" output.check
1498 |       lastchecked empty$
1499 |         'skip$
1500 |         { format.lastchecked output
1501 | 	  ";" no.blank.or.punct output
1502 | 	}
1503 |       if$
1504 |       no.blank.or.punct format.vol.num output
1505 |       pages empty$
1506 |         'skip$
1507 | 	{ ":" no.blank.or.punct output
1508 | 	  no.blank.or.punct pages bracket.check output
1509 | 	}
1510 |       if$
1511 |       new.block
1512 |     }
1513 |   if$
1514 |   format.url "url" output.check
1515 |   new.block
1516 |   note output
1517 |   fin.entry
1518 | }
1519 | % ...urlbst to here
1520 | 
1521 | FUNCTION {misc}
1522 | { output.bibitem
1523 |   format.authors "author" output.check
1524 |   format.editors "author and editor" output.check
1525 |   format.title "title" output.check
1526 |   type missing$
1527 |     { skip$ }
1528 |     { format.type "type" output.check }
1529 |     %%{ inbrackets type output }
1530 |   if$
1531 |   new.block
1532 |   format.publisher.address output
1533 |   format.date "year" output.check
1534 |   new.block
1535 |   format.note output
1536 |   new.block
1537 |   howpublished new.block.checka
1538 |   howpublished "howpublished" bibinfo.check output
1539 |   output.web.refs  % urlbst
1540 |   fin.entry
1541 |   empty.misc.check
1542 | }
1543 | 
1544 | FUNCTION {article}
1545 | { output.bibitem
1546 |   format.authors "author" output.check
1547 |   organization empty$
1548 |     'skip$
1549 |     { author empty$
1550 |         {
1551 |           format.organizations "organization" output.check
1552 | 	}
1553 | 	{
1554 | 	  "; " *
1555 | 	  no.blank.or.punct
1556 |           format.organizations "organization" output.check
1557 | 	}
1558 |       if$
1559 |     }
1560 |   if$
1561 |   new.block
1562 |   format.title "title" output.check
1563 |   type missing$
1564 |     { skip$ }
1565 |     { format.type "type" output.check }
1566 |   if$
1567 |   new.block
1568 |   journal
1569 |   remove.dots
1570 |   "journal" bibinfo.check
1571 |   "journal" output.check
1572 |   format.journal.date "year" output.check
1573 |   add.semicolon
1574 |   format.vol.num.pages output
1575 |   new.block
1576 |   format.note output
1577 |   output.web.refs  % urlbst
1578 |   fin.entry
1579 | }
1580 | 
1581 | FUNCTION {book}
1582 | { output.bibitem
1583 |   author empty$
1584 |     { editor empty$
1585 |         { format.organizations "organization" output.check }
1586 |         { format.editors "author and editor" output.check }
1587 |       if$
1588 |     } 
1589 |     { format.authors output.nonnull
1590 |       "author and editor" editor either.or.check
1591 |     }
1592 |   if$
1593 |   new.block
1594 |   format.btitle "title" output.check
1595 |   format.bvolume output
1596 |   new.block
1597 |   format.edition output
1598 |   new.sentence
1599 |   author empty$ not
1600 |   editor empty$ not
1601 |   and
1602 |     { format.editors "author and editor" output.check }
1603 |       'skip$
1604 |   if$
1605 |   format.number.series output
1606 |   format.publisher.address output
1607 |   format.date "year" output.check
1608 |   new.block
1609 |   format.note output
1610 |   output.web.refs  % urlbst
1611 |   fin.entry
1612 | }
1613 | 
1614 | FUNCTION {booklet}
1615 | { misc }
1616 | 
1617 | FUNCTION {dictionary}
1618 | { output.bibitem
1619 |   format.booktitle "booktitle" output.check
1620 |   format.bvolume output
1621 |   new.block
1622 |   format.edition output
1623 |   new.sentence
1624 |   format.publisher.address output
1625 |   format.date "year" output.check
1626 |   format.btitle "title" output.check
1627 |   add.semicolon
1628 |   add.blank
1629 |   format.pages "pages" output.check
1630 |   new.block
1631 |   format.note output
1632 |   output.web.refs  % urlbst
1633 |   fin.entry
1634 | }
1635 | 
1636 | FUNCTION {inbook}
1637 | { output.bibitem
1638 |   format.authors "author" output.check
1639 |   new.block
1640 |   chapter "chapter" output.check
1641 |   new.block
1642 |   format.in.ed.title "title" output.check
1643 |   format.bvolume output
1644 |   format.edition output
1645 |   new.sentence
1646 |   format.number.series output
1647 |   format.publisher.address output
1648 |   format.date "year" output.check
1649 |   date.block
1650 |   add.blank
1651 |   format.pages "pages" output.check
1652 |   new.block
1653 |   format.note output
1654 |   output.web.refs  % urlbst
1655 |   fin.entry
1656 | }
1657 | 
1658 | FUNCTION {incollection}
1659 | { output.bibitem
1660 |   format.authors "author" output.check
1661 |   new.block
1662 |   format.title "title" output.check
1663 |   new.block
1664 |   format.in.ed.booktitle "booktitle" output.check
1665 |   format.bvolume output
1666 |   format.edition output
1667 |   new.sentence
1668 |   format.number.series output
1669 |   format.publisher.address output
1670 |   format.date "year" output.check
1671 |   date.block
1672 |   add.blank
1673 |   format.pages "pages" output.check
1674 |   new.block
1675 |   format.note output
1676 |   output.web.refs  % urlbst
1677 |   fin.entry
1678 | }
1679 | 
1680 | FUNCTION {inproceedings}
1681 | { output.bibitem
1682 |   format.authors "author" output.check
1683 |   new.block
1684 |   format.title "title" output.check
1685 |   new.block
1686 |   format.in.ed.booktitle "booktitle" output.check
1687 |   format.bvolume output
1688 |   new.sentence
1689 |   format.number.series output
1690 |   publisher empty$
1691 |     { format.organization.address output }
1692 |     { organization "organization" bibinfo.check output
1693 |       format.publisher.address output
1694 |     }
1695 |   if$
1696 |   format.date "year" output.check
1697 |   date.block
1698 |   add.blank
1699 |   format.pages "pages" output.check
1700 |   new.block
1701 |   format.note output
1702 |   output.web.refs  % urlbst
1703 |   fin.entry
1704 | }
1705 | 
1706 | FUNCTION {conference}
1707 | {inproceedings}
1708 | 
1709 | FUNCTION {manual}
1710 | {misc}
1711 | 
1712 | FUNCTION {phdthesis}
1713 | { output.bibitem
1714 |   format.authors "author" output.check
1715 |   new.block
1716 |   format.btitle
1717 |   "title" output.check
1718 |   format.type "type" output.check
1719 |   new.block
1720 |   school "school" bibinfo.warn output
1721 |   address "address" bibinfo.check output
1722 |   format.date "year" output.check
1723 |   new.block
1724 |   format.note output
1725 |   output.web.refs  % urlbst
1726 |   fin.entry
1727 | }
1728 | 
1729 | FUNCTION {mastersthesis}
1730 | {phdthesis}
1731 | 
1732 | FUNCTION {proceedings}
1733 | { output.bibitem
1734 |   editor empty$
1735 |     { organization "organization" bibinfo.check output
1736 |     }
1737 |     { format.editors output.nonnull }
1738 |   if$
1739 |   new.block
1740 |   format.btitle "title" output.check
1741 |   format.bvolume output
1742 |   editor empty$
1743 |     { publisher empty$
1744 |         'skip$
1745 |         {
1746 |           new.sentence
1747 |           format.number.series output
1748 |           format.publisher.address output
1749 |         }
1750 |       if$
1751 |     }
1752 |     { publisher empty$
1753 |         {
1754 |           new.sentence
1755 |           format.organization.address output }
1756 |         {
1757 |           new.sentence
1758 |           organization "organization" bibinfo.check output
1759 |           format.publisher.address output
1760 |         }
1761 |       if$
1762 |      }
1763 |   if$
1764 |       format.date "year" output.check
1765 |   new.block
1766 |   format.note output
1767 |   output.web.refs  % urlbst
1768 |   fin.entry
1769 | }
1770 | 
1771 | FUNCTION {techreport}
1772 | { output.bibitem
1773 |   format.authors "author" output.check
1774 |   new.block
1775 |   format.title
1776 |   "title" output.check
1777 |   new.block
1778 |   format.institution.address output
1779 |   format.date "year" output.check
1780 |   format.tr.number output.nonnull
1781 |   new.block
1782 |   format.note output
1783 |   output.web.refs  % urlbst
1784 |   fin.entry
1785 | }
1786 | 
1787 | FUNCTION {map}
1788 | { output.bibitem
1789 |   format.cartographers "cartographer" output.check
1790 |   new.block
1791 |   format.title
1792 |   "title" output.check
1793 |   format.type "type" output.check
1794 |   new.block
1795 |   format.publisher.address output
1796 |   format.date "year" output.check
1797 |   new.block
1798 |   format.note output
1799 |   output.web.refs  % urlbst
1800 |   fin.entry
1801 | }
1802 | 
1803 | FUNCTION {patent}
1804 | { output.bibitem
1805 |   format.inventors "inventor" output.check
1806 |   "; " *
1807 |   no.blank.or.punct
1808 |   format.assignees "assignee" output.check
1809 |   new.block
1810 |   format.title
1811 |   "title" output.check
1812 |   new.block
1813 |   format.tr.number output.nonnull
1814 |   format.date "year" output.check
1815 |   new.block
1816 |   format.note output
1817 |   output.web.refs  % urlbst
1818 |   fin.entry
1819 | }
1820 | 
1821 | FUNCTION {unpublished}
1822 | { output.bibitem
1823 |   format.authors "author" output.check
1824 |   new.block
1825 |   format.title "title" output.check
1826 |   format.date output
1827 |   new.block
1828 |   format.note "note" output.check
1829 |   output.web.refs  % urlbst
1830 |   fin.entry
1831 | }
1832 | 
1833 | FUNCTION {default.type} { misc }
1834 | READ
1835 | STRINGS { longest.label }
1836 | INTEGERS { number.label longest.label.width }
1837 | FUNCTION {initialize.longest.label}
1838 | { "" 'longest.label :=
1839 |   #1 'number.label :=
1840 |   #0 'longest.label.width :=
1841 | }
1842 | FUNCTION {longest.label.pass}
1843 | { number.label int.to.str$ 'label :=
1844 |   number.label #1 + 'number.label :=
1845 |   label width$ longest.label.width >
1846 |     { label 'longest.label :=
1847 |       label width$ 'longest.label.width :=
1848 |     }
1849 |     'skip$
1850 |   if$
1851 | }
1852 | EXECUTE {initialize.longest.label}
1853 | ITERATE {longest.label.pass}
1854 | FUNCTION {begin.bib}
1855 | { preamble$ empty$
1856 |     'skip$
1857 |     { preamble$ write$ newline$ }
1858 |   if$
1859 |   "\begin{thebibliography}{"  longest.label  * "}" *
1860 |   write$ newline$
1861 | }
1862 | EXECUTE {begin.bib}
1863 | EXECUTE {init.config.constants}
1864 | EXECUTE {init.state.consts}
1865 | ITERATE {call.type$}
1866 | FUNCTION {end.bib}
1867 | { newline$
1868 |   "\end{thebibliography}" write$ newline$
1869 | }
1870 | EXECUTE {end.bib}
1871 | %% End of customized bst file
1872 | %%
1873 | %% End of file `vancouver.bst'.
1874 | 


--------------------------------------------------------------------------------
/category-codes.ssv:
--------------------------------------------------------------------------------
 1 | first;last;description
 2 | A00;B99;Certain infectious and parasitic diseases
 3 | C00;D49;Neoplasms
 4 | D50;D89;Diseases of the blood and blood-forming organs and certain disorders involving the immune mechanism
 5 | E00;E89;Endocrine, nutritional and metabolic diseases
 6 | F01;F99;Mental, Behavioral and Neurodevelopmental disorders
 7 | G00;G99;Diseases of the nervous system
 8 | H00;H59;Diseases of the eye and adnexa
 9 | H60;H95;Diseases of the ear and mastoid process
10 | I00;I99;Diseases of the circulatory system
11 | J00;J99;Diseases of the respiratory system
12 | K00;K95;Diseases of the digestive system
13 | L00;L99;Diseases of the skin and subcutaneous tissue
14 | M00;M99;Diseases of the musculoskeletal system and connective tissue
15 | N00;N99;Diseases of the genitourinary system
16 | O00;O9A;Pregnancy, childbirth and the puerperium
17 | P00;P96;Certain conditions originating in the perinatal period
18 | Q00;Q99;Congenital malformations, deformations and chromosomal abnormalities
19 | R00;R99;Symptoms, signs and abnormal clinical and laboratory findings, not elsewhere classified
20 | S00;T88;Injury, poisoning and certain other consequences of external causes
21 | U00;U85;Codes for special purposes
22 | V00;Y99;External causes of morbidity
23 | Z00;Z99;Factors influencing health status and contact with health services
24 | 


--------------------------------------------------------------------------------
/comparators/REAME.md:
--------------------------------------------------------------------------------
 1 | This directory contains the code used to compare the performance of the BioGPT model with MedBERT and two version of PubMedBERT (pubmedbert-fulltext and pubmedbert-ms-marco).
 2 | 
 3 | The following files are in the current directory.
 4 | - `setup` creates the conda environment to run each of the models.
 5 | - short-codes.R creates the training data to estimate the icd10 category(leading letter).
 6 | - alpha-char-embedding-model.R contains the `dataset` object and supervised models.
 7 | 
 8 | Each directory (medbert, pubmedbert-fulltext, pubmedbert-ms-marco) contains:
 9 | - ref.txt contains the url the model was downloaded from on HuggingFace.
10 | - 0-make-embedding.R create the embedding data set.
11 | - 1-benchmark.R performs the benchmark.
12 | 


--------------------------------------------------------------------------------
/comparators/alpha-char-embedding-model.R:
--------------------------------------------------------------------------------
 1 | library(torch)
 2 | library(dplyr)
 3 | library(foreach)
 4 | 
 5 | AlphaCharEmbedding = dataset(
 6 |   name = "AlphaCharEmbedding",
 7 |   initialize = function(x) {
 8 |     self$x = x
 9 |     self$contr = contr.treatment(sort(unique(x$ll)), contrasts = FALSE)
10 |   },
11 |   width = function() {
12 |     self$x$embed[[1]] |> length()
13 |   },
14 |   .getitem = function(i) {
15 |     list(
16 |       x = torch_tensor(self$x$embed[[i]]),
17 |       y = torch_tensor(self$contr[self$x$ll[i],])
18 |     )
19 |   },
20 |   .length = function() {
21 |     nrow(self$x)
22 |   }
23 | )
24 | 
25 | AlphaCodeEstimator = nn_module(
26 |   initialize = function(layers) {
27 |     self$feature_net = nn_module_list(
28 |       foreach(i = seq_along(layers)[-1]) %do% {
29 |         nn_linear(layers[i-1], layers[i])
30 |       }
31 |     )
32 |   },
33 |   forward = function(x) {
34 |     x = x$squeeze()
35 |     for (i in seq_along(self$feature_net)) {
36 |       x = self$feature_net[[i]](x)
37 |     }
38 |     nnf_softmax(x, dim = 1)
39 |   }
40 | )
41 | 


--------------------------------------------------------------------------------
/comparators/clinicalbert/0-make-embedding.R:
--------------------------------------------------------------------------------
 1 | library(reticulate)
 2 | library(readr)
 3 | library(purrr)
 4 | library(foreach)
 5 | library(itertools)
 6 | 
 7 | use_condaenv("icd-10-huggingface", required = TRUE)
 8 | 
 9 | source("../short-code.R")
10 | 
11 | transformers = import("transformers")
12 | tokenizer = transformers$AutoTokenizer$from_pretrained("medicalai/ClinicalBERT")
13 | torch = import("torch")
14 | np = import("numpy")
15 | 
16 | builtins = import_builtins()
17 | builtins$setattr(torch$distributed, "is_initialized", py_eval("lambda : False"))
18 | model = transformers$AutoModel$from_pretrained("medicalai/ClinicalBERT")
19 | 
20 | mean_pooling = function(model_output, attention_mask) {
21 |     token_embeddings = model_output[[1]] #First element of model_output contains all token embeddings
22 |     input_mask_expanded = attention_mask$unsqueeze(-1L)$expand(token_embeddings$size())$float()
23 |     torch$sum(token_embeddings * input_mask_expanded, 1L) / torch$clamp(input_mask_expanded$sum(1L), min=1e-9)
24 | }
25 | 
26 | # A function to embed a set of string.
27 | embed = function(strings) {
28 |   encoded_input = tokenizer(
29 |     strings,
30 |     padding = TRUE,
31 |     truncation = TRUE,
32 |     max_length = 256L,
33 |     return_tensors = 'pt'
34 |   )
35 |   model_output = model(
36 |     input_ids = encoded_input$input_ids,
37 |     attention_mask = encoded_input$attention_mask
38 |   )
39 |   mean_pooling(model_output,
40 |                encoded_input$attention_mask)$detach()$cpu()$numpy() |>
41 |     (\(x) {rownames(x) = strings; x})()
42 | }
43 | 
44 | x = read_fwf(
45 |   "../../icd-10-cm-codes/icd10cm_codes_2019.txt",
46 |   fwf_cols(code = 8, desc = 150)
47 | )
48 | 
49 | embs = foreach(it = isplitVector(seq_len(nrow(x)), chunkSize = 1000),
50 |                .combine = c) %do% {
51 |   ret = embed(x$desc[it])
52 |   ret = map(seq_len(nrow(ret)), ~ list(ret[.x,,drop = FALSE]))
53 |   message(tail(it, 1), " of ", nrow(x))
54 |   gc()
55 |   ret
56 | }
57 | 
58 | x$embed = unlist(embs, recursive = FALSE)
59 | x$ll = get_short_code(x$code)
60 | saveRDS(x, "x-with-embedding.rds")
61 | 


--------------------------------------------------------------------------------
/comparators/clinicalbert/1-benchmark.R:
--------------------------------------------------------------------------------
 1 | library(luz)
 2 | library(foreach)
 3 | library(yardstick)
 4 | library(tidyr)
 5 | 
 6 | source("../alpha-char-embedding-model.R")
 7 | 
 8 | x = readRDS("x-with-embedding.rds")
 9 | 
10 | traini = sample.int(nrow(x), round(0.9 * nrow(x)))
11 | testi = setdiff(seq_len(nrow(x)), traini)
12 | 
13 | train = AlphaCharEmbedding(x[traini, ])
14 | test = AlphaCharEmbedding(x[testi, ])
15 | 
16 | layers = c(train$width(), 100, 100, 21)
17 | 
18 | batch_size = c(64, 128, 256)
19 | epochs = 30
20 | num_workers = 6
21 | 
22 | loss = function(input, target) {
23 |   torch_mean(-torch_sum(target * torch_log(input$squeeze() + 1e-16), 2))
24 | }
25 | 
26 | mss = foreach (bs = batch_size, .combine = bind_rows) %do% {
27 |   luz_model = AlphaCodeEstimator |>
28 |       setup(
29 |         loss = loss, #nn_cross_entropy_loss(26),
30 |         optimizer = optim_adam
31 |       ) |>
32 |       set_hparams(layers = layers) |>
33 |       fit(
34 |         data = dataloader(
35 |           train,
36 |           batch_size = bs,
37 |           shuffle = TRUE,
38 |           num_workers = num_workers,
39 |           worker_packages = c("torch", "dplyr")
40 |         ),
41 |         epochs = epochs,
42 |         valid_data = dataloader(
43 |           test,
44 |           batch_size = bs,
45 |           shuffle = FALSE,
46 |           num_workers = num_workers,
47 |           worker_packages = c("torch", "dplyr")
48 |         ),
49 |         callbacks = list(
50 |           luz_callback_keep_best_model()
51 |         )
52 |       )
53 | 
54 |   preds =
55 |     predict(
56 |       luz_model,
57 |       dataloader(
58 |         test,
59 |         batch_size = bs,
60 |         num_workers = num_workers,
61 |         worker_packages = c("torch", "dplyr")
62 |       )
63 |     )
64 | 
65 |   comp = tibble(
66 |     obs =  x[testi,]$ll |>
67 |       factor(levels = 1:21),
68 |     pred = preds |>
69 |       torch_tensor(device = "cpu") |>
70 |       as.matrix() |>
71 |       apply(1, which.max) |>
72 |       factor(levels = 1:21)
73 |   )
74 |   bind_cols(
75 |     metric_set(accuracy, bal_accuracy)(comp, truth = obs, estimate = pred),
76 |     batch_size = bs
77 |     ) |>
78 |     select(-.estimator) |>
79 |     pivot_wider(names_from = .metric, values_from = .estimate)
80 | }
81 | print(mss)
82 | saveRDS(mss, "ms.rds")
83 | 


--------------------------------------------------------------------------------
/comparators/clinicalbert/ref.txt:
--------------------------------------------------------------------------------
1 | https://huggingface.co/medicalai/ClinicalBERT
2 | 


--------------------------------------------------------------------------------
/comparators/medbert/0-make-embedding.R:
--------------------------------------------------------------------------------
 1 | library(reticulate)
 2 | library(readr)
 3 | library(purrr)
 4 | library(foreach)
 5 | library(itertools)
 6 | 
 7 | use_condaenv("icd-10-huggingface", required = TRUE)
 8 | 
 9 | source("../short-code.R")
10 | 
11 | transformers = import("transformers")
12 | tokenizer = transformers$AutoTokenizer$from_pretrained("Charangan/MedBERT")
13 | torch = import("torch")
14 | np = import("numpy")
15 | 
16 | builtins = import_builtins()
17 | builtins$setattr(torch$distributed, "is_initialized", py_eval("lambda : False"))
18 | model = transformers$AutoModel$from_pretrained("Charangan/MedBERT")
19 | 
20 | # A function to embed a set of string.
21 | embed = function(strings) {
22 |   encoded_input = tokenizer(
23 |     strings,
24 |     padding = TRUE,
25 |     truncation = TRUE,
26 |     max_length = 256L,
27 |     return_tensors = 'pt'
28 |   )
29 |   ret = model(
30 |     input_ids = encoded_input$input_ids,
31 |     attention_mask = encoded_input$attention_mask
32 |   )$pooler_output 
33 |   ret$detach()$cpu()$numpy()
34 | }
35 | 
36 | x = read_fwf(
37 |   "../../icd-10-cm-codes/icd10cm_codes_2019.txt",
38 |   fwf_cols(code = 8, desc = 150)
39 | )
40 | 
41 | embs = foreach(it = isplitVector(seq_len(nrow(x)), chunkSize = 1000),
42 |                .combine = c) %do% {
43 |   ret = embed(x$desc[it])
44 |   ret = map(seq_len(nrow(ret)), ~ list(ret[.x,,drop = FALSE]))
45 |   message(tail(it, 1), " of ", nrow(x))
46 |   gc()
47 |   ret
48 | }
49 | 
50 | x$embed = unlist(embs, recursive = FALSE)
51 | x$ll = get_short_code(x$code)
52 | saveRDS(x, "x-with-embedding.rds")
53 | 


--------------------------------------------------------------------------------
/comparators/medbert/1-benchmark.R:
--------------------------------------------------------------------------------
 1 | library(luz)
 2 | library(foreach)
 3 | library(yardstick)
 4 | library(tidyr)
 5 | 
 6 | source("../alpha-char-embedding-model.R")
 7 | 
 8 | x = readRDS("x-with-embedding.rds")
 9 | 
10 | traini = sample.int(nrow(x), round(0.9 * nrow(x)))
11 | testi = setdiff(seq_len(nrow(x)), traini)
12 | 
13 | train = AlphaCharEmbedding(x[traini, ])
14 | test = AlphaCharEmbedding(x[testi, ])
15 | 
16 | layers = c(train$width(), 100, 100, 21)
17 | 
18 | batch_size = c(64, 128, 256)
19 | epochs = 30
20 | num_workers = 6
21 | 
22 | loss = function(input, target) {
23 |   torch_mean(-torch_sum(target * torch_log(input$squeeze() + 1e-16), 2))
24 | }
25 | 
26 | mss = foreach (bs = batch_size, .combine = bind_rows) %do% {
27 |   luz_model = AlphaCodeEstimator |>
28 |       setup(
29 |         loss = loss, #nn_cross_entropy_loss(26),
30 |         optimizer = optim_adam
31 |       ) |>
32 |       set_hparams(layers = layers) |>
33 |       fit(
34 |         data = dataloader(
35 |           train,
36 |           batch_size = bs,
37 |           shuffle = TRUE,
38 |           num_workers = num_workers,
39 |           worker_packages = c("torch", "dplyr")
40 |         ),
41 |         epochs = epochs,
42 |         valid_data = dataloader(
43 |           test,
44 |           batch_size = bs,
45 |           shuffle = FALSE,
46 |           num_workers = num_workers,
47 |           worker_packages = c("torch", "dplyr")
48 |         ),
49 |         callbacks = list(
50 |           luz_callback_keep_best_model()
51 |         )
52 |       )
53 | 
54 |   preds =
55 |     predict(
56 |       luz_model,
57 |       dataloader(
58 |         test,
59 |         batch_size = bs,
60 |         num_workers = num_workers,
61 |         worker_packages = c("torch", "dplyr")
62 |       )
63 |     )
64 | 
65 |   comp = tibble(
66 |     obs =  x[testi,]$ll |>
67 |       factor(levels = 1:21),
68 |     pred = preds |>
69 |       torch_tensor(device = "cpu") |>
70 |       as.matrix() |>
71 |       apply(1, which.max) |>
72 |       factor(levels = 1:21)
73 |   )
74 |   bind_cols(
75 |     metric_set(accuracy, bal_accuracy)(comp, truth = obs, estimate = pred),
76 |     batch_size = bs
77 |     ) |>
78 |     select(-.estimator) |>
79 |     pivot_wider(names_from = .metric, values_from = .estimate)
80 | }
81 | print(mss)
82 | saveRDS(mss, "ms.rds")
83 | 


--------------------------------------------------------------------------------
/comparators/medbert/ref.txt:
--------------------------------------------------------------------------------
1 | https://huggingface.co/Charangan/MedBERT
2 | 


--------------------------------------------------------------------------------
/comparators/pubmedbert-fulltext/0-make-embedding.R:
--------------------------------------------------------------------------------
 1 | library(reticulate)
 2 | library(readr)
 3 | library(purrr)
 4 | library(foreach)
 5 | library(itertools)
 6 | 
 7 | use_condaenv("icd-10-huggingface", required = TRUE)
 8 | 
 9 | source("../short-code.R")
10 | 
11 | transformers = import("transformers")
12 | torch = import("torch")
13 | np = import("numpy")
14 | tqdm = import("tqdm")
15 | 
16 | builtins = import_builtins()
17 | builtins$setattr(torch$distributed, "is_initialized", py_eval("lambda : False"))
18 | 
19 | tokenizer = transformers$AutoTokenizer$from_pretrained(
20 |   'cambridgeltl/SapBERT-from-PubMedBERT-fulltext'
21 | )
22 | model = transformers$AutoModel$from_pretrained(
23 |   'cambridgeltl/SapBERT-from-PubMedBERT-fulltext'
24 | )
25 | 
26 | mean_pooling = function(model_output, attention_mask) {
27 |     token_embeddings = model_output[[1]] #First element of model_output contains all token embeddings
28 |     input_mask_expanded = attention_mask$unsqueeze(-1L)$expand(token_embeddings$size())$float()
29 |     torch$sum(token_embeddings * input_mask_expanded, 1L) / torch$clamp(input_mask_expanded$sum(1L), min=1e-9)
30 | }
31 | 
32 | # A function to embed a set of string.
33 | embed = function(strings) {
34 |   encoded_input = tokenizer(
35 |     strings,
36 |     padding = TRUE,
37 |     truncation = TRUE,
38 |     max_length = 256L,
39 |     return_tensors = 'pt'
40 |   )
41 |   model_output = model(
42 |     input_ids = encoded_input$input_ids,
43 |     attention_mask = encoded_input$attention_mask
44 |   )
45 |   mean_pooling(model_output, 
46 |                encoded_input$attention_mask)$detach()$cpu()$numpy() |>
47 |     (\(x) {rownames(x) = strings; x})()
48 | }
49 | 
50 | x = read_fwf(
51 |   "../../icd-10-cm-codes/icd10cm_codes_2019.txt",
52 |   fwf_cols(code = 8, desc = 150)
53 | )
54 | 
55 | embs = foreach(it = isplitVector(seq_len(nrow(x)), chunkSize = 1000),
56 |                .combine = c) %do% {
57 |   ret = embed(x$desc[it])
58 |   ret = map(seq_len(nrow(ret)), ~ list(ret[.x,,drop = FALSE]))
59 |   message(tail(it, 1), " of ", nrow(x))
60 |   gc()
61 |   ret
62 | }
63 | 
64 | x$embed = unlist(embs, recursive = FALSE)
65 | x$ll = get_short_code(x$code)
66 | saveRDS(x, "x-with-embedding.rds")
67 | 


--------------------------------------------------------------------------------
/comparators/pubmedbert-fulltext/1-benchmark.R:
--------------------------------------------------------------------------------
 1 | library(luz)
 2 | library(foreach)
 3 | library(yardstick)
 4 | library(tidyr)
 5 | 
 6 | source("../alpha-char-embedding-model.R")
 7 | 
 8 | x = readRDS("x-with-embedding.rds")
 9 | 
10 | traini = sample.int(nrow(x), round(0.9 * nrow(x)))
11 | testi = setdiff(seq_len(nrow(x)), traini)
12 | 
13 | train = AlphaCharEmbedding(x[traini, ])
14 | test = AlphaCharEmbedding(x[testi, ])
15 | 
16 | layers = c(train$width(), 100, 100, 21)
17 | 
18 | batch_size = c(64, 128, 256)
19 | epochs = 30
20 | num_workers = 6
21 | 
22 | loss = function(input, target) {
23 |   torch_mean(-torch_sum(target * torch_log(input$squeeze() + 1e-16), 2))
24 | }
25 | 
26 | mss = foreach (bs = batch_size, .combine = bind_rows) %do% {
27 |   luz_model = AlphaCodeEstimator |>
28 |       setup(
29 |         loss = loss, #nn_cross_entropy_loss(26),
30 |         optimizer = optim_adam
31 |       ) |>
32 |       set_hparams(layers = layers) |>
33 |       fit(
34 |         data = dataloader(
35 |           train,
36 |           batch_size = bs,
37 |           shuffle = TRUE,
38 |           num_workers = num_workers,
39 |           worker_packages = c("torch", "dplyr")
40 |         ),
41 |         epochs = epochs,
42 |         valid_data = dataloader(
43 |           test,
44 |           batch_size = bs,
45 |           shuffle = FALSE,
46 |           num_workers = num_workers,
47 |           worker_packages = c("torch", "dplyr")
48 |         ),
49 |         callbacks = list(
50 |           luz_callback_keep_best_model()
51 |         )
52 |       )
53 | 
54 |   preds =
55 |     predict(
56 |       luz_model,
57 |       dataloader(
58 |         test,
59 |         batch_size = bs,
60 |         num_workers = num_workers,
61 |         worker_packages = c("torch", "dplyr")
62 |       )
63 |     )
64 | 
65 |   comp = tibble(
66 |     obs =  x[testi,]$ll |>
67 |       factor(levels = 1:21),
68 |     pred = preds |>
69 |       torch_tensor(device = "cpu") |>
70 |       as.matrix() |>
71 |       apply(1, which.max) |>
72 |       factor(levels = 1:21)
73 |   )
74 |   bind_cols(
75 |     metric_set(accuracy, bal_accuracy)(comp, truth = obs, estimate = pred),
76 |     batch_size = bs
77 |     ) |>
78 |     select(-.estimator) |>
79 |     pivot_wider(names_from = .metric, values_from = .estimate)
80 | }
81 | print(mss)
82 | saveRDS(mss, "ms.rds")
83 | 


--------------------------------------------------------------------------------
/comparators/pubmedbert-fulltext/ref.txt:
--------------------------------------------------------------------------------
1 | https://huggingface.co/cambridgeltl/SapBERT-from-PubMedBERT-fulltext
2 | 


--------------------------------------------------------------------------------
/comparators/pubmedbert-ms-marco/0-make-embedding.R:
--------------------------------------------------------------------------------
 1 | library(reticulate)
 2 | library(readr)
 3 | library(purrr)
 4 | library(foreach)
 5 | library(itertools)
 6 | 
 7 | use_condaenv("icd-10-huggingface", required = TRUE)
 8 | 
 9 | source("../short-code.R")
10 | 
11 | transformers = import("transformers")
12 | torch = import("torch")
13 | np = import("numpy")
14 | builtins = import_builtins()
15 | builtins$setattr(torch$distributed, "is_initialized", py_eval("lambda : False"))
16 | 
17 | tokenizer = transformers$AutoTokenizer$from_pretrained(
18 |   'pritamdeka/S-PubMedBert-MS-MARCO'
19 | )
20 | model = transformers$AutoModel$from_pretrained(
21 |   'pritamdeka/S-PubMedBert-MS-MARCO'
22 | )
23 | 
24 | mean_pooling = function(model_output, attention_mask) {
25 |     token_embeddings = model_output[[1]] #First element of model_output contains all token embeddings
26 |     input_mask_expanded = attention_mask$unsqueeze(-1L)$expand(token_embeddings$size())$float()
27 |     torch$sum(token_embeddings * input_mask_expanded, 1L) / torch$clamp(input_mask_expanded$sum(1L), min=1e-9)
28 | }
29 | 
30 | 
31 | # A function to embed a set of string.
32 | embed = function(strings) {
33 |   encoded_input = tokenizer(
34 |     strings,
35 |     padding = TRUE,
36 |     truncation = TRUE,
37 |     max_length = 256L,
38 |     return_tensors = 'pt'
39 |   )
40 |   model_output = model(
41 |     input_ids = encoded_input$input_ids,
42 |     attention_mask = encoded_input$attention_mask
43 |   )
44 |   mean_pooling(model_output, 
45 |                encoded_input$attention_mask)$detach()$cpu()$numpy() |>
46 |     (\(x) {rownames(x) = strings; x})()
47 | }
48 | 
49 | x = read_fwf(
50 |   "../../icd-10-cm-codes/icd10cm_codes_2019.txt",
51 |   fwf_cols(code = 8, desc = 150)
52 | )
53 | 
54 | embs = foreach(it = isplitVector(seq_len(nrow(x)), chunkSize = 1000),
55 |                .combine = c) %do% {
56 |   ret = embed(x$desc[it])
57 |   ret = map(seq_len(nrow(ret)), ~ list(ret[.x,,drop = FALSE]))
58 |   message(tail(it, 1), " of ", nrow(x))
59 |   gc()
60 |   ret
61 | }
62 | 
63 | x$embed = unlist(embs, recursive = FALSE)
64 | x$ll = get_short_code(x$code)
65 | saveRDS(x, "x-with-embedding.rds")
66 | 


--------------------------------------------------------------------------------
/comparators/pubmedbert-ms-marco/1-benchmark.R:
--------------------------------------------------------------------------------
 1 | library(luz)
 2 | library(foreach)
 3 | library(yardstick)
 4 | library(tidyr)
 5 | 
 6 | source("../alpha-char-embedding-model.R")
 7 | 
 8 | x = readRDS("x-with-embedding.rds")
 9 | 
10 | traini = sample.int(nrow(x), round(0.9 * nrow(x)))
11 | testi = setdiff(seq_len(nrow(x)), traini)
12 | 
13 | train = AlphaCharEmbedding(x[traini, ])
14 | test = AlphaCharEmbedding(x[testi, ])
15 | 
16 | layers = c(train$width(), 100, 100, 21)
17 | 
18 | batch_size = c(64, 128, 256)
19 | epochs = 30
20 | num_workers = 6
21 | 
22 | loss = function(input, target) {
23 |   torch_mean(-torch_sum(target * torch_log(input$squeeze() + 1e-16), 2))
24 | }
25 | 
26 | mss = foreach (bs = batch_size, .combine = bind_rows) %do% {
27 |   luz_model = AlphaCodeEstimator |>
28 |       setup(
29 |         loss = loss, #nn_cross_entropy_loss(26),
30 |         optimizer = optim_adam
31 |       ) |>
32 |       set_hparams(layers = layers) |>
33 |       fit(
34 |         data = dataloader(
35 |           train,
36 |           batch_size = bs,
37 |           shuffle = TRUE,
38 |           num_workers = num_workers,
39 |           worker_packages = c("torch", "dplyr")
40 |         ),
41 |         epochs = epochs,
42 |         valid_data = dataloader(
43 |           test,
44 |           batch_size = bs,
45 |           shuffle = FALSE,
46 |           num_workers = num_workers,
47 |           worker_packages = c("torch", "dplyr")
48 |         ),
49 |         callbacks = list(
50 |           luz_callback_keep_best_model()
51 |         )
52 |       )
53 | 
54 |   preds =
55 |     predict(
56 |       luz_model,
57 |       dataloader(
58 |         test,
59 |         batch_size = bs,
60 |         num_workers = num_workers,
61 |         worker_packages = c("torch", "dplyr")
62 |       )
63 |     )
64 | 
65 |   comp = tibble(
66 |     obs =  x[testi,]$ll |>
67 |       factor(levels = 1:21),
68 |     pred = preds |>
69 |       torch_tensor(device = "cpu") |>
70 |       as.matrix() |>
71 |       apply(1, which.max) |>
72 |       factor(levels = 1:21)
73 |   )
74 |   bind_cols(
75 |     metric_set(accuracy, bal_accuracy)(comp, truth = obs, estimate = pred),
76 |     batch_size = bs
77 |     ) |>
78 |     select(-.estimator) |>
79 |     pivot_wider(names_from = .metric, values_from = .estimate)
80 | }
81 | print(mss)
82 | saveRDS(mss, "ms.rds")
83 | 


--------------------------------------------------------------------------------
/comparators/pubmedbert-ms-marco/ref.txt:
--------------------------------------------------------------------------------
1 | https://huggingface.co/pritamdeka/S-PubMedBert-MS-MARCO
2 | 


--------------------------------------------------------------------------------
/comparators/setup:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 | 
3 | conda create -n icd-10-huggingface -c huggingface transformers pytorch numpy tqdm
4 | 
5 | 


--------------------------------------------------------------------------------
/comparators/short-code.R:
--------------------------------------------------------------------------------
 1 | library(purrr)
 2 | 
 3 | ccc = c(
 4 |   "^[AB].*",
 5 |   "(^C|^D[0-4]).*",
 6 |   "^D[5-8].*",
 7 |   "^E[0-8][0-9].*",
 8 |   "^F.*",
 9 |   "^G.*",
10 |   "^H[0-5][0-9].*",
11 |   "^H[6-9][0-9].*",
12 |   "^I.*",
13 |   "^J.*",
14 |   "^K.*",
15 |   "^L.*",
16 |   "^M.*",
17 |   "^N.*",
18 |   "^O[0-9].*",
19 |   "^P.*",
20 |   "^Q.*",
21 |   "^R.*",
22 |   "^[ST].*",
23 |   "^[UVWXY].*",
24 |   "^[Z].*"
25 | )
26 | 
27 | get_short_code_impl = function(code) {
28 |   which(map_lgl(ccc, ~ grepl(.x, code)))
29 | }
30 | 
31 | get_short_code = function(code) {
32 |   map_int(code, get_short_code_impl)
33 | }
34 | 


--------------------------------------------------------------------------------
/embedding-data/.gitattributes:
--------------------------------------------------------------------------------
1 | *.csv filter=lfs diff=lfs merge=lfs -text
2 | *.gz filter=lfs diff=lfs merge=lfs -text
3 | 


--------------------------------------------------------------------------------
/embedding-data/icd-10-cm-2019-0010.csv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:453921d25f49450a6a3547f9cb814f59a166ad3a58f4dd58cfe9dc3c5a86baef
3 | size 6751427
4 | 


--------------------------------------------------------------------------------
/embedding-data/icd-10-cm-2019-0050.csv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:4f8fdf1ceff951bae456b5ff9978506893e3200bc76c0c5e606dda2648f268fc
3 | size 30557163
4 | 


--------------------------------------------------------------------------------
/embedding-data/icd-10-cm-2019-0100.csv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:2f3faedc2c3018756945254a40910d9a5b5e04f5ae12cf4b10b2bbd9e1f5a113
3 | size 60770182
4 | 


--------------------------------------------------------------------------------
/embedding-data/icd-10-cm-2019-1000.csv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:a5d367ca6af8b405bc04d40dca3abc2cf22a79bdecbe880e9e5884b0299d8cd5
3 | size 621858266
4 | 


--------------------------------------------------------------------------------
/embedding-data/icd-10-cm-2020-0010.csv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:69a49cfa852a2b8a7ca6fcfa27e0bd796061419888f48cdc0347408671eb50d9
3 | size 6775185
4 | 


--------------------------------------------------------------------------------
/embedding-data/icd-10-cm-2020-0050.csv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:13917a88d5c9b24222b31e8d955645c72734fa97110f6226cf6c385709fb1431
3 | size 30663679
4 | 


--------------------------------------------------------------------------------
/embedding-data/icd-10-cm-2020-0100.csv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:d16e54c76594574d5f87352afb91b7ff6fbc06d7962ef0ec57ca082a98e5251a
3 | size 60981659
4 | 


--------------------------------------------------------------------------------
/embedding-data/icd-10-cm-2020-1000.csv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:94eb814d7433ee9626de75f8a0fcc49eabfe9495db43a9132f2ab2fd7eb4c742
3 | size 624037283
4 | 


--------------------------------------------------------------------------------
/embedding-data/icd-10-cm-2021-0010.csv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:5a17516e78b7755895743a4c5b63f60003a3c66ea1a9fbd111b92bfb205c2b99
3 | size 6815772
4 | 


--------------------------------------------------------------------------------
/embedding-data/icd-10-cm-2021-0050.csv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:b24d905f03d427c69efbb05784d464e4c1b158b22be66463021c420c3bdc20fc
3 | size 30848223
4 | 


--------------------------------------------------------------------------------
/embedding-data/icd-10-cm-2021-0100.csv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:a8de1a232bd2d5e5adbedcfcefacbe750763ebb9e4d9c45a8cb4d74ccc20c525
3 | size 61347342
4 | 


--------------------------------------------------------------------------------
/embedding-data/icd-10-cm-2021-1000.csv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:c7f88839cc0f327834626ca3320243800886376652c557426ca54d9e59109ebd
3 | size 627764959
4 | 


--------------------------------------------------------------------------------
/embedding-data/icd-10-cm-2022-0010.csv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:079dbccd38141ccc2660239ffd49b442671cbdf6c5b8acd66b8a371fad7b3841
3 | size 6829052
4 | 


--------------------------------------------------------------------------------
/embedding-data/icd-10-cm-2022-0050.csv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:576e3e6ef00a0722d9ad39c1e7e2a438a97c8a7b3bc8a64fd85e2236e8c246fe
3 | size 30905849
4 | 


--------------------------------------------------------------------------------
/embedding-data/icd-10-cm-2022-0100.csv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:ec467c16aa628e554286e901bb0f9a479d425d8ecce8365f3bb58043b45b9ee3
3 | size 61460417
4 | 


--------------------------------------------------------------------------------
/embedding-data/icd-10-cm-2022-1000.csv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:68f6aac04c85f0de9c87c91c81872acfe8d5502f62edfda127d8de74c8c790b5
3 | size 628925623
4 | 


--------------------------------------------------------------------------------
/figure/unnamed-chunk-2-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kaneplusplus/icd-10-cm-embedding/a63d21c7d8f90419515bcfc2b0fce4281a6f1e62/figure/unnamed-chunk-2-1.png


--------------------------------------------------------------------------------
/icd-10-cm-embedding.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 


--------------------------------------------------------------------------------
/icd10_dl.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kaneplusplus/icd-10-cm-embedding/a63d21c7d8f90419515bcfc2b0fce4281a6f1e62/icd10_dl.rds


--------------------------------------------------------------------------------
/make-biogpt-conda-env:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | conda create --name biogpt torch transformers
3 | 


--------------------------------------------------------------------------------
/make-download-rds.R:
--------------------------------------------------------------------------------
 1 | library(tibble)
 2 | library(tidyr)
 3 | 
 4 | icd10_dl = expand_grid(
 5 |   tibble(year = 2019:2022),
 6 |   tibble(emb_dim = c(10, 50, 100, 1000))
 7 | )
 8 | 
 9 | icd10_dl$url = 
10 |   sprintf(
11 |     "https://github.com/kaneplusplus/icd-10-cm-embedding/blob/main/embedding-data/icd-10-cm-%d-%04d.csv?raw=true", 
12 |     icd10_dl$year,
13 |     icd10_dl$emb_dim
14 |   )
15 | 
16 | saveRDS(icd10_dl, "icd10_dl.rds")
17 | 


--------------------------------------------------------------------------------
/model-performance.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kaneplusplus/icd-10-cm-embedding/a63d21c7d8f90419515bcfc2b0fce4281a6f1e62/model-performance.rds


--------------------------------------------------------------------------------
/sup-model-perf.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kaneplusplus/icd-10-cm-embedding/a63d21c7d8f90419515bcfc2b0fce4281a6f1e62/sup-model-perf.rds


--------------------------------------------------------------------------------
/year-validation.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kaneplusplus/icd-10-cm-embedding/a63d21c7d8f90419515bcfc2b0fce4281a6f1e62/year-validation.rds


--------------------------------------------------------------------------------