├── .github
    ├── .gitignore
    └── workflows
    │   └── deploy_bookdown.yml
├── LICENSE
├── README.md
├── metadata.xlsx
├── paper
    ├── Figure1.Rmd
    ├── Figure3.Rmd
    ├── Figure4.Rmd
    └── figures.Rmd
└── supplement
    ├── 01-intro.Rmd
    ├── 02-prequel.Rmd
    ├── 03-prequel_analysis.Rmd
    ├── 04-current.Rmd
    ├── 05-current-techs.Rmd
    ├── 06-text-mining.Rmd
    ├── 07-current_analysis.Rmd
    ├── 08-future.Rmd
    ├── 09-references.Rmd
    ├── DESCRIPTION
    ├── LCM.png
    ├── LICENSE
    ├── LICENSE.md
    ├── _bookdown.yml
    ├── _output.yml
    ├── bdna.png
    ├── cpal.png
    ├── current_analysis.png
    ├── current_curve.png
    ├── current_hist.png
    ├── current_tech.png
    ├── dsp.png
    ├── exm.png
    ├── fig13a.png
    ├── fig13b.png
    ├── fig1A.png
    ├── fisseq.png
    ├── hcr.png
    ├── historical_barcoding.png
    ├── index.Rmd
    ├── langs_doc.png
    ├── lcm_topics.png
    ├── lcm_words.png
    ├── merfish.png
    ├── more_analyses
        ├── analysis.Rmd
        ├── array.Rmd
        ├── comparison.Rmd
        ├── iss.Rmd
        ├── microdissection.Rmd
        ├── prequel2.Rmd
        ├── prequel_analysis.Rmd
        └── smfish.Rmd
    ├── museumst.bib
    ├── niche.png
    ├── note.png
    ├── preamble.tex
    ├── prequel_techs.png
    ├── rca.png
    ├── regular.png
    ├── sedal.png
    ├── seqfish-plus.png
    ├── seqfish.png
    ├── smfish1998.png
    ├── smfish_cells.png
    ├── smfish_cells_part.png
    ├── smfish_gene.png
    ├── smfish_gene_part.png
    ├── solid.png
    ├── split-fish.png
    ├── style.css
    ├── tip.png
    ├── tomo.png
    └── voxelation.png


/.github/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | 


--------------------------------------------------------------------------------
/.github/workflows/deploy_bookdown.yml:
--------------------------------------------------------------------------------
 1 | on:
 2 |   push:
 3 |      branches:
 4 |        - main
 5 |   schedule:
 6 |     - cron: "15 15 15 */1 *"
 7 | 
 8 | name: renderbook
 9 | 
10 | jobs:
11 |   bookdown:
12 |     name: Render-Book
13 |     runs-on: ubuntu-latest
14 |     steps:
15 |       - uses: actions/checkout@v3
16 |       - uses: r-lib/actions/setup-r@v2
17 |       - uses: r-lib/actions/setup-pandoc@v2
18 |       - uses: r-lib/actions/setup-tinytex@v2
19 |       - uses: r-lib/actions/setup-r-dependencies@v2
20 |         with:
21 |           extra-packages: any::bookdown
22 |           working-directory: supplement
23 |       - name: Render Book
24 |         run: Rscript -e 'xfun::in_dir("supplement", bookdown::render_book("index.Rmd", output_format = "bookdown::bs4_book"))'
25 |       - uses: actions/upload-artifact@v4
26 |         with:
27 |           name: _book
28 |           path: supplement/_book/
29 | 
30 | # Need to first create an empty gh-pages branch
31 | # see https://pkgdown.r-lib.org/reference/deploy_site_github.html
32 | # and also add secrets for a GH_PAT and EMAIL to the repository
33 | # gh-action from Cecilapp/GitHub-Pages-deploy
34 |   checkout-and-deploy:
35 |    runs-on: ubuntu-latest
36 |    needs: bookdown
37 |    steps:
38 |      - name: Checkout
39 |        uses: actions/checkout@main
40 |      - name: Download artifact
41 |        uses: actions/download-artifact@v4
42 |        with:
43 |          # Artifact name
44 |          name: _book # optional
45 |          # Destination path
46 |          path: _book # optional
47 |      - name: Deploy to GitHub Pages
48 |        uses: Cecilapp/GitHub-Pages-deploy@v3
49 |        env:
50 |          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
51 |        with:
52 |          email: ${{ secrets.EMAIL }}
53 |          build_dir: _book               # optional
54 |          jekyll: no                     # optional
55 | 
56 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 2-Clause License
 2 | 
 3 | Copyright (c) 2021, Pachter Lab
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | 1. Redistributions of source code must retain the above copyright notice, this
10 |    list of conditions and the following disclaimer.
11 | 
12 | 2. Redistributions in binary form must reproduce the above copyright notice,
13 |    this list of conditions and the following disclaimer in the documentation
14 |    and/or other materials provided with the distribution.
15 | 
16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
20 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # LP_2021
2 | 
3 | [![DOI](https://zenodo.org/badge/329703669.svg)](https://zenodo.org/badge/latestdoi/329703669)
4 | 
5 | This repository contains the code used to generate figures for [the Museum of Spatial Transcriptomics paper](https://www.nature.com/articles/s41592-022-01409-2) and the code used to generate the supplement. The code for figures are in the `paper` directory, and the code for the supplement is in the `supplement` directory. The code can be run on [RStudio Cloud](https://rstudio.cloud/project/2492054). See `supplement/index.Rmd` for instructions to install dependencies to run code in the book; all dependencies have already been installed on RStudio Cloud. The rendered supplement can be seen [here](https://pachterlab.github.io/LP_2021). To submit new items to [the database](https://docs.google.com/spreadsheets/d/1sJDb9B7AtYmfKv4-m8XR7uc3XXw_k4kGSout8cqZ8bY/edit#gid=588531469), use [this Google Form](https://forms.gle/HjQD9x6AMjR7C62SA).
6 | 


--------------------------------------------------------------------------------
/metadata.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pachterlab/LP_2021/7989d2665e5dcebc8c13e61639363a9268ead1a1/metadata.xlsx


--------------------------------------------------------------------------------
/paper/Figure1.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Figure 1"
 3 | author: "Lambda Moses"
 4 | date: "11/9/2020"
 5 | output: html_document
 6 | ---
 7 | 
 8 | ```{r setup, include=FALSE}
 9 | knitr::opts_chunk$set(echo = TRUE)
10 | ```
11 | 
12 | Code to make Figure 1
13 | ```{r}
14 | library(museumst)
15 | library(patchwork)
16 | library(tidyverse)
17 | ```
18 | 
19 | ```{r}
20 | events <- read_major_events(update = TRUE)
21 | ```
22 | 
23 | ```{r, fig.width=8, fig.height=3}
24 | (p1a <- events %>% 
25 |   filter(category == "technique") %>% 
26 |   plot_timeline(c(1.5, -1.5, 1, -1.5, 1, -1.5, 2, -2.5, 1.2),
27 |                 expand_x = c(0.1, 0.1), expand_y = c(0,0)) +
28 |    ggtitle("Major events in evolution of prequel techniques"))
29 | ```
30 | 
31 | ```{r, fig.width=8, fig.height=4}
32 | (p1b <- events %>% 
33 |   filter(category == "ISH atlas") %>% 
34 |   plot_timeline(c(0.5, -0.5, -0.85, 0.7, -0.25, 1, -0.5, 0.3, -0.8, 0.5, -1, 0.7, -0.25, 0.5, 
35 |                   -0.87, 1, -0.6, 0.3, -0.5, 0.3), description_width = 25,
36 |                 expand_x = c(0.1, 0.1), expand_y = c(0,0)) +
37 |    ggtitle("Major prequel (WM)ISH atlases"))
38 | ```
39 | 
40 | ```{r, fig.width=8, fig.height=4}
41 | (p1c <- events %>% 
42 |   filter(category == "technique2") %>% 
43 |   plot_timeline(c(-0.5, 0.5, -0.5, -1, 0.5, 0.7, -0.8, -0.3, -0.6, 0.5, -0.5,
44 |                   0.6, 1, -0.6, 0.3, -1, -0.3), 
45 |                 expand_y = c(0,0), expand_x = c(0.1,0.1)) +
46 |    ggtitle("Major events in evolution of current era techniques"))
47 | ```
48 | 
49 | ```{r, fig.width=8, fig.height=12}
50 | (fig1 <- p1a + p1b + p1c +
51 |    plot_layout(ncol = 1, guides = "collect", heights = c(1, 1.35, 1)) +
52 |    plot_annotation(tag_levels = "A") &
53 |    theme(legend.position = "bottom", legend.title = element_blank(), 
54 |          legend.spacing = unit(0, "lines")))
55 | ```
56 | 
57 | ```{r}
58 | ggsave("fig1.pdf", fig1, width = 12, height = 13)
59 | ```
60 | 
61 | ```{r}
62 | sessionInfo()
63 | ```
64 | 
65 | 


--------------------------------------------------------------------------------
/paper/Figure3.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Figure 3"
  3 | author: "Lambda Moses"
  4 | date: "10/01/2021"
  5 | output: html_document
  6 | ---
  7 | 
  8 | ```{r setup, include=FALSE}
  9 | knitr::opts_chunk$set(echo = TRUE)
 10 | ```
 11 | 
 12 | ```{r}
 13 | library(museumst)
 14 | library(tidyverse)
 15 | library(patchwork)
 16 | library(gganatogram)
 17 | library(ggrepel)
 18 | library(scales)
 19 | theme_set(theme_bw())
 20 | ```
 21 | 
 22 | ```{r}
 23 | nms <- c("Prequel", "ROI selection", "smFISH", "NGS barcoding", "ISS", "De novo")
 24 | data_sheets <- read_metadata(nms, update = TRUE)
 25 | ```
 26 | 
 27 | ```{r}
 28 | data_sheets <- data_sheets %>% 
 29 |   mutate(era = case_when(sheet == "Prequel" ~ "prequel",
 30 |                          TRUE ~ "current"))
 31 | ```
 32 | 
 33 | ```{r}
 34 | publications <- get_pubs_df(data_sheets, "era")
 35 | ```
 36 | 
 37 | ```{r}
 38 | methods_current <- data_sheets %>% 
 39 |   select(title, date_published, method, sheet, era, country:short_name) %>% 
 40 |   filter(era == "current", !is.na(method)) %>% 
 41 |   distinct()
 42 | ```
 43 | 
 44 | ```{r}
 45 | (p3a <- methods_current %>% 
 46 |    select(institution, method) %>% 
 47 |    distinct() %>% 
 48 |    mutate(method = fct_lump_min(method, 3)) %>% 
 49 |    filter(!method %in% c("LCM", "Other")) %>% 
 50 |    pubs_per_cat(method) +
 51 |    labs(y = "Number of institutions", x = "Technology"))
 52 | ```
 53 | 
 54 | ```{r}
 55 | smfish <- read_metadata("smFISH", update = TRUE)
 56 | ```
 57 | 
 58 | ```{r}
 59 | (p3c <- ggplot(smfish, aes(date_published, n_genes)) +
 60 |   geom_point(aes(color = method), alpha = 0.5) +
 61 |   geom_text_repel(aes(label = method, color = method), segment.alpha = 0.5,
 62 |                   max.overlaps = 20) +
 63 |   geom_smooth(method = "lm") +
 64 |   labs(x = "Date published", title = "Number of genes per dataset",
 65 |        y = NULL) +
 66 |   theme(legend.position = "none") +
 67 |   scale_y_log10(expand = expansion(mult = c(0.1, 0.2))) +
 68 |   annotation_logticks(sides = "l"))
 69 | ```
 70 | 
 71 | ```{r}
 72 | sum_cells <- smfish %>% 
 73 |   group_by(date_published, title, method) %>% 
 74 |   summarize(n_cells = sum(`n_cells/bins/spots`, na.rm = TRUE)) %>% 
 75 |   filter(n_cells > 0)
 76 | ```
 77 | 
 78 | ```{r}
 79 | (p3d <- ggplot(sum_cells, aes(date_published, n_cells)) +
 80 |   geom_point(aes(color = method)) +
 81 |   geom_text_repel(aes(label = method, color = method), segment.alpha = 0.5) +
 82 |   geom_smooth(method = "lm") +
 83 |   labs(x = "Date published", title = "Total number of cells per study", y = NULL) +
 84 |   theme(legend.position = "none") +
 85 |   scale_y_log10() +
 86 |   annotation_logticks())
 87 | ```
 88 | 
 89 | ```{r}
 90 | species <- data_sheets %>%
 91 |   filter(era == "current", !is.na(species)) %>% 
 92 |   unnest_cat(species)
 93 | ```
 94 | 
 95 | ```{r}
 96 | data("species_cols")
 97 | ```
 98 | 
 99 | ```{r}
100 | species_pie <- species %>% 
101 |   mutate(species = fct_lump(species, 4)) %>% 
102 |   count(species) %>% 
103 |   mutate(species = fct_reorder(species, n, .desc = TRUE) %>% 
104 |            fct_relevel("Other", after = Inf)) %>% 
105 |   arrange(desc(species)) %>% 
106 |   mutate(label = paste("italic('", species, "')", "~(", n, ")"))
107 | ```
108 | 
109 | ```{r}
110 | (p3e <- ggplot(species_pie, aes(x = "", y = n, fill = species)) +
111 |    geom_col(position = "stack", color = "gray50", size = 0.3, show.legend = FALSE,
112 |             width = 1) +
113 |    geom_text_repel(aes(x = 1.5, label = label), parse = TRUE, segment.size = 0,
114 |                    position = position_stack(vjust = 0.5)) +
115 |    coord_polar(theta = "y") +
116 |    scale_fill_manual(values = species_cols) +
117 |    scale_y_continuous(breaks = species_pie$y_label, labels = species_pie$label) +
118 |    scale_x_discrete(expand = expansion(0.1)) +
119 |    theme_void()
120 |    )
121 | ```
122 | 
123 | ```{r}
124 | data("hgFemale_key")
125 | data("mmFemale_key")
126 | data("hgMale_key")
127 | ```
128 | 
129 | ```{r}
130 | organs <- data_sheets %>% 
131 |   filter(era == "current") %>% 
132 |   select(date_published:journal, country:year, species, sheet, organ, pathological) %>% 
133 |   distinct()
134 | organs <- organs %>% 
135 |   filter(species %in% c("Mus musculus", "Homo sapiens")) %>% 
136 |   unnest_cat(organ, other_cols = c("species", "pathological")) %>% 
137 |   filter(!is.na(organ))
138 | ```
139 | 
140 | ```{r}
141 | organs_hg_f <- organs %>% 
142 |   filter(species == "Homo sapiens") %>% 
143 |   inner_join(hgFemale_key[, c("organ", "type")], by = "organ") %>% 
144 |   count(organ, type, pathological, name = "value")
145 | ```
146 | 
147 | ```{r}
148 | organs_mm_f <- organs %>% 
149 |   filter(species == "Mus musculus") %>% 
150 |   inner_join(mmFemale_key[, c("organ", "type")], by = "organ") %>% 
151 |   count(organ, type, pathological, name = "value")
152 | ```
153 | 
154 | ```{r}
155 | organs_hg_m <- organs %>% 
156 |   filter(species == "Homo sapiens") %>% 
157 |   inner_join(hgMale_key[, c("organ", "type")], by = "organ") %>% 
158 |   count(organ, type, pathological, name = "value")
159 | ```
160 | 
161 | Use the same color scale across humans and mice
162 | ```{r}
163 | limits_use <- range(c(organs_hg_f$value[!organs_hg_f$pathological], 
164 |                       organs_mm_f$value[!organs_mm_f$pathological]))
165 | ```
166 | 
167 | ```{r}
168 | (p3f <- organs_hg_m %>% 
169 |    filter(!pathological) %>% 
170 |    gganatogram(organism = "human", sex = "male", fill = "value") +
171 |    coord_equal() +
172 |    theme_void() +
173 |    scale_fill_distiller(palette = "Blues", direction = 1, name = "# publications\n(healthy)",
174 |                         limits = limits_use))
175 | ```
176 | 
177 | ```{r}
178 | limits_use2 <- range(c(organs_hg_f$value[organs_hg_f$pathological],
179 |                        organs_mm_f$value[organs_mm_f$pathological]))
180 | ```
181 | 
182 | ```{r}
183 | (p3g <- organs_hg_f %>% 
184 |    filter(pathological) %>% 
185 |    gganatogram(organism = "human", sex = "female", fill = "value") +
186 |    coord_equal() +
187 |    theme_void() +
188 |    scale_fill_distiller(palette = "Reds", direction = 1, name = "# publications\n(pathological)",
189 |                         limits = limits_use2))
190 | ```
191 | 
192 | ```{r}
193 | (p3h <- organs_mm_f %>% 
194 |    filter(!pathological) %>% 
195 |    gganatogram(organism = "mouse", sex = "female", fill = "value") +
196 |    coord_equal() +
197 |    theme_void() +
198 |    scale_fill_distiller(palette = "Blues", direction = 1, name = "# publications\n(healthy)",
199 |                         limits = limits_use))
200 | ```
201 | 
202 | ```{r}
203 | (p3i <- organs_mm_f %>% 
204 |    filter(pathological) %>% 
205 |    gganatogram(organism = "mouse", sex = "female", fill = "value") +
206 |    coord_equal() +
207 |    theme_void() +
208 |    scale_fill_distiller(palette = "Reds", direction = 1, name = "# publications\n(pathological)",
209 |                         limits = limits_use2))
210 | ```
211 | 
212 | ```{r}
213 | # prequel sheet doesn't have programming language information anyway though it has that columnn
214 | langs <- unnest_cat(data_sheets, language)
215 | ```
216 | 
217 | ```{r}
218 | (p3j <- pubs_per_cat(langs, language, n_top = 5, isotype = TRUE, img_unit = 50) +
219 |   labs(title = "Users (downstream analysis)"))
220 | ```
221 | ```{r}
222 | analysis <- read_metadata("Analysis", update = TRUE)
223 | ```
224 | 
225 | ```{r}
226 | lang_counts <- langs %>% 
227 |   count(language) %>% 
228 |   arrange(desc(n))
229 | ```
230 | 
231 | ```{r}
232 | analysis_langs <- unnest_cat(analysis, language, 
233 |                              other_cols = c("documented", "CRAN/Bioc/pip/conda"))
234 | max_x <- max(lang_counts$n)
235 | ```
236 | 
237 | ```{r}
238 | (p3k <- pubs_per_cat(analysis_langs, language, n_top = 5, isotype = TRUE, img_unit = 50) +
239 |   scale_y_continuous(limits = c(0, max_x), 
240 |                      breaks = breaks_width(50),
241 |                      expand = expansion(mult = c(0, 0.05))) +
242 |   labs(title = "Package developers"))
243 | ```
244 | 
245 | ```{r, fig.width=8, fig.height=8}
246 | fig3_top <- wrap_elements(full = p3a) + p3f + p3g +
247 |   p3e + p3h + p3i +
248 |   plot_layout(ncol = 3, guides = "collect", widths = c(1.8, 1, 1), heights = c(1.3, 1))
249 | fig3_bottom <- p3c + p3d +
250 |   p3j + p3k +
251 |   plot_layout(ncol = 2)
252 | (fig3 <- fig3_top / fig3_bottom +
253 |     plot_layout(heights = c(1.3, 1)) +
254 |     plot_annotation(tag_levels = "a"))
255 | ```
256 | 
257 | ```{r}
258 | ggsave("fig3.pdf", fig3, width = 12, height = 14)
259 | ```
260 | 
261 | 


--------------------------------------------------------------------------------
/paper/Figure4.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Figure 4"
 3 | author: "Lambda Moses"
 4 | date: "10/01/2021"
 5 | output: html_document
 6 | ---
 7 | 
 8 | ```{r setup, include=FALSE}
 9 | knitr::opts_chunk$set(echo = TRUE)
10 | ```
11 | 
12 | ```{r}
13 | library(museumst)
14 | library(tidyverse)
15 | library(patchwork)
16 | library(lubridate)
17 | theme_set(theme_bw())
18 | ```
19 | 
20 | ```{r}
21 | analysis <- read_metadata("Analysis")
22 | analysis$sheet <- "Analysis"
23 | ```
24 | 
25 | ```{r}
26 | nms <- c("ROI selection", "smFISH", "NGS barcoding", "ISS", "De novo")
27 | data_sheets <- read_metadata(nms, update = TRUE)
28 | ```
29 | 
30 | ```{r}
31 | data_sheets <- data_sheets %>% 
32 |   mutate(Era = case_when(sheet == "Prequel" ~ "prequel",
33 |                          TRUE ~ "current"))
34 | ```
35 | 
36 | ```{r}
37 | current <- data_sheets %>% 
38 |   filter(Era == "current") %>% 
39 |   select(date_published:journal, country:year, sheet) %>% 
40 |   distinct()
41 | ```
42 | 
43 | ```{r}
44 | all_current <- rbind(current, analysis[,names(current)])
45 | ```
46 | 
47 | ```{r}
48 | all_current <- all_current %>% 
49 |   mutate(Type = case_when(sheet == "Analysis" ~ "analysis",
50 |                           TRUE ~ "data")) %>% 
51 |   distinct()
52 | ```
53 | 
54 | ```{r}
55 | (p4a <- era_freqpoly(all_current, Type, preprints = TRUE, binwidth = 120) +
56 |    scale_y_continuous(expand = expansion(c(0, 0.05))) +
57 |    scale_x_date(breaks = seq(ymd("2001-01-01"), ymd("2022-01-01"), by = "2 years"), 
58 |                 expand = expansion(c(0,0.05)),
59 |                 date_labels = "%Y"))
60 | ```
61 | 
62 | ```{r}
63 | pubs <- get_pubs_df(data_sheets, other_cols = "sheet")
64 | ```
65 | 
66 | ```{r}
67 | data("sheet_fill")
68 | sheet_fill2 <- sheet_fill[nms]
69 | ```
70 | 
71 | ```{r}
72 | (p4b <- pubs %>% 
73 |    mutate(sheet = fct_infreq(sheet) %>% fct_rev()) %>% 
74 |    ggplot() +
75 |    geom_histogram(aes(x = date_published, y = after_stat(count), fill = sheet), 
76 |              stat = "bin", color = "gray", size = 0.5, binwidth = 120, 
77 |              position = "stack") +
78 |    scale_fill_manual(values = sheet_fill2, name = "Category") +
79 |    scale_y_continuous(expand = expansion(c(0, 0.05))) +
80 |    scale_x_date(breaks = seq(ymd("2001-01-01"), ymd("2022-01-01"), by = "2 years"), 
81 |                 expand = expansion(c(0,0.05)),
82 |                 date_labels = "%Y") +
83 |    labs(x = "Date published", y = "Number of publications"))
84 | ```
85 | 
86 | ```{r, fig.width=6, fig.height=8}
87 | (fig4 <- p4a / p4b +
88 |   plot_annotation(tag_levels = "a"))
89 | ```
90 | 
91 | ```{r}
92 | ggsave("fig4.pdf", fig4, width = 8, height = 8)
93 | ```
94 | 
95 | ```{r}
96 | sessionInfo()
97 | ```
98 | 
99 | 


--------------------------------------------------------------------------------
/supplement/01-intro.Rmd:
--------------------------------------------------------------------------------
 1 | # Introduction {#intro}
 2 | 
 3 | The spatial organization of the components of biological systems is crucial for their proper function. For instance, morphogen gradients in embryos are tightly regulated to ensure that the right cell types differentiate at the right place. In adults, spatial organization of cells in tissues is important to proper functions of organs. For instance, the liver lobule is divided in labor according to distance from the portal triad as such distance affects suitability of different tasks. Both oxygen level and morphogen gradient regulate zonation of metabolism [@Gebhardt2014]; there is more oxidative phosphorylation and gluconeogenesis in the more oxygenated periportal region and more glycolysis in the more deoxygenated pericentral region. How cell types and cellular functions vary in space can be measured by quantifying gene expression in space. Conversely, the expression of an unknown gene in space can give clues to its function. Gene expression is usually quantified by quantifying proteins or transcripts encoded by the gene, and high throughput spatial methods exist for both protein and transcripts. In other words, cellular function exemplifies the maxim that "the whole is greater than the sum of its parts", and in large part this follows from "location, location, location". 
 4 | 
 5 | Here we focus on spatial transcriptomics (the field of spatial proteomics is covered elsewhere [@Lundberg2019; @Baharlou2019; @Buchberger2018]). Even spatial transcriptomics is a vast field, and it is useful to begin by considering the scope of what it contains. Naïvely, one may say, spatial transcriptomics means quantifying the complete set of RNAs encoded by the genome in space. Usually the "in space" is at some microscopic resolution rather than geospatial as often assumed in the term "spatial statistics"; the resolution is usually cellular, though sometimes subcellular. The "spatial" is in contrast to other transcriptomics methods that by virtue of the nature of their assays, lose information of tissue structure in space. That is the case with microarray technology for bulk tissue analysis, for bulk RNA-seq, and single cell RNA-seq (scRNA-seq) that is based on dissociation of tissue -- the "spatial" usually means tissue structure in space. More broadly, the "spatial" can mean knowing spatial context of samples although the spatial context is only a label and the coordinates are not collected or not used, such as in some laser capture microdissection (LCM) literature [@Aguila2021; @Baccin2020; @Nichterwitz2016], Niche-seq [@Medaglia2017], and APEX-seq [@Fazal2019a]. The "spatial" can also mean preserving spatial coordinates of samples within tissue, though the coordinates may or may not be explicitly used in data analysis, such as in the various single molecular fluorescent in situ hybridization (smFISH) based technologies such as seqFISH [@Lubeck2014] and MERFISH [@Chen2015] and array based technologies such as Spatial Transciptomics (ST) [@Stahl2016a].
 6 | 
 7 | There is more complexity in defining "transcriptomics". While some technologies usually called "spatial transcriptomics" are indeed transcriptome-wide, such as ST, Visium, and LCM followed by RNA-seq, many technologies that only profile a panel of usually a few hundred genes are nevertheless considered part of "spatial transcriptomics". Here "transcriptomics" actually means high-throughput quantification of gene expression, preferably highly multiplexed, quantifying numerous genes within the same piece of tissue at the same time. However, what counts as "high-throughput"? Is there a minimum number of genes required? Should 50 genes be enough? Or a hundred genes? The threshold number of genes required to be considered "high-throughput" is difficult to define; here, by "high-throughput", we mean the intent to quantify expression of more genes than normally done with fluorescent in situ hybridization (FISH) or immunofluorescence when only color distinguishes between genes, which can mean more than about 5 genes. There is also some complication regarding whether "highly multiplexed" should be required. Some fairly recent studies that intended to perform high-throughput gene expression profiling in space did not profile most genes at the same time (e.g. multiple rounds of smFISH hybridization, each round for a different set of genes) [@Lignell2017; @Wang2021], or even profiled different genes in different tissue sections [@Bayraktar2020; @Battich2013]; these papers nevertheless claimed to be spatial transcriptomic or something similar.
 8 | 
 9 | When terms are to be defined by how they are used, then we rely on a generic and inclusive definition of "spatial transcriptomics", which can be summarized as: Quantifying transcripts while keeping spatial context of samples within tissue or cell, with intent to quantify transcripts of more genes than normally done with one round of FISH or immunofluorescence when color is the only way to distinguish between genes. This is the criterion we used in considering what methods to include in our review.
10 | 
11 | ## Database
12 | 
13 | The field of spatial transcriptomics has grown drastically in the past 5 years, during which several reviews have already been written. These survey existing technologies [@Crosetto2015; @Moor2017; @Strell2019; @Liao2020; @Waylen2020] or discuss how the technologies apply to specific biological systems such as tumors [@Smith2019a], brain [@Lein2017], and liver [@Saviano2020]. Unlike the review papers, we aim to be more systematic and detailed in our review of spatial transcriptomics technology. In addition, we review existing data analysis methods in this field, a crucial aspect of spatial transcriptomics which has not yet been comprehensively reviewed in depth. Moreover, we present a curated database of spatial transcriptomics literature and analyses of the literature metadata to show trends in different aspects of spatial transcriptomics. This database is publicly available [here](https://docs.google.com/spreadsheets/d/1sJDb9B7AtYmfKv4-m8XR7uc3XXw_k4kGSout8cqZ8bY/edit#gid=1693202466). Similar databases have been curated for scRNA-seq literature [@Svensson2020], and for scRNA-seq data analysis tools [@Zappia2018], which have been analyzed to show trends in the field, although the metadata in our database and the analyses are much more extensive.
14 | 
15 | Curation of the database was performed by searching terms "spatial transcriptomics", "visium", "merfish", "seqfish", and "geomx dsp" on PubMed and in addition, the term "ISS" on bioRxiv as searching "ISS" on PubMed does not yield many relevant results. Then the search results are manually screened and publications that fit the definition of "spatial transcriptomics" as stated above are added to the database. In addition, publications citing well-known publications that are commonly recognized as "spatial transcriptomics" (e.g. the original paper for MERFISH) are screened. Such searches can find publications for spatial transcriptomics data analysis as well. Additional criteria of inclusion for data analysis publications are discussed in Chapter \@ref(current-analysis). If a method fitting the definition of "spatial transcriptomics" is mentioned anywhere outside the search results, such as a review paper, the publication of that method is also added to the database. For historical methods (i.e. prequel) loosely fitting our definition of "spatial transcriptomics" and sharing objectives with more recent spatial transcriptomics but are not highly multiplexed and don't involve cDNA microarrays or next generation sequencing (NGS), search terms such as "gene trap screen" and "in situ hybridization atlas" were used. Review papers and protocols are excluded.
16 | 
17 | Metadata of the publications collected include date published (or posted on bioRxiv for preprints), title, journal, PMID if applicable, DOI URL, species and tissue the data comes from (or the data analysis method is designed for), whether the tissue is pathological (mouse and human only), and city and institution of the first author. Such metadata allow for analyses of trends in spatial transcriptomics through time and how and where spatial transcriptomics technologies are used. In addition, for historical databases such as for *in situ* hybridization atlases, a metadata column indicates whether the database is still available. Metadata for data and code availability are also recorded. For cDNA microarray and NGS data, accessions in Gene Expression Omnibus (GEO), Short Read Archive (SRA), database of Genotypes and Phenotypes (dbGaP), European Nucleotide Archive (ENA), DNA Bank of Japan, The National Omics Data Encyclopedia (China), and BIG Sub (China) are recorded when available. For both downstream analysis and package development, the programming languages used and code repository are recorded when available. Other metadata specific to certain types of publications are collected as well, such as whether the method was used to target specific histologically defined regions of interest (ROI) or to analyze the tissue in a regular grid for microdissection based methods, and whether the implementation of a data analysis method is packaged and reasonably well-documented for data analysis publications.
18 | 
19 | There are some caveats to our review and database. First, while we narrate a history of evolution of techniques and in some cases explain how one technique influenced another, we do not present aspects of the history that are not apparent from the publications. Studying those aspects of the history of the field may require interviewing the people who developed the techniques, as well as exploration of additional unpublished material. Second, our database was originally only meant for papers, so relevant materials that are not in presented in that format are underrepresented. Examples of such materials include databases and software not presented as papers (e.g. the XDB3 database [@XDB3]). This means that the metadata analyses in this book might not be representative of all material that exists in spatial transcriptomics. Third, as the curation was done manually and the search engines are imperfect, the database might not include some relevant literature unknown to us. Please contact us or open an issue in the GitHub repo of this book if you wish to suggest new entries to the database.
20 | 
21 | The database is continuously manually updated daily by screening RSS feeds from the search terms in PubMed and bioRxiv mentioned above. New entries and the associated metadata can also be submitted via the [Google Form](https://forms.gle/HjQD9x6AMjR7C62SA).
22 | 
23 | ## Organization of the database and this book
24 | 
25 | The database is organized as several different sheets for different types of publications. Many technologies can be classified in several different ways and some ways are more useful in some contexts than others, and spatial transcriptomics is no exception. Furthermore, the line between different categories can at times be difficult to draw and there are gray areas. 
26 | 
27 | Our database starts with articles published in the 1980s to provide historical context of what is now commonly known as spatial transcriptomics; this literature is summarized in Chapter \@ref(prequel), and historical methods of data analysis are reviewed in Chapter \@ref(prequel-analysis). 
28 | 
29 | The literature is broken down into the following categories, corresponding to sheets in the database, to be defined and elaborated on in the subsequent chapters. Technologies to collect data (Chapter \@ref(current)) can be broadly classified by mechanisms spatial contexts of samples are obtained: ROI selection (Section \@ref(microdissection)), next generation sequencing with spatial barcodes (abbreviated as NGS barcoding, Section \@ref(array)), single molecular FISH (smFISH) (Section \@ref(smfish)), *in situ* sequencing (ISS) (Section \@ref(iss)), and no *priori* (Section \@ref(no-priori)). Within some of the categories, especially microdissection and NGS barcoding, are large varieties of mechanisms and gray areas. Methods in the gray areas and don't fit nicely into any category are placed in the "Other" sheet. 
30 | 
31 | These technologies can be classified in other ways, such as whether transcripts can be traced back to individual cells, and whether the spatial context takes the form of manually selected ROIs or a regular grid or both or neither. These other categories can cut across different mechanisms to acquire spatial contexts. In addition, studies using these technologies can be classified: demonstration of new data collection techniques, reference atlases intended to more comprehensively characterize the system of interest, characterization of tissues without intending to build reference atlases, and demonstration of data analysis methods. As the purpose of this database and book is to systematically document data collection and analysis methods in spatial transcriptomics, the mechanisms to acquire spatial contexts are used to structure the database and text; the other ways of categorization are mentioned in the text to give some perspectives for potential users of data collection techniques or users of existing datasets.
32 | 
33 | Data analysis methods (Chapter \@ref(current-analysis)) are placed under the following categories: Preprocessing (Section \@ref(preprocessing)), exploratory data analysis (EDA) (Section \@ref(eda)), spatial reconstruction of single cell RNA-seq (scRNA-seq) data (Section \@ref(reconstruction)), spatially variable genes (Section \@ref(variable)), archetypal gene expression patterns (Section \@ref(pattern)), using transcriptome to identify spatially coherent regions in tissue (Section \@ref(region)), cell type deconvolution of non-single-cell resolution spatial data (Section \@ref(deconvolution)), cell-cell interaction (Section \@ref(cell-interaction)), and other types of analyses. These data analysis methods can also be placed on a upstream to downstream spectrum. Upstream methods prepare the data to be more amenable to downstream analyses, and downstream methods aim to give biological relevant information and hypotheses. Then preprocessing, including cell segmentation in highly multiplexed smFISH images and obtaining a gene count matrix from fastq files, would be upstream. Quality control of the gene count matrix and EDA would be downstream from that, followed by cell type deconvolution, mapping cells to locations, and then spatially variable genes and cell-cell interactions. The types of data analysis methods are introduced roughly in the order from upstream to downstream.
34 | 
35 | In each of the following chapters, besides introducing the relevant technologies, the literature metadata is analyzed to show relevant sociological trends such as who is using each technology, usage trends of technologies, and the programming languages used. The metadata analyses can be run interactively in [RStudio Cloud](https://rstudio.cloud/project/1981124).
36 | 


--------------------------------------------------------------------------------
/supplement/02-prequel.Rmd:
--------------------------------------------------------------------------------
  1 | # (PART) Prequel era {.unnumbered}
  2 | 
  3 | # Prequel era {#prequel}
  4 | 
  5 | ```{r include=FALSE}
  6 | knitr::opts_chunk$set(echo = FALSE, fig.keep = "all",
  7 |                       message = FALSE, warning = FALSE,
  8 |                       fig.align = "center")
  9 | library(museumst)
 10 | library(tidyverse)
 11 | library(sf)
 12 | library(lubridate)
 13 | library(gganatogram)
 14 | library(patchwork)
 15 | library(scales)
 16 | library(here)
 17 | theme_set(theme_bw())
 18 | ```
 19 | 
 20 | ```{r}
 21 | prequel <- read_metadata(sheet_use = "Prequel", update = TRUE)
 22 | ```
 23 | 
 24 | Some previous reviews on spatial transcriptomics start the history of spatial transcriptomics with laser capture microdissection (LCM) followed by microarray or RNA-seq and single molecular fluorescent *in situ* hybridization (smFISH) in the late 1990s [@Lein2017; @Liao2020; @Crosetto2015]. We will discuss these later, but note that by 1999 and the early 2000s, when the earliest LCM microarray studies were published [@Luo1999; @Sgroi1999; @Ohyama2000; @Kitahara2001], the quest to profile the transcriptome in space had already begun, with enhancer and gene trap screens, *in situ* reporter screens, and (whole mount) *in situ* hybridization ((WM)ISH) atlases. Although this early literature, dating from the late 1980s, generally does not refer to itself as "spatial transcriptomics", it fits into the definition of spatial transcriptomics as stated in Chapter \@ref(intro).
 25 | 
 26 | We call this body of literature "prequel", because first, its origin predates LCM microarray. Second, unlike most technologies covered by existing spatial transcriptomics reviews, the techniques used were not multiplexed and were less quantitative, and as a result, they have fallen out of favor. In contrast, what comes after "prequel" will be called "current", although the prequel and current eras chronologically overlap. Given what current era spatial transcriptomics is commonly perceived to be, here "prequel" is broadly defined as methods that fulfill the more relaxed definition of "spatial transcriptomics" in this book, but do not involve cDNA microarray, next generation sequencing (NGS), or single molecular imaging.
 27 | 
 28 | There are `r nrow(prequel)` prequel papers in our database. Prequel literature is included in the database and covered here for the following reasons. First, the legacy of the prequel era has influenced more recent spatial transcriptomic research; the present and future are shaped by the past. For example, spatial reconstruction of scRNA-seq data in Seurat v1 [@Satija2015], the Achim et al. *Platynereis* study [@Achim2015], `DistMap` [@Karaiskos2017], and the Zeisel et al. Mouse Brain Atlas [@Zeisel2018] used (WM)ISH atlases as spatial references. Recent Spatial Transcriptomics^TM^ (ST) mouse brain data are still compared to the ISH atlas of Allen Brain Atlas (ABA) [@Ortiz2020; @Chen2020]. A study on spatial reconstruction of scATAC-seq data compared the *in silico* reconstruction to the FlyLight *Drosophila* enhancer atlas [@Jenett2012; @BravoGonzalez-Blas2020]. Hence prequel resources can still be useful in the current era. We expand on this in Chapter \@ref(current). Second, some features of the prequel era may benefit future spatial transcriptomics studies; this will be discussed after more recent technologies are reviewed. Third, the various quests in the current era have already begun in the prequel era, and this history can show how the coming together of new technologies made us better at achieving the previous generation's dreams.
 29 | 
 30 | Fourth, as shown later in this book, existing current era spatial transcriptomics data are by and large from humans and mice, and especially the brain (Figure \@ref(fig:species-pie), Figure \@ref(fig:anat1)). For other model and non-model organisms (e.g. *Xenopus laevis* [@Bowes2009; @XDB3], *Ciona intestinalis* [@Satou2001], *Danio rerio* [@Sprague2003; @Belmamoune2008], *Oryzias latipes* [@Henrich2003], *Gallus gallus* [@Bell2004], *Taeniopygia guttata* [@Lovell2020], and to some extent, even *Drosophila melanogaster* [@Tomancak2002; @LuengoHendriks2006a]), some tissues other than the brain (e.g. lung (prior to the increase interest following the COVID pandemic) [@Ardini-Poleske2017], retina [@Blackshaw2004], genitourinary tract [@Harding2011]), and miRNAs [@Ahmed2015; @Karali2010; @Diez-Roux2011; @Aboobaker2005; @Wienholds2005; @Darnell2006], the most comprehensive spatial transcriptomic resources, if any are are available at all, are still (WM)ISH atlases. For plants, the most comprehensive resources can still be enhancer and gene trap screens [@Johnson2005; @Nakayama2005]. Hence, while current era technologies may produce more quantitative and highly multiplexed data, they have not completely superseded (WM)ISH atlases. This may be likened to the Jet Age in the history of aviation. While massive jet airliners made aviation available to the masses so when most people fly they fly with jets, jet airliners have not completely superseded airplanes with reciprocating engines and propellers; the latter are still very common in general aviation. Finally, the historical literature is curated for the same reason why museums and libraries keep historical maps and scientific works that have been superseded by more recent work; it is part of our heritage.
 31 | 
 32 | An overall timeline for prequel techniques is shown in Figure \@ref(fig:tl1), which will be discussed in more details in the rest of this chapter.
 33 | 
 34 | ```{r}
 35 | events <- read_major_events(update = TRUE)
 36 | ```
 37 | 
 38 | ```{r tl1, fig.width=8, fig.height=4, fig.cap="Timeline of prequel techniques.", out.width="100%"}
 39 | events %>% 
 40 |     filter(category == "technique") %>% 
 41 |     plot_timeline(c(1.5, -1.5, 1, -1.5, 1, -1.5, 2, -2.5, 1.2),
 42 |                 expand_x = c(0.1, 0.1), expand_y = c(0,0),
 43 |                 include_refs = FALSE) +
 44 |     theme(legend.position = "none")
 45 | ```
 46 | 
 47 | ## Enhancer and gene traps {#traps}
 48 | 
 49 | Long before the advent of reference genomes for common model organisms, the quest to characterize genes based on expression pattern in space had already begun. The earliest high-throughput efforts to identify and characterize such genes were enhancer traps. To the best of our knowledge, the first use of a reporter to visualize gene expression in space was reported in 1983. It used lacZ fused to sequences upstream to the hsp70 gene encoding a heat shock protein in *Drosophila melanogaster* and inserted into the genome with P element to characterize the puffs formed in polytene chromosomes and the tissue distribution of hsp70 in response to heat shock [@Lis1983].
 50 | 
 51 | The first enhancer trap screen in *Drosophila melanogaster* was published in 1987 [@OKane1987]. The P element is a transposable element found in *Drosophila*. In an enhancer trap vector, a reporter gene, such as lacZ, here with the polyadenylation site of the hsp70 gene, and a marker gene with its own promoter that can be used to identify individuals and their offspring with the vector integrated into the germline, such as rosy which can be used in *Drosophila* to identify the individuals with eye color, are flanked by the 5' and 3' ends of the P element necessary for transposition (Figure \@ref(fig:trap1)). The vector is injected into *Drosophila* embryos before the formation of pole cells [@Spradling1982]. As a transposon, the construct is randomly inserted into the genome, and since the P element promoter is so weak that an enhancer is required for the promoter to drive transcription of the reporter gene, the location of the reporter gene expression marks where the enhancer is active. As the transposon is inserted into different locations of the genome in different individuals, each individual that has the vector integrated into the germline forms a transformant line. In *Drosophila*, in many cases, expression patterns of $\beta$-galactosidase do reflect expression pattern of a nearby gene [@Bellen1989; @Wilson1989].
 52 | 
 53 | (ref:trap) Illustrations of enhancer trap as described in [@OKane1987] and gene trap as described in [@Gossler1989] (Created with BioRender.com).
 54 | 
 55 | ```{r trap1, fig.cap='(ref:trap)', out.width="100%"}
 56 | knitr::include_graphics("fig1A.png")
 57 | ```
 58 | 
 59 | Since then, different vectors have been developed for better efficiency and flexibility [@Stanford2001], and enhancer traps have been applied at increasing scale. The 1987 study recovered 39 lines [@OKane1987], possibly characterizing 39 genes, but already in 1989, over 3000 lines were possible in one study [@Bier1989]. Enhancer trapping was also adapted to other species, such as mouse [@Gossler1989; @Allen1988] and *Arabidopsis thaliana* [@Sundaresan1995].
 60 | 
 61 | Enhancer traps were not intended to be mutagenic [@OKane1987], nor is it highly mutagenic [@Stanford2001]. Gene trap and promoter traps were introduced to not only screen for genes with restricted expression patterns, but also to enable functional analysis of the gene from homozygote mutant phenotypes [@Friedrich1991]. Like the typical enhancer trap vector, gene trap and promoter trap vectors contain a reporter gene, such as lacZ ($\beta$-gal), to visualize gene expression, and sometimes also a marker to screen for integration, such as the neomycin resistance gene (neo). Though often, lacZ itself, or in a fusion with neo ($\beta$-geo), was also used as the marker when screening mouse embryonic stem (ES) cells (Figure \@ref(fig:trap1)).
 62 | 
 63 | Unlike the enhancer trap vector, gene trap and promoter trap vectors do not have a promoter for the reporter, though the marker, if present, can have its own promoter. In a promoter trap, the construct needs to be inserted in frame and in the correct orientation into an exon of a gene to be expressed, making it very inefficient [@Friedrich1991; @Stanford2001].
 64 | 
 65 | In contrast, in gene traps, a splice acceptance site is added to the 5' end of the reporter, so the construct can be expressed when inserted into an intron at the right orientation; this is over 50 times more efficient than a promoter trap because introns tend to be much longer than exons and the construct does not have to be in frame to an exon [@Friedrich1991; @Stanford2001]. Gene traps and promoter traps are mutagenic as the reporter has a stop codon, thus truncating the endogenous protein.
 66 | 
 67 | While enhancer traps are more commonly used in *Drosophila*, gene traps are more commonly used in mice. In mice, in 1988, the enhancer trap vector was initially introduced by injection into the male pronucleus in the fertilized egg [@Allen1988]. The throughput of the screen is increased by inserting the construct into genomes of ES cells by electroporation or retroviral infection [@Stanford2001], screening for ES cells expressing lacZ or the marker, injecting these ES cells into blastocysts to generate chimeric mice to characterize gene expression patterns; chimera are especially useful for characterizing dominant and lethal mutations [@Friedrich1991; @Gossler1989].
 68 | 
 69 | The first gene trap screen in mouse ES cells was reported in 1989 [@Gossler1989], recovering 14 lines. Again, variants of the vector emerged and gene trap screens increased in scale. In 1995, nearly 300 mouse gene trap lines were recovered from one study [@Wurst1995a]. Later, smaller gene trap studies specific to particular types of genes made possible by additional steps to screen ES cell colonies were performed, such as genes encoding membrane and secreted proteins [@Skarnes1995], genes responding to retinoic acid [@Forrester1996], and genes expressed in hematopoitic and endothelial lineages [@Stanford1998]. In 2001, gene trapping was used to examine not only expression pattern of genes in cell bodies of neurons in the mouse brain, but also axon guidance [@Leighton2001]. By 2001, a number of gene trap consortia have been established as resources of gene trap vectors and transformant mouse ES cell lines, hoping to create at least one line for each gene in the mouse genome [@Stanford2001].
 70 | 
 71 | In the 1980s and 1990s, with increasing throughput of Sanger sequencing and the advent of shotgun sequencing, the amount of sequencing data in GenBank exploded [@Giani2020]. With 5' or 3' rapid amplification of cDNA ends (RACE) PCR, the fusion transcript of the reporter and an endogenous gene could be cloned [@Frohman1988], sequenced, and potentially aligned to the existing sequences to identify the gene of interest [@Stanford1998]. However, the golden age of gene trapping was soon to pass, with the rise of ISH atlases in the late 1990s and the advent of reference genomes of *Drosophila melanogaster* [@Myers2000], mouse [@Waterston2002], and human [@Lander2001; @Venter2001] in the early 2000s that would make it easier to design ISH probes from the reference genome to target annotated genes, as is done today. Nevertheless, enhancer and gene traps were not rendered obsolete by these developments. They have been used in plants and zebrafish through the 2000s and 2010s, as resources of gene expression patterns [@Johnson2005; @Nakayama2005; @Perez-Martin2017; @Hiwatashi2001; @Kawakami2010; @Marquart2015] (Figure \@ref(fig:hist1)).
 72 | 
 73 | (ref:hist1c) Number of publications over time in the prequel era, broken down by technique and colored by species. The gray histogram in the background is the histogram for all prequel publications over time. The bin width of this histogram is 365 days. Here WMISH and ISH exclude fluorescent ISH (FISH).
 74 | 
 75 | ```{r hist1, fig.width=8, fig.height=8, fig.cap='(ref:hist1c)', out.width="100%"}
 76 | method <- unnest_cat(prequel, method, other_cols = "species")
 77 | pubs_per_year(method, facet_by = "method", fill_by = "species", n_top_fill = 7) +
 78 |    labs(title = "Number of publications over time", y = NULL) +
 79 |    theme(legend.position = "top")
 80 | ```
 81 | 
 82 | ## In situ reporter {#in-situ-reporter}
 83 | 
 84 | In enhancer, gene, or promoter trap screens, the reporter is randomly inserted into the genome, not targeting predetermined genes. In contrast, in what we call *in situ* reporter screens, the reporter is fused to predefined regulatory sequences of a gene of interest, with the hope that expression pattern of the reporter would recapitulate that of the gene of interest. Chronologically, this is the second type of high throughput method to profile gene expression patterns (Figure \@ref(fig:hist1)).
 85 | 
 86 | A precursor to this type of method was used in 1991, where random genomic fragments were fused to a lacZ reporter lacking a transcription start signal and injected as plasmids, screening for fragments driving lacZ expression and characterizing the expression patterns in *C. elegans* [@Hope1991]. To the best of our knowledge, the first time *in situ* reporter with predefined regulatory sequences was used to screen for gene expression patterns in a multicellular organism, was in 1995, in *C. elegans* [@Lynch1995]. At that time, the *C. elegans* genome sequencing project was already in progress [@Lynch1995; @Sulston1992], and the genome sequence was declared "essentially complete" in 1998 [@Consortium1998]. Computationally predicted upstream regulatory sequences of 35 putative genes were fused to a promoterless lacZ as a reporter, cloned into plasmid vectors, and microinjected into *C. elegans* gonads to create transformed lines then stained with X-gal [@Lynch1995].
 87 | 
 88 | A reliable *in situ* reporter was first reported in mice in 1997. It used a recombinant bacterial artificial chromosome (BAC) with part of the full RU49 gene in the BAC replaced by a lacZ construct and showed that the construct is heritable [@Yang1997]. In 2003, a similar strategy, replacing coding sequences of genes in BACs with EGFP reporter gene, was used to create a mouse brain gene expression atlas [GENSAT](http://www.gensat.org/index.html) with BAC transgenic mouse lines [@Gong2003]. The GENSAT lines were used again in 2009 to create a gene expression atlas for retina [@Siegert2009]. Again, GENSAT benefited from the reference genome, which greatly helped with identifying BACs that include sequences flanking a gene that may contain regulatory elements that make the reporter better recapitulate expression pattern of the endogenous gene [@Gong2003].
 89 | 
 90 | Through the 2000s and 2010s, *in situ* reporters have been used as a targeted alternative to enhancer and gene trap screens informed by the reference genomes. To address limitations of gene traps, such as inability to precisely define the allele and favoring genes expressed in ES cells when screening for transformant colonies, high-throughput mouse knock out resources with knock out alleles computationally designed according to a reference genome and annotations have been established [@Skarnes2011; @AMouseForAllReasons]. As these alleles contain a lacZ reporter, these resources have been used to characterize gene expression in over 40 tissues in mutant mice with lacZ staining [@White2013; @West2015; @Tuck2015]. However, for some tissues, only low resolution whole mount staining was performed. Similarly, in both mouse [@Visel2013] and *Drosophila* [@Jenett2012; @Kvon2014], transgenic lines with genomic fragments containing putative enhancers driving expression of reporter genes were established as alternatives to enhancer traps. The enhancer candidates can be selected from sequence homology and ChIP-seq predictions [@Visel2013], or from tiles of sequences flanking genes thought to have restricted expression patterns or within introns of such genes [@Jenett2012].
 91 | 
 92 | *In situ* reporter atlases exceeded the scale of enhancer and gene trap screens. The largest such atlas in *C. elegans*, WormAtlas, profiled 1886 genes [@Hunt-Newbury2007]; we are unaware of enhancer and gene trap screens in *C. elegans* because *C. elegans* genome sequencing was already underway by 1992 [@Sulston1992], making *in situ* reporter screening feasible before it was so in mice and *Drosophila*. The largest such study in *Drosophila* profiled 7705 enhancer candidates [@Kvon2014], which far exceeded the 3768 enhancer trap lines in 1989 [@Bier1989]. *In situ* reporters were used in mice to profile up to 536 genes[@Siegert2009] and 329 enhancer candidates [@Visel2013], while the large scale gene trap screen in 1995 only reached 279 lines [@Wurst1995a] and later mouse gene trap screens did not typically exceed 100 lines. However, where comparable, *in situ* reporter atlases never reached the scale of (WM)ISH atlases, perhaps because of the large number of transgenic lines required. Allen Brain Atlas ([ABA](https://portal.brain-map.org)) profiled over 20,000 genes in the mouse brain, and as of April 2021, the Berkeley Drosophila Genome Project ([BDGP](https://insitu.fruitfly.org/cgi-bin/ex/insitu.pl)) WMISH atlas already has 8533 genes. However, *in situ* reporters might still be a good way to profile enhancer usage in space.
 93 | 
 94 | ## ISH and WMISH atlases {#wmish}
 95 | 
 96 | *In situ* hybridization was first used in 1969 to visualize ribosomal RNA (rRNA) [@Gall1969] and ribosomal DNA (rDNA) [@John1969] in *Xenopus laevis* oocytes with probes labeled with radioisotope ^3^H (Figure \@ref(fig:tl1)). To the best of our knowledge, the earliest use of ISH to visualize what was thought to be a specific transcript was done in 1973, to visualize globin mRNAs in various cultured erythroid and non-erythoid cell types by hybridization of radiolabeled cDNA to the mRNA [@Harrison1973]. As radioactive ISH requires long exposure time (several weeks), has low spatial resolution and high background, and requires handling hazardous radioactive material, alternatives emerged in the mid 1970s and early 1980s. Among the alternatives were variants of FISH and labeled probes detected by primary and enzyme or fluorophore labeled secondary antibodies [@Huber2018; @Langer-Safer1982]; the latter, immunological method is commonly used in ISH and WMISH atlases. To the best of our knowledge, the first report of using immunological fluorescent and peroxidase ISH to visualize expression of a specific gene was published in 1982, the same year such technique was published [@Langer-Safer1982], visualizing actin transcripts in chicken muscle tissue culture; the authors reported puncta of cytoplasmic fluorescence which might be clumps of mRNAs or artefact, but could possibly be individual transcripts [@Singer1982].
 97 | 
 98 | Non-radioactive ISH not only has shorter exposure time and higher resolution than radioactive ISH, but also made WMISH possible. WMISH was first reported in Drosophila embryos in 1989 [@Tautz1989], and was adapted to other model organisms such as mice, *Xenopus laevis*, and *Paracentrotus lividus* (purple sea urchin) in the early 1990s [@Rosen1993]. Advantages of WMISH compared to section ISH is preservation of 3D structure of the tissue, ease of interpretation in blastoderm stage embryos, and ease of performing ISH on larger number of embryos [@Rosen1993; @Tautz1989].
 99 | 
100 | Just like genome sequencing in multi-cellular organisms and *in situ* reporter screens, WMISH atlases got a head start in *C. elegans*. The first WMISH screen with higher throughput than typically used on select marker genes was reported in 1994, of 21 genes in *C. elegans* [@Seydoux1994]. Early (WM)ISH atlases in the late 1990s typically made probes from cDNA clones from poly-A selected RNAs in tissue or developmental stage of interest without pre-selecting genes to stain for [@Tomancak2002; @Stapleton2002; @Gawantka1998; @Bettenhausen1995]. Some early atlases were intended to be improvements to enhancer and gene trapping and *in situ* reporter screens, as a simpler and more direct alternative [@Bettenhausen1995] or as a way that can better capture endogenous and dynamic spatial distribution of transcripts [@Gawantka1998]. Since 1998, (WM)ISH has been automated, enabling staining for thousands of probes [@Gawantka1998; @Carson2002].
101 | 
102 | The genes from which the clones come from were often unknown, so early (WM)ISH atlases referred to the entities stained for as "clones" (Figure \@ref(fig:items)), though the genes, homology, and putative functions of the genes can be identified by aligning sequences of the cDNA clones to existing sequences in databases [@Bettenhausen1995; @Gawantka1998; @Kopczynski1998]. However, again, the first WMISH screen with probes made from cloning PCR amplified pre-defined genomic sequences was performed in *C. elegans* in 1995 [@Birchall1995]. By the turn of the century, the entities stained for were sometimes referred to as "clusters", especially in the GHOST atlas for *Ciona intestinalis* [@Satou2001] (Figure \@ref(fig:items)); the sequences of the probes were clustered by alignment and these probes might have come from the same gene.
103 | 
104 | ```{r}
105 | ish <- prequel %>% 
106 |   filter(str_detect(method, "ISH")) %>% 
107 |   unnest_cat(species, other_cols = c("organ", "n_items", "still_available"))
108 | ```
109 | 
110 | ```{r}
111 | item_annot <- prequel %>% 
112 |   select(facets = item_type) %>% 
113 |   mutate(facets = fct_lump_n(facets, 5)) %>% 
114 |   distinct() %>% 
115 |   mutate(label = case_when(facets == "line" ~ "Draft mouse\ngenome",
116 |                            TRUE ~ ""))
117 | ```
118 | 
119 | (ref:itemc) Number of prequel publications over time, broken down by what the entities stained for were called and colored by species. Bin width is 365 days. Vertical line marks the date when the draft mouse reference genome was published [@Waterston2002], as context of transition from "clone" and "line" to "gene".
120 | 
121 | ```{r items, fig.width=8, fig.height=8, fig.cap='(ref:itemc)', out.width="100%"}
122 | pubs_per_year(prequel, facet_by = "item_type", n_top = 5, 
123 |                       fill_by = "species") +
124 |    geom_vline(xintercept = ymd("2002-12-05"), color = "gray70") +
125 |    geom_text(data = item_annot, aes(label = label), x = ymd("2001-08-01"), y = 12) +
126 |    labs(title = "Number of publications over time", y = "") +
127 |    theme(legend.position = "bottom")
128 | ```
129 | 
130 | The rise of (WM)ISH atlases started before the completion of genome projects in humans and common model organisms, although their later growth was transformed by the reference genome. In the 2000s, with the availability of sequenced cDNA collections covering increasing proportion of predicted genes and the consequent rise of transcriptome-wide microarray [@Stapleton2002; @Carter2003], genes to be stained for in (WM)ISH atlases could be pre-screened based on microarray data of the tissue of interest, with probes made from cDNA clones readily available from such collections [@Yoshikawa2006; @Lein2004]. In addition, probes could be computationally designed based on reference genome sequences [@Lein2007]. Perhaps because of these developments, since the turn of the century, entities stained for have been predominantly referred to as "genes" (Figure \@ref(fig:items)). Notably, while radioactive ISH has been mostly replaced by non-radioactive ISH by the 2000s, there is a mouse hippocampus ISH atlas published in 2004 that used radioactive ISH to profile all of its 104 genes [@Lein2004]. 
131 | 
132 | Also with the rise of cDNA microarray in the late 1990s and early 2000s, some (WM)ISH atlases were made as an improvement to microarray with bulk tissue to profile the transcriptome, not only at cellular resolution, but also preserving spatial and sometimes temporal context [@Lein2007; @Bell2004], analogous to how scRNA-seq and various later forms of spatial transcriptomics were developed in response to bulk RNA-seq.
133 | 
134 | Since the 2000s, (WM)ISH atlases have been made for specific types of genes and a number of mouse tissues. In 2004, locked nucleic acid (LNA) modified oligonucleotide probes were introduced, greatly improving sensitivity of miRNA northern blot [@Valoczi2004] and made (WM)ISH atlases for miRNAs possible. The first miRNA WMISH atlas was published in 2005, which profiled 115 miRNAs in zebrafish embryos [@Wienholds2005]. Since then, miRNA atlases were created for mice [@Karali2010; @Diez-Roux2011; @Kloosterman2006], Drosophila [@Aboobaker2005], chicken [@Darnell2006], and Xenopus laevis [@Ahmed2015].
135 | 
136 | ```{r species, fig.width=6, fig.height=4, fig.cap="Number of (WM)ISH publications per species."}
137 | pubs_per_cat(ish, species) +
138 |   labs(y = "Number of (WM)ISH publications", x = "Species") +
139 |   theme(axis.text.y = element_text(face = "italic"))
140 | ```
141 | 
142 | While (WM)ISH atlases are available for several species, the mouse is by far the favored model organism (Figure \@ref(fig:species)). A timeline of the first (WM)ISH atlas for each of the species and some notable atlases are shown in Figure \@ref(fig:tl2). Especially for mice, atlases for other specific types of genes were published in the late 2000s and the 2010s, such as genes coding for RNA binding proteins [@McKee2005], fibroblast growth factors and their receptors [@Yaylaoglu2005], proteins with catalytic activities [@Cankaya2007], transcription factors and cofactors [@Yokoyama2009], metabolic enzymes and soluble carriers [@Geffers2013], cholesterol biosynthetic enzymes [@Sisecioglu2015], and ion channels (in rats) [@Shcherbatyy2014]. Among the mouse atlases, while the brain gets disproportionately strong interests, with the influential [ABA](https://portal.brain-map.org) [@Lein2007] and [GenePaint](https://gp3.mpg.de) [@Carson2002], ISH atlases exist for the eye [@Thut2001; @Blackshaw2004], genitourinary tract ([GenitoUrinary Development Molecular Anatomy Project (GUDMAP)](https://www.gudmap.org)) [@Harding2011], and lung ([LungMAP](https://lungmap.net)) [@Ardini-Poleske2017] (Figure \@ref(fig:tl2), Figure \@ref(fig:anat1)).
143 | 
144 | ```{r tl2, fig.width=10, fig.height=6, fig.cap="Timeline of the first (WM)ISH databases for each species for which such databases are available, as well as some notable databases.", out.width="100%"}
145 | events %>% 
146 |   filter(category == "ISH atlas") %>% 
147 |   plot_timeline(c(0.5, -0.5, -0.85, 0.7, -0.25, 1, -0.5, 0.3, -0.8, 0.5, -1, 
148 |                   0.7, -0.25, 0.5, 
149 |                   -0.87, 1, -0.6, 0.3, -0.5, 0.3), description_width = 25,
150 |                 expand_x = c(0.1, 0.1), expand_y = c(0,0),
151 |                 include_refs = FALSE) +
152 |     theme(legend.position = "bottom")
153 | ```
154 | 
155 | While the vast majority of (WM)ISH atlases used bright field imaging, a few used FISH (Figure \@ref(fig:hist1)), for advantages conferred by FISH discussed below. A notable FISH atlas is the Berkeley *Drosophila* Transcription Network Project ([BDTNP](http://www.cb.uu.se/~cris/BDTNP_Imaging.html)) from 2006 to 2008, which profiled expression patterns of 95 genes in the *Drosophila* embryo across 6 developmental stages up to the beginning of gastrulation [@Fowlkes2008; @LuengoHendriks2006a]. Two genes are imaged in each embryo, and the images of 1822 embryos were registered across both space and time to construct 3D virtual embryos on which patterns of different genes can be quantitatively compared [@Fowlkes2008]; the 3D imaging and penetration into the opaque yolk is made possible by two photon microscopy, in which only the fluorophores in the region of focus are excited [@LuengoHendriks2006a]. Another notable FISH atlas is [Fly-FISH](http://fly-fish.ccbr.utoronto.ca) from 2007, which profiled subcellular localization of transcripts of 3370 genes in *Drosophila* embryos [@Lecuyer2007]. While subcellular localization of transcripts can sometimes be discerned in bright field WMISH [@Tomancak2002], Fly-FISH shows higher subcellular resolution thanks to a FISH protocol using tyramide signal amplification. To our best knowledge, this is the first transcriptomic atlas of a multi-cellular organism to profile subcellular transcript localization. While more recent smFISH based methods record subcellular information, such information is typically not used in downstream analyses.
156 | 
157 | ```{r}
158 | data("mmMale_key")
159 | ```
160 | 
161 | ```{r}
162 | mm_ish_tissues <- ish %>% 
163 |     filter(str_detect(species, "Mus musculus"), !is.na(organ)) %>% 
164 |     unnest_cat(organ, other_cols = "n_items")
165 | mm_ish_tissues <- mm_ish_tissues %>% 
166 |   left_join(mmMale_key[, c("organ", "type")], by = "organ")
167 | ```
168 | 
169 | ```{r}
170 | organs_npubs <- mm_ish_tissues %>% 
171 |     count(organ, type, name = "value") %>% 
172 |     gganatogram(organism = "mouse", fill = "value") +
173 |     coord_equal() +
174 |     theme_void() +
175 |     scale_fill_distiller(palette = "Blues", direction = 1, 
176 |                          name = "# publications", breaks = scales::breaks_width(4))
177 | ```
178 | 
179 | ```{r}
180 | organs_ngenes <- mm_ish_tissues %>% 
181 |    group_by(organ, type) %>% 
182 |    summarise(value = max(n_items, na.rm = TRUE)) %>% 
183 |    gganatogram(organism = "mouse", fill = "value") +
184 |    coord_equal() +
185 |    theme_void() +
186 |    scale_fill_distiller(palette = "PuRd", direction = 1, trans = "log10",
187 |                         name = "# genes\n(log scale)")
188 | ```
189 | 
190 | ```{r anat1, fig.width=6, fig.height=4, fig.cap="A) Number of mouse publications per organ for (WM)ISH atlases (including FISH). B) Maximum number of genes in atlases for each organ, as of publication of the paper about the atlases. The color is in log scale to improve dynamic range."}
191 | organs_npubs + organs_ngenes +
192 |     plot_layout(nrow = 1) +
193 |     plot_annotation(tag_levels = "A")
194 | ```
195 | 
196 | WMISH was the most commonly used technique in the prequel era, followed by ISH (Figure \@ref(fig:tech1)). In summary, advances of non-radioactive ISH and WMISH from radioactive ISH, limitations of enhancer and gene trap and *in situ* reporter screens, cDNA collections that cover most of predicted genes, limitations of bulk microarray, reference genomes that allow for computational probe design, and ISH robots may have been responsible for the rise of (WM)ISH atlases. Another important factor may be the rise of digital photography and the internet in the 1990s, as developing thousands of analogue photos is an arduous task. Moreover, online digital atlases have been much more accessible to the wider community. Assuming that the number of publications in a field reflects interest in that field during a period of time, and if our collection is representative of the actual body of literature, then the golden age of the prequel era was the 2000s and WMISH was responsible for that peak, while section ISH and "collection", i.e. databases of gene expression patterns curated from publications and some (WM)ISH atlases, account for much of the interest after 2010 (Figure \@ref(fig:hist1)). The websites of many of the older (WM)ISH atlases are no longer accessible. However, some of the atlases from that period of time still live on in extant curated databases, which will be discussed in the next section.
197 | 
198 | ```{r tech1, fig.width=6, fig.height=3, fig.cap="Number of prequel publications per technique."}
199 | pubs_per_cat(method, method) +
200 |    labs(x = "Technique")
201 | ```
202 | 
203 | The golden age declined before the rise of current era spatial transcriptomics, which started around 2014 \@ref(fig:current-vs-prequel). What contributed to the decline of the golden age? Perhaps with proliferation of such atlases, curated databases exceeding 10,000 genes, and especially with over 20,000 genes in ABA mapped to a high quality 3D mouse brain model, there are already enough gene expression pattern resources for the most commonly studied genes, tissues (especially the brain), and developmental stages in the most common model organisms, thus making new atlases in those systems unnecessary. Moreover, in the last decade, the under-utilization of gene expression atlases [@DeBoer2009] may have reduced motivation to build new atlases. Or perhaps, more importantly, inherent limitations of non-multiplexed (WM)ISH contributed to the decline in interest in such methods. In these atlases, typically only one gene is stained for in each individual embryo or tissue section. Gene expression patterns of different genes can only be meaningfully compared and classified in tissues with a stereotypical structure, such as wild type embryos and the brain, but not tumors and pathological tissues, even though there is intense interest in spatial transcriptomics in tumors as evidenced by the LCM and ST literature \@ref(fig:topics). A large number of embryos or sections are required for such atlases, thus increasing cost and making human atlases extremely difficult and costly, if ethical at all. Furthermore, since the chromogenic reaction in bright field ISH can be prolonged to increase staining intensity, the patterns are not quantitative and consequently, analyses of such patterns typically involve binarization and quantitative expression levels of genes cannot be compared. Even with a stereotypical structure, image registration can be challenging because of biological differences between individuals [@Fowlkes2008].
204 | 
205 | ## Databases of the prequel era {#database_prequel}
206 | 
207 | Many of the (WM)ISH atlases discussed above, such as BDGP [@Tomancak2002], [Gallus *In Situ* Hybridization Atlas (GEISHA)](http://geisha.arizona.edu/geisha/) [@Bell2004], ABA [@Lein2007], BDTNP [@Fowlkes2008], GUDMAP [@Harding2011], and LungMAP [@Ardini-Poleske2017] are stored in databases that can be queried online, typically by gene symbol or by controlled anatomical or developmental vocabulary (i.e. ontology, reviewed in depth in [@Clarkson2016]). There are additional gene expression databases for images curated from publications, some containing non-spatial data as well and some specifically for spatial data.
208 | 
209 | The rise of the curated databases started in the 1990s. Already in 1992, the challenges of managing the increasing amount of gene expression data in developmental biology emerged and a spatiotemporal database of mouse gene expression that would later become the Edinburgh Mouse Atlas of Gene Expression ([EMAGE](http://www.emouseatlas.org/emage/home.php)) was discussed [@Baldock1992]. In 1994, Jackson Laboratory proposed the Gene Expression Database ([GXD](http://www.informatics.jax.org/menus/expression_menu.shtml)) [@Ringwald1994], in collaboration with EMAGE to build the most comprehensive mouse gene expression database. In 1997, work was already in progress to produce (WM)ISH atlases and construct the database infrastructure for mouse [@Ringwald1997] (GXD and EMAGE), Drosophila melanogaster [@Janning1997], *C. elegans* [@Martinelli1997], and zebrafish [@Westerfield1997]. Curated databases of mice (GXD and EMAGE), zebrafish ([Zebrafish Information Network (ZFIN)](http://zfin.org) [@Howe2017]), and *Xenopus laevis* ([Xenbase](http://www.xenbase.org/entry/) [@Bowes2009]) were released in the 2000s, within a tide of (WM)ISH atlases for new species (Figure \@ref(fig:tl2)). Some of these databases are regularly updated and the updates are responsible for many of the "collection" publications after 2010 (Figure \@ref(fig:hist1), Figure \@ref(fig:tech1)); our historical literature collection has not only the original publications for the databases, but also publications for later updates that involve new spatial gene expression images. Examples of other extant curated databases: for *Drosophila melanogaster* [FlyExpress](http://www.flyexpress.net) [@Kumar2017], for *Xenopus laevis* [XenMARK](http://genomics.crick.ac.uk/cgi-bin/search.exe) [@Gilchrist2009], and for ascidians [Ascidian Network for *In Situ* Expression and Embryological Data (ANISEED)](https://www.aniseed.cnrs.fr) [@Tassy2010]. Databases, curated or not, are available for several species; mice, *Drosophila*, and zebrafish have the most extant databases (Figure \@ref(fig:dbs)).
210 | 
211 | ```{r}
212 | dbs <- prequel %>% 
213 |   filter(!is.na(name), still_available) %>% 
214 |   unnest_cat(species, other_cols = "name") %>% 
215 |   select(species, name) %>% 
216 |   distinct()
217 | ```
218 | 
219 | ```{r dbs, fig.width=6, fig.height=4, fig.cap="Number of extant spatial gene expression databases per species."}
220 | dbs %>% 
221 |   mutate(species = fct_infreq(species) %>% fct_rev()) %>% 
222 |   ggplot(aes(y = species)) +
223 |   geom_bar() +
224 |   scale_x_continuous(breaks = breaks_pretty(5), 
225 |                      expand = expansion(mult = c(0, 0.05))) +
226 |   theme(panel.grid.minor = element_blank(), panel.grid.major.y = element_blank(),
227 |         axis.text.y = element_text(face = "italic")) +
228 |   labs(x = "Number of extant databases", y = "Species")
229 | ```
230 | 
231 | Data can be exchanged between databases. For example, among mouse databases GenePaint [@Carson2002] and EMAGE now contain data from [Eurexpress](http://www.eurexpress.org/ee/) [@Diez-Roux2011; @DeBoer2009], and EMAGE uses data from GXD for the 3D gene expression models [@Ringwald1999]. ANISEED contains data from WMISH atlases [GHOST](http://ghost.zool.kyoto-u.ac.jp) for Ciona intestinalis [@Satou2001] and MAboya Gene Expression patterns and Sequence Tags (MAGEST) for *Halocynthia roretzi* [@Kawashima2000]. FlyExpress contains data from *Drosophila* atlases such as BDGP and Fly-FISH. Data in databases that ceased to operate may still be available in extant databases. For instance, AXelDb WMISH atlas and database for *Xenopus laevis* [@Gawantka1998] has been subsumed in Xenbase while AXelDb's own website has long been defunct. Likewise, as of April 2021, the MAGEST website is defunct but the data lives on in ANISEED.
232 | 
233 | Some of the databases go beyond collecting data from other databases. Databases such as EMAGE, ANISEED, and ABA registered multiple 2D section images to map gene expression patterns onto 3D anatomical models for better comparison between different genes. FlyExpress also standardized the images from the atlases and enables search for coexpressed genes by expression pattern [@Kumar2017]. There have also been efforts to integrate databases from multiple model organisms. In 2007, COMPARE [@Salgado2008] and 4DXpress [@Haudry2007] were developed to make gene expression patterns and developmental stages in zebrafish, mouse, and *Drosophila* (also medaka in 4DXpress) comparable. While COMPARE and 4DXpress are no longer available, interest in integrating the databases continues, so in 2016, the Alliance of Genome Resource was founded, producing a unified user interface to genome and gene expression databases for *Saccharomyces cerevisiae*, *C. elegans*, *Drosophila melanogaster*, mouse, rat, and zebrafish [@Agapite2020], although spatial patterns are not its focus.
234 | 
235 | ## Geography of the prequel era {#geo_prequel}
236 | 
237 | ```{r}
238 | # For maps later in this notebook
239 | city_gc <- geocode_inst_city(prequel, cache_location = here())
240 | pubs_on_map2 <- partial(pubs_on_map, city_gc = city_gc)
241 | ```
242 | 
243 | Where were prequel era research conducted? Our database includes affiliation of the first author as of publication for all papers, and the affiliations have been geocoded to plot on maps. Around the world, most of prequel studies were performed in coastal US and Western Europe, but a some studies were performed in Asia and Oceania, but especially Japan (Figure \@ref(fig:prequel-world)). Not all of the top contributing institutions are readily recognizable "elite" institutions. Institutions include BDGP from UC Berkeley, ZFIN from University of Oregon (UO), ABA from Allen Brain Institute (Allen), GEISHA from University of Arizona (UofA), GXD from Jackson Laboratory (JAX), EMAGE from Western General Hospital (WGH), [MEPD](https://www.embl-heidelberg.de/mepd/) (for *Oryzias latipes*) from European Molecular Biology Laboratory (EMBL), and GHOST from Kyoto University (Kyodai), and mouse gene trap lines from Mount Sinai.
244 | 
245 | ```{r prequel-world, fig.width=10, fig.height=5, fig.cap="Number of prequel publications per city around the world, with top contributing institutions labeled.", out.width="100%"}
246 | pubs_on_map2(prequel)
247 | ```
248 | 
249 | ```{r prequel-us, fig.width=9, fig.height=6, fig.cap="Number of prequel publications in the US and Canada, with top contributing institutions labeled.", out.width="100%"}
250 | pubs_on_map2(prequel, zoom = "usa")
251 | ```
252 | 
253 | ```{r prequel-europe, fig.width=6, fig.height=6, fig.cap="Number of prequel publications in western Europe, with top contributing institutions labeled."}
254 | pubs_on_map2(prequel, zoom = "europe")
255 | ```
256 | 
257 | ```{r prequel-ne-asia, fig.width=8, fig.height=6, fig.cap="Number of prequel publications in northeast Asia, with top contributing institutions labeled.", out.width="100%"}
258 | pubs_on_map2(prequel, zoom = "ne_asia")
259 | ```
260 | 
261 | This can be better visualized by breaking the map down by species. Here we see locations of some model organism consortia, and that GHOST is a result of collaboration of multiple Japanese institutions (Figure \@ref(fig:map-species)).
262 | 
263 | ```{r map-species, fig.width=10, fig.height=12.5, fig.cap="Number of prequel publications per city broken down by species. Gray points are the overall number as a reference of contributions from each city and region.", out.width="100%"}
264 | pubs_on_map2(prequel, facet_by = "species", n_top = 8, ncol = 2, n_label = 3) +
265 |     theme(legend.position = "right")
266 | ```
267 | 
268 | That some institutions have disproportional contribution of one technique can also be shown. Here it's clear that prequel techniques are used by many different institutions (Figure \@ref(fig:map-tech)). In contrast, as will be shown in Chapter \@ref(current), most current era techniques never spread beyond their institutions of origin. The LCM study comes from Allen Brain Institute's atlases for Allen's mouse sleep deprivation atlas [@Thompson2010] and human glioblastoma atlas [@Puchalski2018]; although LCM is a current era technique, those two studies are in the prequel sheet because they also have ISH atlases.
269 | 
270 | ```{r map-tech, fig.width=10, fig.height=10, fig.cap="Number of prequel publications per city broken down by technique. Gray points are the overall number as a reference of contributions from each city and region.", out.width="100%"}
271 | prequel %>% 
272 |     unnest_cat(method, other_cols = "short_name") %>% 
273 |     pubs_on_map2(facet_by = "method", ncol = 2, n_label = 3) +
274 |     theme(legend.position = "right")
275 | ```
276 | 


--------------------------------------------------------------------------------
/supplement/03-prequel_analysis.Rmd:
--------------------------------------------------------------------------------
  1 | # Data analysis in the prequel era {#prequel-analysis}
  2 | 
  3 | ```{block2, note-text2, type='rmdtip', include=TRUE}
  4 | Many machine learning and statistics methods are mentioned in this chapter. The names of these methods are linked to articles explaining them for those who are unfamiliar. Some of them are math heavy.
  5 | ```
  6 | 
  7 | ```{r include=FALSE}
  8 | knitr::opts_chunk$set(echo = FALSE, fig.keep = "all",
  9 |                       message = FALSE, warning = FALSE,
 10 |                       fig.align = "center")
 11 | library(museumst)
 12 | library(tidyverse)
 13 | library(sf)
 14 | library(lubridate)
 15 | theme_set(theme_bw())
 16 | ```
 17 | 
 18 | From the earliest days of enhancer and gene traps to the (WM)ISH atlases, identifying genes with spatially and temporally variable expression patterns, comparing and classifying the patterns, identifying new marker genes of cell types and developmental stages, and using gene expression to redefine cell types have been among the goals of the studies [@OKane1987; @Gossler1989; @Wurst1995a; @Sundaresan1995; @Gawantka1998; @Tomancak2002; @Lein2007]. In the prequel era, these were typically done manually, which, with the growing size of atlases in the 2000s, was time consuming and potentially inconsistent between curators. Thus, computational methods were developed to analyze images from the (WM)ISH atlases. This chapter reviews data analysis methods designed for (WM)ISH atlases and does not involve scRNA-seq data; methods involving both (WM)ISH and scRNA-seq are reviewed in Chapter \@ref(current-analysis) for the current era because scRNA-seq is at present a popular and rapidly growing field, too in vogue to be considered "prequel". If our collection is representative, then the rise of prequel data analysis methods arrived much later than that of data collection (Figure \@ref(fig:prequel-poly)).
 19 | 
 20 | ```{r}
 21 | all_prequel <- read_metadata(sheet_use = c("Prequel", "Prequel analysis"), 
 22 |                              update = TRUE)
 23 | all_prequel <- all_prequel %>% 
 24 |   mutate(Type = case_when(sheet == "Prequel analysis" ~ "analysis",
 25 |                           TRUE ~ "data"))
 26 | ```
 27 | 
 28 | ```{r prequel-poly, fig.width=7, fig.height=4, fig.cap="Comparing trends in data collection and data analysis in the prequel era. Bin width is 365 days. The x-shaped points show the number of publications from the last bin, which is not yet full."}
 29 | era_freqpoly(all_prequel, Type, preprints = TRUE, binwidth = 365) +
 30 |     scale_x_date(date_breaks = "2 years", date_labels = "%Y") +
 31 |     scale_y_continuous(expand = expansion(c(0,0.05)))
 32 | ```
 33 | 
 34 | Except for one study on *Platynereis dumereilii* in 2014 [@Pettit2014], on *Xenopus tropicalis* in 2018 [@Patrushev2018], one on post mortem human brain in 2021 [@Abed-Esfahani2021], all data analysis methods in our collection were designed for either *Drosophila melanogaster* or *Mus musculus* (Figure \@ref(fig:pa-species)). There seem to have been two waves; the first for *Drosophila*, peaking in the late 2000s, mostly concerning the BDGP in situ atlas, and the second for mice, peaking in early 2010s, mostly concerning ABA (Figure \@ref(fig:pa-species)). The apparent rise since 2019 is in part driven by deep learning methods to annotate gene expression patterns or infer gene interactions. Given the small number of publications in this category and potential incompleteness of the curation, the trends should be taken with a grain of salt.
 35 | 
 36 | ```{r}
 37 | prequel_analysis <- read_metadata("Prequel analysis", update = TRUE)
 38 | ```
 39 | 
 40 | ```{r}
 41 | p4b_annot <- tibble(facets = c("Drosophila melanogaster", "Mus musculus"),
 42 |                     xint = ymd(c("2002-12-23", "2006-12-06")),
 43 |                     x =  xint,
 44 |                     label = c("BDGP", "ABA"))
 45 | ```
 46 | 
 47 | ```{r pa-species, fig.width=6, fig.height=6, fig.cap="Gray histogram in the background is overall histogram of prequel data analysis literature. Number of publications in each time bin for each species is highlighted in the facets."}
 48 | pubs_per_year(prequel_analysis, facet_by = "species", fill_by = "species",
 49 |                       n_top = 2, preprints = TRUE) +
 50 |     geom_vline(data = p4b_annot, aes(xintercept = xint), color = "gray70") +
 51 |     geom_text(data = p4b_annot, aes(label = label, x = x, y = 4)) +
 52 |     theme(legend.position = "top")
 53 | ```
 54 | 
 55 | ## Gene patterns {#prequel-gene-patterns}
 56 | The most common goal of these data analysis methods was to annotate and compare gene expression patterns, especially to automate annotation of the BDGP atlas (Figure \@ref(fig:pa-category)). It seems reasonable to focus on 4 phases in this category: first, in early to mid 2000s, after image registration, the images were binarized into "expressed" and "not expressed" regions, and the shapes of the expressed regions were summarized and compared. Metrics to summarize the shapes included [moment invariant](https://towardsdatascience.com/introduction-to-the-invariant-moment-and-its-application-to-the-feature-extraction-ee991f39ec) [@Jayaraman2001; @Gurunathan2004], Hamming distance [@Kumar2002], and a weighted score involving [L1 distance](https://iq.opengenus.org/manhattan-distance/) between column or row histograms of two images [@Liu2007]. These unsupervised methods enabled clustering of patterns and querying genes with similar patterns to a given gene.
 57 | 
 58 | ```{r}
 59 | categories <- unnest_cat(prequel_analysis, category, other_cols = "species")
 60 | ```
 61 | 
 62 | ```{r pa-category, fig.width=6, fig.height=6, fig.cap="Number of publications in each time bin for each category of data analysis is highlighted in the facets."}
 63 | pubs_per_year(categories, facet_by = "category", n_top = 5, 
 64 |               fill_by = "species", sort_by = "count", preprints = TRUE) +
 65 |     theme(legend.position = "top")
 66 | ```
 67 | 
 68 | Second, from the mid 2000s to mid 2010s, many supervised and unsupervised methods for gene expression pattern annotation or comparison were developed. In supervised methods, extensive feature engineering more sophisticated than binarization was performed on registered images for image annotation with machine learning classification. These methods were trained with existing BDGP annotations and developed to automatically annotate the BDGP expression patterns with controlled vocabulary (CV) of anatomical regions where genes were expressed. In BDGP, a gene gets annotated with a CV if the gene was deemed expressed in the anatomical region and developmental stage denoted by the CV, so the annotation typically contained a list of CVs.
 69 | 
 70 | The feature engineering can be based on the wavelet transform [@Zhou2007a] and Fourier coefficients [@Heffel2008a], but a particularly popular feature engineering method was [scale-invariant feature transform (SIFT)](https://docs.opencv.org/master/da/df5/tutorial_py_sift_intro.html) [@Lowe2004; @Ji2008; @Li2009; @Ji2009]. A method published in 2009 that used SIFT followed by [bag of words](https://machinelearningmastery.com/gentle-introduction-bag-words-model/) where "word" is a [k means](https://medium.com/analytics-vidhya/k-means-clustering-explained-419ee66d095e) cluster (code book) was quite influential [@Ji2009]; several later methods were inspired by this method, with improved code books [@Sun2013; @Ji2009a; @Yuan2012; @Liscovitch2013a]. The most common classifier that take in the features to predict annotations is [support vector machine (SVM)](https://towardsdatascience.com/support-vector-machine-introduction-to-machine-learning-algorithms-934a444fca47) [@Sun2013; @Yuan2012] or multi-label variants of it [@Ji2008; @Ji2009].
 71 | 
 72 | Unsupervised methods rely on clustering algorithms after images are registered on a common mesh, such as [affinity propagation clustering](https://towardsdatascience.com/unsupervised-machine-learning-affinity-propagation-algorithm-explained-d1fef85f22c8) [@Frise2010] and co-clustering (rows and columns of matrix are clustered simultaneously) [@Jagalur2007; @Zhang2013].
 73 | 
 74 | Third, another notable type of the feature engineering is dimension reduction. In 2006, some methods applied dimension reduction methods such as [principal component analysis (PCA)](https://liorpachter.wordpress.com/2014/05/26/what-is-principal-component-analysis/) and [independent component analysis (ICA)](http://wwwf.imperial.ac.uk/~nsjones/TalkSlides/HyvarinenSlides.pdf) to the registered images to find "eigen" patterns [@Pan2006; @Peng2006]. Instead of PCA or ICA, the dimension reduction can also be sparse Bayesian factor analysis [@Pruteanu-Malinici2011], sparse dictionary learning [@Li2017], and [non-negative matrix factorization (NMF)](http://www.cs.cmu.edu/~11755/lectures/Lee_Seung_NMF.pdf) [@Noto2017; @Wu2016]. The dimension reduction can be used for unsupervised clustering of genes [@Pan2006; @Peng2006; @Pruteanu-Malinici2011], as well as supervised classification methods such as SVM and [logistic regression](https://towardsdatascience.com/logistic-regression-detailed-overview-46c4da4303bc) to annotate gene expression patterns with controlled vocabulary [@Pruteanu-Malinici2011; @Wu2016]. Notably, in NMF, both the matrix for basis patterns and the coefficient matrix for the genes tend to exhibit block structures; the blocks in the gene coefficient matrix has been used to cluster genes [@Noto2017].
 75 | 
 76 | Fourth, since 2015, [convolutional neural networks (CNNs)](https://towardsdatascience.com/a-comprehensive-guide-to-convolutional-neural-networks-the-eli5-way-3bd2b1164a53) have been adopted to analyze gene expression patterns. Typically, a pre-trained model, such as ResNet50, OverFeat, or Alexnet is used. With some modifications or retraining of the original model, the model can be used to extract features for gene pattern annotation with logistic regression [@Zeng2015], classifying new patterns [@Andonian2019, @Long2021], and predicting interactions between genes [@Yang2019].
 77 | 
 78 | ## Spatial regions {#prequel-spatial-regions}
 79 | Closely related to classifying gene expression patterns are these questions: What are the implications of gene expression patterns to traditional anatomical regions as in the CV? Can we discover novel anatomical regions from gene expression? How well do expression-based regions correspond to the traditional regions? A few studies, which we call "spatial region", tried to answer these questions in the ABA (Figure \@ref(fig:pa-category)). Clusters of expression patterns of cell type specific genes [@Ko2013], or the most localized genes [@Grange2014], principal components of the patterns [@Bohland2010a], or patterns of coexpression modules were compared to traditional anatomy [@Grange2014]. At least in the mouse brain, with the principal components, these clusters may correspond to traditional anatomy quite well [@Bohland2010a]. However, when cell types are taken into account in clustering, gene expression seems to be able to refine traditional anatomy [@Ko2013; @Grange2014].
 80 | 
 81 | A clustering strategy for identifying spatial regions that takes the spatial neighborhood into account is [Markov random field (MRF)](https://ocw.mit.edu/courses/electrical-engineering-and-computer-science/6-867-machine-learning-fall-2006/lecture-notes/lec23.pdf). In MRFs, nearby voxels can be made to be more likely to share a label, which can be cell type or histological region, and the probability of a voxel taking each of the labels only depends on labels of neighboring voxels. MRFs were used to delineate spatial regions in a 3D FISH atlas of the developing [*Platynereis dumereilii*](https://platynereis.github.io/) brain [@Pettit2014], with 86 high quality genes. The images in the atlas were aligned into a 3D model and broken into voxels 3 $\mu$m per side, which is smaller than a typical single cell; the spatial neighborhood graph is the 3D square grid of the voxels. As FISH is not very quantitative, the gene expression was manually binarized. Expression of each gene at each voxel is modeled with a [Bernoulli distribution](https://mathworld.wolfram.com/BernoulliDistribution.html), and the 86 genes are assumed to be independent. Cluster label assignment is modeled with [Potts model](https://en.wikipedia.org/wiki/Potts_model), a type of MRF in which only neighboring voxels with the same label contribute to the probability distribution of the labels, thus favoring neighbors with the same label. The parameters, such as interaction strength between neighboring voxels for the Potts model and the probability parameter of the Bernoulli distributions are estimated with [expectation maximization (EM)](http://ai.stanford.edu/~chuongdo/papers/em_tutorial.pdf).
 82 | 
 83 | ## Gene interactions {#prequel-gene-interactions}
 84 | While not single cell resolution, (WM)ISH atlases provide transcriptomes within the tissue at a resolution far higher than that of typical bulk RNA-seq and bulk microarray, thus opening the way to studying coexperssion and interaction between genes within the tissue. There are a few methods that aim to decide whether two genes interact according to (WM)ISH images, some dating published long before the popularization of scRNA-seq. Already in 2002, an early method that compares binarized gene expression patterns was used to identify interactions among genes by comparing patterns from wild type and mutant backgrounds [@Kumar2002]. 
 85 | 
 86 | However, as mutant lines are harder to obtain than wild type images, the simplest method is to set a threshold in [Pearson correlation](https://www.questionpro.com/blog/pearson-correlation-coefficient/) coefficient between two genes to decide an edge should be drawn on the gene coexpression graph [@Wu2016; @Campiteli2013]. 
 87 | 
 88 | Alternatively, a sparse [Markov network](http://ml.informatik.uni-freiburg.de/former/_media/teaching/ws1314/gm/11-markov_logic_networks.handout.pdf) whose nodes are genes and edges are presence of interaction can be learnt from expression profiles in each voxel [@Puniyani2013], or a CNN can be trained on known interactions and predict new interactions based on gene expression patterns [@Yang2019]. There are other types of analyses, such as inferring gene function from expression pattern, identifying spatially variable genes, and gene expression imputation at locations. The latter two are still important topics in current era data analysis.
 89 | 
 90 | ## Decline {#decline}
 91 | What contributed to the decline of the golden age of prequel data analysis? Partly a lack of usage of the methods developed, which was exacerbated by the decline of the golden age of (WM)ISH atlases in the 2010s so there were fewer new atlases where the methods can be applied (Figure \@ref(fig:hist1)). While many methods to automate gene expression pattern annotation for BDGP were developed before 2013, for the 2013 BDGP update that added images of 708 transcription factors, the BDGP annotated the new images with human curators instead of the automated methods [@Hammonds2013]. Nor did BDGP use the new methods to compare and classify the new gene expression patterns; instead, the curator assigned CV annotations were used for analysis [@Hammonds2013; @Tomancak2007]. BDGP did not have a major update after 2013; as existing images have already been annotated, there is no need to automate annotations.
 92 | 
 93 | There are additional possible reasons why these methods were not used: First, it is unclear from the publications of the methods where the software implementation can be obtained. Second, a reason why most prequel analysis methods were developed for either BDGP or ABA is that since one gene is stained for in one embryo/section at a time, the images must be registered and standardized for different genes to be comparable; BDGP, through FlyExpress [@Kumar2017], and ABA, provide images that have already been registered and standardized, while many other atlases, such as GEISHA, do not. Due to challenges in image registration in other organisms, the automated gene expression pattern analysis methods can't be applied. Third, lack of usage of these methods can also be due to insufficient accuracy; from 2009 to 2013, the [area under the curve (AUC)](https://towardsdatascience.com/understanding-auc-roc-curve-68b2303cc9c5) of the automated annotations is typically around 0.8 and rarely exceeded 0.9 [@Ji2009; @Pruteanu-Malinici2011; @Yuan2012; @Sun2013], which means when using such tools to annotate new images, extensive human review would still be required.
 94 | 
 95 | ## Geography of prequel data analysis {#geo-prequel-analysis}
 96 | 
 97 | ```{r}
 98 | city_gc <- geocode_inst_city(prequel_analysis)
 99 | pubs_on_map2 <- partial(pubs_on_map, city_gc = city_gc)
100 | ```
101 | 
102 | If our collection is representative, then contribution to prequel data analysis concentrates in a few institutions (Figure \@ref(fig:pa-map)), not all of which are elite.
103 | 
104 | ```{r pa-map, fig.width=10, fig.height=5, fig.cap="Number of publications per city for prequel data analysis.", out.width="100%"}
105 | pubs_on_map2(prequel_analysis)
106 | ```
107 | 
108 | ```{r pa-map-us, fig.width=9, fig.height=6, fig.cap="Number of publications per city for prequel data analysis in the US.", out.width="100%"}
109 | pubs_on_map2(prequel_analysis, zoom = "usa")
110 | ```
111 | 
112 | When broken down by species, it seems that distinct institutions contributed to data analysis of *Drosophila* and mouse data. UC Berkeley and Lawrence Berkeley National Laboratory (LBL) are responsible for BDGP, and Allen is responsible for ABA. However, among the top contributors are other institutions such as Arizona State University (ASU) and Old Dominion University (ODU) (Figure \@ref(fig:pa-species2)).
113 | 
114 | ```{r pa-species2, fig.width=10, fig.height=15, fig.cap="Number of publications per city for prequel data analysis broken down by species of interest.", out.width="100%"}
115 | pubs_on_map2(prequel_analysis, facet_by = "species", n_top = 2, ncol = 1, 
116 |              n_label = 5)
117 | ```
118 | 


--------------------------------------------------------------------------------
/supplement/08-future.Rmd:
--------------------------------------------------------------------------------
 1 | # (PART) Future perspectives {.unnumbered}
 2 | 
 3 | # From the past to the present to the future {#future}
 4 | 
 5 | The quest to profile the transcriptome in space with high resolution is not new. It started with the enhancer and gene trap screens in the late 1980s and the 1990s, before the genomes of metazoans were sequenced. However, in the prequel era, challenges with the existing technology made the dream of profiling the transcriptome in space hard to reach, as the technologies were not highly-multiplexed and not very quantitative. Over 30 years later, this dream seems to be more within reach, though with some caveats. We have come so far, because of so many strands of ideas and technologies coming together since the late 2010s. Highly multiplexed smFISH that can profile 10000 genes at a time would not have been possible without the reference genome sequence to screen for off target binding, the reference transcriptome and genome annotation with which to design the probes, the technology to synthesize DNA oligos, smFISH, confocal microscopy, digital photography, combinatorial barcoding, and the computing resources to store and process terabytes of images. ST and Visium would not have been possible without microarray technology, scRNA-seq techniques designed for small amount of RNA from each spot, NGS, and the computing power to process the data. Some of these strands are older than others, and each of them would not have been possible without more preceding strands coming together. For instance, smFISH would not have been possible without the development of non-radioactive FISH in the late 1970s and the 1980s and techniques to synthesize fluorophore labeled probes. The field of spatial transcriptomics has grown tremendously since the late 2010s, as this is the time when a wide array of technologies truly started to add up to more than the sum of their parts.
 6 | 
 7 | Where are we right now in terms of the development of this rapidly unfolding field? Again, we may take inspiration from and draw parallels with development of other technologies that have much longer histories. From such comparisons, we find that the field of spatial transcriptomics is coming of age. First, in several fields, there have been less successful early attempts to achieve the goal of the field that had never become very popular, and the field did not become vastly popular until the right strands of technologies came together. In the history of cycling, the hobby horse and the penny farthing are among such early attempts which were more dangerous and less efficient, and the breakthrough of the safety bicycle, with the convergence of technologies such as the pneumatic tire, the tangent spoke, and the chain and sprocket, as well as disadvantages of horses and immaturity of the automobile, led to the bike boom in the 1890s, though there are many other important advances such as derailleurs, disc brakes, clipless pedals, and carbon fiber technology, important but not as revolutionary. In the history of elevators, there have been the Archimedes screw and the paternoster, which are no longer commonly seen as passenger elevators due to their disadvantages. The Archimedes screw elevator was very slow and costly, and the paternoster was dangerous. There have also been hand pulled elevators since the era of the Roman Empire. Convergence of several strands of technologies and social changes led to mainstreaming of the elevator, including urbanization, the steam engine, hydraulic propulsion, and electric motors. Here in spatial transcriptomics, considering the drastic growth in the late 2010s, perhaps we may say that for the purpose of profiling expression of large number of genes in tissue, prequel techniques such as enhancer and gene traps, *in situ* reporters, and (WM)ISH are among the less successful early attempts which have never seen the popularity of some current era techniques and which have gone out of favor due to their disadvantages. We have come to a time where the right technologies converge to make achieving the goal of profiling the transcriptome in space efficient enough for a much wider audience, though there are still challenges.
 8 | 
 9 | Second, in several fields that are no doubt mature, while many different technologies to solve the same problem are available, a small number of such technologies, often sold by a small number of companies, tend to dominate. This could be a sign of maturity of the field as companies have enough time to become well-established in the field and factors that lead to dominance such as cultural inertia and network effect have enough time and popularity to form. That these companies get to dominate at all means that this field is already popular enough to be profitable. The dominating technologies are not necessarily the best in all rounds and many factors beyond how well the technology or company currently solves the problem (e.g. historical contributions, cost, marketing, cultural inertia, and monopolistic business practices) led to dominance. The obvious example in our field is NGS; while there were many sequencing start ups and many different ways proposed to make sequencing more efficient in the 1990s (e.g. cPAL and SOLiD as already mentioned, and sequencing by hybridization [@Mirzabekov1994]), today Illumina dominates. For scRNA-seq, while there are Drop-seq, inDrops, CEL-seq, MARS-seq, SMART-seq, and etc., 10X Chromium dominates and is used in most scRNA-seq studies we have come across. When we began curating the database in January 2020, we were surprised by the common usage of Tomo-seq and LCM, but we have witnessed the rapid rise and spread of Visium and GeoMX DSP over the course of the past year. In the current era, from about 2014 to 2019, a variety of techniques were used to collect new data and it was hard to say which ones dominated. In contrast, since about 2020, a substantial portion of publications for new data used Visium, a portion not previously seen after the golden age of WMISH in the 2000s and since the current era began to take off in the mid 2010s (Figures \@ref(fig:all-methods), \@ref(fig:all-methods-prop)). With many institutions using their products, 10X and Nanostring have become relatively newly well-established in the field of spatial transcriptomics. Especially for Visium, open source developers (e.g. for Seurat, SpatialExperiment, and BayesSpace) are catering to the output format of Space Ranger (the official preprocessing software for Visium). This is a nod to Visium's establishment akin to the earlier establishment of 10X Chromium and Cell Ranger.
10 | 
11 | ```{r, include=FALSE}
12 | knitr::opts_chunk$set(echo = FALSE, fig.keep = "all",
13 |                       message = FALSE, warning = FALSE,
14 |                       fig.align = "center")
15 | ```
16 | 
17 | ```{r}
18 | library(tidyverse)
19 | library(museumst)
20 | theme_set(theme_bw())
21 | ```
22 | 
23 | ```{r}
24 | nms <- c("Prequel", "ROI selection", "NGS barcoding", "smFISH", "ISS", "De novo")
25 | all_methods <- read_metadata(nms, update = TRUE) %>% 
26 |   unnest_cat(method)
27 | ```
28 | 
29 | ```{r}
30 | all_methods2 <- all_methods %>% 
31 |   mutate(method2 = fct_lump_n(method, 10) %>% fct_infreq() %>% 
32 |            fct_relevel("Other", after = Inf) %>% 
33 |            fct_rev())
34 | ```
35 | 
36 | ```{r}
37 | # Colorblind friendly palette
38 | dittoseq_colors <- 
39 |   c("#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", 
40 |     "#CC79A7", "#666666", "#AD7700", "#1C91D4", "#007756", "#D5C711", 
41 |     "#005685", "#A04700", "#B14380", "#4D4D4D", "#FFBE2D", "#80C7EF", 
42 |     "#00F6B3", "#F4EB71", "#06A5FF", "#FF8320", "#D99BBD", "#8C8C8C", 
43 |     "#FFCB57", "#9AD2F2", "#2CFFC6", "#F6EF8E", "#38B7FF", "#FF9B4D", 
44 |     "#E0AFCA", "#A3A3A3", "#8A5F00", "#1674A9", "#005F45", "#AA9F0D", 
45 |     "#00446B", "#803800", "#8D3666", "#3D3D3D")
46 | ```
47 | 
48 | ```{r all-methods, fig.width=7, fig.height=4, fig.cap="Number of publications (including preprints) using each technique to collect new data in both prequel and current era. Only the top 10 in terms of number of publications of all time are colored, and the rest are lumped into Other. Bin width is 180 days, or about half a year. The LCM is for curated LCM literature, which might not be representative of all LCM literature given LCM's long term popularity."}
49 | ggplot(all_methods2, aes(date_published, fill = method2)) +
50 |   geom_histogram(binwidth = 180) +
51 |   #geom_area(stat = "bin", binwidth = 180) +
52 |   scale_fill_manual(values = c("gray70", rev(dittoseq_colors[1:11])),
53 |                     name = "Method") +
54 |   scale_y_continuous(expand = expansion(c(0, 0.05))) +
55 |   labs(x = "Date published", y = "Number of publications")
56 | ```
57 | 
58 | ```{r all-methods-prop, fig.width=7, fig.height=4, fig.cap="Proportion of publications per bin using each of the top 10 techniques for data collection."}
59 | ggplot(all_methods2, aes(date_published, fill = method2)) +
60 |   geom_histogram(binwidth = 180, position = "fill") +
61 |   scale_fill_manual(values = c("gray70", rev(dittoseq_colors[1:11])),
62 |                     name = "Method") +
63 |   scale_y_continuous(expand = expansion(0)) +
64 |   scale_x_date(expand = expansion(0)) +
65 |   labs(x = "Date published", y = "Proportion of publications")
66 | ```
67 | 
68 | Spatial transcriptomics still faces many challenges. First, there still is the trade off between quantity and quality. ST and Visium, which have limited resolution and low detection efficiency, can be more easily applied to larger areas of tissue and the whole transcriptome. ISS has been applied to whole mouse brain sections, because while it has lower detection efficiency than smFISH, the amplified and less crowded signals can be detected at lower magnification. In contrast, while smFISH based techniques have subcellular resolution and often over 80% detection efficiency, the efficiency is compromised when applied to 10000 genes and these techniques are more difficult to apply to larger areas of tissue. As there are still challenges, new techniques to collect data are constantly being developed. Second, compared to the prequel era, the current era is more elitist. While commercial LCM, ST, and Visium have spread far and wide, the various high quality smFISH based techniques mostly failed to spread beyond their usually elite institutions of origin. This might be due to difficulty in building custom equipment, challenges in customizing the protocols to different tissues, limits in number of genes and cells profiled, lack of core facilities for these techniques, and lack of unified, efficient, open source, and well documented software platform to process the data. However, with the rise of commercial platforms for highly multiplexed smFISH such as MERFISH, Rebus Esper, and Molecular Cartography, this might soon change.
69 | 
70 | Data analysis has also come a long way, from PCA and ICA in the early 2000s to much more sophisticated techniques today. Many ideas that originated in other fields such as computer vision, machine learning, and statistics, including geospatial statistics, have been adapted to spatial transcriptomics in recent years. Ideas from computer vision include SIFT, NMF, CNN, and to some extent also PCA and ICA. Ideas from machine learning include SVM, neural networks, bag of words, variational autoencoders (for some cases of latent space), mixture of experts model, $k$ nearest neighbor, and clustering. Ideas from statistics include CCA, permutation testing, MCMC, factor analysis, generalized linear models, and hierarchical modeling. Ideas from geospatial statistics include Gaussian process model (usually used for kriging), spatial point process, and MRF. Other ideas include Laplacian score and optimal transport. Conceivably, more ideas can be adapted to spatial transcriptomics. For instance, spatiotemporal statistics can be adapted to analyze multiple aligned sections of the same tissue to address the difference in covariance between the z axis and the x and y axes. Well established methods in geospatial statistics, such as the semivariogram, J function, G function, and other point process models are also promising for spatial transcriptomics.
71 | 
72 | We have reviewed many different types of data analysis, using a diverse arsenal of principles. However, integrated analysis pipelines like Seurat are still immature for spatial transcriptomics; Seurat only supports the most rudimentary analyses and the user still needs to learn different syntax and convert data to different formats to use many of the other more specialized and advanced tools, many of which are not well documented. However, the open source culture is flourishing and growing. Most prequel data analysis publications did not link to a repository of the implementation of the software, while most current era data analysis publications do. While the proprietary MATLAB language is still in use, most, especially more recent, current era publication use R, Python, C++, and in some cases Julia and Rust, which are open source and free. Open source software and freely available data may enable less privileged individuals and institutions to perform data analysis and develop new data analysis tools.
73 | 
74 | What would an ideal future of spatial transcriptomics look like? Data collection would have subcellular resolution, be transcriptome wide, have nearly 100% detection efficiency, and is scalable to large areas of tissues in 3D. Even better, it's multi-omic, profiling not only transcriptome, but also epigenome, proteome, metabolome, and etc., with equally high quality and throughput for the other omics. Moreover, the data collection technique is easy to use, such as coming in easy to use kits, and affordable, so it can spread far and wide into non-elite institutions. It should also be open source and transparent, so it would be easier for others to improve it. While we have reviewed many data analysis methods, a comprehensive benchmark of the methods for each analysis task and evaluation of user experience, like in dynverse for scRNA-seq pseudotime analysis [@Saelens2019], would be helpful for users to choose a method to use and for developers to compare their new methods to existing methods.
75 | 
76 | Data analysis would have the same user-friendly user interface for different data types and different methods for the same task. Also, the package should be modular, so dependencies are only installed if needed. It should also be extensible, so users can add additional modules or additional tools for existing tasks to the integrative framework. This would be like SeuratWrappers, which provides Seurat interfaces to data integration and RNA velocity methods not implemented by Seurat. Or like caret and tidymodels, which provide a uniform user interface to numerous machine learning methods. This can be achieved with guidelines such as those used by Bioconductor, encouraging developers to reuse existing data structures and methods in Bioconductor rather than reinventing the wheel. It should also be effective at its task, scalable, well documented, open source, unit tested, easy to install, and portable, again, as enforced to some extent by the Bioconductor guideline. It should be implemented in easy to read code, so developers can more easily fix bugs and improve the package. In addition, it should be interoperable, so tools written in different programming languages can be integrated, combining their strengths and bridging cultural differences between the programming language communities. It should have elegant data visualization, both static for publications and interactive for data exploration and sharing. The data visualization should also be accessible, such as using redundant encoding and colorblind friendly palettes and providing alternatives to those who are visual impaired. Finally, it should be integrated with a graphical user interface (GUI) like iSee so the data can be shared with colleagues who do not code.
77 | 
78 | We don't live in the ideal world. Then what might the actual future of spatial transcriptomics look like given current trends? Visium might soon become to spatial transcriptomics what Chromium is to scRNA-seq, while LCM and GeoMX DSP live on by the side for ROI based studies. Perhaps largely with Visium, spatial transcriptomics might soon become as mainstream as scRNA-seq is today. However, just like the cDNA microarray, which was the transcriptomics method of choice in the 2000s and early 2010s and was replaced by RNA-seq which is more quantitative and sensitive, Visium might be replaced by some other technique in a few years after more technological advances that address Visium's drawbacks such as lack of single cell resolution and low detection efficiency, though we don't know what that new technique would be. At present, 10X has plans for what might be based on smFISH or ISS and have single molecule resolution and Visium HD which has single cell resolution. Then we anticipate 10X to hold a substantial market share of spatial transcriptomics in the near future. 
79 | 
80 | However, if 10X fails to ride the new trends, or if another company develops something much better, then it might replace 10X as the dominant company in spatial transcriptomics. Then what might replace Visium? If the commercial highly multiplexed smFISH platforms take off and become adopted by core facilities so the individual lab no longer has to invest in new equipment and the pricey probe collection, then the possibility that they may compete with Visium can't be ruled out. Moreover, as Illumina sequencing also involves image processing and matching fluorescent spots from different rounds, image processing for smFISH might no longer be a bottleneck in the near future. Commercial probe sets for highly multiplexed smFISH much as the probe sets on commercial cDNA microarrays might emerge for use with the automated platforms and core facilities. Back in the golden age of the cDNA microarray, probes of known sequences on the array were used to profile the transcriptome. Also, at present, most scRNA-seq and spatial transcriptomics studies only care about known genes and existing genome annotations, so not being able to find novel isoforms might not be a significant drawback to most users. In contrast, lack of single cell resolution in Visium is indeed a serious drawback, because cell type deconvolution of the spots is commonly performed and many computational tools have been developed for this purpose. As we don't know how this rapidly developing field will unfold in the next few years, these are just possibilities and we cannot make specific predictions.
81 | 
82 | In addition, realistically speaking, where are we on the way to pursue the holy grail of low cost, convenient, high spatial resolution, high detection efficiency, larger area of tissue, transcriptome wide profiling, 3D tissue, and multi-omics? As already discussed in Section \@ref(comparisons), trade offs can't be avoided at present. Considering the more recent novel techniques, such as CISI, MOSAICA, sci-Space, BOLORAMIS, PIXEL-seq, and etc., we don't find the new techniques in the entire field of spatial transcriptomics going in a single direction in what to prefer in the trade offs. 
83 | 
84 | Some areas do not seem to pursue some of the objectives of the holy grail. For instance, we do not see smFISH based techniques applied to an increasing number of genes over time (Figure \@ref(fig:smfish-lm-gene)), while there may be more interest in profiling larger number of cells (Figure \@ref(fig:smfish-lm-cell)) and novel proofs of principle (e.g. in CISI, MOSAICA, and SABER). Instead, highly multiplexed smFISH datasets with a smaller number genes are complementary to scRNA-seq data from the same studies (e.g. [@LaManno2021; @Bhaduri2021; @Lu2021; @VanBruggen2021]). 
85 | 
86 | However, there are developments that reduce the competition between some areas of the trade offs without eliminating the trade offs. So far there seems to be less interest in *in situ* sequencing due to its inefficiency. ISS and HybISS were developed by the same group and are both in Cartana, but recent atlases that could have used either favored HybISS, which has somewhat higher detection efficiency than ISS, and with RCA amplification and a relatively low detection efficiency, can be applied to larger areas of tissue and imaged at lower magnification. For *in situ* sequencing, there also seems to be a trend to avoid the inefficiency of reverse transcription, as in HybRISS and BOLORAMIS. 
87 | 
88 | New NGS barcoding techniques seem to have more emphasis on high resolution (e.g. single cell but not high spatial resolution in XYZeq), if not high spatial resolution, but different studies seem to have different emphases on the other objectives in the holy grail. For instance, while all aiming for higher spatial resolution, the Slide-seq and Stereo-seq papers emphasize scalability to more tissue (indeed these techniques have lower detection efficiency), while the PIXEL-seq paper emphasizes not compromising detection efficiency, and the Seq-Scope paper emphasizes "easy-to-implement". Slide-seq2 then emphasizes better detection efficiency than the first version of Slide-seq, though the improved efficiency is still low. No NGS based spatial technique has attempted to rival smFISH detection efficiency. Again, the different emphases highlight the trade offs, which will most likely stay with us for a long time. If that is the case, then spatial transcriptomics might evolve into different branches, with different types of techniques each with its own trade offs better suited to different types of studies. 
89 | 


--------------------------------------------------------------------------------
/supplement/09-references.Rmd:
--------------------------------------------------------------------------------
1 | # References {-}
2 | 


--------------------------------------------------------------------------------
/supplement/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: museumst_catalogue
 2 | Title: museumst book
 3 | Version: 0.0.0.9001
 4 | Authors@R: 
 5 |     person(given = "Lambda",
 6 |            family = "Moses",
 7 |            role = c("aut", "cre"),
 8 |            email = "dlu2@caltech.edu",
 9 |            comment = c(ORCID = "0000-0002-7092-9427"))
10 | Description: Not a package, just packages used to render the book.
11 | License: MIT + file LICENSE
12 | Encoding: UTF-8
13 | LazyData: true
14 | Roxygen: list(markdown = TRUE)
15 | RoxygenNote: 7.1.1
16 | Imports:
17 |     bookdown,
18 |     broom,
19 |     bluster,
20 |     downlit,
21 |     gganatogram,
22 |     ggraph,
23 |     ggrepel,
24 |     ggtextures,
25 |     grid,
26 |     gridExtra,
27 |     gridtext,
28 |     gtable,
29 |     here,
30 |     hexbin,
31 |     igraph,
32 |     kableExtra,
33 |     lubridate,
34 |     magick,
35 |     museumst,
36 |     patchwork,
37 |     quanteda,
38 |     quanteda.textstats,
39 |     reshape2,
40 |     rnaturalearthdata,
41 |     rnaturalearth,
42 |     scales,
43 |     sf,
44 |     stm,
45 |     stminsights,
46 |     text2vec,
47 |     tidygraph,
48 |     tidytext,
49 |     tidyverse,
50 |     urbnmapr,
51 |     uwot,
52 |     xfun,
53 |     XML
54 | Remotes:
55 |     github::pachterlab/museumst,
56 |     github::jespermaag/gganatogram,
57 |     github::clauswilke/ggtextures,
58 |     github::UrbanInstitute/urbnmapr
59 |     
60 | 


--------------------------------------------------------------------------------
/supplement/LCM.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pachterlab/LP_2021/7989d2665e5dcebc8c13e61639363a9268ead1a1/supplement/LCM.png


--------------------------------------------------------------------------------
/supplement/LICENSE:
--------------------------------------------------------------------------------
1 | YEAR: 2021
2 | COPYRIGHT HOLDER: Lambda Moses
3 | 


--------------------------------------------------------------------------------
/supplement/LICENSE.md:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | 
 3 | Copyright (c) 2021 Lambda Moses
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/supplement/_bookdown.yml:
--------------------------------------------------------------------------------
1 | book_filename: "museumst"
2 | delete_merged_file: true
3 | language:
4 |   ui:
5 |     chapter_name: "Chapter "
6 | new_session: yes
7 | 


--------------------------------------------------------------------------------
/supplement/_output.yml:
--------------------------------------------------------------------------------
 1 | bookdown::bs4_book:
 2 |   theme:
 3 |     primary: "#B070D5"
 4 |   repo:
 5 |     base: https://github.com/pachterlab/LP_2021
 6 |     branch: main
 7 |     subdir: supplement
 8 | bookdown::pdf_book:
 9 |   latex_engine: xelatex
10 |   keep_tex: yes
11 |   includes:
12 |     in_header: preamble.tex
13 | 


--------------------------------------------------------------------------------
/supplement/bdna.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pachterlab/LP_2021/7989d2665e5dcebc8c13e61639363a9268ead1a1/supplement/bdna.png


--------------------------------------------------------------------------------
/supplement/cpal.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pachterlab/LP_2021/7989d2665e5dcebc8c13e61639363a9268ead1a1/supplement/cpal.png


--------------------------------------------------------------------------------
/supplement/current_analysis.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pachterlab/LP_2021/7989d2665e5dcebc8c13e61639363a9268ead1a1/supplement/current_analysis.png


--------------------------------------------------------------------------------
/supplement/current_curve.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pachterlab/LP_2021/7989d2665e5dcebc8c13e61639363a9268ead1a1/supplement/current_curve.png


--------------------------------------------------------------------------------
/supplement/current_hist.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pachterlab/LP_2021/7989d2665e5dcebc8c13e61639363a9268ead1a1/supplement/current_hist.png


--------------------------------------------------------------------------------
/supplement/current_tech.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pachterlab/LP_2021/7989d2665e5dcebc8c13e61639363a9268ead1a1/supplement/current_tech.png


--------------------------------------------------------------------------------
/supplement/dsp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pachterlab/LP_2021/7989d2665e5dcebc8c13e61639363a9268ead1a1/supplement/dsp.png


--------------------------------------------------------------------------------
/supplement/exm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pachterlab/LP_2021/7989d2665e5dcebc8c13e61639363a9268ead1a1/supplement/exm.png


--------------------------------------------------------------------------------
/supplement/fig13a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pachterlab/LP_2021/7989d2665e5dcebc8c13e61639363a9268ead1a1/supplement/fig13a.png


--------------------------------------------------------------------------------
/supplement/fig13b.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pachterlab/LP_2021/7989d2665e5dcebc8c13e61639363a9268ead1a1/supplement/fig13b.png


--------------------------------------------------------------------------------
/supplement/fig1A.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pachterlab/LP_2021/7989d2665e5dcebc8c13e61639363a9268ead1a1/supplement/fig1A.png


--------------------------------------------------------------------------------
/supplement/fisseq.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pachterlab/LP_2021/7989d2665e5dcebc8c13e61639363a9268ead1a1/supplement/fisseq.png


--------------------------------------------------------------------------------
/supplement/hcr.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pachterlab/LP_2021/7989d2665e5dcebc8c13e61639363a9268ead1a1/supplement/hcr.png


--------------------------------------------------------------------------------
/supplement/historical_barcoding.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pachterlab/LP_2021/7989d2665e5dcebc8c13e61639363a9268ead1a1/supplement/historical_barcoding.png


--------------------------------------------------------------------------------
/supplement/index.Rmd:
--------------------------------------------------------------------------------
 1 | --- 
 2 | title: "Museum of Spatial Transcriptomics"
 3 | author: 
 4 |   - Lambda Moses
 5 |   - Lior Pachter
 6 | date: "`r Sys.Date()`"
 7 | site: bookdown::bookdown_site
 8 | documentclass: book
 9 | bibliography: "museumst.bib"
10 | #csl: nature-genetics.csl
11 | link-citations: yes
12 | description: "Museum of Spatial Transcriptomics"
13 | github-repo: "pachterlab/LP_2021"
14 | ---
15 | 
16 | # Preface {-} 
17 | 
18 | This supplement to [the paper Museum of Spatial Transcriptomics](https://www.nature.com/articles/s41592-022-01409-2) and the associated [database of spatial transcriptomics literature](https://docs.google.com/spreadsheets/d/1sJDb9B7AtYmfKv4-m8XR7uc3XXw_k4kGSout8cqZ8bY/edit?usp=sharing) is inspired by museum catalogs that provide insight and detail to further understanding of the exhibits. The results presented are based on code that can be run interactively on [RStudio Cloud](https://rstudio.cloud/project/2492054). We present key analyses of metadata curated for the database, and provide further analyses and results beyond what could be included here in the `more_analyses` directory of this repository. The markdown that generates this text is on GitHub, and is version controlled so that its development can be tracked now and in the future. Please notify us of errors, omissions, or other suggestions via submission of issues on GitHub: https://github.com/pachterlab/LP_2021 To submit new entries to the database, please fill out this [Google Form](https://forms.gle/HjQD9x6AMjR7C62SA). If the text in some figures are too small to read, then right click on the figure to open in a new tab to zoom in.
19 | 
20 | ## Quick stats
21 | ```{r, echo=FALSE}
22 | knitr::opts_chunk$set(echo = FALSE, message = FALSE, warning = FALSE)
23 | ```
24 | 
25 | ```{r}
26 | library(museumst)
27 | ```
28 | 
29 | ```{r}
30 | nms <- c("ROI selection", "smFISH", "NGS barcoding", "ISS", "De novo", "Analysis")
31 | data_sheets <- read_metadata(nms, update = TRUE, cache = FALSE)
32 | ```
33 | 
34 | ```{r}
35 | n_pubs <- length(unique(data_sheets$title))
36 | n_data <- length(unique(data_sheets$title[data_sheets$sheet != "Analysis"]))
37 | n_analysis <- length(unique(data_sheets$title[data_sheets$sheet == "Analysis"]))
38 | ```
39 | 
40 | ```{r}
41 | nms <- c("Prequel", "Prequel analysis")
42 | prequel <- read_metadata(nms, update = TRUE)
43 | n_prequel <- length(unique(prequel$title))
44 | ```
45 | 
46 | ```{r}
47 | n_current_insts <- length(unique(data_sheets$short_name))
48 | n_current_cities <- length(unique(paste(data_sheets$country, data_sheets$city)))
49 | n_current_countries <- length(unique(data_sheets$country))
50 | ```
51 | 
52 | ```{r}
53 | analysis <- read_metadata(sheet_use = "Analysis", update = TRUE)
54 | n_packages <- sum(complete.cases(analysis[,c("repo", "documented", "packaged", "CRAN/Bioc/pip/conda")]))
55 | ```
56 | 
57 | As of `r Sys.Date()`, this database contains:
58 | 
59 | * `r n_pubs` current era publications, `r n_data` of which are for data collection and `r n_analysis` are for data analysis (see Chapter 1 for definition of prequel and current eras)
60 | * `r n_prequel` prequel era publications
61 | * Current era publications from `r n_current_insts` institutions[^1] in `r n_current_cities` cities in `r n_current_countries` countries
62 | * `r n_packages` current era data analysis software packages whose source code is available online
63 | 
64 | ## Running the code
65 | 
66 | This document is built with the `bookdown` package from a collection of R Markdown files. How some of figures look depends on parameters that can be changed, such as size of bins when binning number of publications in time to show a trend. The source code is on [RStudio Cloud](https://rstudio.cloud/project/2492054). The dependencies are pre-installed in the RStudio Cloud project. By default, when the database is queried by code, the most up to date version is used, which can be newer than the rendered static version on github.io. To build the document in RStudio Cloud, run this in the R console:
67 | 
68 | ```{r, eval=FALSE, echo=TRUE}
69 | bookdown::render_book("index.Rmd", output_format = "bookdown::bs4_book")
70 | ```
71 | 
72 | If you are cloning this repo into a fresh RStudio Cloud project or a fresh machine, install the packages required to build the book as follows:
73 | 
74 | First install `remotes` with `install.packages("remotes")`. Then use `remotes:install_deps(dependencies = TRUE)` to install all required packages from CRAN, Bioconductor, and GitHub. So in short,
75 | 
76 | ```{r, eval=FALSE, echo=TRUE}
77 | install.packages("remotes")
78 | remotes::install_deps(dependencies = TRUE)
79 | ```
80 | 
81 | Because many packages are installed, the installation can be sped up with the argument `Ncpus` in `install_deps()` to specify the number of CPU cores to use to install packages in parallel, such as `Ncpus = 2L` for 2 cores. The free plan of RStudio Cloud only has 1 core, but this argument can be used when multiple cores are available.
82 | 
83 | By default, the most up to date version of the database is downloaded for analyses in this book. However, as the `museumst` R package written for these analyses contains a cached version of the database, historical versions of the database can be viewed by installing older versions of `museumst` and setting `update = FALSE` when calling `museumst::read_metadata()` when running code from this book on RStudio Cloud or your computer. Older versions of `museumst` can be installed with
84 | 
85 | ```{r, eval=FALSE, echo=TRUE}
86 | remotes::install_github("pachterlab/museumst", ref = "v0.0.0.9016")
87 | ```
88 | 
89 | where `ref` refers to a release. Release history of `museumst` can be seen [here](https://github.com/pachterlab/museumst/releases). Documentation of `museumst` can be seen [here](https://pachterlab.github.io/museumst/).
90 | 
91 | [^1]: Caveat: definition of "institution" is more complicated. While it usually means a university or company, whether named institutions, schools, and hospitals affiliated with a university count as institutions distinct from that university is somewhat inconsistent within this database. Usually those are considered separate institutions if they reside in a different city from the rest of the university or are sometimes listed independently from the university. However, what counts as a city is also somewhat complicated as different countries have different administrative structures.
92 | 


--------------------------------------------------------------------------------
/supplement/langs_doc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pachterlab/LP_2021/7989d2665e5dcebc8c13e61639363a9268ead1a1/supplement/langs_doc.png


--------------------------------------------------------------------------------
/supplement/lcm_topics.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pachterlab/LP_2021/7989d2665e5dcebc8c13e61639363a9268ead1a1/supplement/lcm_topics.png


--------------------------------------------------------------------------------
/supplement/lcm_words.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pachterlab/LP_2021/7989d2665e5dcebc8c13e61639363a9268ead1a1/supplement/lcm_words.png


--------------------------------------------------------------------------------
/supplement/merfish.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pachterlab/LP_2021/7989d2665e5dcebc8c13e61639363a9268ead1a1/supplement/merfish.png


--------------------------------------------------------------------------------
/supplement/more_analyses/analysis.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Analysis"
  3 | author: "Lambda Moses"
  4 | date: "`r Sys.Date()`"
  5 | output: rmarkdown::html_vignette
  6 | vignette: >
  7 |   %\VignetteIndexEntry{Analysis}
  8 |   %\VignetteEngine{knitr::rmarkdown}
  9 |   %\VignetteEncoding{UTF-8}
 10 | ---
 11 | 
 12 | ```{r setup, include=FALSE}
 13 | knitr::opts_chunk$set(echo = TRUE)
 14 | ```
 15 | 
 16 | Here I explore the history and geography of spatial transcriptomics data analysis method through the metadata I collected.
 17 | ```{r}
 18 | library(museumst)
 19 | library(purrr)
 20 | library(dplyr)
 21 | library(ggplot2)
 22 | library(sf)
 23 | theme_set(theme_bw())
 24 | ```
 25 | 
 26 | # Import data
 27 | By default, the `read_metadata` function reads the appropriate sheet from a cache within this package. This may be outdated, since the Google Sheets are constantly updated. Use the argument `update = TRUE` to download the most up to date version. The default is `FALSE` to avoid API limits during automated CRAN checks.
 28 | ```{r}
 29 | sheet <- read_metadata("Analysis")
 30 | ```
 31 | 
 32 | # Number of publications
 33 | How many publications per year?
 34 | 
 35 | ```{r}
 36 | anyDuplicated(sheet$title)
 37 | ```
 38 | 
 39 | How many publications are there in total in this sheet?
 40 | ```{r}
 41 | nrow(sheet)
 42 | ```
 43 | 
 44 | ```{r}
 45 | # For maps later in this notebook
 46 | city_gc <- geocode_inst_city(sheet)
 47 | pubs_on_map2 <- partial(pubs_on_map, city_gc = city_gc)
 48 | ```
 49 | 
 50 | ## Overall
 51 | By default, preprints are excluded from plots of number of publications over time, since the timing of preprints is incoherent to that of published papers. Including preprints will inflate the number of publications in recent months, and if we make the same plot a few months later, that inflation we see now will be gone and moved to a more recent date. You can add the `preprints = TRUE` argument to all functions plotting things over time in this package to include preprints. You can also use the `binwidth` argument to specify bin width in days. The default is 365 days.
 52 | ```{r}
 53 | pubs_per_year(sheet)
 54 | ```
 55 | 
 56 | ## By category
 57 | By the category of problem to address
 58 | ```{r}
 59 | category <- unnest_cat(sheet, category)
 60 | ```
 61 | 
 62 | ```{r}
 63 | pubs_per_cat(category, category)
 64 | ```
 65 | 
 66 | ```{r, fig.width=4, fig.height=4}
 67 | pubs_per_year(category, facet_by = "category", n_top = 5, sort_by = "recent_count")
 68 | ```
 69 | 
 70 | ```{r}
 71 | principle <- unnest_cat(sheet, core_principle)
 72 | ```
 73 | 
 74 | ```{r}
 75 | pubs_per_cat(principle, core_principle)
 76 | ```
 77 | 
 78 | ## By species
 79 | These are the species the computational methods were demonstrated on
 80 | 
 81 | ```{r}
 82 | species <- unnest_cat(sheet, species)
 83 | ```
 84 | 
 85 | ```{r}
 86 | pubs_per_cat(species, species)
 87 | ```
 88 | 
 89 | ```{r}
 90 | pubs_per_cat(species, species, n_top = 5, isotype = TRUE, img_unit = 2)
 91 | ```
 92 | 
 93 | I debated for a while which image to use for humans. In order to avoid racist and sexist connotations, I picked the skull as non-experts can't tell the race and sex from the skull. If you think it's bad to use a skull, then I'll use a Black, Hispanic, Middle Eastern, North African, Central Asian, or South Asian woman's portrait. I stand with the oppressed.
 94 | 
 95 | ## By journal
 96 | ```{r}
 97 | sort(table(sheet$journal))
 98 | ```
 99 | 
100 | ## Location
101 | ### General
102 | Just some barplots for number of publications per institution, city, and country. 
103 | ```{r}
104 | pubs_per_cat(sheet, country)
105 | ```
106 | 
107 | How about per capita? Actually this probably does not say much since many researchers in the West are immigrants. Then it raises the burning political question of who counts as the "population".
108 | 
109 | ```{r}
110 | pubs_per_capita(sheet, plot = "bar")
111 | ```
112 | 
113 | How about country over time? 
114 | ```{r, fig.height=12, fig.width=6}
115 | pubs_per_year(sheet, facet_by = "country")
116 | ```
117 | Now look at cities. 
118 | 
119 | ```{r}
120 | pubs_per_cat(sheet, city)
121 | ```
122 | 
123 | Institutions
124 | ```{r}
125 | pubs_per_cat(sheet, institution)
126 | ```
127 | 
128 | ### Worldwide
129 | ```{r}
130 | pubs_on_map2(sheet)
131 | ```
132 | 
133 | Let me also plot the per capita thing on a map, as a choropleth
134 | ```{r}
135 | pubs_per_capita(sheet)
136 | ```
137 | 
138 | Break down by species
139 | ```{r, fig.width=8, fig.height=4}
140 | pubs_on_map2(species, facet_by = "species")
141 | ```
142 | 
143 | ### Europe
144 | Plot a map just for Europe. A problem here is that if I simply filter the `world` sf object, I'll get some islands away from what we usually think is Europe. Kind of feel bad for Russians since Russia is so large that it makes the map look bad and I don't have a paper from Russia in this spreadsheet. 
145 | 
146 | ```{r}
147 | pubs_on_map(sheet, city_gc, zoom = "europe")
148 | ```
149 | 
150 | ```{r}
151 | pubs_per_capita(sheet, "europe")
152 | ```
153 | 
154 | I didn't realize that Switzerland and Netherlands were pretty great. Again, this is a small sample size, so I take this with a grain of salt.
155 | 
156 | ```{r}
157 | pubs_on_map2(species, zoom = "europe", facet_by = "species")
158 | ```
159 | 
160 | Maybe I just made this plot for fun. Not sure what to say about it.
161 | 
162 | ### USA
163 | Also a plot just for America. I did not encounter a publication in this spreadsheet that is from Hawaii or Alaska, but I'll include Hawaii and Alaska in the map anyway in case I get one in the future.
164 | 
165 | ```{r}
166 | pubs_on_map(sheet, zoom = "usa", city_gc = city_gc)
167 | ```
168 | 
169 | ```{r}
170 | pubs_per_capita(sheet, "usa")
171 | ```
172 | 
173 | ```{r, fig.width=8, fig.height=3}
174 | pubs_on_map2(species, zoom = "usa", facet_by = "species")
175 | ```
176 | 
177 | # Word cloud
178 | ## Titles
179 | ```{r, fig.height=7, fig.width=7}
180 | plot_wordcloud(sheet)
181 | ```
182 | 
183 | ## Summaries
184 | ```{r, fig.height=7, fig.width=7}
185 | plot_wordcloud(sheet, summary)
186 | ```
187 | 
188 | ```{r, fig.height=7, fig.width=7}
189 | plot_wordcloud(sheet, core_principle, scale = c(5, 0.1))
190 | ```
191 | 
192 | ## Tissues
193 | ```{r, fig.width=4, fig.height=4}
194 | plot_wordcloud(sheet, tissue)
195 | ```
196 | 
197 | ## Over time
198 | 
199 | ```{r}
200 | range(sheet$date_published)
201 | ```
202 | 
203 | ```{r, fig.height=6, fig.width=6}
204 | plot_wordcloud(sheet, year_min = 2009, year_max = 2015, scale = c(5, 0.1))
205 | ```
206 | 
207 | ```{r, fig.height=7, fig.width=7}
208 | plot_wordcloud(sheet, year_min = 2015, year_max = 2021)
209 | ```
210 | 
211 | ## Department names
212 | 
213 | ```{r, fig.height=7, fig.width=7}
214 | plot_wordcloud(sheet, col_use = "department", other_stop_words = inst_words,
215 |               scale = c(5, 0.1))
216 | ```
217 | 
218 | # Programming languages
219 | ```{r}
220 | langs <- unnest_cat(sheet, language, c("documented", "packaged", "CRAN/Bioc/pip/conda"))
221 | ```
222 | 
223 | Here each icon stands for 2 publications.
224 | ```{r}
225 | pubs_per_cat(langs, language, n_top = 5, isotype = TRUE, img_unit = 2)
226 | ```
227 | 
228 | Not sure of MathWorks will get mad at me for using their logo this way, since they don't want people to put their logo into a pattern and I'm not entirely sure if this counts as pattern.
229 | 
230 | # Data and code availability
231 | 
232 | How many publications have provided a code repo?
233 | ```{r}
234 | hist_bool(sheet, !is.na(repo), preprints = TRUE)
235 | ```
236 | 
237 | It seems that more recent publications are more likely to provide a repo. Is this significant? 
238 | ```{r}
239 | test_year_bool(sheet, !is.na(repo))
240 | ```
241 | 
242 | Yep, it's significant. How about whether the code is well-documented?
243 | 
244 | ```{r, fig.width=6, fig.height=9}
245 | pubs_per_year(langs, "language", binwidth = 150)
246 | ```
247 | 
248 | ```{r}
249 | hist_bool_line(langs, documented, facet_by = "language", preprints = TRUE, n_top = 5) 
250 | ```
251 | 
252 | It seems that R, Python, and C++ packages are more likely to be well-documented, and MATLAB ones are less likely. Is that MATLAB culture?
253 | ```{r}
254 | hist_bool_line(langs, `CRAN/Bioc/pip/conda`, "language", preprints = TRUE, n_top = 5)
255 | ```
256 | 
257 | It seems that most packages are not on those public repos, not even R packages. Why? Too lazy to write documentations?
258 | ```{r}
259 | langs %>% 
260 |   filter(complete.cases(.)) %>% 
261 |   glm(data = ., documented ~ `CRAN/Bioc/pip/conda`, family = "binomial") %>% 
262 |   summary()
263 | ```
264 | 
265 | So it seems that packages on those public repos sort of tend to be better documented, though this is not really significant.
266 | 
267 | ```{r}
268 | sessionInfo()
269 | ```
270 | 
271 | 


--------------------------------------------------------------------------------
/supplement/more_analyses/array.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "NGS barcoding"
  3 | author: "Lambda Moses"
  4 | date: "`r Sys.Date()`"
  5 | output: rmarkdown::html_vignette
  6 | vignette: >
  7 |   %\VignetteIndexEntry{Array}
  8 |   %\VignetteEngine{knitr::rmarkdown}
  9 |   %\VignetteEncoding{UTF-8}
 10 | ---
 11 | 
 12 | ```{r setup, include=FALSE}
 13 | knitr::opts_chunk$set(echo = TRUE)
 14 | ```
 15 | 
 16 | Here I explore the history and geography of spatial transcriptomics based on array capture through the metadata I collected.
 17 | ```{r}
 18 | library(museumst)
 19 | library(ggplot2)
 20 | library(purrr)
 21 | library(dplyr)
 22 | library(sf)
 23 | theme_set(theme_bw())
 24 | ```
 25 | 
 26 | # Import data
 27 | By default, the `read_metadata` function reads the appropriate sheet from a cache within this package. This may be outdated, since the Google Sheets are constantly updated. Use the argument `update = TRUE` to download the most up to date version. The default is `FALSE` to avoid API limits during automated CRAN checks.
 28 | ```{r}
 29 | sheet <- read_metadata("NGS barcoding")
 30 | ```
 31 | 
 32 | # Number of publications
 33 | How many publications per year?
 34 | ```{r}
 35 | # Remove duplicates as different datasets in the same publication have their own rows
 36 | publications <- get_pubs_df(sheet, c("repo", "accession"))
 37 | ```
 38 | 
 39 | How many publications are there in total in this sheet?
 40 | ```{r}
 41 | nrow(publications)
 42 | ```
 43 | 
 44 | ```{r}
 45 | # For maps later in this notebook
 46 | city_gc <- geocode_inst_city(publications)
 47 | pubs_on_map2 <- partial(pubs_on_map, city_gc = city_gc)
 48 | ```
 49 | 
 50 | ## Overall
 51 | Number of publications per year. Preprints are excluded from plots over years as it takes months to publish so the dates of the preprints are incoherent with those for the other papers. 
 52 | ```{r}
 53 | events <- read_major_events()
 54 | ```
 55 | 
 56 | ```{r, fig.width=7, fig.height=3}
 57 | events %>% 
 58 |   filter(category == "array") %>% 
 59 |   plot_timeline(rep(c(1, -1), length.out = nrow(.)), 
 60 |                 expand_x = c(0.1, 0.1), expand_y = c(0.05, 0.05))
 61 | ```
 62 | 
 63 | By default, preprints are excluded from plots of number of publications over time, since the timing of preprints is incoherent to that of published papers. Including preprints will inflate the number of publications in recent months, and if we make the same plot a few months later, that inflation we see now will be gone and moved to a more recent date. You can add the `preprints = TRUE` argument to all functions plotting things over time in this package to include preprints. You can also use the `binwidth` argument to specify bin width in days. The default is 365 days.
 64 | ```{r}
 65 | pubs_per_year(publications, binwidth = 100, preprints = TRUE)
 66 | ```
 67 | 
 68 | Again, preprints are not included here since their timing is different from that of publications.
 69 | 
 70 | ## By method
 71 | How about when broken down by method (not for data analysis sheets)?
 72 | 
 73 | ```{r}
 74 | methods <- unnest_cat(sheet, method)
 75 | ```
 76 | 
 77 | ```{r, fig.width=4, fig.height=4}
 78 | pubs_per_year(methods, facet_by = "method", binwidth = 100, preprints = TRUE,
 79 |               sort_by = "recent")
 80 | ```
 81 | 
 82 | I don't know how much this actually says due to the small sample size. This is also kind of confusing since Geo-seq really is a form of LCM coupled with SMART-seq, so it should be categorized as LCM, but the authors gave it a new name. 
 83 | 
 84 | ## By species
 85 | 
 86 | ```{r}
 87 | species <- unnest_cat(sheet, species)
 88 | ```
 89 | 
 90 | ```{r}
 91 | pubs_per_cat(species, species)
 92 | ```
 93 | 
 94 | ```{r, fig.width=4, fig.height=2}
 95 | pubs_per_cat(species, species, isotype = TRUE, img_unit = 2)
 96 | ```
 97 | 
 98 | I debated for a while which image to use for humans. In order to avoid racist and sexist connotations, I picked the skull as non-experts can't tell the race and sex from the skull. If you think it's bad to use a skull, then I'll use a Black, Hispanic, Middle Eastern, North African, Central Asian, or South Asian woman's portrait. I stand with the oppressed.
 99 | 
100 | ## By journal
101 | ```{r}
102 | sort(table(publications$journal))
103 | ```
104 | 
105 | ## Location
106 | ### General
107 | Just some barplots for number of publications per institution, city, and country. 
108 | ```{r}
109 | pubs_per_cat(publications, country)
110 | ```
111 | 
112 | How about per capita? Actually this probably does not say much since many researchers in the West are immigrants. Then it raises the burning political question of who counts as the "population".
113 | 
114 | ```{r}
115 | pubs_per_capita(publications, plot = "bar")
116 | ```
117 | 
118 | How about country over time? There might not be enough publications to show a trend.
119 | ```{r}
120 | pubs_per_year(publications, facet_by = "country", binwidth = 150)
121 | ```
122 | Now look at cities. 
123 | 
124 | ```{r}
125 | pubs_per_cat(publications, city)
126 | ```
127 | 
128 | Institutions
129 | ```{r}
130 | pubs_per_cat(publications, institution)
131 | ```
132 | 
133 | OK, now here comes the maps!
134 | 
135 | ### Worldwide
136 | ```{r}
137 | pubs_on_map2(publications)
138 | ```
139 | 
140 | Let me also plot the per capita thing on a map, as a choropleth
141 | ```{r}
142 | pubs_per_capita(publications)
143 | ```
144 | 
145 | Break down by species
146 | ```{r, fig.width=8, fig.height=3}
147 | pubs_on_map2(species, facet_by = "species")
148 | ```
149 | 
150 | By method
151 | ```{r, fig.width=8, fig.height=4}
152 | pubs_on_map2(methods, facet_by = "method")
153 | ```
154 | 
155 | It seems that ST is the only method that did spread. I think Visium did too; it's just that the papers haven't been published yet.
156 | 
157 | ### Europe
158 | Plot a map just for Europe. A problem here is that if I simply filter the `world` sf object, I'll get some islands away from what we usually think is Europe. Kind of feel bad for Russians since Russia is so large that it makes the map look bad and I don't have a paper from Russia in this spreadsheet. 
159 | 
160 | ```{r}
161 | pubs_on_map2(publications, zoom = "europe")
162 | ```
163 | 
164 | ```{r}
165 | pubs_per_capita(publications, "europe")
166 | ```
167 | 
168 | ```{r}
169 | pubs_on_map2(species, zoom = "europe", facet_by = "species")
170 | ```
171 | 
172 | Maybe I just made this plot for fun. Not sure what to say about it.
173 | 
174 | ```{r}
175 | pubs_on_map2(methods, zoom = "europe", facet_by = "method")
176 | ```
177 | 
178 | So all the other methods are American
179 | 
180 | ### USA
181 | Also a plot just for America. I did not encounter a publication in this spreadsheet that is from Hawaii or Alaska, but I'll include Hawaii and Alaska in the map anyway in case I get one in the future.
182 | 
183 | ```{r}
184 | pubs_on_map2(publications, zoom = "usa")
185 | ```
186 | 
187 | ```{r}
188 | pubs_per_capita(publications, "usa")
189 | ```
190 | 
191 | ```{r}
192 | pubs_on_map2(species, zoom = "usa", facet_by = "species")
193 | ```
194 | 
195 | ```{r}
196 | pubs_on_map2(methods, zoom = "usa", facet_by = "method")
197 | ```
198 | 
199 | # Word cloud
200 | ## Titles
201 | ```{r, fig.height=6, fig.width=6}
202 | plot_wordcloud(sheet)
203 | ```
204 | 
205 | ```{r, fig.height=5, fig.width=5}
206 | plot_wordcloud(sheet, species_use = "Mus musculus")
207 | ```
208 | 
209 | ```{r, fig.height=7, fig.width=7}
210 | plot_wordcloud(sheet, species_use = "Homo sapiens")
211 | ```
212 | 
213 | I think the prominence of "spatial" is probably driven by ST.
214 | 
215 | ## Tissues
216 | ```{r, fig.height=6, fig.width=6}
217 | plot_wordcloud(sheet, col_use = "tissue", scale = c(5, 0.1))
218 | ```
219 | 
220 | ```{r}
221 | plot_wordcloud(sheet, species_use = "Mus musculus", col_use = "tissue", scale = c(5, 0.1))
222 | ```
223 | 
224 | ```{r, fig.height=5, fig.width=5}
225 | plot_wordcloud(sheet, species_use = "Homo sapiens", col_use = "tissue", scale = c(5, 0.1))
226 | ```
227 | 
228 | ## Department names
229 | 
230 | ```{r, fig.height=6, fig.width=6}
231 | plot_wordcloud(sheet, col_use = "department", other_stop_words = inst_words, scale = c(5, 0.1))
232 | ```
233 | 
234 | I think that's driven by Department of Gene Technology from Sweden
235 | 
236 | ## Downstream analyses
237 | ```{r, fig.height=5, fig.width=5}
238 | plot_wordcloud(sheet, "downstream")
239 | ```
240 | 
241 | # Programming languages
242 | ```{r}
243 | langs <- unnest_cat(sheet, language)
244 | ```
245 | 
246 | ```{r}
247 | pubs_per_cat(langs, language, isotype = TRUE, img_unit = 2)
248 | ```
249 | 
250 | # Data and code availability
251 | 
252 | How many publications have provided a code repo? As with any plot over time, preprints are excluded.
253 | ```{r}
254 | hist_bool(publications, !is.na(repo), binwidth = 100, preprints = TRUE)
255 | ```
256 | 
257 | How many publications have provided an accession for sequencing data?
258 | ```{r}
259 | hist_bool(publications, !is.na(accession), binwidth = 100, preprints = TRUE)
260 | ```
261 | 
262 | ```{r}
263 | hist_bool_line(publications, !is.na(accession), binwidth = 100, preprints = TRUE)
264 | ```
265 | 
266 | I don't think it's significant.
267 | ```{r}
268 | sessionInfo()
269 | ```
270 | 
271 | 


--------------------------------------------------------------------------------
/supplement/more_analyses/comparison.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Comparisons between sheets"
  3 | author: "Lambda Moses"
  4 | date: "`r Sys.Date()`"
  5 | output: rmarkdown::html_vignette
  6 | vignette: >
  7 |   %\VignetteIndexEntry{Comparisons between sheets}
  8 |   %\VignetteEngine{knitr::rmarkdown}
  9 |   %\VignetteEncoding{UTF-8}
 10 | ---
 11 | 
 12 | ```{r setup, include=FALSE}
 13 | knitr::opts_chunk$set(echo = TRUE)
 14 | ```
 15 | 
 16 | Here I compare different sheets and do the analysis for all sheets in the original trilogy together. Here select plots from the notebooks for the individual sheets are plotted together for easier comparison.
 17 | 
 18 | ```{r}
 19 | library(museumst)
 20 | library(gganimate)
 21 | library(purrr)
 22 | library(dplyr)
 23 | library(tidytext)
 24 | library(tidyr)
 25 | library(ggplot2)
 26 | library(forcats)
 27 | library(scales)
 28 | library(ggrepel)
 29 | library(sf)
 30 | theme_set(theme_bw())
 31 | ```
 32 | 
 33 | # Data
 34 | How does the trend in number of papers in the original trilogy compare to that in the prequel?
 35 | 
 36 | By default, the `read_metadata` function reads the appropriate sheet from a cache within this package. This may be outdated, since the Google Sheets are constantly updated. Use the argument `update = TRUE` to download the most up to date version. The default is `FALSE` to avoid API limits during automated CRAN checks.
 37 | ```{r}
 38 | nms <- c("Prequel", "ROI selection", "smFISH", "NGS barcoding", "ISS", "No imaging")
 39 | data_sheets <- read_metadata(nms, update = TRUE)
 40 | ```
 41 | 
 42 | ```{r}
 43 | data_sheets <- data_sheets %>% 
 44 |   mutate(era = case_when(sheet == "Prequel" ~ "prequel",
 45 |                          TRUE ~ "current"))
 46 | ```
 47 | 
 48 | ## Number of publications over time
 49 | 
 50 | See the number of publications; for original trilogy sheets, each dataset has its own row so often one publication has multiple rows.
 51 | ```{r}
 52 | publications <- get_pubs_df(data_sheets, "era")
 53 | ```
 54 | 
 55 | Number of publications in each sheet
 56 | ```{r}
 57 | data_sheets %>% 
 58 |   select(title, sheet) %>% 
 59 |   distinct() %>% 
 60 |   count(sheet)
 61 | ```
 62 | 
 63 | ```{r}
 64 | # For maps later in this notebook
 65 | city_gc <- geocode_inst_city(publications)
 66 | pubs_on_map2 <- partial(pubs_on_map, city_gc = city_gc)
 67 | ```
 68 | 
 69 | By default, preprints are excluded from plots of number of publications over time, since the timing of preprints is incoherent to that of published papers. Including preprints will inflate the number of publications in recent months, and if we make the same plot a few months later, that inflation we see now will be gone and moved to a more recent date. You can add the `preprints = TRUE` argument to all functions plotting things over time in this package to include preprints. You can also use the `binwidth` argument to specify bin width in days. The default is 365 days.
 70 | ```{r}
 71 | era_freqpoly(publications, era, preprints = TRUE)
 72 | ```
 73 | 
 74 | Here we see that what's currently known as spatial transcriptomics is growing more than prequel spatial transcriptomics back in the 1990s and 2000s. Or maybe my prequel collection is incomplete, so I'm not so sure how big that peak is. What's more remarkable is the number of publications in 2020. Now it's near the end of May. We're almost half way through the year. Considering that most publications are from Western countries, for which December is the holiday season, I expect this number to double by the end of 2020.
 75 | 
 76 | Let me plot these lines not with the absolute year, but how many years since the first publication in the category to see how quickly these things grew. This will definitely change once I make the LCM collection more complete. Since using LCM + transcriptomics started at least as early as 2001 (the oldest paper I can find that did this is from 2001), I would expect the FALSE curve to rise earlier than the TRUE curve.
 77 | ```{r}
 78 | era_freqpoly(publications, era, since_first = TRUE, preprints = TRUE)
 79 | ```
 80 | 
 81 | So far they seem pretty parallel (which most likely won't be the case when I make the LCM collection more complete), though I certainly don't think the current era has peaked yet. Based on this, I think the current era will be much greater than the prequel era.
 82 | 
 83 | Here I'm making an animation on a map for prequel and original trilogy separately for number of publications per institution. These take a while to run to render all those frames, so not run for CRAN checks.
 84 | 
 85 | ```{r, eval=FALSE}
 86 | world_prequel <- publications %>% 
 87 |   filter(era == "prequel") %>% 
 88 |   pubs_on_map(inst_gc, city_gc, per_year = TRUE)
 89 | animate(world_prequel, nframes = 200)
 90 | anim_save("output/world_prequel.gif")
 91 | ```
 92 | 
 93 | ```{r, eval=FALSE}
 94 | europe_prequel <- publications %>% 
 95 |   filter(era == "prequel") %>% 
 96 |   pubs_on_map(inst_gc, city_gc, zoom = "europe", per_year = TRUE)
 97 | animate(europe_prequel, nframes = 200)
 98 | anim_save("output/europe.gif")
 99 | ```
100 | 
101 | ```{r, eval=FALSE}
102 | usa_prequel <- publications %>% 
103 |   filter(era == "prequel") %>% 
104 |   pubs_on_map(inst_gc, city_gc, zoom = "usa", per_year = TRUE)
105 | animate(usa_prequel, nframes = 200)
106 | anim_save("output/usa.gif")
107 | ```
108 | 
109 | ```{r, eval=FALSE}
110 | world_current <- publications %>% 
111 |   filter(era == "current") %>% 
112 |   pubs_on_map(inst_gc, city_gc, per_year = TRUE)
113 | animate(world_current, nframes = 150)
114 | anim_save("output/world_current.gif")
115 | ```
116 | 
117 | ```{r, eval=FALSE}
118 | europe_current <- publications %>% 
119 |   filter(era == "current") %>% 
120 |   pubs_on_map(inst_gc, city_gc, per_year = TRUE, zoom = "europe")
121 | animate(europe_current, nframes = 80)
122 | anim_save("output/europe_current.gif")
123 | ```
124 | 
125 | ```{r, eval=FALSE}
126 | usa_current <- publications %>% 
127 |   filter(era == "current") %>% 
128 |   pubs_on_map(inst_gc, city_gc, per_year = TRUE, zoom = "usa")
129 | animate(usa_current)
130 | anim_save("output/usa_current.gif")
131 | ```
132 | 
133 | ## For all original trilogy or current era data
134 | ```{r}
135 | current <- data_sheets %>% 
136 |   filter(era == "current") %>% 
137 |   select(date_published:journal, country:year, sheet) %>% 
138 |   distinct()
139 | ```
140 | 
141 | ```{r}
142 | current2 <- publications %>% filter(era == "current")
143 | ```
144 | 
145 | How many per sheet? Again, I definitely don't think the LCM collection (within Microdissection) is anywhere close to complete, so microdissection is definitely the most popular one so far. LCM abstracts from PubMed search are analyzed separately.
146 | ```{r}
147 | pubs_per_cat(current, sheet)
148 | ```
149 | 
150 | How about when broken down by the specific methods, like Visium and MERFISH?
151 | ```{r}
152 | methods_current <- data_sheets %>% 
153 |   select(title, date_published, method, sheet, era) %>% 
154 |   filter(era == "current", !is.na(method)) %>% 
155 |   distinct()
156 | ```
157 | 
158 | ```{r, fig.width=6, fig.height=5}
159 | pubs_per_cat(methods_current, method)
160 | ```
161 | 
162 | I'm not sure what to do with LCM, since I did a separate analysis on LCM literature from PubMed search. Here we see Visium has really taken off though it just got commercialized late last year.
163 | 
164 | Sheet over time. Those on RStudio Cloud: Please try different binwidths!
165 | ```{r, fig.width=5, fig.height=5}
166 | pubs_per_year(current, "sheet", binwidth = 150, sort_by = "recent")
167 | ```
168 | 
169 | Also break this down further: this is to reproduce this plot in the other notebooks, faceted by methods.
170 | ```{r, fig.width=6, fig.height=6}
171 | binwidths <- tribble(~ sheet, ~ binwidth,
172 |                       "Prequel", 365,
173 |                       "ROI selection", 365,
174 |                       "smFISH", 365,
175 |                       "NGS barcoding", 100,
176 |                       "ISS", 200)
177 | pmap(binwidths, ~ data_sheets %>% 
178 |        filter(sheet == ..1) %>% 
179 |        unnest_cat(col_use = method) %>% 
180 |        pubs_per_year(facet_by = "method", binwidth = ..2) +
181 |        ggtitle(..1))
182 | ```
183 | 
184 | Also plot the most common methods
185 | ```{r, fig.width=4, fig.height=4}
186 | methods_current %>% 
187 |   filter(method != "LCM") %>% 
188 |   pubs_per_year(facet_by = "method", binwidth = 150, preprints = TRUE, n_top = 4, 
189 |                 sort_by = "recent")
190 | ```
191 | 
192 | Species in all the current sheets. For prequels, it has already been plotted in the prequel notebook.
193 | ```{r}
194 | species <- data_sheets %>%
195 |   filter(era == "current", !is.na(species)) %>% 
196 |   unnest_cat(species)
197 | ```
198 | 
199 | ```{r}
200 | pubs_per_cat(species, species)
201 | ```
202 | 
203 | I thought that there would be more mouse papers than human ones, but there're slightly more human ones.
204 | 
205 | Another fun isotype plot:
206 | ```{r}
207 | pubs_per_cat(species, species, n_top = 5, isotype = TRUE, img_unit = 5)
208 | ```
209 | 
210 | ```{r}
211 | pubs_per_cat(current2, country)
212 | ```
213 | 
214 | ```{r, fig.width=6, fig.height=6}
215 | pubs_per_cat(current2, city)
216 | ```
217 | 
218 | ```{r, fig.width=6, fig.height=5}
219 | pubs_per_cat(current2, institution)
220 | ```
221 | 
222 | ```{r}
223 | pubs_per_capita(current2)
224 | ```
225 | 
226 | Unlike LCM, this is more concentrated in the West. We do have some LCM papers from countries like Egypt, India, and Turkey. Even prequel papers are more spread out.
227 | ```{r}
228 | pubs_per_capita(current2, plot = "bar")
229 | ```
230 | 
231 | ```{r}
232 | pubs_per_capita(current, "europe")
233 | ```
234 | 
235 | ```{r}
236 | pubs_per_capita(current, "usa")
237 | ```
238 | 
239 | ```{r}
240 | pubs_per_capita(current, "usa", "bar")
241 | ```
242 | 
243 | ```{r}
244 | pubs_on_map2(current)
245 | ```
246 | 
247 | Bug: Need to distinguish between Cambridge, UK and Cambridge, MA in bar plots, though this is not a problem on maps.
248 | ```{r}
249 | pubs_on_map2(current, zoom = "europe")
250 | ```
251 | 
252 | ```{r}
253 | pubs_on_map2(current, zoom = "usa")
254 | ```
255 | 
256 | ### Text mining titles
257 | ```{r, fig.width=5, fig.height=5}
258 | data_sheets %>% 
259 |   filter(era == "current") %>% 
260 |   plot_wordcloud(min.freq = 1)
261 | ```
262 | 
263 | Unlike in Prequel, "database" is not at all prominent here. Which words have the most different frequencies between prequel and current era data papers? Here I use proportion of words to normalize the different sizes of the corpus. The most extreme case is when a word is used a lot in one era and not used at all in the other.
264 | ```{r}
265 | publications %>% 
266 |   count(era)
267 | ```
268 | 
269 | ```{r}
270 | word_prop_scatter(publications, title, era = era)
271 | ```
272 | 
273 | Cool. It seems that the more frequent words diverge more from y = x. Here we see in the current era, there's more emphasis on single cell, transcriptomics, and the word "spatial", while in the prequel era, there was more emphasis on database, patterns, and the very notion of "gene expression". Is it that the prequel era grew in the same period when the human and mouse genomes were sequenced and culminated around the same time when the human reference genome was published; back then gene expression was considered really cool, though now this notion is so common place that we no longer make a big deal out of it? Is this why the "spatial" and "single cell" are emphasized, because it's been so hard that we make a big deal out of it when it's done? Well, I don't think many people emphasize H&E in a title, though even if it's central to a paper, because it's so easy and cheap. They would rather say "pathology" or "diagnostics" since the cool part is not H&E, but the insights of the pathologist who analyzed the images. Will there be a day when "spatial" is so common place that people no longer make a big deal out of it, and mention it just like saying "I texted my friend", which is pretty trivial now, though texting was once a big deal?
274 | 
275 | ```{r, fig.width=6, fig.height=6}
276 | data_sheets %>% 
277 |   filter(era == "current") %>% 
278 |   plot_wordcloud(tissue, min.freq = 1)
279 | ```
280 | 
281 | Also compare tissues in the current era to those in prequel:
282 | ```{r}
283 | tissues <- unnest_cat(data_sheets, tissue, other_cols = "era")
284 | ```
285 | 
286 | ```{r}
287 | word_prop_scatter(tissues, tissue, era)
288 | ```
289 | 
290 | It seems that there's more Drosophila in the prequel era. But also mouse tissues other than the brain, such as the retina and genitourinary tracts (the "tract" and "system" are probably from the GUDMAP atlas). The "bulb" comes from mouse olfactory bulb, which is the go to organ to test array capture based techniques. Cell cultures also showed up here. But also note "cancer", since spatial transcriptomics in tumors is not possible with prequel era technology (except LCM, which I include in current era though the oldest LCM transcriptomics study I found is from 2001) because tumors don't have a stereotypical structure. Actually I already knew this as I was curating the papers, long before making this plot, but this plot confirms my suspicions.
291 | 
292 | For current era data papers, which programming languages (if any stated in the paper) are the most popular?
293 | ```{r}
294 | # prequel sheet doesn't have programming language information anyway though it has that columnn
295 | langs <- unnest_cat(data_sheets, language)
296 | ```
297 | 
298 | ```{r}
299 | pubs_per_cat(langs, language, n_top = 5, isotype = TRUE, img_unit = 5)
300 | ```
301 | 
302 | Just imagining MathWorks getting mad at me for making a pattern out of their logo. Finally, how about department names?
303 | ```{r}
304 | word_prop_scatter(publications, department, era, other_stop_words = inst_words)
305 | ```
306 | 
307 | I already know that prequel era and current era institutions are different, and that there's more emphasis on developmental biology in the prequel era. Here "zoology" makes sense, since there're more "weird" species in the prequel era, like Ciona intestinalis. The "medical" also makes sense, since many current era datasets (more specifically, ST) are from clinical biopsies or autopsies, such as for cancer, gingivis, and ALS.
308 | 
309 | # Analysis
310 | ```{r}
311 | nms <- c("Analysis", "Prequel analysis")
312 | analysis_sheets <- read_metadata(nms)
313 | ```
314 | 
315 | Just to see how number of publications changed through time here
316 | ```{r}
317 | pubs_per_year(analysis_sheets, "sheet", binwidth = 180, preprints = TRUE)
318 | ```
319 | 
320 | ```{r}
321 | era_freqpoly(analysis_sheets, sheet, preprints = TRUE)
322 | ```
323 | 
324 | ```{r}
325 | era_freqpoly(analysis_sheets, sheet, since_first = TRUE, preprints = TRUE)
326 | ```
327 | 
328 | Again, in the current era, data analysis papers grow more than in the prequel era (if my collection is complete, which I'm not sure). I don't think we have reached the peak yet.
329 | 
330 | Here, again, I'll compare word frequency between current era analysis and prequel analysis
331 | ```{r}
332 | word_prop_scatter(analysis_sheets, era = sheet)
333 | ```
334 | 
335 | Just like for the datasets, there's more emphasis on pattern and gene expression in prequel analysis, and more emphasis on single cell and the very word "spatial" in the current era. For gene expression in prequel analysis, note that most papers from that era in this collection are about automatic classification and annotation of gene expression patterns, so it makes sense to mention "gene expression".
336 | 
337 | How about summaries of the methods?
338 | ```{r}
339 | word_prop_scatter(analysis_sheets, summary)
340 | ```
341 | 
342 | Take this with a grain of salt since what I wrote in the summaries in part depends on my mood. Like when I was tired, I would be more concise, and when I was happy, I would be more verbose. Here we see in the current era, with single cell resolution, "cell" is more emphasized, while the BDGP atlas and ABA are not single cell resolution so the word "voxel" is used instead. 
343 | 
344 | Again, since both prequel analysis and current era analysis sheets have only about 60 titles, I think the dataset is too small for helpful modeling of how word frequencies change through time.
345 | 
346 | Finally, department names.
347 | ```{r}
348 | word_prop_scatter(analysis_sheets, department, other_stop_words = inst_words, n_top = 15)
349 | ```
350 | 
351 | We see that a lot of the prequel analysis is from computer science departments. I wonder whether this reflects a sociological change, since biologists trained as recently as the 2000s often don't have much computational background and it seems that in the 2010s, especially after about 2015, biology has been becoming more and more computational, so more of the data analysis methods were developed within biology departments. I don't know if this trend is real of whether it's simply the timeline of my personal journy from experimental biology to computational biology. Back in the first half of 2015, I did H&E and Masson's trichrome staining. In September 2015, I changed lab, intending to learn bioinformatics, learnt R, and then started analyzing phenotype data from mouse cohorts before getting into GWAS in late 2016. So probably I only see biology becoming more and more computational because the sort of biology I've been dealing with became more and more computational. Or maybe it is real, since back in 2017, Eleazar Eskin told me about his idea of doing a master's program for postdoc biologists who want to catch up with the computational stuff, and then in 2018, the first time I TA'ed, it was the sort of intro to molecular biology class that I have taken back in my sophomore year in college (2014), and to my surprise, my section only had 2 students and the professor was dismayed that these days some "biologists" are actually mathematicians who don't even know how to do PCR. I was surprised that this class wasn't full and mandatory for biology majors, as was the case when I was a sophomore.
352 | 
353 | These are some visual explorations. Burning question: are the differences statistically significant? I think these plots are already compelling enough for now. I may need to look more into text mining literature to see what kind of tests are the best.
354 | 
355 | Also, how about plotting how number of publications about spatial data and number of publications about data analysis together?
356 | ```{r}
357 | all_current <- rbind(current, 
358 |                      analysis_sheets[analysis_sheets$sheet == "Analysis",
359 |                                      names(current)])
360 | ```
361 | 
362 | ```{r}
363 | all_current <- all_current %>% 
364 |   mutate(type = case_when(sheet == "Analysis" ~ "analysis",
365 |                           TRUE ~ "data")) %>% 
366 |   select(-sheet) %>% 
367 |   distinct()
368 | ```
369 | 
370 | ```{r}
371 | era_freqpoly(all_current, type, preprints = TRUE, binwidth = 120) +
372 |   scale_x_date(breaks = scales::breaks_pretty(10))
373 | ```
374 | 
375 | For those who viewing this vignette on RStudio Cloud, PLEASE do play around with different values of `binwidth`, since different values can give different interpretations. But I do think while before 2019, analysis did lack behind data, analysis has really taken off since 2019. However, this plot does not tell the whole story. It only says how many papers were posted online, but not the quality. Data analysis can still be behind if we don't have quality methods. But there's a fine line between "everything is so bad" and "this field is challenging" when judging that "behind". Another problem with this is that while some types of analysis methods have taken off, some other types have not. But I do think data integration and cell type inference for non-single cell resolution data have taken off. Spatially variable genes, finding characteristic gene patterns? Not really. Preprocessing? Maybe, but still not good enough for smFISH and ISS based data, though for arrayed based data, there's ST Pipeline and Space Ranger. 
376 | 
377 | How about prequel data vs. analysis?
378 | ```{r}
379 | all_prequel <- rbind(data_sheets[data_sheets$sheet == "Prequel", names(current)],
380 |                      analysis_sheets[analysis_sheets$sheet == "Prequel analysis", names(current)])
381 | ```
382 | 
383 | ```{r}
384 | all_prequel <- all_prequel %>% 
385 |   mutate(type = case_when(sheet == "Prequel analysis" ~ "analysis",
386 |                           TRUE ~ "data"))
387 | ```
388 | 
389 | ```{r}
390 | era_freqpoly(all_prequel, type, preprints = TRUE, binwidth = 365) +
391 |   scale_x_date(breaks = scales::breaks_pretty(10))
392 | ```
393 | 
394 | So the analysis was indeed behind.
395 | 
396 | ```{r}
397 | sessionInfo()
398 | ```
399 | 


--------------------------------------------------------------------------------
/supplement/more_analyses/iss.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "ISS"
  3 | author: "Lambda Moses"
  4 | date: "`r Sys.Date()`"
  5 | output: rmarkdown::html_vignette
  6 | vignette: >
  7 |   %\VignetteIndexEntry{ISS}
  8 |   %\VignetteEngine{knitr::rmarkdown}
  9 |   %\VignetteEncoding{UTF-8}
 10 | ---
 11 | 
 12 | ```{r setup, include=FALSE}
 13 | knitr::opts_chunk$set(echo = TRUE)
 14 | ```
 15 | 
 16 | Here I explore the history and geography of spatial transcriptomics based on in situ sequencing through the metadata I collected.
 17 | ```{r}
 18 | library(museumst)
 19 | library(ggplot2)
 20 | library(purrr)
 21 | library(dplyr)
 22 | library(sf)
 23 | theme_set(theme_bw())
 24 | ```
 25 | 
 26 | # Import data
 27 | By default, the `read_metadata` function reads the appropriate sheet from a cache within this package. This may be outdated, since the Google Sheets are constantly updated. Use the argument `update = TRUE` to download the most up to date version. The default is `FALSE` to avoid API limits during automated CRAN checks.
 28 | ```{r}
 29 | sheet <- read_metadata("ISS")
 30 | ```
 31 | 
 32 | # Number of publications
 33 | How many publications per year?
 34 | ```{r}
 35 | # Remove duplicates as different datasets in the same publication have their own rows
 36 | publications <- get_pubs_df(sheet, other_cols = "repo")
 37 | ```
 38 | 
 39 | How many publications are there in total in this sheet?
 40 | ```{r}
 41 | nrow(publications)
 42 | ```
 43 | 
 44 | That's not a lot. So seriously, take every trend in this notebook with a grain of salt!
 45 | 
 46 | ```{r}
 47 | # For maps later in this notebook
 48 | city_gc <- geocode_inst_city(publications)
 49 | pubs_on_map2 <- partial(pubs_on_map, city_gc = city_gc)
 50 | ```
 51 | 
 52 | ## Overall
 53 | Number of publications per year. Preprints are excluded from plots over years as it takes months to publish so the dates of the preprints are incoherent with those for the other papers. 
 54 | ```{r}
 55 | events <- read_major_events()
 56 | ```
 57 | 
 58 | ```{r, fig.width=6, fig.height=3}
 59 | events %>% 
 60 |   filter(category == "ISS") %>% 
 61 |   plot_timeline(rep(c(1, -1), length.out = nrow(.)), 
 62 |                 expand_x = c(0.1, 0.1), expand_y = c(0.05, 0.05))
 63 | ```
 64 | 
 65 | By default, preprints are excluded from plots of number of publications over time, since the timing of preprints is incoherent to that of published papers. Including preprints will inflate the number of publications in recent months, and if we make the same plot a few months later, that inflation we see now will be gone and moved to a more recent date. You can add the `preprints = TRUE` argument to all functions plotting things over time in this package to include preprints. You can also use the `binwidth` argument to specify bin width in days. The default is 365 days.
 66 | ```{r}
 67 | pubs_per_year(publications, binwidth = 200, preprints = TRUE)
 68 | ```
 69 | 
 70 | ## By method
 71 | How about when broken down by method (not for data analysis sheets)?
 72 | 
 73 | ```{r}
 74 | methods <- unnest_cat(sheet, method)
 75 | ```
 76 | 
 77 | ```{r, fig.width=6, fig.height=4}
 78 | pubs_per_year(methods, facet_by = "method", binwidth = 200)
 79 | ```
 80 | I don't know how much this actually says due to the small sample size. This is also kind of confusing since Geo-seq really is a form of LCM coupled with SMART-seq, so it should be categorized as LCM, but the authors gave it a new name. 
 81 | 
 82 | ## By species
 83 | 
 84 | ```{r}
 85 | species <- unnest_cat(sheet, species)
 86 | ```
 87 | 
 88 | ```{r}
 89 | pubs_per_cat(species, species)
 90 | ```
 91 | 
 92 | ```{r}
 93 | pubs_per_cat(species, species, isotype = TRUE, img_unit = 1)
 94 | ```
 95 | 
 96 | 
 97 | ## By journal
 98 | ```{r}
 99 | sort(table(publications$journal))
100 | ```
101 | 
102 | ## Location
103 | ### General
104 | Just some barplots for number of publications per institution, city, and country. 
105 | ```{r}
106 | pubs_per_cat(publications, country)
107 | ```
108 | 
109 | How about per capita? Actually this probably does not say much since many researchers in the West are immigrants. Then it raises the burning political question of who counts as the "population".
110 | 
111 | ```{r}
112 | pubs_per_capita(publications, plot = "bar")
113 | ```
114 | 
115 | Why is Israel doing so great? Why not freedom to Palestine? Yeah, freedom to Palestine!
116 | 
117 | How about country over time? There might not be enough publications to show a trend.
118 | ```{r, fig.height=4, fig.width=6}
119 | pubs_per_year(publications, facet_by = "country", binwidth = 200)
120 | ```
121 | Now look at cities. 
122 | 
123 | ```{r}
124 | pubs_per_cat(publications, city)
125 | ```
126 | 
127 | Institutions
128 | ```{r}
129 | pubs_per_cat(publications, institution)
130 | ```
131 | 
132 | OK, now here comes the maps!
133 | 
134 | ### Worldwide
135 | ```{r}
136 | pubs_on_map2(publications)
137 | ```
138 | 
139 | Let me also plot the per capita thing on a map, as a choropleth
140 | ```{r}
141 | pubs_per_capita(publications)
142 | ```
143 | 
144 | Break down by species
145 | ```{r, fig.width=8, fig.height=6}
146 | pubs_on_map2(species, facet_by = "species", ncol = 2)
147 | ```
148 | 
149 | By method
150 | ```{r, fig.width=8, fig.height=6}
151 | pubs_on_map2(methods, facet_by = "method", ncol = 2)
152 | ```
153 | 
154 | So while the 2013 method of ISS has spread, the other methods did not. Well, FISSEQ sort of did, since untargeted ExSeq is expansion microscopy + FISSEQ.
155 | 
156 | ### Europe
157 | Plot a map just for Europe. A problem here is that if I simply filter the `world` sf object, I'll get some islands away from what we usually think is Europe. Kind of feel bad for Russians since Russia is so large that it makes the map look bad and I don't have a paper from Russia in this spreadsheet. 
158 | 
159 | ```{r}
160 | pubs_on_map2(publications, zoom = "europe")
161 | ```
162 | 
163 | ```{r, fig.width=6, fig.height=4}
164 | pubs_on_map2(species, zoom = "europe", facet_by = "species")
165 | ```
166 | 
167 | Maybe I just made this plot for fun. Not sure what to say about it.
168 | 
169 | ```{r}
170 | pubs_on_map2(methods, zoom = "europe", facet_by = "method")
171 | ```
172 | 
173 | ### USA
174 | Also a plot just for America. I did not encounter a publication in this spreadsheet that is from Hawaii or Alaska, but I'll include Hawaii and Alaska in the map anyway in case I get one in the future.
175 | 
176 | ```{r}
177 | pubs_on_map2(publications, zoom = "usa")
178 | ```
179 | 
180 | ```{r}
181 | pubs_per_capita(publications, "usa")
182 | ```
183 | 
184 | ```{r, fig.width=8, fig.height=6}
185 | pubs_on_map2(species, zoom = "usa", facet_by = "species", ncol = 2)
186 | ```
187 | 
188 | ```{r, fig.width=8, fig.height=6}
189 | pubs_on_map2(methods, zoom = "usa", facet_by = "method", ncol = 2)
190 | ```
191 | 
192 | # Word cloud
193 | ## Titles
194 | ```{r, fig.height=6, fig.width=6}
195 | plot_wordcloud(sheet)
196 | ```
197 | 
198 | ```{r, fig.height=6, fig.width=6}
199 | plot_wordcloud(sheet, species_use = "Mus musculus", scale = c(5, 0.1))
200 | ```
201 | 
202 | ```{r, fig.height=6, fig.width=6}
203 | plot_wordcloud(sheet, species_use = "Homo sapiens")
204 | ```
205 | 
206 | ## Tissues
207 | ```{r, fig.height=6, fig.width=6}
208 | plot_wordcloud(sheet, col_use = "tissue")
209 | ```
210 | 
211 | I think that's in part due to STARmap, but a ISS preprint that demonstrated the scalability of a preprocessing pipeline also used the cortex.
212 | 
213 | ```{r, fig.height=5, fig.width=5}
214 | plot_wordcloud(sheet, species_use = "Mus musculus", col_use = "tissue")
215 | ```
216 | 
217 | ```{r}
218 | plot_wordcloud(sheet, species_use = "Homo sapiens", col_use = "tissue", scale = c(5, 0.1))
219 | ```
220 | 
221 | ## Over time
222 | 
223 | ```{r}
224 | range(sheet$date_published)
225 | ```
226 | 
227 | Probably not worth the effort to split by time period.
228 | 
229 | ## Department names
230 | 
231 | ```{r}
232 | plot_wordcloud(sheet, department, other_stop_words = inst_words, scale = c(5, 0.1))
233 | ```
234 | 
235 | ## Downstream analyses
236 | ```{r, fig.height=7, fig.width=7}
237 | plot_wordcloud(sheet, downstream, scale = c(4, 0.1))
238 | ```
239 | 
240 | # Programming languages
241 | ```{r}
242 | langs <- unnest_cat(sheet, language)
243 | ```
244 | 
245 | ```{r}
246 | pubs_per_cat(langs, language, isotype = TRUE, img_unit = 1)
247 | ```
248 | 
249 | # Data and code availability
250 | 
251 | How many publications have provided a code repo?
252 | ```{r}
253 | hist_bool(publications, !is.na(repo), binwidth = 200)
254 | ```
255 | 
256 | Since data availability status can differ among datasets for the same publications, here the number of datasets is used.
257 | ```{r}
258 | hist_bool(sheet, has_matrix, binwidth = 200)
259 | ```
260 | 
261 | Is this significant? Probably not. 
262 | ```{r}
263 | test_year_bool(publications, !is.na(repo))
264 | ```
265 | 
266 | # Number of genes and cells
267 | ```{r}
268 | sheet %>% 
269 |   filter(!is.na(n_genes), !is.na(n_cells)) %>% 
270 |   ggplot(aes(date_published, n_genes, color = method)) +
271 |   geom_point()
272 | ```
273 | 
274 | ```{r}
275 | sheet %>% 
276 |   filter(!is.na(n_genes), !is.na(n_cells)) %>% 
277 |   ggplot(aes(date_published, n_cells, color = method)) +
278 |   geom_point()
279 | ```
280 | 
281 | I don't see any trend that the number of genes and cells is increasing over time. Too lazy to formally test it. Plus the number of datasets is too small for me to draw conclusions.
282 | 


--------------------------------------------------------------------------------
/supplement/more_analyses/microdissection.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "ROI selection"
  3 | author: "Lambda Moses"
  4 | date: "`r Sys.Date()`"
  5 | output: rmarkdown::html_vignette
  6 | vignette: >
  7 |   %\VignetteIndexEntry{Microdissection}
  8 |   %\VignetteEngine{knitr::rmarkdown}
  9 |   %\VignetteEncoding{UTF-8}
 10 | ---
 11 | 
 12 | ```{r setup, include=FALSE}
 13 | knitr::opts_chunk$set(echo = TRUE)
 14 | ```
 15 | 
 16 | Here I explore the history and geography of spatial transcriptomics based on microdissection through the metadata I collected.
 17 | ```{r}
 18 | library(museumst)
 19 | library(ggplot2)
 20 | library(purrr)
 21 | library(dplyr)
 22 | library(sf)
 23 | theme_set(theme_bw())
 24 | ```
 25 | 
 26 | # Import data
 27 | By default, the `read_metadata` function reads the appropriate sheet from a cache within this package. This may be outdated, since the Google Sheets are constantly updated. Use the argument `update = TRUE` to download the most up to date version. The default is `FALSE` to avoid API limits during automated CRAN checks.
 28 | ```{r}
 29 | sheet <- read_metadata("ROI selection")
 30 | ```
 31 | 
 32 | # Number of publications
 33 | How many publications per year?
 34 | ```{r}
 35 | # Remove duplicates as different datasets in the same publication have their own rows
 36 | publications <- get_pubs_df(sheet, other_cols = c("repo", "accession"))
 37 | ```
 38 | 
 39 | How many publications are there in total in this sheet?
 40 | ```{r}
 41 | nrow(publications)
 42 | ```
 43 | 
 44 | ```{r}
 45 | # For maps later in this notebook
 46 | city_gc <- geocode_inst_city(publications)
 47 | pubs_on_map2 <- partial(pubs_on_map, city_gc = city_gc)
 48 | ```
 49 | 
 50 | ## Overall
 51 | Number of publications per year. Preprints are excluded from plots over years as it takes months to publish so the dates of the preprints are incoherent with those for the other papers. 
 52 | ```{r}
 53 | events <- read_major_events()
 54 | ```
 55 | 
 56 | ```{r, fig.width=8, fig.height=7}
 57 | events %>% 
 58 |   filter(category == "microdissection") %>% 
 59 |   plot_timeline(c(0.3, -0.3, 0.5, -0.3, 0.3, -0.7, 1, -0.9, 0.5), 
 60 |                 expand_x = c(0.1, 0.1), expand_y = c(0.05, 0.05))
 61 | ```
 62 | 
 63 | By default, preprints are excluded from plots of number of publications over time, since the timing of preprints is incoherent to that of published papers. Including preprints will inflate the number of publications in recent months, and if we make the same plot a few months later, that inflation we see now will be gone and moved to a more recent date. You can add the `preprints = TRUE` argument to all functions plotting things over time in this package to include preprints. You can also use the `binwidth` argument to specify bin width in days. The default is 365 days.
 64 | 
 65 | ```{r}
 66 | pubs_per_year(publications)
 67 | ```
 68 | 
 69 | ## By method
 70 | How about when broken down by method (not for data analysis sheets)?
 71 | 
 72 | Get this into another R Markdown file since the analysis ones don't have the method column. For LCM, this should be taken with a grain of salt, since I don't think my collection of LCM papers is complete. LCM started far earlier than this. Because even I don't believe this myself, I don't think this figure will go into the paper. 
 73 | 
 74 | I'm not sure if many of those papers even count as spatial since they simply used H&E to get a ROI, use LCM to isolate it for RNA-seq, that's it, sort of forgetting about the spatial part. But Niche-seq is also used this way and those who made Niche-seq called it spatial. I guess it's just that you know where the sample comes from in the tissue and what's next to it. Well, suppose you say, "I'm visiting LA. Not just LA, but more specifically Little Bangladesh." Is that spatial? I know where Little Bangladesh is and what's around it. I don't think geostatisticians will consider that statement spatial, because it does not use spatial statistics, just like you can fit non-spatial linear regression models to data from different neighborhoods though you know where those neighborhoods are. I suppose it's just an example of how the same word has different meanings in different context, and I'll stick to the LCM context since so far, spatial statistics has not be all that widely used in transcriptomics.
 75 | 
 76 | ```{r}
 77 | methods <- unnest_cat(sheet, method)
 78 | ```
 79 | 
 80 | ```{r, fig.width=6, fig.height=12}
 81 | pubs_per_year(methods, facet_by = "method")
 82 | ```
 83 | 
 84 | I don't know how much this actually says due to the small sample size. 
 85 | 
 86 | ## By species
 87 | 
 88 | ```{r}
 89 | species <- unnest_cat(sheet, species)
 90 | ```
 91 | 
 92 | ```{r}
 93 | pubs_per_cat(species, species)
 94 | ```
 95 | 
 96 | ```{r}
 97 | pubs_per_cat(species, species, n_top = 5, isotype = TRUE, img_unit = 2)
 98 | ```
 99 | 
100 | ## By journal
101 | ```{r}
102 | sort(table(publications$journal))
103 | ```
104 | 
105 | ## Location
106 | ### General
107 | Just some barplots for number of publications per institution, city, and country. 
108 | ```{r}
109 | pubs_per_cat(publications, country)
110 | ```
111 | 
112 | How about per capita? Actually this probably does not say much since many researchers in the West are immigrants. Then it raises the burning political question of who counts as the "population".
113 | 
114 | ```{r}
115 | pubs_per_capita(publications, plot = "bar")
116 | ```
117 | 
118 | How about country over time? There might not be enough publications to show a trend.
119 | ```{r, fig.height=12, fig.width=6}
120 | pubs_per_year(publications, facet_by = "country")
121 | ```
122 | Now look at cities. 
123 | 
124 | ```{r}
125 | pubs_per_cat(publications, city)
126 | ```
127 | 
128 | Institutions
129 | ```{r}
130 | pubs_per_cat(publications, institution)
131 | ```
132 | 
133 | 12345678, U, C, L, A, UCLA fight fight fight! We beat 'SC in this plot!
134 | 
135 | OK, now here comes the maps!
136 | 
137 | ### Worldwide
138 | ```{r}
139 | pubs_on_map2(publications)
140 | ```
141 | 
142 | Let me also plot the per capita thing on a map, as a choropleth
143 | ```{r}
144 | pubs_per_capita(publications)
145 | ```
146 | 
147 | Break down by species
148 | ```{r, fig.width=8, fig.height=8}
149 | pubs_on_map2(species, facet_by = "species")
150 | ```
151 | 
152 | 
153 | Probably this does not tell us much since there's only 1 study for some species. Again, we see the received wisdom that research is mostly confined to the West, especially Western Europe, west coast of the US, and New England, but not as much in the other parts of the "West".
154 | 
155 | By method
156 | ```{r, fig.width=8, fig.height=8}
157 | pubs_on_map2(methods, facet_by = "method")
158 | ```
159 | 
160 | I suppose this merely tells us where those techniques were invented and how they spread. For instance, voxelation of the brain did not spread outside UCLA to my best knowledge, while Tomo-seq and Niche-seq sort of spread beyond the lab where they originated. 
161 | 
162 | ### Europe
163 | Plot a map just for Europe. A problem here is that if I simply filter the `world` sf object, I'll get some islands away from what we usually think is Europe. Alas, the sad legacy of colonialism! Kind of feel bad for Russians since Russia is so large that it makes the map look bad and I don't have a paper from Russia in this spreadsheet. 
164 | 
165 | ```{r}
166 | pubs_on_map2(publications, zoom = "europe")
167 | ```
168 | 
169 | ```{r}
170 | pubs_per_capita(publications, "europe")
171 | ```
172 | 
173 | I didn't realize that Switzerland and Netherlands were pretty great. Again, this is a small sample size, so I take this with a grain of salt until I plot the other sheets. From the other sheets, I already know that Sweden, the UK, and Germany have many contributions. 
174 | 
175 | ```{r, fig.width=6, fig.height=8}
176 | pubs_on_map2(species, zoom = "europe", facet_by = "species")
177 | ```
178 | 
179 | Maybe I just made this plot for fun. Not sure what to say about it.
180 | 
181 | ```{r, fig.width=8, fig.height=6}
182 | pubs_on_map2(methods, zoom = "europe", facet_by = "method")
183 | ```
184 | 
185 | ### USA
186 | Also a plot just for America. I did not encounter a publication in this spreadsheet that is from Hawaii or Alaska, but I'll include Hawaii and Alaska in the map anyway in case I get one in the future.
187 | 
188 | ```{r}
189 | pubs_on_map2(publications, zoom = "usa")
190 | ```
191 | 
192 | ```{r}
193 | pubs_per_capita(publications, "usa")
194 | ```
195 | 
196 | ```{r, fig.width=8, fig.height=6}
197 | pubs_on_map2(species, zoom = "usa", facet_by = "species")
198 | ```
199 | 
200 | ```{r, fig.width=8, fig.height=6}
201 | pubs_on_map2(methods, zoom = "usa", facet_by = "method")
202 | ```
203 | 
204 | # Word cloud
205 | ## Titles
206 | ```{r, fig.height=7, fig.width=7}
207 | plot_wordcloud(sheet)
208 | ```
209 | 
210 | So not all use of the term "spatial" in the current era is from ST.
211 | 
212 | ```{r, fig.height=7, fig.width=7}
213 | plot_wordcloud(sheet, species_use = "Mus musculus", scale = c(5, 0.1))
214 | ```
215 | 
216 | ```{r, fig.height=7, fig.width=7}
217 | plot_wordcloud(sheet, species_use = "Homo sapiens", scale = c(5, 0.1))
218 | ```
219 | 
220 | ## Tissues
221 | ```{r, fig.height=5, fig.width=5}
222 | plot_wordcloud(sheet, col_use = "tissue")
223 | ```
224 | 
225 | ```{r, fig.height=5, fig.width=5}
226 | plot_wordcloud(sheet, species_use = "Mus musculus", col_use = "tissue")
227 | ```
228 | 
229 | ```{r, fig.height=5, fig.width=5}
230 | plot_wordcloud(sheet, species_use = "Homo sapiens", col_use = "tissue")
231 | ```
232 | 
233 | ## Over time
234 | 
235 | ```{r}
236 | range(sheet$date_published)
237 | ```
238 | 
239 | ```{r, fig.height=6, fig.width=6}
240 | plot_wordcloud(sheet, year_min = 2002, year_max = 2010)
241 | ```
242 | 
243 | ```{r, fig.height=7, fig.width=7}
244 | plot_wordcloud(sheet, year_min = 2010, year_max = 2015)
245 | ```
246 | 
247 | ```{r, fig.height=7, fig.width=7}
248 | plot_wordcloud(sheet, year_min = 2015, year_max = 2021)
249 | ```
250 | 
251 | ## Department names
252 | 
253 | ```{r, fig.height=7, fig.width=7}
254 | plot_wordcloud(sheet, col_use = "department", other_stop_words = inst_words)
255 | ```
256 | 
257 | ## Downstream analyses
258 | ```{r, fig.height=5, fig.width=5}
259 | plot_wordcloud(sheet, "downstream")
260 | ```
261 | 
262 | # Programming languages
263 | ```{r}
264 | langs <- unnest_cat(sheet, language)
265 | ```
266 | 
267 | ```{r}
268 | pubs_per_cat(langs, language, isotype = TRUE, img_unit = 5)
269 | ```
270 | 
271 | # Data and code availability
272 | 
273 | How many publications have provided a code repo?
274 | ```{r}
275 | hist_bool(publications, !is.na(repo))
276 | ```
277 | 
278 | How many publications have provided an accession for sequencing data?
279 | ```{r}
280 | hist_bool(publications, !is.na(accession))
281 | ```
282 | 
283 | It seems that more recent publications are more likely to provide an accession. Is this significant? Here I fit a logistic regression model using date published to predict whether accession is available and test if the coefficients are 0.
284 | ```{r}
285 | test_year_bool(publications, !is.na(repo))
286 | ```
287 | 
288 | ```{r}
289 | test_year_bool(publications, !is.na(accession))
290 | ```
291 | 
292 | So there sort of is a positive association between when the paper was published and whether accession is available, and more recent publications are more likely to have accessions.
293 | 


--------------------------------------------------------------------------------
/supplement/more_analyses/prequel2.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Prequel"
  3 | author: "Lambda Moses"
  4 | date: "`r Sys.Date()`"
  5 | output: rmarkdown::html_vignette
  6 | vignette: >
  7 |   %\VignetteIndexEntry{Prequel}
  8 |   %\VignetteEngine{knitr::rmarkdown}
  9 |   %\VignetteEncoding{UTF-8}
 10 | 
 11 | ---
 12 | 
 13 | ```{r setup, include=FALSE}
 14 | knitr::opts_chunk$set(echo = TRUE)
 15 | ```
 16 | 
 17 | Here I explore the history and geography of spatial transcriptomics in the prequel era through the metadata I collected, namely things like in situ atlases.
 18 | ```{r}
 19 | library(museumst)
 20 | library(ggplot2)
 21 | library(purrr)
 22 | library(stringr)
 23 | library(dplyr)
 24 | library(sf)
 25 | theme_set(theme_bw())
 26 | ```
 27 | 
 28 | # Import data
 29 | By default, the `read_metadata` function reads the appropriate sheet from a cache within this package. This may be outdated, since the Google Sheets are constantly updated. Use the argument `update = TRUE` to download the most up to date version. The default is `FALSE` to avoid API limits during automated CRAN checks.
 30 | ```{r}
 31 | sheet <- read_metadata("Prequel", update = TRUE)
 32 | ```
 33 | 
 34 | # Number of publications
 35 | 
 36 | ```{r}
 37 | anyDuplicated(sheet$title[!is.na(sheet$title)])
 38 | ```
 39 | 
 40 | How many publications are there in total in this sheet?
 41 | ```{r}
 42 | nrow(sheet)
 43 | ```
 44 | 
 45 | ```{r}
 46 | # For maps later in this notebook
 47 | city_gc <- geocode_inst_city(sheet)
 48 | pubs_on_map2 <- partial(pubs_on_map, city_gc = city_gc)
 49 | ```
 50 | 
 51 | ## Overall
 52 | ```{r}
 53 | events <- read_major_events()
 54 | ```
 55 | 
 56 | ```{r, fig.width=8, fig.height=4}
 57 | events %>% 
 58 |   filter(category == "trap") %>% 
 59 |   plot_timeline(rep(c(1, -1), length.out = nrow(.)), 
 60 |                 expand_x = c(0.1, 0.1), expand_y = c(0.05, 0.05))
 61 | ```
 62 | 
 63 | ```{r, fig.width=8, fig.height=5}
 64 | events %>% 
 65 |   filter(category == "ISH atlas") %>% 
 66 |   plot_timeline(c(0.3, -0.3, 0.3, -0.3, 0.3, -0.6, 0.3, -0.5, 0.6, -0.7, 0.3, -0.2, 0.6, -0.5, 0.2), 
 67 |                 expand_x = c(0.1, 0.1), expand_y = c(0.05, 0.05))
 68 | ```
 69 | 
 70 | By default, preprints are excluded from plots of number of publications over time, since the timing of preprints is incoherent to that of published papers. Including preprints will inflate the number of publications in recent months, and if we make the same plot a few months later, that inflation we see now will be gone and moved to a more recent date. You can add the `preprints = TRUE` argument to all functions plotting things over time in this package to include preprints. You can also use the `binwidth` argument to specify bin width in days. The default is 365 days.
 71 | ```{r}
 72 | pubs_per_year(sheet)
 73 | ```
 74 | 
 75 | ## By method
 76 | 
 77 | ```{r}
 78 | methods <- unnest_cat(sheet, method)
 79 | ```
 80 | 
 81 | ```{r}
 82 | pubs_per_cat(methods, method)
 83 | ```
 84 | 
 85 | ```{r, fig.width=6, fig.height=6}
 86 | pubs_per_year(methods, facet_by = "method")
 87 | ```
 88 | 
 89 | ## By species
 90 | ```{r}
 91 | species <- unnest_cat(sheet, species)
 92 | ```
 93 | 
 94 | ```{r}
 95 | pubs_per_cat(species, species)
 96 | ```
 97 | 
 98 | ```{r}
 99 | pubs_per_cat(species, species, n_top = 5, isotype = TRUE, img_unit = 10)
100 | ```
101 | 
102 | Just for WMISH and ISH
103 | ```{r}
104 | sheet %>% 
105 |   filter(str_detect(method, "ISH")) %>% 
106 |   unnest_cat(species) %>% 
107 |   pubs_per_cat(species)
108 | ```
109 | 
110 | Also just for WMISH and ISH, for species over time
111 | ```{r, fig.width=6, fig.height=6}
112 | sheet %>% 
113 |   filter(str_detect(method, "ISH")) %>% 
114 |   unnest_cat(species) %>% 
115 |   pubs_per_year(facet_by = "species", n_top = 6)
116 | ```
117 | 
118 | I suppose different species are responsible for the apparent 2 peaks; Ciona and zebrafish are responsible for the first one.
119 | 
120 | ## By journal
121 | ```{r}
122 | cat_heatmap(methods, journal, method)
123 | ```
124 | 
125 | ```{r}
126 | cat_heatmap(species, journal, species)
127 | ```
128 | 
129 | ## Location
130 | ### General
131 | Just some barplots for number of publications per institution, city, and country. 
132 | ```{r}
133 | pubs_per_cat(sheet, country)
134 | ```
135 | 
136 | How about per capita? Actually this probably does not say much since many researchers in the West are immigrants. Then it raises the burning political question of who counts as the "population".
137 | 
138 | ```{r}
139 | pubs_per_capita(sheet, plot = "bar")
140 | ```
141 | 
142 | How about country over time? There might not be enough publications to show a trend.
143 | ```{r, fig.height=12, fig.width=6}
144 | pubs_per_year(sheet, facet_by = "country")
145 | ```
146 | Now look at cities. 
147 | 
148 | ```{r, fig.width=6, fig.height=8}
149 | pubs_per_cat(sheet, city)
150 | ```
151 | 
152 | Bar Harbor is for GXD(JAX), and Edinburgh is for EMAGE
153 | Institutions
154 | ```{r, fig.width=6, fig.height=9}
155 | pubs_per_cat(sheet, institution)
156 | ```
157 | 
158 | ### Worldwide
159 | ```{r}
160 | pubs_on_map2(sheet)
161 | ```
162 | 
163 | ```{r}
164 | pubs_on_map2(sheet, plot = "hex")
165 | ```
166 | 
167 | Still, it's mostly in the US and Western Europe, but this time we see some more in Asia. Still no Africa and South America.
168 | 
169 | Let me also plot the per capita thing on a map, as a choropleth
170 | ```{r}
171 | pubs_per_capita(sheet)
172 | ```
173 | 
174 | Break down by species
175 | ```{r, fig.width=8, fig.height=8}
176 | pubs_on_map2(species, facet_by = "species", ncol = 4)
177 | ```
178 | 
179 | Ciona is more centered in Japan, and chicken more in Arizona. I already know that.
180 | 
181 | Probably this does not tell us much since there's only 1 study for some species. Again, we see the received wisdom that research is mostly confined to the West, especially Western Europe, west coast of the US, and New England, but not as much in the other parts of the "West".
182 | 
183 | ### Europe
184 | Plot a map just for Europe. A problem here is that if I simply filter the `world` sf object, I'll get some islands away from what we usually think is Europe. 
185 | 
186 | ```{r}
187 | pubs_on_map2(sheet, zoom = "europe")
188 | ```
189 | 
190 | ```{r}
191 | pubs_per_capita(sheet, "europe")
192 | ```
193 | 
194 | I'm kind of amazed that Sweden didn't contribute to the prequel stuff, yet it's doing so well in the newer spatial transcriptomics technologies.
195 | 
196 | ```{r, fig.width=5.5, fig.height=5.5}
197 | pubs_on_map2(species, zoom = "europe", facet_by = "species", ncol = 5)
198 | ```
199 | 
200 | Looks like some French labs are quite interested in unconventional model organisms. That's consistent with our collaboration with that French lab on the jellyfish Clytia project. I still wonder why. Is it a French thing?
201 | 
202 | ### USA
203 | Also a plot just for America. I did not encounter a publication in this spreadsheet that is from Hawaii or Alaska, but I'll include Hawaii and Alaska in the map anyway in case I get one in the future.
204 | 
205 | ```{r}
206 | pubs_on_map2(sheet, zoom = "usa")
207 | ```
208 | 
209 | ```{r}
210 | pubs_per_capita(sheet, "usa")
211 | ```
212 | 
213 | ```{r, fig.width=8, fig.height=6}
214 | pubs_on_map2(species, zoom = "usa", facet_by = "species", ncol = 3)
215 | ```
216 | 
217 | ZFIN is based in Eugene, Oregon. GEISHA is based in Tempe, Arizona. GXD is based in JAX, Bar Harbor, Maine. There're a number of Drosophila projects from Berkeley. And we see that on this map. I still wonder why the prequel stuff is more spread out around the world than the current era stuff. Is that because ISH and WMISH, included the robotic versions, are much cheaper than say ST, Visium, ISS, MERISH, and seqFISH? But how about LCM, which is also quite spread out? LCM certainly requirely expensive specialized equipments. But back in UCLA, I did hear that the histology core did LCM, which made it easier and cheaper for labs that don't have the equipment, while at present, I don't think other current era spatial transcriptomics technology is supported by a core facility. 
218 | 
219 | # Word cloud
220 | ## Titles
221 | ```{r, fig.height=6, fig.width=6}
222 | plot_wordcloud(sheet)
223 | ```
224 | 
225 | ## Tissues
226 | ```{r}
227 | plot_wordcloud(sheet, tissue)
228 | ```
229 | 
230 | ```{r, fig.height=6, fig.width=6}
231 | plot_wordcloud(sheet, tissue, species_use = "Drosophila melanogaster")
232 | ```
233 | 
234 | ```{r}
235 | plot_wordcloud(sheet, tissue, species_use = "Mus musculus")
236 | ```
237 | 
238 | ## Over time
239 | 
240 | ```{r}
241 | range(sheet$date_published)
242 | ```
243 | 
244 | ```{r, fig.height=7, fig.width=7}
245 | plot_wordcloud(sheet, year_min = 1987, year_max = 1995, scale = c(4, 0.1))
246 | ```
247 | 
248 | ```{r, fig.height=6, fig.width=6}
249 | plot_wordcloud(sheet, year_min = 1995, year_max = 2000)
250 | ```
251 | 
252 | ```{r, fig.height=6, fig.width=6}
253 | plot_wordcloud(sheet, year_min = 2000, year_max = 2005)
254 | ```
255 | 
256 | ```{r, fig.height=6, fig.width=6}
257 | plot_wordcloud(sheet, year_min = 2005, year_max = 2010)
258 | ```
259 | 
260 | ```{r, fig.height=6, fig.width=6}
261 | plot_wordcloud(sheet, year_min = 2010, year_max = 2021, scale = c(5, 0.1))
262 | ```
263 | 
264 | ## Department names
265 | 
266 | ```{r, fig.height=6, fig.width=6}
267 | plot_wordcloud(sheet, col_use = "department", other_stop_words = inst_words)
268 | ```
269 | 
270 | # Data availability
271 | 
272 | How many of the resources are still available?
273 | ```{r}
274 | hist_bool(sheet, still_available)
275 | ```
276 | 
277 | It seems that more recent ones are more likely to be still available. The NAs are for those atlases that has never been made available. Well, actually that might not be the case, since the data from those papers has most likely now been deposited in one of the databases like GXD and ZFIN. It seems that the NAs tend to be older.
278 | ```{r}
279 | test_year_bool(sheet, still_available)
280 | ```
281 | 
282 | Yes, it is significant. More recent databases are more likely to be still available.
283 | 
284 | # Number of genes
285 | For the old enhancer and gene trap studies, there weren't genome annotations and number of lines was reported rather than genes. For old ISH atlases, the number of clones was reported.
286 | 
287 | ```{r}
288 | pubs_per_year(sheet, facet_by = "item_type", n_top = 3)
289 | ```
290 | 
291 | Yeah, genes became prevalent later, and the prevalence of clones is quite transient. However, lines are not dead, since enhancer and gene traps are not dead since those are high throughput perturbations in addition to visualization.
292 | 
293 | Do more recent studies tend to do more genes?
294 | 
295 | ```{r}
296 | sheet %>% 
297 |   filter(!str_detect(method, "collection"), item_type == "gene") %>% 
298 |   ggplot(aes(date_published, n_items)) +
299 |   geom_point()
300 | ```
301 | 
302 | Not really.
303 | 
304 | I know that studies that do a particular type of genes tend to have fewer genes. Which types of genes?
305 | ```{r}
306 | sheet %>% 
307 |   filter(!is.na(gene_type)) %>% 
308 |   pubs_per_cat(gene_type)
309 | ```
310 | 
311 | Are more recent publications more likely to focus on some type of genes?
312 | ```{r}
313 | sheet %>% 
314 |   filter(str_detect(method, "ISH")) %>% 
315 |   hist_bool(!is.na(gene_type))
316 | ```
317 | 
318 | ```{r}
319 | test_year_bool(sheet, !is.na(gene_type))
320 | ```
321 | 
322 | I don't think so.
323 | 


--------------------------------------------------------------------------------
/supplement/more_analyses/prequel_analysis.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Prequel Analysis"
  3 | author: "Lambda Moses"
  4 | date: "`r Sys.Date()`"
  5 | output: rmarkdown::html_vignette
  6 | vignette: >
  7 |   %\VignetteIndexEntry{Prequel Analysis}
  8 |   %\VignetteEngine{knitr::rmarkdown}
  9 |   %\VignetteEncoding{UTF-8}
 10 | 
 11 | ---
 12 | 
 13 | ```{r setup, include=FALSE}
 14 | knitr::opts_chunk$set(echo = TRUE)
 15 | ```
 16 | 
 17 | Here I explore the history and geography of spatial transcriptomics data analysis for prequel data through the metadata I collected.
 18 | ```{r}
 19 | library(museumst)
 20 | library(purrr)
 21 | library(dplyr)
 22 | library(ggplot2)
 23 | library(sf)
 24 | theme_set(theme_bw())
 25 | ```
 26 | 
 27 | # Import data
 28 | By default, the `read_metadata` function reads the appropriate sheet from a cache within this package. This may be outdated, since the Google Sheets are constantly updated. Use the argument `update = TRUE` to download the most up to date version. The default is `FALSE` to avoid API limits during automated CRAN checks.
 29 | ```{r}
 30 | sheet <- read_metadata("Prequel analysis")
 31 | ```
 32 | 
 33 | # Number of publications
 34 | How many publications per year?
 35 | ```{r}
 36 | anyDuplicated(sheet$title)
 37 | ```
 38 | 
 39 | How many publications are there in total in this sheet?
 40 | ```{r}
 41 | nrow(sheet)
 42 | ```
 43 | 
 44 | ```{r}
 45 | # For maps later in this notebook
 46 | city_gc <- geocode_inst_city(sheet)
 47 | pubs_on_map2 <- partial(pubs_on_map, city_gc = city_gc)
 48 | ```
 49 | 
 50 | ## Overall
 51 | By default, preprints are excluded from plots of number of publications over time, since the timing of preprints is incoherent to that of published papers. Including preprints will inflate the number of publications in recent months, and if we make the same plot a few months later, that inflation we see now will be gone and moved to a more recent date. You can add the `preprints = TRUE` argument to all functions plotting things over time in this package to include preprints. You can also use the `binwidth` argument to specify bin width in days. The default is 365 days.
 52 | ```{r}
 53 | pubs_per_year(sheet, preprints = TRUE)
 54 | ```
 55 | 
 56 | Apparently, the golden age of prequel data analysis is gone.
 57 | 
 58 | ## By category
 59 | By the category of problem to address
 60 | ```{r}
 61 | category <- unnest_cat(sheet, category)
 62 | ```
 63 | 
 64 | ```{r}
 65 | pubs_per_cat(category, category)
 66 | ```
 67 | 
 68 | ```{r, fig.width=5, fig.height=5}
 69 | pubs_per_year(category, facet_by = "category", preprints = TRUE)
 70 | ```
 71 | 
 72 | ## By species
 73 | 
 74 | ```{r}
 75 | species <- unnest_cat(sheet, species, other_cols = "state/province")
 76 | ```
 77 | 
 78 | ```{r}
 79 | pubs_per_cat(species, species)
 80 | ```
 81 | 
 82 | ```{r}
 83 | pubs_per_cat(species, species, isotype = TRUE, img_unit = 5)
 84 | ```
 85 | 
 86 | I know, for almost all cases, it's either for the BDGP atlas or the Allen Brain Atlas.
 87 | 
 88 | ```{r}
 89 | pubs_per_year(species, facet_by = "species", preprints = TRUE)
 90 | ```
 91 | 
 92 | I don't think it's significant, but it looks like the mouse papers came slightly later than Drosophila papers. It seems that the first peak is for Drosophila, while the second is for mice. The first is in response to the BDGP atpas, and the second is in response to ABA, hence the date whe they started.
 93 | 
 94 | ## By journal
 95 | ```{r}
 96 | sort(table(sheet$journal), decreasing = TRUE)
 97 | ```
 98 | 
 99 | ## Location
100 | ### General
101 | Just some barplots for number of publications per institution, city, and country. 
102 | ```{r}
103 | pubs_per_cat(sheet, country)
104 | ```
105 | 
106 | What, Sweden isn't here? How about per capita? Actually this probably does not say much since many researchers in the West are immigrants. Then it raises the burning political question of who counts as the "population".
107 | 
108 | ```{r}
109 | pubs_per_capita(sheet, plot = "bar")
110 | ```
111 | 
112 | How about country over time? There might not be enough publications to show a trend.
113 | ```{r, fig.height=9, fig.width=6}
114 | pubs_per_year(sheet, facet_by = "country")
115 | ```
116 | 
117 | It's mostly an American thing. Now look at cities. 
118 | 
119 | ```{r}
120 | pubs_per_cat(sheet, city)
121 | ```
122 | 
123 | Where the research is done is quite different from data analysis methods for current era data.
124 | 
125 | Institutions
126 | ```{r}
127 | pubs_per_cat(sheet, institution)
128 | ```
129 | 
130 | It's very different from the institutions for the original trilogy analysis.
131 | 
132 | ### Worldwide
133 | ```{r}
134 | pubs_on_map2(sheet)
135 | ```
136 | 
137 | Let me also plot the per capita thing on a map, as a choropleth
138 | ```{r}
139 | pubs_per_capita(sheet)
140 | ```
141 | 
142 | Break down by species
143 | ```{r, fig.width=6, fig.height=10}
144 | pubs_on_map2(species, facet_by = "species", ncol = 1)
145 | ```
146 | 
147 | ### Europe
148 | Plot a map just for Europe. A problem here is that if I simply filter the `world` sf object, I'll get some islands away from what we usually think is Europe. Kind of feel bad for Russians since Russia is so large that it makes the map look bad and I don't have a paper from Russia in this spreadsheet. 
149 | 
150 | ```{r}
151 | pubs_on_map2(sheet, zoom = "europe")
152 | ```
153 | 
154 | ```{r}
155 | pubs_per_capita(sheet, "europe")
156 | ```
157 | 
158 | ```{r}
159 | pubs_on_map2(species, zoom = "europe", facet_by = "species")
160 | ```
161 | 
162 | Maybe I just made this plot for fun. Not sure what to say about it.
163 | 
164 | ### USA
165 | Also a plot just for America. I did not encounter a publication in this spreadsheet that is from Hawaii or Alaska, but I'll include Hawaii and Alaska in the map anyway in case I get one in the future.
166 | 
167 | ```{r}
168 | pubs_on_map2(sheet, zoom = "usa")
169 | ```
170 | 
171 | ```{r}
172 | pubs_per_capita(sheet, "usa")
173 | ```
174 | 
175 | Compare to current era or original trilogy anything, prequel data analysis is more spread out over different parts of the US, just like prequel data collection. Now I really why this is the case since it probably can't be explained by cost.
176 | 
177 | ```{r}
178 | pubs_on_map2(species, zoom = "usa", facet_by = "species")
179 | ```
180 | 
181 | # Word cloud
182 | ## Titles
183 | ```{r, fig.height=6, fig.width=6}
184 | plot_wordcloud(sheet)
185 | ```
186 | 
187 | ## Summaries
188 | ```{r, fig.height=6, fig.width=6}
189 | plot_wordcloud(sheet, summary)
190 | ```
191 | 
192 | ```{r, fig.height=6, fig.width=6}
193 | plot_wordcloud(sheet, core_principle, scale = c(5, 0.1))
194 | ```
195 | 
196 | That "bag" comes from bag of words.
197 | 
198 | ## Tissues
199 | ```{r}
200 | plot_wordcloud(sheet, tissue)
201 | ```
202 | 
203 | No-brainer (pun intended). Of course it's brain from ABA, since I left the tissue colum empty for whole Drosophila embryos.
204 | 
205 | ## By species
206 | ```{r, fig.height=6, fig.width=6}
207 | plot_wordcloud(sheet, species_use = "Drosophila melanogaster")
208 | ```
209 | 
210 | ```{r, fig.height=6, fig.width=6}
211 | plot_wordcloud(sheet, species_use = "Mus musculus")
212 | ```
213 | 
214 | ## Over time
215 | 
216 | ```{r}
217 | range(sheet$date_published)
218 | ```
219 | 
220 | ### Titles
221 | ```{r, fig.height=6, fig.width=6}
222 | plot_wordcloud(sheet, year_min = 2001, year_max = 2010)
223 | ```
224 | 
225 | ```{r, fig.height=6, fig.width=6}
226 | plot_wordcloud(sheet, year_min = 2010, year_max = 2015)
227 | ```
228 | 
229 | ```{r, fig.height=6, fig.width=6}
230 | plot_wordcloud(sheet, year_min = 2015, year_max = 2021)
231 | ```
232 | 
233 | ### Summaries
234 | ```{r, fig.height=6, fig.width=6}
235 | plot_wordcloud(sheet, summary, year_min = 2001, year_max = 2010, scale = c(5, 0.1))
236 | ```
237 | 
238 | ```{r, fig.height=6, fig.width=6}
239 | plot_wordcloud(sheet, summary, year_min = 2010, year_max = 2015)
240 | ```
241 | 
242 | That 2009 is from the 2009 bag of words method.
243 | 
244 | ```{r, fig.height=7, fig.width=7}
245 | plot_wordcloud(sheet, summary, year_min = 2015, year_max = 2021)
246 | ```
247 | 
248 | NMF rose to fame.
249 | 
250 | ## Department names
251 | 
252 | ```{r, fig.height=6, fig.width=6}
253 | plot_wordcloud(sheet, col_use = "department", other_stop_words = inst_words)
254 | ```
255 | 


--------------------------------------------------------------------------------
/supplement/more_analyses/smfish.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "smFISH"
  3 | author: "Lambda Moses"
  4 | date: "`r Sys.Date()`"
  5 | output: rmarkdown::html_vignette
  6 | vignette: >
  7 |   %\VignetteIndexEntry{smFISH}
  8 |   %\VignetteEngine{knitr::rmarkdown}
  9 |   %\VignetteEncoding{UTF-8}
 10 | ---
 11 | 
 12 | ```{r setup, include=FALSE}
 13 | knitr::opts_chunk$set(echo = TRUE)
 14 | ```
 15 | 
 16 | Here I explore the history and geography of spatial transcriptomics based on smFISH through the metadata I collected.
 17 | ```{r}
 18 | library(museumst)
 19 | library(ggplot2)
 20 | library(purrr)
 21 | library(dplyr)
 22 | library(sf)
 23 | library(ggrepel)
 24 | theme_set(theme_bw())
 25 | ```
 26 | 
 27 | # Import data
 28 | By default, the `read_metadata` function reads the appropriate sheet from a cache within this package. This may be outdated, since the Google Sheets are constantly updated. Use the argument `update = TRUE` to download the most up to date version. The default is `FALSE` to avoid API limits during automated CRAN checks.
 29 | ```{r}
 30 | sheet <- read_metadata("smFISH")
 31 | ```
 32 | 
 33 | # Number of publications
 34 | How many publications per year?
 35 | ```{r}
 36 | # Remove duplicates as different datasets in the same publication have their own rows
 37 | publications <- get_pubs_df(sheet, "repo")
 38 | ```
 39 | 
 40 | How many publications are there in total in this sheet?
 41 | ```{r}
 42 | nrow(publications)
 43 | ```
 44 | 
 45 | ```{r}
 46 | # For maps later in this notebook
 47 | city_gc <- geocode_inst_city(publications)
 48 | pubs_on_map2 <- partial(pubs_on_map, city_gc = city_gc)
 49 | ```
 50 | 
 51 | ## Overall
 52 | Number of publications per year. Preprints are excluded from plots over years as it takes months to publish so the dates of the preprints are incoherent with those for the other papers. 
 53 | ```{r}
 54 | events <- read_major_events()
 55 | ```
 56 | 
 57 | ```{r, fig.width=8, fig.height=4}
 58 | events %>% 
 59 |   filter(category == "smFISH") %>% 
 60 |   plot_timeline(c(0.8, -0.5, 0.6, -0.5, 1, 0.6, -0.4, 0.7), 
 61 |                 expand_x = c(0.1, 0.1), expand_y = c(0.05, 0.05))
 62 | ```
 63 | 
 64 | ```{r}
 65 | pubs_per_year(publications)
 66 | ```
 67 | 
 68 | ## By method
 69 | How about when broken down by method (not for data analysis sheets)?
 70 | 
 71 | ```{r}
 72 | methods <- unnest_cat(sheet, method)
 73 | ```
 74 | 
 75 | ```{r, fig.width=6, fig.height=12}
 76 | pubs_per_year(methods, facet_by = "method")
 77 | ```
 78 | 
 79 | ## By species
 80 | 
 81 | ```{r}
 82 | species <- unnest_cat(sheet, species)
 83 | ```
 84 | 
 85 | ```{r}
 86 | pubs_per_cat(species, species)
 87 | ```
 88 | 
 89 | ```{r, fig.width=6, fig.height=3}
 90 | pubs_per_cat(species, species, isotype = TRUE, img_unit = 1)
 91 | ```
 92 | 
 93 | ## By journal
 94 | ```{r}
 95 | sort(table(publications$journal))
 96 | ```
 97 | 
 98 | ## Location
 99 | ### General
100 | Just some barplots for number of publications per institution, city, and country. 
101 | ```{r}
102 | pubs_per_cat(publications, country)
103 | ```
104 | 
105 | How about per capita? Actually this probably does not say much since many researchers in the West are immigrants. Then it raises the burning political question of who counts as the "population".
106 | 
107 | ```{r}
108 | pubs_per_capita(publications, plot = "bar")
109 | ```
110 | 
111 | How about country over time? There might not be enough publications to show a trend.
112 | ```{r, fig.height=6, fig.width=6}
113 | pubs_per_year(publications, facet_by = "country")
114 | ```
115 | 
116 | Well, I know, this is because seqFISH comes from Caltech and MERFISH comes from Harvard.
117 | 
118 | Now look at cities. 
119 | 
120 | ```{r}
121 | pubs_per_cat(publications, city)
122 | ```
123 | 
124 | Cambridge for Harvard, and Pasadena for Caltech.
125 | 
126 | Institutions
127 | ```{r}
128 | pubs_per_cat(publications, institution)
129 | ```
130 | 
131 | So thank you, Long Cai. OK, now here comes the maps!
132 | 
133 | ### Worldwide
134 | ```{r}
135 | pubs_on_map2(publications)
136 | ```
137 | 
138 | Let me also plot the per capita thing on a map, as a choropleth
139 | ```{r}
140 | pubs_per_capita(publications)
141 | ```
142 | 
143 | Break down by species
144 | ```{r}
145 | pubs_on_map2(species, facet_by = "species", ncol = 2)
146 | ```
147 | 
148 | By method
149 | ```{r, fig.width=8, fig.height=8}
150 | pubs_on_map2(methods, facet_by = "method")
151 | ```
152 | 
153 | I suppose this merely tells us where those techniques were invented and how they spread. It seems that seqFISH and its variants did not spread beyond Caltech; the dataset labeled Harvard is from a collaboration between Long Cai and someone at Harvard. However, MERFISH did spread beyond Harvard, where it originated, to UCLA.
154 | 
155 | ### Europe
156 | Plot a map just for Europe. A problem here is that if I simply filter the `world` sf object, I'll get some islands away from what we usually think is Europe. Kind of feel bad for Russians since Russia is so large that it makes the map look bad and I don't have a paper from Russia in this spreadsheet. 
157 | 
158 | ```{r}
159 | pubs_on_map2(publications, zoom = "europe")
160 | ```
161 | 
162 | ```{r}
163 | pubs_per_capita(publications, "europe")
164 | ```
165 | 
166 | ```{r, fig.width=6, fig.height=4}
167 | pubs_on_map2(species, zoom = "europe", facet_by = "species")
168 | ```
169 | 
170 | Maybe I just made this plot for fun. Not sure what to say about it.
171 | 
172 | ```{r, fig.width=6, fig.height=6}
173 | pubs_on_map2(methods, zoom = "europe", facet_by = "method", ncol = 2)
174 | ```
175 | 
176 | Note that HybISS and SCRINSHOT both use RCA, which is used in the original 2013 ISS, which comes from Sweden. Is that why?
177 | 
178 | ### USA
179 | Also a plot just for America. I did not encounter a publication in this spreadsheet that is from Hawaii or Alaska, but I'll include Hawaii and Alaska in the map anyway in case I get one in the future.
180 | 
181 | ```{r}
182 | pubs_on_map2(publications, zoom = "usa")
183 | ```
184 | 
185 | ```{r}
186 | pubs_per_capita(publications, "usa")
187 | ```
188 | 
189 | ```{r}
190 | pubs_on_map2(species, zoom = "usa", facet_by = "species", ncol = 2)
191 | ```
192 | 
193 | ```{r, fig.width=8, fig.height=6}
194 | pubs_on_map2(methods, zoom = "usa", facet_by = "method")
195 | ```
196 | 
197 | # Word cloud
198 | ## Titles
199 | ```{r, fig.height=6, fig.width=6}
200 | plot_wordcloud(sheet)
201 | ```
202 | 
203 | Yeah, single cell.
204 | ```{r, fig.height=6, fig.width=6}
205 | plot_wordcloud(sheet, species_use = "Mus musculus")
206 | ```
207 | 
208 | ```{r, fig.height=6, fig.width=6}
209 | plot_wordcloud(sheet, species_use = "Homo sapiens")
210 | ```
211 | 
212 | ## Tissues
213 | ```{r, fig.height=4, fig.width=4}
214 | plot_wordcloud(sheet, tissue)
215 | ```
216 | 
217 | That 2 comes from U-2 OS, which was used many times to test MERFISH.
218 | ```{r, fig.height=6, fig.width=6}
219 | plot_wordcloud(sheet, tissue, species_use = "Mus musculus")
220 | ```
221 | 
222 | ```{r, fig.height=4, fig.width=4}
223 | plot_wordcloud(sheet, tissue, species_use = "Homo sapiens")
224 | ```
225 | 
226 | Those are all about cell culture. I look forward to seeing this used on more real tissues.
227 | 
228 | ## Over time
229 | 
230 | ```{r}
231 | range(sheet$date_published)
232 | ```
233 | 
234 | ```{r, fig.height=4, fig.width=4}
235 | plot_wordcloud(sheet, year_min = 2012, year_max = 2016)
236 | ```
237 | 
238 | ```{r, fig.height=6, fig.width=6}
239 | plot_wordcloud(sheet, year_min = 2016, year_max = 2021)
240 | ```
241 | 
242 | ## Department names
243 | 
244 | ```{r, fig.height=6, fig.width=6}
245 | plot_wordcloud(sheet, col_use = "department", other_stop_words = inst_words)
246 | ```
247 | 
248 | That's because Long Cai used to be in CCE, though later he moved to BBE.
249 | 
250 | ## Downstream analyses
251 | ```{r, fig.height=5, fig.width=5}
252 | plot_wordcloud(sheet, downstream)
253 | ```
254 | 
255 | # Programming languages
256 | ```{r}
257 | langs <- unnest_cat(sheet, language)
258 | ```
259 | 
260 | ```{r}
261 | pubs_per_cat(langs, language, isotype = TRUE, img_unit = 2)
262 | ```
263 | 
264 | # Data and code availability
265 | 
266 | How many publications have provided a code repo?
267 | ```{r}
268 | hist_bool(publications, !is.na(repo))
269 | ```
270 | 
271 | Here one paper can have multiple datasets and the availability status of the datasets can be different, so I'm plotting the datasets rather than publications here.
272 | 
273 | ```{r}
274 | hist_bool(sheet, has_matrix)
275 | ```
276 | 
277 | It seems that more recent publications are more likely to provide a repo, but not data. Is this significant? Here I fit a logistic regression model to use date published to predict whether publications have repo or accession and test if the coefficients are 0.
278 | ```{r}
279 | test_year_bool(publications, !is.na(repo))
280 | ```
281 | 
282 | Maybe a bit suggestive, but not significant.
283 | 
284 | # Number of genes and cells
285 | 
286 | The mean for each study is used since I'm not always sure what counts as a dataset. Sometimes I mean one section, while sometimes I mean multiple sections the authors treated as one dataset. Actually sometimes the authors aren't clear how many sections are there in a "dataset", hence this confusion.
287 | ```{r}
288 | mean_genes <- sheet %>% 
289 |   group_by(date_published, title, method) %>% 
290 |   summarize(n_genes = mean(n_genes, na.rm = TRUE))
291 | ```
292 | 
293 | ```{r}
294 | ggplot(mean_genes, aes(date_published, n_genes, color = method)) +
295 |   geom_point() +
296 |   geom_text_repel(aes(label = method), segment.alpha = 0.5) +
297 |   labs(x = "Date published", y = "Mean number of genes per study") +
298 |   theme(legend.position = "none")
299 | ```
300 | 
301 | While the max possible number of genes increased, I don't think people necessarily want to go for the max.
302 | ```{r}
303 | mean_genes %>% 
304 |   filter(n_genes < 1000) %>% 
305 |   ggplot(aes(date_published, n_genes)) +
306 |   geom_point(aes(color = method)) +
307 |   geom_text_repel(aes(label = method, color = method), segment.alpha = 0.5) +
308 |   geom_smooth(method = "lm") +
309 |   labs(x = "Date published", y = "Mean number of genes per study") +
310 |   theme(legend.position = "none")
311 | ```
312 | 
313 | See, without those 3 outliers, there really does not seem to be an increase (if not a decrease). I wonder why people aren't opting for the larger number of genes possible. Is it because the probes are expensive or it's too challenging to process the sheer number of images, which is exacerbated by poor documentation of much of the existing code written for this task and lack of a unified pipeline? Now I wonder if growth of ST is in part because of STPipeline, which makes data preprocessing easier. Now I also wonder how Cell Ranger and Seurat contributed to spread of scRNA-seq in general.
314 | 
315 | How about number of cells? Here the total number of cells from each study per method is plotted.
316 | ```{r}
317 | sum_cells <- sheet %>% 
318 |   group_by(date_published, title, method) %>% 
319 |   summarize(n_cells = sum(n_cells, na.rm = TRUE)) %>% 
320 |   filter(n_cells > 0)
321 | ```
322 | 
323 | ```{r}
324 | ggplot(sum_cells, aes(date_published, n_cells, color = method)) +
325 |   geom_point() +
326 |   geom_text_repel(aes(label = method), segment.alpha = 0.5) +
327 |   labs(x = "Date published", y = "Total number of cells per study") +
328 |   theme(legend.position = "none")
329 | ```
330 | 
331 | That outlier is the crazy hypothalamus MERFISH study.
332 | 
333 | ```{r}
334 | sum_cells2 <- sum_cells %>% 
335 |   filter(n_cells < 1e5)
336 | ggplot(sum_cells2, aes(date_published, n_cells)) +
337 |   geom_point(aes(color = method)) +
338 |   geom_text_repel(aes(label = method, color = method), segment.alpha = 0.5) +
339 |   geom_smooth(method = "lm") +
340 |   labs(x = "Date published", y = "Total number of cells per study") +
341 |   theme(legend.position = "none") +
342 |   coord_cartesian(ylim = c(0, max(sum_cells2$n_cells)))
343 | ```
344 | 
345 | I don't think it's significant. Here I formally test if beta is not 0:
346 | ```{r}
347 | summary(lm(n_cells ~ date_published, data = sum_cells2))
348 | ```
349 | 
350 | See, it's not significant. So what does this mean? What should our priorities be? Shall we focus on in depth data analysis techniques without worrying too much about scalability yet since people aren't opting for larger amount of data anyway, or shall we prioritize scalability since lack of scalability of data processing and analysis tools is the reason why people aren't opting for larger amount of data? Or is that because the existing software is not very user friendly, so people consider data analysis a really daunting task? Or is it the expensive reagents and instruments and the amount of time to collect data? Or did people make up their minds and decided that it's not necessary to profile that many cells? I don't know. I think we can try both and see what happens in a few years. Well, given how quickly this field is evolving, I suppose months. 
351 | 
352 | ```{r}
353 | sessionInfo()
354 | ```
355 | 
356 | 


--------------------------------------------------------------------------------
/supplement/niche.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pachterlab/LP_2021/7989d2665e5dcebc8c13e61639363a9268ead1a1/supplement/niche.png


--------------------------------------------------------------------------------
/supplement/note.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pachterlab/LP_2021/7989d2665e5dcebc8c13e61639363a9268ead1a1/supplement/note.png


--------------------------------------------------------------------------------
/supplement/preamble.tex:
--------------------------------------------------------------------------------
 1 | \usepackage{booktabs}
 2 | \usepackage{longtable}
 3 | \usepackage{hyperref}
 4 | \usepackage[bf,singlelinecheck=off]{caption}
 5 | 
 6 | %\setmainfont[UprightFeatures={SmallCapsFont=AlegreyaSC-Regular}]{Alegreya}
 7 | 
 8 | \usepackage{framed,color}
 9 | \definecolor{shadecolor}{RGB}{248,248,248}
10 | 
11 | \renewcommand{\textfraction}{0.05}
12 | \renewcommand{\topfraction}{0.8}
13 | \renewcommand{\bottomfraction}{0.8}
14 | \renewcommand{\floatpagefraction}{0.75}
15 | 
16 | \renewenvironment{quote}{\begin{VF}}{\end{VF}}
17 | \let\oldhref\href
18 | \renewcommand{\href}[2]{#2\footnote{\url{#1}}}
19 | 
20 | \ifxetex
21 |   \usepackage{letltxmacro}
22 |   \setlength{\XeTeXLinkMargin}{1pt}
23 |   \LetLtxMacro\SavedIncludeGraphics\includegraphics
24 |   \def\includegraphics#1#{% #1 catches optional stuff (star/opt. arg.)
25 |     \IncludeGraphicsAux{#1}%
26 |   }%
27 |   \newcommand*{\IncludeGraphicsAux}[2]{%
28 |     \XeTeXLinkBox{%
29 |       \SavedIncludeGraphics#1{#2}%
30 |     }%
31 |   }%
32 | \fi
33 | 
34 | \makeatletter
35 | \newenvironment{kframe}{%
36 | \medskip{}
37 | \setlength{\fboxsep}{.8em}
38 |  \def\at@end@of@kframe{}%
39 |  \ifinner\ifhmode%
40 |   \def\at@end@of@kframe{\end{minipage}}%
41 |   \begin{minipage}{\columnwidth}%
42 |  \fi\fi%
43 |  \def\FrameCommand##1{\hskip\@totalleftmargin \hskip-\fboxsep
44 |  \colorbox{shadecolor}{##1}\hskip-\fboxsep
45 |      % There is no \\@totalrightmargin, so:
46 |      \hskip-\linewidth \hskip-\@totalleftmargin \hskip\columnwidth}%
47 |  \MakeFramed {\advance\hsize-\width
48 |    \@totalleftmargin\z@ \linewidth\hsize
49 |    \@setminipage}}%
50 |  {\par\unskip\endMakeFramed%
51 |  \at@end@of@kframe}
52 | \makeatother
53 | 
54 | \makeatletter
55 | \@ifundefined{Shaded}{
56 | }{\renewenvironment{Shaded}{\begin{kframe}}{\end{kframe}}}
57 | \makeatother
58 | 
59 | \newenvironment{rmdblock}[1]
60 |   {
61 |   \begin{itemize}
62 |   \renewcommand{\labelitemi}{
63 |     \raisebox{-.7\height}[0pt][0pt]{
64 |       {\setkeys{Gin}{width=3em,keepaspectratio}\includegraphics{#1}}
65 |     }
66 |   }
67 |   \setlength{\fboxsep}{1em}
68 |   \begin{kframe}
69 |   \item
70 |   }
71 |   {
72 |   \end{kframe}
73 |   \end{itemize}
74 |   }
75 | 
76 | \newenvironment{rmdtip}
77 |   {\begin{rmdblock}{tip}}
78 |   {\end{rmdblock}}
79 | \newenvironment{rmdnote}
80 |   {\begin{rmdblock}{note}}
81 |   {\end{rmdblock}}
82 | 
83 | \usepackage{makeidx}
84 | \makeindex
85 | 
86 | \urlstyle{tt}
87 | 
88 | \usepackage{amsthm}
89 | \makeatletter
90 | \def\thm@space@setup{%
91 |   \thm@preskip=8pt plus 2pt minus 4pt
92 |   \thm@postskip=\thm@preskip
93 | }
94 | \makeatother
95 | 
96 | 


--------------------------------------------------------------------------------
/supplement/prequel_techs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pachterlab/LP_2021/7989d2665e5dcebc8c13e61639363a9268ead1a1/supplement/prequel_techs.png


--------------------------------------------------------------------------------
/supplement/rca.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pachterlab/LP_2021/7989d2665e5dcebc8c13e61639363a9268ead1a1/supplement/rca.png


--------------------------------------------------------------------------------
/supplement/regular.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pachterlab/LP_2021/7989d2665e5dcebc8c13e61639363a9268ead1a1/supplement/regular.png


--------------------------------------------------------------------------------
/supplement/sedal.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pachterlab/LP_2021/7989d2665e5dcebc8c13e61639363a9268ead1a1/supplement/sedal.png


--------------------------------------------------------------------------------
/supplement/seqfish-plus.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pachterlab/LP_2021/7989d2665e5dcebc8c13e61639363a9268ead1a1/supplement/seqfish-plus.png


--------------------------------------------------------------------------------
/supplement/seqfish.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pachterlab/LP_2021/7989d2665e5dcebc8c13e61639363a9268ead1a1/supplement/seqfish.png


--------------------------------------------------------------------------------
/supplement/smfish1998.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pachterlab/LP_2021/7989d2665e5dcebc8c13e61639363a9268ead1a1/supplement/smfish1998.png


--------------------------------------------------------------------------------
/supplement/smfish_cells.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pachterlab/LP_2021/7989d2665e5dcebc8c13e61639363a9268ead1a1/supplement/smfish_cells.png


--------------------------------------------------------------------------------
/supplement/smfish_cells_part.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pachterlab/LP_2021/7989d2665e5dcebc8c13e61639363a9268ead1a1/supplement/smfish_cells_part.png


--------------------------------------------------------------------------------
/supplement/smfish_gene.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pachterlab/LP_2021/7989d2665e5dcebc8c13e61639363a9268ead1a1/supplement/smfish_gene.png


--------------------------------------------------------------------------------
/supplement/smfish_gene_part.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pachterlab/LP_2021/7989d2665e5dcebc8c13e61639363a9268ead1a1/supplement/smfish_gene_part.png


--------------------------------------------------------------------------------
/supplement/solid.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pachterlab/LP_2021/7989d2665e5dcebc8c13e61639363a9268ead1a1/supplement/solid.png


--------------------------------------------------------------------------------
/supplement/split-fish.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pachterlab/LP_2021/7989d2665e5dcebc8c13e61639363a9268ead1a1/supplement/split-fish.png


--------------------------------------------------------------------------------
/supplement/style.css:
--------------------------------------------------------------------------------
 1 | p.caption {
 2 |   color: #777;
 3 |   margin-top: 10px;
 4 | }
 5 | p code {
 6 |   white-space: inherit;
 7 | }
 8 | pre {
 9 |   word-break: normal;
10 |   word-wrap: normal;
11 | }
12 | pre code {
13 |   white-space: inherit;
14 | }
15 | .rmdtip {
16 |   padding: 1em 1em 1em 4em;
17 |   margin-bottom: 10px;
18 |   background: #f5f5f5 5px center/3em no-repeat;
19 |   background-image: url("./tip.png");
20 | }
21 | 
22 | .rmdnote {
23 |   padding: 1em 1em 1em 4em;
24 |   margin-bottom: 10px;
25 |   background: #f5f5f5 5px center/3em no-repeat;
26 |   background-image: url("./note.png");
27 | }
28 | 


--------------------------------------------------------------------------------
/supplement/tip.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pachterlab/LP_2021/7989d2665e5dcebc8c13e61639363a9268ead1a1/supplement/tip.png


--------------------------------------------------------------------------------
/supplement/tomo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pachterlab/LP_2021/7989d2665e5dcebc8c13e61639363a9268ead1a1/supplement/tomo.png


--------------------------------------------------------------------------------
/supplement/voxelation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pachterlab/LP_2021/7989d2665e5dcebc8c13e61639363a9268ead1a1/supplement/voxelation.png


--------------------------------------------------------------------------------