├── .Rbuildignore
├── .github
    └── workflows
    │   └── basic_checks.yaml
├── .gitignore
├── DESCRIPTION
├── Dockerfile
├── LICENSE
├── NAMESPACE
├── R
    ├── data.R
    ├── imbalance_score.R
    └── import_raw_data.R
├── README.md
├── _pkgdown.yml
├── data
    ├── calendar.rda
    ├── evaluateKResult.png
    └── sce.rda
├── man
    ├── calendar.Rd
    ├── imbalance_score.Rd
    ├── importRawData.Rd
    └── sce.Rd
└── vignettes
    ├── .gitignore
    ├── advent.png
    ├── advent2.JPG
    ├── adventExample.Rmd
    └── workshopTrajectories.Rmd


/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^bioc2020trajectories\.Rproj$
2 | ^\.Rproj\.user$
3 | .travis.yml
4 | .github/*


--------------------------------------------------------------------------------
/.github/workflows/basic_checks.yaml:
--------------------------------------------------------------------------------
 1 | on: [push]
 2 | 
 3 | # change this env variable (v...) to manually bust the 
 4 | # cache. Cache busting otherwise occurs with changes in
 5 | # dependencies in the DESCRIPTION file. 
 6 | env:
 7 |   cache-version: v2
 8 | 
 9 | jobs:
10 |   job1:
11 |     runs-on: ubuntu-latest
12 |     container: bioconductor/bioconductor_docker:devel
13 |     steps:
14 |       - uses: actions/checkout@v1
15 | 
16 |       - name: Query dependencies and update old packages
17 |         run: |
18 |           install.packages('remotes')
19 |           options(repos = c(CRAN = 'https://cran.r-project.org'))
20 |           BiocManager::install(ask=FALSE)
21 |           saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2)
22 |         shell: Rscript {0}
23 |         
24 |       - name: Cache R packages
25 |         if: runner.os == 'Linux'
26 |         uses: actions/cache@v1
27 |         with:
28 |           path: /usr/local/lib/R/site-library
29 |           key: ${{ env.cache-version }}-${{ runner.os }}-r-${{ hashFiles('.github/depends.Rds') }}
30 |           restore-keys: ${{ env.cache-version }}-${{ runner.os }}-r-
31 | 
32 |       # This lets us augment with additional dependencies
33 |       - name: Install system dependencies
34 |         if: runner.os == 'Linux'
35 |         env:
36 |           RHUB_PLATFORM: linux-x86_64-ubuntu-gcc
37 |         run: |
38 |           Rscript -e "remotes::install_github('r-hub/sysreqs')"
39 |           sysreqs=$(Rscript -e "cat(sysreqs::sysreq_commands('DESCRIPTION'))")
40 |           sudo -s eval "$sysreqs"
41 |           
42 |       - name: Install dependencies
43 |         run: |
44 |           remotes::install_deps(dependencies = TRUE, repos = BiocManager::repositories())
45 |           remotes::install_cran("rcmdcheck")
46 |         shell: Rscript {0}
47 | 
48 |       - name: Check
49 |         env:
50 |           _R_CHECK_CRAN_INCOMING_REMOTE_: false
51 |         run: rcmdcheck::rcmdcheck(args = c("--no-manual"), error_on = "error", check_dir = "check")
52 |         shell: Rscript {0}
53 |         
54 |       - uses: docker/build-push-action@v1
55 |         with:
56 |           username: ${{ secrets.DOCKER_USERNAME }}
57 |           password: ${{ secrets.DOCKER_PASSWORD }}
58 |           repository: kstreet13/bioc2020trajectories
59 |           tag_with_ref: true
60 |           tag_with_sha: true
61 |           tags: latest
62 | 
63 |       - name: Build pkgdown
64 |         run: |
65 |            PATH=$PATH:$HOME/bin/ Rscript -e 'pkgdown::build_site(".")'
66 | 
67 |       # deploy needs rsync? Seems so.
68 |       - name: Install deploy dependencies
69 |         run: |
70 |           apt-get update
71 |           apt-get -y install rsync
72 | 
73 |       - name: Deploy 🚀
74 |         uses: JamesIves/github-pages-deploy-action@releases/v3
75 |         with:
76 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
77 |           BRANCH: gh-pages # The branch the action should deploy to.
78 |           FOLDER: docs # The folder the action should deploy.
79 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # History files
 2 | .Rhistory
 3 | .Rapp.history
 4 | 
 5 | # Session Data files
 6 | .RData
 7 | .rds
 8 | *.rds.gz
 9 | 
10 | # tradeSeq fit
11 | data/sce_conditions_4knots.rds
12 | data/sce_new.rds
13 | 
14 | # RStudio files
15 | .Rproj.user/
16 | *.Rproj
17 | 
18 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
19 | .httr-oauth
20 | 
21 | # knitr and R markdown default cache directories
22 | /*_cache/
23 | /cache/
24 | 
25 | # Temporary files created by R markdown
26 | *.utf8.md
27 | *.knit.md
28 | 
29 | # Mac artifacts
30 | *.DS_Store
31 | 
32 | # built README
33 | README.html
34 | .Rproj.user
35 | inst/doc
36 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: bioc2020trajectories
 2 | Title: Trajectory inference across conditions: differential expression and differential progression
 3 | Version: 0.0.0.93
 4 | Authors@R: c(person("Kelly", "Street", role = c("aut", "cre"),
 5 |                      email = "kstreet@ds.dfci.harvard.edu"),
 6 |               person("Koen", "Van den Berge", role = c("aut"),
 7 |                      email = "koenvdberge@berkeley.edu"),
 8 | 	      person("Hector","Roux de Bezieux", role = c("aut"),
 9 | 			email = "hector.rouxdebezieux@berkeley.edu"))
10 | Description: What the package does (one paragraph).
11 | Depends: R (>= 3.5.0)
12 | Encoding: UTF-8
13 | LazyData: true
14 | License: CC BY 4.0  + file LICENSE
15 | RoxygenNote: 7.1.1
16 | Suggests: 
17 |     knitr,
18 |     RColorBrewer,
19 |     rmarkdown,
20 |     scales,
21 |     pkgdown,
22 |     viridis,
23 |     ggplot2,
24 |     UpSetR,
25 |     pheatmap,
26 |     msigdbr,
27 |     fgsea,
28 |     gridExtra,
29 |     tradeSeq
30 | Imports: 
31 |     BiocFileCache,
32 |     slingshot,
33 |     rappdirs,
34 |     SingleCellExperiment,
35 |     HDF5Array,
36 |     igraph,
37 |     RANN,
38 |     magrittr,
39 |     mgcv,
40 |     purrr
41 | VignetteBuilder: knitr
42 | URL: https://kstreet13.github.io/bioc2020trajectories
43 | DockerImage: kstreet13/bioc2020trajectories:latest
44 | Remotes: github::statOmics/tradeSeq
45 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM bioconductor/bioconductor_docker:devel
 2 | 
 3 | WORKDIR /home/rstudio
 4 | 
 5 | COPY --chown=rstudio:rstudio . /home/rstudio/
 6 | 
 7 | RUN apt-get update && apt-get install -y libglpk-dev
 8 | 
 9 | RUN Rscript -e "Sys.setenv(R_REMOTES_NO_ERRORS_FROM_WARNINGS='true');options(repos = c(CRAN = 'https://cran.r-project.org')); BiocManager::install(ask=FALSE)"
10 | 
11 | RUN Rscript -e "Sys.setenv(R_REMOTES_NO_ERRORS_FROM_WARNINGS='true');options(repos = c(CRAN = 'https://cran.r-project.org')); devtools::install('.', dependencies=TRUE, build_vignettes=TRUE, repos = BiocManager::repositories())"
12 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | This work is licensed under the Creative Commons Attribution 4.0 International License. To view a copy of this license, visit http://creativecommons.org/licenses/by/4.0/ or send a letter to Creative Commons, PO Box 1866, Mountain View, CA 94042, USA.
2 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | export(imbalance_score)
 4 | export(importRawData)
 5 | import(BiocFileCache)
 6 | import(HDF5Array)
 7 | import(RANN)
 8 | import(SingleCellExperiment)
 9 | import(igraph)
10 | import(purrr)
11 | import(rappdirs)
12 | import(slingshot)
13 | importFrom(magrittr,"%>%")
14 | importFrom(mgcv,gam)
15 | 


--------------------------------------------------------------------------------
/R/data.R:
--------------------------------------------------------------------------------
 1 | #' The object
 2 | #'
 3 | #' A \code{\link{SingleCellExperiment}} object of 9268 cells, derived from [McFaline-Figueroa, et al.](https://www.nature.com/articles/s41588-019-0489-5)
 4 | #'
 5 | #' @format Can be re-obtained by runing first chunk of the vignette.
 6 | #' @usage data(sce)
 7 | "sce"
 8 | 
 9 | #' Calendar
10 | #'
11 | #' A \code{\link{data.frame}} for the calendar
12 | #'
13 | #' @format Can be re-obtained from the image
14 | #' @usage data("calendar")
15 | "calendar"


--------------------------------------------------------------------------------
/R/imbalance_score.R:
--------------------------------------------------------------------------------
  1 | # Inspired from the EMT package from Uwe Menzel
  2 | # Uwe Menzel (2013). EMT: Exact Multinomial Test: Goodness-of-Fit Test for Discrete
  3 | # Multivariate data. R package version 1.1. https://CRAN.R-project.org/package=EMT
  4 | # citation(EMT)
  5 | 
  6 | .findVectors <- function(groups, size) {
  7 |   if (groups == 1) {
  8 |     mat <- size
  9 |   }
 10 |   else {
 11 |     mat <- matrix(rep(0, groups - 1), nrow = 1)
 12 |     for (i in 1:size) {
 13 |       mat <- rbind(mat, .findVectors(groups - 1, i))
 14 |     }
 15 |     mat <- cbind(mat, size - rowSums(mat))
 16 |   }
 17 |   return(mat)
 18 | }
 19 | 
 20 | .multinomial.test <- function(clMatrix, groups, props) {
 21 |   size <- ncol(clMatrix)
 22 |   eventMat <- .findVectors(length(groups), size)
 23 |   eventProb <- apply(eventMat, 1, function(x) {
 24 |     dmultinom(x,  size = size, prob = props)
 25 |   })
 26 |   pvalues <- apply(clMatrix, 1, function(conds){
 27 |     real <- as.vector(table(factor(conds, levels = groups)))
 28 |     pObs <- dmultinom(real, size, props)
 29 |     p.value <- sum(eventProb[eventProb <= pObs])
 30 |     return(p.value)
 31 |   })
 32 |   res <- -stats::qnorm(unlist(pvalues) / 2)
 33 |   return(res)
 34 | }
 35 | 
 36 | #' @title Imbalance score.
 37 | #'
 38 | #' @description Compute an imbalance score to show whether nearby cells have the
 39 | #' same condition of not
 40 | #' 
 41 | #' @return A list with two components:
 42 | #' \itemize{
 43 | #'   \item \code{scores} is the raw score, a vector with one value per cell.
 44 | #'   \item \code{scaled_scores} is the score after local smoothing. A vector with one value per cell.
 45 | #' }
 46 | #' 
 47 | #' @details The score is computed in two steps. First, a score is computed for 
 48 | #' each cell. The distribution of labels among the \code{k}-nearest- neighbours 
 49 | #' is computed to the overall distribution for  all cells. This yields a p-value 
 50 | #' based on the multinomial distribution, which is squared to return the scores. 
 51 | #' 
 52 | #' Then, splines are used to smooth the scores along the reduced dimension space, 
 53 | #' with \code{smooth} nodes.  This yields the scaled_scores.
 54 | #' 
 55 | #' @param rd The reduced dimension matrix of the cells
 56 | #' @param cl the vector of conditions
 57 | #' @param k The number of neighbours to consider when computing the score.
 58 | #'  Default to 10.
 59 | #' @param smooth The smoothing parameter. Default to k. Lower values mean that
 60 | #' we smooth more.
 61 | #' @importFrom magrittr %>%
 62 | #' @import RANN purrr
 63 | #' @importFrom mgcv gam
 64 | #' @import igraph
 65 | #' @export
 66 | imbalance_score <- function(rd, cl, k = 10, smooth = k) {
 67 |   # Code inspired from the monocle3 package
 68 |   # https://github.com/cole-trapnell-lab/monocle3/blob/9becd94f60930c2a9b51770e3818c194dd8201eb/R/cluster_cells.R#L194
 69 | 
 70 |   props <- as.vector(table(cl) / length(cl))
 71 |   groups <- unique(cl)
 72 |   if (length(groups) == 1) stop("cl should have at least 2 classes")
 73 | 
 74 |   # Get the graph
 75 |   # We need to add 1 because by default, nn2 counts each cell as its own
 76 |   # neighbour
 77 |   tmp <- RANN::nn2(rd, rd, k + 1, searchtype = "standard")
 78 |   neighborMatrix <- tmp[[1]]
 79 |   # Remove each cell from being it's own neighbour
 80 |   distMatrix <- tmp[[2]][, -1]
 81 |   distMatrix <- 1 / distMatrix
 82 |   distMatrix <- distMatrix / rowSums(distMatrix)
 83 |   clMatrix <- matrix(factor(cl)[neighborMatrix], ncol = k + 1)
 84 |   simMatrix <- clMatrix == clMatrix[, 1]
 85 |   # Remove each cell from being it's own neighbour
 86 |   simMatrix <- simMatrix[, -1]
 87 | 
 88 |   # Get the basic scores
 89 |   scores <- rowMeans(simMatrix)
 90 | 
 91 |   # Get the smoothed scores
 92 |   scaled_scores <- .multinomial.test(clMatrix, groups, props)
 93 |   scaled_scores <- unlist(scaled_scores)
 94 |   names(scaled_scores) <- rownames(rd)
 95 |   formula <- paste0("scaled_scores ~ s(",
 96 |                     paste0("rd[, ", seq_len(ncol(rd)), "], ", collapse = ""),
 97 |                     "k = smooth)")
 98 |   mm <- mgcv::gam(as.formula(formula))
 99 |   scaled_scores <- predict(mm, type = "response")
100 | 
101 |   return(list("scores" = scores, "scaled_scores" = scaled_scores))
102 | }
103 | 


--------------------------------------------------------------------------------
/R/import_raw_data.R:
--------------------------------------------------------------------------------
 1 | #' @title Download and import the raw dataset.
 2 | #'
 3 | #' @description Download the dataset from GEO, filter, and create a
 4 | #' \code{SingleCellExperiment object}
 5 | #'
 6 | #' @export
 7 | #' @import BiocFileCache SingleCellExperiment rappdirs slingshot HDF5Array
 8 | importRawData <- function(){
 9 |   url <- "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE114687&format=file&file=GSE114687%5Fpseudospace%5Fcds%2Erds%2Egz"
10 |   path <- paste0(rappdirs::user_cache_dir(), basename(url))
11 |   bfc <- BiocFileCache::BiocFileCache(path, ask = FALSE)
12 |   addCds <- bfcadd(bfc, "cds", fpath = url)
13 |   con <- gzcon(gzfile(addCds))
14 |   cds <- readRDS(con)
15 | 
16 |   # Extract useful info from the cellDataSet object
17 |   counts <- cds@assayData$exprs
18 |   phenoData <- pData(cds@phenoData)
19 |   rd <- SimpleList(
20 |     tSNEorig = cbind(cds@phenoData@data$TSNE.1, cds@phenoData@data$TSNE.2)
21 |   )
22 |   rm(cds) ; gc(verbose = FALSE)
23 |   filt <- apply(counts, 1, function(x){
24 |     sum(x >= 2) >= 15
25 |   })
26 |   counts <- counts[filt, ]
27 |   sce <- SingleCellExperiment::SingleCellExperiment(
28 |     assays = list(counts = counts),
29 |     colData = phenoData,
30 |     reducedDims = rd)
31 |   return(sce)
32 | }
33 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Trajectory inference across conditions: differential expression and differential progression
 2 | 
 3 | [![.github/workflows/basic_checks.yaml](https://github.com/kstreet13/bioc2020trajectories/workflows/.github/workflows/basic_checks.yaml/badge.svg)](https://github.com/kstreet13/bioc2020trajectories/actions)
 4 | 
 5 | 
 6 | # Instructors
 7 | 
 8 |  - Kelly Street (kstreet@ds.dfci.harvard.edu)
 9 |  - Koen Van den Berge (koenvdberge@berkeley.edu)
10 |  - Hector Roux de Bézieux (hector.rouxdebezieux@berkeley.edu)
11 |  
12 | # Workshop Description
13 | 
14 | In single-cell RNA-sequencing (scRNA-seq), gene expression is assessed at the level of single cells. In dynamic biological systems, it may not be appropriate to assign cells to discrete groups, but rather a continuum of cell states may be observed, e.g. the differentiation of a stem cell population into mature cell types. This is often represented as a trajectory in reduced dimension.
15 | 
16 | Many methods have been suggested for trajectory inference. However, in this setting, it is often unclear how one should handle multiple biological groups or conditions, e.g. constructing and comparing the differentiation trajectory of a wild type versus a knock-out stem cell population.
17 | 
18 | In this workshop, we will explore methods for comparing multiple conditions in a trajectory inference analysis. We start by integrating datasets from multiple conditions into a single trajectory. By comparing the conditions along the trajectory's path, we can detect large-scale changes, indicative of differential progression. We also demonstrate how to detect subtler changes by finding genes that exhibit different behaviors between these conditions along a differentiation path.
19 | 
20 | [This vignette](https://bioconductor.org/packages/release/bioc/vignettes/slingshot/inst/doc/conditionsVignette.html) provides a more complete problem description and proposes a few analytical approaches, which will serve as the basis of our workshop.
21 | 
22 | ## Pre-requisites
23 | 
24 | Software:
25 | 
26 | * Basic knowledge of _R_ syntax
27 | * Familiarity with single-cell RNA-sequencing
28 | * Familiarity with the `SingleCellExperiment` class
29 | 
30 | Background reading:
31 | 
32 | * The textbook "Orchestrating Single-Cell Analysis with Bioconductor" is a great reference for single-cell analysis using Bioconductor packages.
33 | * [Slingshot paper](https://bmcgenomics.biomedcentral.com/articles/10.1186/s12864-018-4772-0)
34 | * [tradeSeq paper](https://www.nature.com/articles/s41467-020-14766-3)
35 | 
36 | 
37 | ## Workshop Participation
38 | 
39 | The workshop will start with an introduction to the problem and the dataset using presentation slides. Following this, we will have a lab session on how one may tackle the problem of handling multiple conditions in trajectory inference and in downstream analysis involving differential progression and differential expression.
40 | 
41 | ## _R_ / _Bioconductor_ packages used
42 | 
43 | * The workshop will focus on Bioconductor packages [SingleCellExperiment](https://bioconductor.org/packages/release/bioc/html/SingleCellExperiment.html), [Slingshot](https://bioconductor.org/packages/release/bioc/html/slingshot.html), and [tradeSeq](https://bioconductor.org/packages/release/bioc/html/tradeSeq.html) 
44 | 
45 | ## Time outline
46 | 
47 | 
48 | | Activity                     | Time |
49 | |------------------------------|------|
50 | | Introduction                     | 15m  |
51 | | Data Integration and Trajectory Inference   | 10m  |
52 | | Differential Progression                    | 15m  |
53 | | Differential Expression          | 15m  |
54 | | Wrap-up and Conclusions          | 5m  |
55 | 
56 | 
57 | # Workshop goals and objectives
58 | 
59 | Participants will learn how to reason about trajectories in single-cell RNA-seq data and how they may be used for interpretation of complex scRNA-seq datasets.
60 | 
61 | 
62 | ## Learning goals
63 | 
64 | 
65 | * Reason about dynamic biological systems.
66 | * Grasp the complexity of analyzing large scRNA-seq datasets with the goal of extracting relevant biological information.
67 | * Understand the concepts of differential progression and differential expression along a trajectory path.
68 | 
69 | ## Learning objectives
70 | 
71 | 
72 | * Learn how to analyze single-cell RNA-seq data using Bioconductor packages.
73 | * Import and explore large scRNA-seq datasets.
74 | * Understand the challenges of trajectory inference.
75 | * Compose analysis pipeline that allows interpretation of complex scRNA-seq datasets.
76 | * Assess the added complexity of handling multiple conditions in these dynamic systems and how it influences the analysis pipeline.
77 | * Discuss how the analysis pipeline can incorporate this change and evaluate it.
78 | 


--------------------------------------------------------------------------------
/_pkgdown.yml:
--------------------------------------------------------------------------------
 1 | url: https://kstreet13.github.io/bioc2020trajectories
 2 | 
 3 | template:
 4 |   params:
 5 |     bootswatch: flatly
 6 | 
 7 | home:
 8 |   title: "bioctrajectories2020"
 9 |   type: inverse
10 | 
11 | 
12 | navbar:
13 |   right:
14 |    - icon: fa-github
15 |      href: https://github.com/kstreet13/bioc2020trajectories
16 |      
17 | 
18 | 


--------------------------------------------------------------------------------
/data/calendar.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kstreet13/bioc2020trajectories/d0a386f8c1411ebc40e6edc505531926c46f354e/data/calendar.rda


--------------------------------------------------------------------------------
/data/evaluateKResult.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kstreet13/bioc2020trajectories/d0a386f8c1411ebc40e6edc505531926c46f354e/data/evaluateKResult.png


--------------------------------------------------------------------------------
/data/sce.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kstreet13/bioc2020trajectories/d0a386f8c1411ebc40e6edc505531926c46f354e/data/sce.rda


--------------------------------------------------------------------------------
/man/calendar.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{calendar}
 5 | \alias{calendar}
 6 | \title{Calendar}
 7 | \format{
 8 | Can be re-obtained from the image
 9 | }
10 | \usage{
11 | data("calendar")
12 | }
13 | \description{
14 | A \code{\link{data.frame}} for the calendar
15 | }
16 | \keyword{datasets}
17 | 


--------------------------------------------------------------------------------
/man/imbalance_score.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/imbalance_score.R
 3 | \name{imbalance_score}
 4 | \alias{imbalance_score}
 5 | \title{Imbalance score.}
 6 | \usage{
 7 | imbalance_score(rd, cl, k = 10, smooth = k)
 8 | }
 9 | \arguments{
10 | \item{rd}{The reduced dimension matrix of the cells}
11 | 
12 | \item{cl}{the vector of conditions}
13 | 
14 | \item{k}{The number of neighbours to consider when computing the score.
15 | Default to 10.}
16 | 
17 | \item{smooth}{The smoothing parameter. Default to k. Lower values mean that
18 | we smooth more.}
19 | }
20 | \value{
21 | A list with two components:
22 | \itemize{
23 |   \item \code{scores} is the raw score, a vector with one value per cell.
24 |   \item \code{scaled_scores} is the score after local smoothing. A vector with one value per cell.
25 | }
26 | }
27 | \description{
28 | Compute an imbalance score to show whether nearby cells have the
29 | same condition of not
30 | }
31 | \details{
32 | The score is computed in two steps. First, a score is computed for 
33 | each cell. The distribution of labels among the \code{k}-nearest- neighbours 
34 | is computed to the overall distribution for  all cells. This yields a p-value 
35 | based on the multinomial distribution, which is squared to return the scores. 
36 | 
37 | Then, splines are used to smooth the scores along the reduced dimension space, 
38 | with \code{smooth} nodes.  This yields the scaled_scores.
39 | }
40 | 


--------------------------------------------------------------------------------
/man/importRawData.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/import_raw_data.R
 3 | \name{importRawData}
 4 | \alias{importRawData}
 5 | \title{Download and import the raw dataset.}
 6 | \usage{
 7 | importRawData()
 8 | }
 9 | \description{
10 | Download the dataset from GEO, filter, and create a
11 | \code{SingleCellExperiment object}
12 | }
13 | 


--------------------------------------------------------------------------------
/man/sce.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{sce}
 5 | \alias{sce}
 6 | \title{The object}
 7 | \format{
 8 | Can be re-obtained by runing first chunk of the vignette.
 9 | }
10 | \usage{
11 | data(sce)
12 | }
13 | \description{
14 | A \code{\link{SingleCellExperiment}} object of 9268 cells, derived from [McFaline-Figueroa, et al.](https://www.nature.com/articles/s41588-019-0489-5)
15 | }
16 | \keyword{datasets}
17 | 


--------------------------------------------------------------------------------
/vignettes/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | *.R
3 | 


--------------------------------------------------------------------------------
/vignettes/advent.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kstreet13/bioc2020trajectories/d0a386f8c1411ebc40e6edc505531926c46f354e/vignettes/advent.png


--------------------------------------------------------------------------------
/vignettes/advent2.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kstreet13/bioc2020trajectories/d0a386f8c1411ebc40e6edc505531926c46f354e/vignettes/advent2.JPG


--------------------------------------------------------------------------------
/vignettes/adventExample.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Trajectory inference across conditions: toy example"
  3 | output:
  4 |   rmarkdown::html_document:
  5 |     highlight: pygments
  6 |     toc: true
  7 |     toc_depth: 3
  8 |     fig_width: 5
  9 | vignette: >
 10 |   %\VignetteIndexEntry{Toy Example}
 11 |   %\VignetteEngine{knitr::rmarkdown}
 12 |   %\VignetteEncoding[utf8]{inputenc}
 13 | ---
 14 | 
 15 | # Overviews
 16 | 
 17 | ## Instructor(s) name(s) and contact information
 18 | 
 19 |  - Kelly Street (kstreet@ds.dfci.harvard.edu)
 20 |  - Koen Van den Berge (koenvdberge@berkeley.edu)
 21 |  - Hector Roux de Bézieux (hector.rouxdebezieux@berkeley.edu)
 22 | 
 23 | ## Workshop Description
 24 | 
 25 | In single-cell RNA-sequencing (scRNA-seq), gene expression is assessed at the level of single cells. In dynamic biological systems, it may not be appropriate to assign cells to discrete groups, but rather a continuum of cell states may be observed, e.g. the differentiation of a stem cell population into mature cell types. This is often represented as a trajectory in a reduced dimension of the scRNA-seq dataset.
 26 | 
 27 | Many methods have been suggested for trajectory inference. However, in this setting, it is often unclear how one should handle multiple biological groups or conditions, e.g. constructing and comparing the differentiation trajectory of a wild type versus a knock-out stem cell population.
 28 | 
 29 | In this workshop, we will explore methods for comparing multiple conditions in a trajectory inference analysis. We start by integrating datasets from multiple conditions into a single trajectory. By comparing the conditions along the trajectory's path, we can detect large-scale changes, indicative of differential progression. We also demonstrate how to detect subtler changes by finding genes that exhibit different behaviors between these conditions along a differentiation path.
 30 |  
 31 | ## Goal of this vignette
 32 | 
 33 | This vignette is meant as a soft introduction to our main vignette "Trajectory inference across conditions: differential expression and differential progression" and should be taken with the implied levity. A compiled version of the main vignette is available on the [workshop website](https://kstreet13.github.io/bioc2020trajectories/articles/workshopTrajectories.html).
 34 | 
 35 | ```{r}
 36 | suppressPackageStartupMessages({
 37 |   library(knitr)
 38 |   library(ggplot2)
 39 | })
 40 | ```
 41 | 
 42 | # Setting
 43 | 
 44 | Late November, I bought an advent calendar with my partner and we realized we were facing a very challenging situation: how to properly partition the daily chocolates? Given the inherent difficulties of cutting a piece of hard chocolate into two equal blocks, we quickly settled on alternate days: I would get the treat on odd days and they’ll be eating the chocolate on even days. 
 45 | 
 46 | Below is a picture of our event calendar 
 47 | 
 48 | ```{r, out.width = "50%", fig.pos="h", fig.align='center', echo = FALSE}
 49 | knitr::include_graphics("advent.png")
 50 | ```
 51 | 
 52 | We will use the 2D representation of the calendar as a proxy example for our workflow. A mock calendar data is available with the package:
 53 | 
 54 | ```{r}
 55 | data("calendar", package = "bioc2020trajectories")
 56 | head(calendar)
 57 | ```
 58 | 
 59 | # Differential topology (of the odd and even slots in the calendar)
 60 | 
 61 | We can first see the display of the calendar, colored by odd or even attribution.
 62 | 
 63 | ```{r, fig.align='center'}
 64 | calendar$is_even <- (calendar$Day %% 2) == 0
 65 | ggplot(calendar, aes(x = x, y = y, fill = is_even)) +
 66 |   geom_tile(width = .8, height = .8) +
 67 |   scale_fill_brewer(type = "qual") +
 68 |   theme_void()
 69 | ```
 70 | 
 71 | It's hard to judge if the distribution of coordinates is truly independent from the labels. To help assess it visually, we devised an __imbalance score__. 
 72 | 
 73 | ```{r, fig.align='center'}
 74 | rd <- as.matrix(as.data.frame(calendar[, 2:3]))
 75 | scores <- bioc2020trajectories::imbalance_score(rd = rd, cl = calendar$is_even,
 76 |                                                 k = 8, smooth = 3)
 77 | calendar$scores <- scores$scaled_scores
 78 | 
 79 | ggplot(calendar, aes(x = x, y = y, fill = scores)) +
 80 |   geom_tile(width = .8, height = .8) +
 81 |   scale_fill_viridis_c() +
 82 |   theme_void()
 83 | ```
 84 | 
 85 | The scores are definitely not distributed independently from the coordinates: the manufacturer of the calendar may not have used a random process to assign the numbers. Note however that our imbalance score is definitively not the most appropriate tool in this specific setting (where $n=24$).
 86 | 
 87 | # Differential progression (of days where we eat a chocolate)
 88 | 
 89 | Of course, we know how days work so we know that the distribution of days between us is going to be the same. However, we can still compare it to pursue the comparison
 90 | 
 91 | ```{r, fig.align='center', fig.height=4}
 92 | ggplot(calendar, aes(x = Day, fill = is_even)) +
 93 |   geom_density(alpha = .5) +
 94 |   theme_minimal() +
 95 |   scale_fill_brewer(type = "qual")
 96 | ```
 97 | 
 98 | This seems about right. We can check if the two distributions are indeed identical using the *Kolmogorov-Smirnov* test.
 99 | 
100 | ```{r}
101 | ks.test(
102 |   x = calendar$Day[calendar$is_even],
103 |   y = calendar$Day[!calendar$is_even]
104 | )
105 | ```
106 | 
107 | We do indeed fail to reject the null.
108 | 
109 | # Differential (chocolated) expression
110 | 
111 | Now that we looked at the global differences, we can also focus on more granular differences, as days progressed. 
112 | 
113 | ## Weight of the chocolate
114 | 
115 | The first thing we can look at is the weight of the chocolate as the days progressed. For genes, we would use a more complex Negative Binomial - Generalized Additive Model (NB-GAM) to estimate the mean gene expression along pseudotime. 
116 | 
117 | 
118 | ```{r}
119 | ggplot(calendar, aes(x = Day, y = choco_weight, col = is_even)) +
120 |   geom_point(size = 4) +
121 |   theme_minimal() +
122 |   geom_smooth(method = "gam") +
123 |   scale_color_brewer(type = "qual")
124 | ```
125 | 
126 | Here we can just do a gam model with gaussian noise. We won't get into testing here but we can visually see that the confidence interval of the two fit always overlap, which suggests that there is no difference between us in regard to chocolate!!
127 | 
128 | ## Percent of cocoa in the chocolate
129 | 
130 | Everyone has their own tastes with regard to chocolate but, for an advent calendar, in my opinion, diversity is key. Let's see if that's the case for both of us.
131 | 
132 | ```{r}
133 | ggplot(calendar, aes(x = Day, y = pct_cocoa, col = is_even)) +
134 |   geom_point(size = 4) +
135 |   theme_minimal() +
136 |   geom_smooth(method = "gam") +
137 |   scale_color_brewer(type = "qual")
138 | ```
139 | 
140 | While the percentage of cocoa seems constant across the days, there is a clear difference between the odd and even days. Well, it's all for the best, I prefer the higher cocoa content anyway!!
141 | 
142 | # How it's going 17 days later....
143 | 
144 | ```{r, out.width = "50%", fig.pos="h", fig.align='center', echo = FALSE}
145 | knitr::include_graphics("advent2.JPG")
146 | ```
147 | 
148 | 
149 | # Final notes
150 | 
151 | This is just a toy example, of course. For a more rigorous, serious and probably helpful workflow, please go the the [main page of the workshop](https://kstreet13.github.io/bioc2020trajectories/articles/workshopTrajectories.html).
152 | 
153 | ```{r}
154 | sessionInfo()
155 | ```
156 | 


--------------------------------------------------------------------------------
/vignettes/workshopTrajectories.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Trajectory inference across conditions: differential expression and differential progression"
  3 | output:
  4 |   rmarkdown::html_document:
  5 |     highlight: pygments
  6 |     toc: true
  7 |     toc_depth: 3
  8 |     fig_width: 5
  9 | #bibliography: "`r system.file(package='bioc2020trajectories', 'vignettes', 'bibliography.bib')`"
 10 | vignette: >
 11 |   %\VignetteIndexEntry{bioc2020trajectories}
 12 |   %\VignetteEngine{knitr::rmarkdown}
 13 |   %\VignetteEncoding[utf8]{inputenc}
 14 | ---
 15 | 
 16 | # Overview
 17 | 
 18 | ## Instructor(s) name(s) and contact information
 19 | 
 20 |  - Kelly Street (kstreet@ds.dfci.harvard.edu)
 21 |  - Koen Van den Berge (koenvdberge@berkeley.edu)
 22 |  - Hector Roux de Bézieux (hector.rouxdebezieux@berkeley.edu)
 23 |  
 24 | ## Workshop Description
 25 | 
 26 | In single-cell RNA-sequencing (scRNA-seq), gene expression is assessed at the level of single cells. In dynamic biological systems, it may not be appropriate to assign cells to discrete groups, but rather a continuum of cell states may be observed, e.g. the differentiation of a stem cell population into mature cell types. This is often represented as a trajectory in a reduced dimension of the scRNA-seq dataset.
 27 | 
 28 | Many methods have been suggested for trajectory inference. However, in this setting, it is often unclear how one should handle multiple biological groups or conditions, e.g. constructing and comparing the differentiation trajectory of a wild type versus a knock-out stem cell population.
 29 | 
 30 | In this workshop, we will explore methods for comparing multiple conditions in a trajectory inference analysis. We start by integrating datasets from multiple conditions into a single trajectory. By comparing the conditions along the trajectory's path, we can detect large-scale changes, indicative of differential progression. We also demonstrate how to detect subtler changes by finding genes that exhibit different behaviors between these conditions along a differentiation path.
 31 | 
 32 | [This vignette](https://bioconductor.org/packages/release/bioc/vignettes/slingshot/inst/doc/conditionsVignette.html) provides a more complete problem description and proposes a few analytical approaches, which will serve as the basis of our workshop.
 33 | 
 34 | ## Pre-requisites
 35 | 
 36 | 
 37 | Software:
 38 | 
 39 | * Basic knowledge of _R_ syntax
 40 | * Familiarity with single-cell RNA-sequencing
 41 | * Familiarity with the `SingleCellExperiment` class
 42 | 
 43 | Background reading:
 44 | 
 45 | * The textbook "Orchestrating Single-Cell Analysis with Bioconductor" is a great reference for single-cell analysis using Bioconductor packages.
 46 | * The data used in this vignette was originally published in [McFaline-Figueroa, et al.](https://www.nature.com/articles/s41588-019-0489-5) "A pooled single-cell genetic screen identifies regulatory checkpoints in the continuum of the epithelial-to-mesenchymal transition."
 47 | * [Slingshot paper](https://bmcgenomics.biomedcentral.com/articles/10.1186/s12864-018-4772-0)
 48 | * [tradeSeq paper](https://www.nature.com/articles/s41467-020-14766-3)
 49 | 
 50 | 
 51 | ## Workshop Participation
 52 | 
 53 | The workshop will start with an introduction to the problem and the dataset using presentation slides. Following this, we will have a lab session on how one may tackle the problem of handling multiple conditions in trajectory inference and in downstream analysis involving differential progression and differential expression.
 54 | 
 55 | ## _R_ / _Bioconductor_ packages used
 56 | 
 57 | * The workshop will focus on Bioconductor packages [SingleCellExperiment](https://bioconductor.org/packages/release/bioc/html/SingleCellExperiment.html), [Slingshot](https://bioconductor.org/packages/release/bioc/html/slingshot.html), and [tradeSeq](https://bioconductor.org/packages/release/bioc/html/tradeSeq.html) 
 58 | 
 59 | ### Time outline
 60 | 
 61 | 
 62 | | Activity                     | Time |
 63 | |------------------------------|------|
 64 | | Introduction                     | 15m  |
 65 | | Small example                     | 10m  |
 66 | | Data Integration and Trajectory Inference   | 15m  |
 67 | | Differential Progression                    | 15m  |
 68 | | Differential Expression          | 20m  |
 69 | | Wrap-up and Conclusions          | 10m  |
 70 | 
 71 | 
 72 | ## Workshop goals and objectives
 73 | 
 74 | Participants will learn how to reason about trajectories in single-cell RNA-seq data and how they may be used for interpretation of complex scRNA-seq datasets. Participants can follow along in the following fashions:
 75 | 
 76 | + Run the code online by going to [orchestra](http://app.orchestra.cancerdatasci.org/1) and launching the workshop titled __Trajectory inference across conditions: differential expression and differential progression__
 77 | + Clone the [repository](https://github.com/kstreet13/bioc2020trajectories) from github and run the code locally
 78 | + USe docker and get the container kstreet13/bioc2020trajectories:latest
 79 | 
 80 | ## Learning goals
 81 | 
 82 | * Reason about dynamic biological systems.
 83 | * Grasp the complexity of analyzing large scRNA-seq datasets with the goal of extracting relevant biological information.
 84 | * Understand the concepts of differential progression and differential expression along a trajectory path.
 85 | 
 86 | ## Learning objectives
 87 | 
 88 | * Learn how to analyze single-cell RNA-seq data using Bioconductor packages.
 89 | * Import and explore large scRNA-seq datasets.
 90 | * Understand the challenges of trajectory inference.
 91 | * Compose analysis pipeline that allows interpretation of complex scRNA-seq datasets.
 92 | * Assess the added complexity of handling multiple conditions in these dynamic systems and how it influences the analysis pipeline.
 93 | * Discuss how the analysis pipeline can incorporate this change and evaluate it.
 94 | 
 95 | ```{r}
 96 | suppressPackageStartupMessages({
 97 |   library(slingshot); library(SingleCellExperiment)
 98 |   library(RColorBrewer); library(scales)
 99 |   library(viridis); library(UpSetR)
100 |   library(pheatmap); library(msigdbr)
101 |   library(fgsea); library(knitr)
102 |   library(ggplot2); library(gridExtra)
103 |   library(tradeSeq)
104 | })
105 | ```
106 | 
107 | # Analysis
108 | 
109 | ## Dataset
110 | 
111 | The dataset we will be working with concerns a single-cell RNA-sequencing dataset consisting of two different experiments, which correspond to two treatments. [McFaline-Figueroa et al.](https://www.nature.com/articles/s41588-019-0489-5) studied the epithelial-to-mesenchymal transition (EMT), where cells spatially migrate from the epithelium to the mesenchyme during development. This process will be described by a trajectory, reflecting the gene expression changes occurring during this migration. The authors furthermore studied both a control (`Mock`) condition, and a condition under activation of transforming growth factor $\beta$ (TGFB).
112 | 
113 | In summary, we will be investigating a trajectory consisting of a single lineage that represents the EMT. This lineage is studied in two different conditions; a control condition and a TGFB-induced condition.
114 | 
115 | ## Integration
116 | 
117 | Our dataset contains cells collected from samples undergoing two different treatment conditions which were necessarily collected separately. Hence, we will start with an integration step to combine these two sets of cells, similar to batch correction. Our goal is to remove the technical effects of the different sample collections while preserving any true, biological differences between the two treatment groups.
118 | 
119 | Data integration and normalization are complex problems and there are a variety of methods addressing each. Interested participants can explore [the corresponding chapter of the Bioconductor Ebook](http://bioconductor.org/books/release/OSCA/integrating-datasets.html). However, since neither is the main focus of this workshop, we elected to use an existing pipeline for these tasks. The full Seurat data integration workflow with SCTransform normalization is described in [this vignette](https://satijalab.org/seurat/v3.1/integration.html).
120 | 
121 | Since this whole step is quite slow, it will not be run during the workshop but the code is provided below, along with a function to download and preprocess the public data from GEO.
122 | 
123 | ```{r integration, eval=FALSE}
124 | sce <- bioc2020trajectories::importRawData()
125 | 
126 | ########################
127 | ### Split by condition and convert to Seurat
128 | ########################
129 | assays(sce)$logcounts <- log1p(assays(sce)$counts)
130 | sceMock <- sce[ ,colData(sce)$pheno$treatment_id=='Mock']
131 | sceTGFB <- sce[ ,colData(sce)$pheno$treatment_id=='TGFB']
132 | library(Seurat)
133 | soMock <- as.Seurat(sceMock)
134 | soTGFB <- as.Seurat(sceTGFB)
135 | 
136 | ########################
137 | ### Normalize
138 | ########################
139 | soMock <- SCTransform(soMock, verbose = FALSE)
140 | soTGFB <- SCTransform(soTGFB, verbose = FALSE)
141 | 
142 | ########################
143 | ### Integrate
144 | ########################
145 | dtlist <- list(Mock = soMock, TGFB = soTGFB)
146 | intfts <- SelectIntegrationFeatures(object.list = dtlist, nfeatures = nrow(sce)) # maxes out at 4080 (why?)
147 | dtlist <- PrepSCTIntegration(object.list = dtlist,
148 | 							 anchor.features = intfts)
149 | anchors <- FindIntegrationAnchors(object.list = dtlist, normalization.method = "SCT",
150 | 								  anchor.features = intfts)
151 | integrated <- IntegrateData(anchorset = anchors, normalization.method = "SCT")
152 | integrated <- RunPCA(integrated)
153 | integrated <- RunUMAP(integrated, dims = 1:50)
154 | 
155 | ## convert back to singleCellExperiment
156 | sce <- as.SingleCellExperiment(integrated, assay = "RNA")
157 | ```
158 | 
159 | 
160 | ## Import dataset
161 | 
162 | We have made the pre-processed, integrated dataset available as a `SingleCellExperiment` object in the workshop package, which we import below.
163 | 
164 | ```{r}
165 | data("sce", package = "bioc2020trajectories")
166 | ```
167 | 
168 | 
169 | ## Differential Topology
170 | 
171 | Once the two datasets have been integrated, we can visualize all the single cells in a shared reduced dimensional space. 
172 | We also visualize the distribution of cells in this space according to the treatment (control and TGFB) and spatial location (inner cells versus outer cells).
173 | 
174 | 
175 | ```{r compute scores, eval = TRUE, fig.width=7}
176 | shuffle <- sample(ncol(sce))
177 | layout(matrix(1:2, nrow = 1))
178 | par(mar = c(4.5,4,1,1))
179 | 
180 | plot(reducedDims(sce)$UMAP[shuffle, ],
181 |   asp = 1, pch = 16, xlab = "UMAP-1", ylab = "UMAP-2",
182 |   col = alpha(c(1:2)[factor(colData(sce)$pheno$treatment_id)][shuffle], alpha = .5))
183 | legend("topright", pch = 16, col = 1:2, bty = "n", 
184 |        legend = levels(factor(colData(sce)$pheno$treatment_id)))
185 | plot(reducedDims(sce)$UMAP[shuffle, ], asp = 1, pch = 16, xlab = "UMAP-1", ylab = "UMAP-2", 
186 |      col = alpha(c(3, 4)[factor(colData(sce)$pheno$spatial_id)][shuffle], alpha = .5))
187 | legend("topright", pch = 16, col = 3:4, bty = "n", legend = levels(factor(colData(sce)$pheno$spatial_id)))
188 | layout(1)
189 | par(mar = c(5, 4, 4, 2) + .1)
190 | ```
191 | 
192 | 
193 | We know from biological knowledge that the EMT development goes from the inner to the outer cells. The question is: should we fit a separate trajectory for each condition? We might expect the trajectory itself to be changed by the treatment if the treatment effect is systematically large. Otherwise, the treatment may impact the expression profile of some genes but the overall trajectory will be preserved. 
194 | 
195 | To help assess this, we devised an imbalance score. Regions with a high score indicate that the local cell distribution according to treatment label is unbalanced compared the overall distribution. Here, we see that, while there are some small regions of imbalance, the global path along the development axis is well-balanced. This means that we can fit a global trajectory to the full dataset. This choice allows us to use the maximal amount of data in the construction of our trajectory, which should lead to more robust results than separate, potentially noisy trajectories constructed on subsets of the data. As we will see, not all cell types are necessarily present in every condition, so this approach ensures that our trajectory accounts for all cell types present in the overall data. 
196 | 
197 | ```{r, eval = TRUE}
198 | scores <- bioc2020trajectories::imbalance_score(
199 |   rd = reducedDims(sce)$UMAP, 
200 |   cl = colData(sce)$pheno$treatment_id,
201 |   k = 20, smooth = 40)
202 | grad <- viridis::plasma(10, begin = 0, end = 1)
203 | names(grad) <- levels(cut(scores$scaled_scores, breaks = 10))
204 | plot(reducedDims(sce)$UMAP, col = grad[cut(scores$scaled_scores, breaks = 10)],
205 |      asp = 1, pch = 16, xlab = "UMAP-1", ylab = "UMAP-2", cex = .8)
206 | legend("topleft", legend = names(grad), col = grad, pch = 16, bty = "n", cex = 2 / 3)
207 | ```
208 | 
209 | For more information on the score, run `help("imbalance_score", "bioc2020trajectories")`
210 | 
211 | ## Trajectory Inference
212 | 
213 | We perform trajectory inference to order the cells according to EMT progression. We  use `slingshot` for trajectory inference, with the cells' position (inner or outer) serving as the cluster identifier. This ensures that we will only find a single lineage while still allowing sufficient flexibility to correctly orient the pseudotime axis.
214 | 
215 | Note that we perform trajectory inference using cells from both conditions, rather than splitting the data into two groups, as discussed above. 
216 | 
217 | ```{r slingshot, eval = TRUE}
218 | library(slingshot)
219 | sce <- slingshot(sce, reducedDim = 'UMAP', clusterLabels = colData(sce)$pheno$spatial_id,
220 |                  start.clus = 'inner', approx_points = 150)
221 | ```
222 | 
223 | 
224 | ```{r plotSlingshot, echo = FALSE, fig.width =  8}
225 | layout(matrix(c(1, 1, 2, 3), 2))
226 | par(mar = c(4.5, 4, 1, 1))
227 | plot(reducedDims(sce)$UMAP[shuffle, ], asp = 1, pch = 16, xlab = "UMAP-1", ylab = "UMAP-2",
228 |   col = hcl.colors(100, alpha = .5)[cut(sce$slingPseudotime_1, breaks = 100)][shuffle])
229 | lines(SlingshotDataSet(sce))
230 | # Pseudotime densities (by spatial)
231 | ds <- list( Inner = density(slingPseudotime(sce)[colData(sce)$pheno$spatial_id == "inner", 1]),
232 |             Outer = density(slingPseudotime(sce)[colData(sce)$pheno$spatial_id == "outer", 1]))
233 | xlim <- range(c(ds$Inner$x, ds$Outer$x))
234 | ylim <- range(c(ds$Inner$y, ds$Outer$y))
235 | plot(xlim, ylim, col = "white", xlab = "Pseudotime", ylab = "")
236 | polygon(c(min(ds$Inner$x), ds$Inner$x, max(ds$Inner$x)), c(0, ds$Inner$y, 0),
237 |   col = alpha(brewer.pal(4, "Set1")[3], alpha = .5))
238 | polygon(c(min(ds$Outer$x), ds$Inner$x, max(ds$Inner$x)), c(0, ds$Outer$y, 0),
239 |         col = alpha(brewer.pal(4, "Set1")[2], alpha = .5))
240 | legend("topleft", legend = c("Inner", "Outer"), 
241 |        fill = alpha(brewer.pal(4, "Set1")[3:2], alpha = .5), bty = "n")
242 | 
243 | plot(reducedDims(sce)$UMAP[shuffle, ], asp = 1, pch = 16, xlab = "UMAP-1", ylab = "UMAP-2", 
244 |      col = alpha(c(3, 4)[factor(colData(sce)$pheno$spatial_id)][shuffle], alpha = .5))
245 | lines(SlingshotDataSet(sce), type = 'lineages', show.constraints = TRUE)
246 | legend("topright", pch = 16, col = 3:4, bty = "n", legend = levels(factor(colData(sce)$pheno$spatial_id)))
247 | 
248 | layout(1)
249 | par(mar = c(5, 4, 4, 2) + .1)
250 | ```
251 | 
252 | ## Differential progression
253 | 
254 | Now that we have ordered the cells by EMT progression, we can begin to address the main question: how is this progression affected by TGF-$\beta$ treatment? In this section, we interpret this question as a univariate analysis of the pseudotime values between the two groups.
255 | 
256 | ```{r plotDensities, echo=FALSE, fig.height=4}
257 | # Pseudotime densities (by treatment)
258 | ds <- list(Mock = density(slingPseudotime(sce)[colData(sce)$pheno$treatment_id == "Mock", 1]),
259 |            TGFB = density(slingPseudotime(sce)[colData(sce)$pheno$treatment_id == "TGFB", 1]))
260 | xlim <- range(c(ds$Mock$x, ds$TGFB$x))
261 | ylim <- range(c(ds$Mock$y, ds$TGFB$y))
262 | plot(xlim, ylim, col = "white", xlab = "Pseudotime", ylab = "")
263 | polygon(c(min(ds$Mock$x),ds$Mock$x,max(ds$Mock$x)),
264 | 		c(0,ds$Mock$y,0), col = rgb(0,0,0,.5))
265 | polygon(c(min(ds$TGFB$x),ds$Mock$x,max(ds$Mock$x)),
266 | 		c(0,ds$TGFB$y,0), col = alpha(brewer.pal(4,'Set1')[1], alpha = .5))
267 | legend("topright", legend = c("Mock", "TGFB"), 
268 |        fill = alpha(c(1, brewer.pal(3, "Set1")[1]), alpha = .5), bty = "n")
269 | ```
270 | 
271 | The density estimates for the two groups show a trimodal distribution for the untreated cells, but a tendency toward later pseudotime values in the TGF-$\beta$ treated cells. The difference is striking enough that a standard T-test would likely be significant, but we are we are interested more generally in differences between the two distributions, not just the difference of means (one could imagine a scenario in which the treated group tended toward the extremes, but the means were the same). Hence, we propose a Kolmogorov-Smirnov Test to assess whether the two groups of pseudotime values are derived from the same distribution. For more info on the Kolmogorov-Smirnov Test, see [here](https://en.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test).
272 | 
273 | ```{r KStest, eval = FALSE}
274 | ########################
275 | ### Kolmogorov-Smirnov Test
276 | ########################
277 | ks.test(slingPseudotime(sce)[colData(sce)$pheno$treatment_id == "Mock", 1],
278 |         slingPseudotime(sce)[colData(sce)$pheno$treatment_id == "TGFB", 1])
279 | ```
280 | 
281 | As we might expect from the plot, this test is highly significant, so we can conclude that there are differences between the two distributions.
282 | 
283 | ## Differential expression
284 | 
285 | We will now proceed to discover genes whose expression is associated with the inferred trajectory. We will look for genes that (i) change in gene expression along the trajectory, and (ii) are differentially expressed between the two conditions along the trajectory. The differential expression analysis uses the Bioconductor package `tradeSeq`. This analysis relies on a new version of `tradeSeq`, which we have recently updated to allow for multiple conditions.
286 | 
287 | For each condition (i.e., control and TGF-Beta), a smooth average expression profile along pseudotime will be estimated for each gene, using a negative binomial generalized additive model (NB-GAM).
288 | Each differential expression hypothesis of interest will then be translated into testing specific features (a linear combination of the parameters) of this smoothed expression estimate.
289 | See [here](https://www.maths.ed.ac.uk/~swood34/mgcv/) for more information on smoothers and the `mgcv` package which we are using to estimate the smoothers.
290 | 
291 | The next two paragraphs can be time-consuming so we will not run them during the workshop, however, their output is already present in the data object that was loaded at the start of this workshop. They can be easily parallelized, relying on the `BiocParallel` bioconductor package. See [here](https://statomics.github.io/tradeSeq/articles/fitGAM.html#parallel-computing-1) for more details.
292 | 
293 | ### Select number of knots
294 | 
295 | Before we can fit these smoothed expression profiles, we need to get a sense of how complex the expression patterns are in this dataset. This is translated into selecting a number of knots for the NB-GAMs, where a higher number of knots allows for more complex expression patterns. Here, we pick $5$ knots.
296 | 
297 | 
298 | ```{r evaluateK, eval=FALSE}
299 | set.seed(3)
300 | icMat <- evaluateK(counts = as.matrix(assays(sce)$counts),
301 |                    pseudotime = colData(sce)$slingshot$pseudotime,
302 |                    cellWeights = colData(sce)$slingshot$cellWeights.V1,
303 |                    conditions = factor(colData(sce)$pheno$treatment_id),
304 |                    nGenes = 300,
305 |                    k = 3:7)
306 | ```
307 | 
308 | ```{r, echo=FALSE}
309 | knitr::include_graphics("../data/evaluateKResult.png")
310 | ```
311 | 
312 | The plot above shows the graphical output from running `evaluateK`. The left panel shows the distribution of gene-level AIC values as compared to their average AIC over the range of `k`. The second and third panel plot the average AIC and relative AIC with respect to the lowest value of `k` (i.e., 3), respectively, as a function of `k`. Finally, the right panel plots the number of genes whose AIC is lowest at a particular value of `k`.
313 | 
314 | Choosing an appropriate value of `k` can be seen as analogous to choosing the number of principal components based on a scree plot: we look for an 'elbow point', where the decrease starts attenuating. Here, we choose `k=5` to allow for flexible, yet simple, functions while limiting the computational burden. In general, we found the influence of choosing the exact value of `k` to be rather limited, unless `k` is arbitrarily small or large. In our evaluations, most datasets fall within the range of $4$ to $8$ knots.
315 | 
316 | ### Fit GAM
317 | 
318 | Next, we fit the NB-GAMs using 5 knots, based on the pseudotime and cell-level weights estimated by Slingshot. We use the `conditions` argument to fit separate smoothers for each condition.
319 | 
320 | ```{r, eval=FALSE}
321 | set.seed(3)
322 | sce <- fitGAM(counts = as.matrix(assays(sce)$counts),
323 |               sds = as.SlingshotDataSet(colData(sce)$slingshot),
324 |               conditions = factor(colData(sce)$pheno$treatment_id),
325 |               nknots = 5)
326 | mean(rowData(sce)$tradeSeq$converged)
327 | ```
328 | 
329 | 
330 | 
331 | ### Assess DE along pseudotime (or pseudospace)
332 | 
333 | Note that the axis represented by the trajectory in this dataset is actually the migration of cells from the epithelium to the mesenchyme and therefore could also be looked at as a space dimension, although it is likely to be correlated with chronological time, too.
334 | 
335 | To assess significant changes in gene expression as a function of pseudotime within each lineage, we use the `associationTest`, which tests the null hypothesis that gene expression is not a function of pseudotime, i.e., whether the estimated smoothers are significantly varying as a function of pseudotime within each lineage. The `lineages=TRUE` argument specifies that we would like the results for each lineage separately, asides from the default global test, which tests for significant associations across all lineages in the trajectory simultaneously. Further, we specify a log2 fold change cut-off to test against using the `l2fc` argument. 
336 | 
337 | On a lineage-specific basis, there are over twice as much DE genes in the mock lineage (2398) as compared to the TGFB lineage (1013). Many of the DE genes in the TGFB condition are also DE in the mock condition, around 80%. 
338 | 
339 | The authors of the original paper found $1105$ DE genes for the mock condition on a FDR level of $1e-10$ and a cut-off of 1 on the absolute value of the log2 fold change.
340 | 
341 | ```{r, eval = FALSE}
342 | rowData(sce)$assocRes <- associationTest(sce, lineages = TRUE, l2fc = log2(2))
343 | ```
344 | 
345 | ```{r, eval=TRUE}
346 | library(tradeSeq)
347 | assocRes <- rowData(sce)$assocRes
348 | mockGenes <-  rownames(assocRes)[
349 |   which(p.adjust(assocRes$pvalue_lineage1_conditionMock, "fdr") <= 0.05)
350 | ]
351 | tgfbGenes <-  rownames(assocRes)[
352 |   which(p.adjust(assocRes$pvalue_lineage1_conditionTGFB, "fdr") <= 0.05)
353 | ]
354 | 
355 | length(mockGenes)
356 | length(tgfbGenes)
357 | 
358 | UpSetR::upset(fromList(list(mock = mockGenes, tgfb = tgfbGenes)))
359 | ```
360 | 
361 | #### Visualization of DE genes
362 | 
363 | Below we visualize and cluster the genes whose expression vary over pseudotime, using the smoothed expression patterns
364 | As was also observed in the original manuscript, genes are mainly upregulated at the start- or endpoints of the lineage.
365 | 
366 | ```{r, eval=TRUE}
367 | ### based on mean smoother
368 | yhatSmooth <- predictSmooth(sce, gene = mockGenes, nPoints = 50, tidy = FALSE)
369 | heatSmooth <- pheatmap(t(scale(t(yhatSmooth[, 1:50]))),
370 |                        cluster_cols = FALSE,
371 |                        show_rownames = FALSE, 
372 |                        show_colnames = FALSE)
373 | 
374 | ## the hierarchical trees constructed here, can also be used for 
375 | ## clustering of the genes according to their average expression pattern.
376 | cl <- sort(cutree(heatSmooth$tree_row, k = 6))
377 | table(cl)
378 | ```
379 | 
380 | 
381 | ```{r, eval=FALSE}
382 | conditions <- colData(sce)$pheno$treatment_id
383 | pt1 <- colData(sce)$slingshot$pseudotime
384 |   
385 | ### based on fitted values (plotting takes a while to run)
386 | yhatCell <- predictCells(sce, gene=mockGenes)
387 | yhatCellMock <- yhatCell[,conditions == "Mock"]
388 | # order according to pseudotime
389 | ooMock <- order(pt1[conditions == "Mock"], decreasing=FALSE)
390 | yhatCellMock <- yhatCellMock[,ooMock]
391 | pheatmap(t(scale(t(yhatCellMock))), cluster_cols = FALSE,
392 |           show_rownames = FALSE, show_colnames=FALSE)
393 | ```
394 | 
395 | #### Gene set enrichment analysis on genes from the Mock condition
396 | 
397 | Gene set enrichment analysis on the DE genes within the mock condition confirms the biology on epithelial cell differentiation.
398 | 
399 | ```{r, eval=TRUE}
400 | ## C5 category is according to gene ontology grouping: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4707969/pdf/nihms-743907.pdf
401 | geneSets <- msigdbr(species = "Mus musculus", category = "C5", subcategory = "BP")
402 | ### filter background to only include genes that we assessed.
403 | geneSets$gene_symbol <- toupper(geneSets$gene_symbol)
404 | geneSets <- geneSets[geneSets$gene_symbol %in% names(sce),]
405 | m_list <- geneSets %>% split(x = .$gene_symbol, f = .$gs_name)
406 | stats <- assocRes$waldStat_lineage1_conditionMock
407 | names(stats) <- rownames(assocRes)
408 | eaRes <- fgsea(pathways = m_list, stats = stats, nperm = 5e4, minSize = 10)
409 | ooEA <- order(eaRes$pval, decreasing = FALSE)
410 | kable(head(eaRes[ooEA, 1:3], n = 20))
411 | ```
412 | 
413 | 
414 | ### Differential expression between conditions
415 | 
416 | In the following sections, we will investigate differential expression for each gene, between the two conditions.
417 | 
418 | We will first make exploratory data analysis visualizations to take a look at the expression patterns of genes that were also discussed in the original manuscript.
419 | The paper mentions that CDH1 and CRB3 should be expressed in similar kinetics.
420 | Note that the lower slope of CDH1 is also observed in the paper.
421 | 
422 | ```{r, eval=TRUE}
423 | plotSmoothers(sce, assays(sce)$counts, gene = "CDH1", alpha = 1, border = TRUE) + ggtitle("CDH1")
424 | plotSmoothers(sce, assays(sce)$counts, gene = "CRB3", alpha = 1, border = TRUE) + ggtitle("CRB3")
425 | ```
426 | 
427 | They also mention that 'only cells treated with TGF-Beta and positioned at the outer extreme of the trajectory expressed robust levels of FN1 and CDH2'.
428 | 
429 | ```{r, eval=TRUE}
430 | plotSmoothers(sce, assays(sce)$counts, gene = "FN1", alpha = 1, border = TRUE) + ggtitle("FN1")
431 | plotSmoothers(sce, assays(sce)$counts, gene = "CDH2", alpha = 1, border = TRUE) + ggtitle("CDH2")
432 | ```
433 | 
434 | #### Differential expression analysis
435 | 
436 | To test differential expression between conditions, we use the `conditionTest` function implemented in `tradeSeq`. 
437 | This function tests the null hypothesis that genes have identical expression patterns in each condition.
438 | We discover $1993$ genes that are DE with a fold change higher than $2$ or lower than $1/2$.
439 | 
440 | ```{r, eval=TRUE}
441 | condRes <- conditionTest(sce, l2fc = log2(2))
442 | condRes$padj <- p.adjust(condRes$pvalue, "fdr")
443 | mean(condRes$padj <= 0.05, na.rm = TRUE)
444 | sum(condRes$padj <= 0.05, na.rm = TRUE)
445 | conditionGenes <- rownames(condRes)[condRes$padj <= 0.05]
446 | conditionGenes <- conditionGenes[!is.na(conditionGenes)]
447 | ```
448 | 
449 | #### Visualize most and least significant gene
450 | 
451 | ```{r, eval=TRUE}
452 | # plot genes
453 | oo <- order(condRes$waldStat, decreasing = TRUE)
454 | 
455 | # most significant gene
456 | plotSmoothers(sce, assays(sce)$counts,
457 |               gene = rownames(assays(sce)$counts)[oo[1]],
458 |               alpha = 1, border = TRUE)
459 | 
460 | # least significant gene
461 | plotSmoothers(sce, assays(sce)$counts,
462 |               gene = rownames(assays(sce)$counts)[oo[nrow(sce)]],
463 |               alpha = 1, border = TRUE)
464 | ```
465 | 
466 | ### Heatmaps of genes DE between conditions
467 | 
468 | Below we show heatmaps of the genes DE between conditions. The DE genes in the heatmaps are ordered according to a hierarchical clustering on the TGF-Beta condition.
469 | 
470 | ```{r, eval=TRUE}
471 | ### based on mean smoother
472 | yhatSmooth <- predictSmooth(sce, gene = conditionGenes, nPoints = 50, tidy = FALSE)
473 | yhatSmoothScaled <- t(scale(t(yhatSmooth)))
474 | heatSmooth_TGF <- pheatmap(yhatSmoothScaled[, 51:100],
475 |   cluster_cols = FALSE,
476 |   show_rownames = FALSE, show_colnames = FALSE, main = "TGF-Beta", legend = FALSE,
477 |   silent = TRUE
478 | )
479 | 
480 | matchingHeatmap_mock <- pheatmap(yhatSmoothScaled[heatSmooth_TGF$tree_row$order, 1:50],
481 |   cluster_cols = FALSE, cluster_rows = FALSE,
482 |   show_rownames = FALSE, show_colnames = FALSE, main = "Mock",
483 |   legend = FALSE, silent = TRUE
484 | )
485 | 
486 | grid.arrange(heatSmooth_TGF[[4]], matchingHeatmap_mock[[4]], ncol = 2)
487 | ```
488 | 
489 | ### Gene set enrichment analysis
490 | 
491 | Gene set enrichment analysis on genes that are differentially expressed between conditions finds evidence for cell motility, cell junctions/adhesion and gastrulation. The original paper also focuses on the KRAS signaling pathway, which induces cell migration, amongst others.
492 | Other related processes include morphogenesis, gastrulation and cell adhesion.
493 | 
494 | ```{r, eval=TRUE}
495 | statsCond <- condRes$waldStat
496 | names(statsCond) <- rownames(condRes)
497 | eaRes <- fgsea(pathways = m_list, stats = statsCond, nperm = 5e4, minSize = 10)
498 | ooEA <- order(eaRes$pval, decreasing = FALSE)
499 | kable(head(eaRes[ooEA, 1:3], n = 20))
500 | ```
501 | 
502 | 
503 | # Final notes
504 | 
505 | A compiled version of the vignette is available on the [workshop website](https://kstreet13.github.io/bioc2020trajectories/articles/workshopTrajectories.html).
506 | 
507 | If you have questions that you could not ask during the workshop, feel free to open an issue on the github repository [here](https://github.com/kstreet13/bioc2020trajectories/issues).
508 | 
509 | ```{r}
510 | sessionInfo()
511 | ```
512 | 


--------------------------------------------------------------------------------