├── .Rbuildignore ├── .Renviron ├── .circleci └── config.yml ├── .github ├── .gitignore └── workflows │ ├── R-CMD-check.yaml │ └── rhub.yaml ├── .gitignore ├── .travis.yml ├── DESCRIPTION ├── LICENSE ├── NAMESPACE ├── NOTES.txt ├── R ├── RcppExports.R ├── annotation_heatmap.R ├── betanmf.R ├── ccd.R ├── colors.R ├── datasim.R ├── de_analysis.R ├── embedding_plots.R ├── embeddings.R ├── fit_multinom_model.R ├── fit_poisson_nmf.R ├── fit_topic_model.R ├── homer.R ├── init_poisson_nmf.R ├── lfc.R ├── likelihood.R ├── merge_topics.R ├── misc.R ├── mixem.R ├── multinom2poisson.R ├── newsgroups.R ├── other_plots.R ├── pbmc_facs.R ├── pnmfem.R ├── poismix.R ├── poisson.R ├── poisson2multinom.R ├── predict.R ├── scd.R ├── select.R ├── structure_plot.R ├── summary.R ├── sysdata.rda ├── topicscore.R ├── verify_args.R ├── volcano_plots.R └── zzz.R ├── README.md ├── TODO.txt ├── _pkgdown.yml ├── appveyor.yml ├── data ├── newsgroups.RData └── pbmc_facs.RData ├── docs ├── 404.html ├── LICENSE-text.html ├── articles │ ├── index.html │ ├── relationship.html │ ├── relationship_files │ │ └── figure-html │ │ │ ├── loglik-poisson-vs-multinom-1.png │ │ │ ├── multinom2poisson-1-1.png │ │ │ ├── multinom2poisson-2-1.png │ │ │ └── plot-loglik-1.png │ ├── single_cell_rnaseq_basic.html │ ├── single_cell_rnaseq_basic_files │ │ └── figure-html │ │ │ ├── structure-plot-test-1.png │ │ │ ├── structure-plot-with-celltype-labels-1.png │ │ │ ├── volcano-plot-b-1.png │ │ │ ├── volcano-plot-bcells-1.png │ │ │ ├── volcano-plot-nk-1.png │ │ │ ├── volcano-plot-t-1.png │ │ │ └── volcano-plot-tcells-1.png │ ├── single_cell_rnaseq_practical.html │ ├── single_cell_rnaseq_practical_files │ │ ├── crosstalk-1.0.0 │ │ │ ├── css │ │ │ │ └── crosstalk.css │ │ │ └── js │ │ │ │ ├── crosstalk.js │ │ │ │ ├── crosstalk.js.map │ │ │ │ ├── crosstalk.min.js │ │ │ │ └── crosstalk.min.js.map │ │ ├── figure-html │ │ │ ├── loglik-2-1.png │ │ │ ├── loglik-3-1.png │ │ │ ├── pca-plot-1-1.png │ │ │ ├── pca-plot-2-1.png │ │ │ ├── plot-loglik-1.png │ │ │ ├── structure-plot-by-cluster-1-1.png │ │ │ ├── structure-plot-by-cluster-2-1.png │ │ │ ├── structure-plot-by-cluster-3-1.png │ │ │ ├── structure-plot-without-labels-1.png │ │ │ ├── volcano-plot-cd4-1.png │ │ │ ├── volcano-plot-cd8-1.png │ │ │ └── volcano-plot-t-1.png │ │ ├── htmlwidgets-1.5.1 │ │ │ └── htmlwidgets.js │ │ ├── jquery-1.11.3 │ │ │ ├── jquery-AUTHORS.txt │ │ │ ├── jquery.js │ │ │ ├── jquery.min.js │ │ │ └── jquery.min.map │ │ ├── plotly-binding-4.9.2 │ │ │ └── plotly.js │ │ ├── plotly-htmlwidgets-css-1.52.2 │ │ │ └── plotly-htmlwidgets.css │ │ ├── plotly-main-1.52.2 │ │ │ └── plotly-latest.min.js │ │ └── typedarray-0.1 │ │ │ └── typedarray.min.js │ ├── topics_vs_clusters.html │ ├── topics_vs_clusters_files │ │ └── figure-html │ │ │ ├── pca-from-loadings-1.png │ │ │ ├── plot-topic-proportions-1.png │ │ │ ├── tsne-from-counts-1-1.png │ │ │ ├── tsne-from-counts-2-1.png │ │ │ └── tsne-from-loadings-1.png │ └── volcano_plot_t_cells.html ├── authors.html ├── bootstrap-toc.css ├── bootstrap-toc.js ├── docsearch.css ├── docsearch.js ├── index.html ├── link.svg ├── pbmc_de_analysis.html ├── pbmc_facs.RData ├── pkgdown.css ├── pkgdown.js ├── pkgdown.yml ├── reference │ ├── Rplot001.png │ ├── Rplot002.png │ ├── Rplot003.png │ ├── Rplot004.png │ ├── Rplot005.png │ ├── Rplot006.png │ ├── compare_fits.html │ ├── compare_poisson_nmf_fits.html │ ├── de_analysis-1.png │ ├── de_analysis.html │ ├── diff_count_analysis.html │ ├── embedding_plots.html │ ├── embeddings_from_topics-1.png │ ├── embeddings_from_topics-2.png │ ├── embeddings_from_topics-3.png │ ├── embeddings_from_topics-4.png │ ├── embeddings_from_topics-5.png │ ├── embeddings_from_topics-6.png │ ├── embeddings_from_topics.html │ ├── fit_multinom_model.html │ ├── fit_poisson_nmf-1.png │ ├── fit_poisson_nmf-2.png │ ├── fit_poisson_nmf.html │ ├── fit_topic_model.html │ ├── index.html │ ├── likelihood.html │ ├── loadings_plot.html │ ├── merge_topics.html │ ├── multinom2poisson.html │ ├── pbmc_4k.html │ ├── pbmc_facs.html │ ├── pca_plot.html │ ├── plot_loglik_vs_rank.html │ ├── plot_progress.html │ ├── plot_progress_poisson_nmf.html │ ├── poisson2multinom.html │ ├── predict-1.png │ ├── predict-2.png │ ├── predict-3.png │ ├── predict-4.png │ ├── predict.html │ ├── run_homer.html │ ├── select_loadings.html │ ├── simulate_count_data.html │ ├── simulate_gene_data.html │ ├── simulate_toy_gene_data.html │ ├── structure_plot.html │ ├── summary.poisson_nmf_fit.html │ ├── tsne_from_topics.html │ ├── tsne_plot.html │ └── volcano_plot.html └── sitemap.xml ├── inst ├── CITATION ├── COPYRIGHTS ├── code │ ├── altsqp_original.R │ ├── check_map.R │ ├── check_poisson_hessian.R │ ├── compile_newsgroups_results_for_annotation.R │ ├── compute_newsgroups_topics.R │ ├── droplet.R │ ├── lda.R │ ├── multinom_demo.R │ ├── pbmc_de_analysis.Rmd │ ├── pbmc_demo.R │ ├── plsi.R │ ├── pois_vs_binom.R │ ├── pois_vs_multinom.R │ ├── poisson_demo.R │ ├── postfit_motif_analysis_Buenrostro2018.R │ ├── pseudocounts.R │ ├── scd.R │ ├── simulate_data_for_sfa.R │ ├── test_hpd.R │ ├── test_poisson_fit.R │ └── test_poisson_fit_basic.R └── datafiles │ ├── newsgroups.RData │ └── newsgroups_topics.RData ├── man ├── annotation_heatmap.Rd ├── compare_fits.Rd ├── de_analysis.Rd ├── embedding_plots.Rd ├── embeddings_from_topics.Rd ├── fit_multinom_model.Rd ├── fit_poisson_nmf.Rd ├── fit_topic_model.Rd ├── likelihood.Rd ├── loadings_plot.Rd ├── merge_topics.Rd ├── multinom2poisson.Rd ├── newsgroups.Rd ├── pbmc_facs.Rd ├── plot_loglik_vs_rank.Rd ├── plot_progress.Rd ├── poisson2multinom.Rd ├── predict.Rd ├── run_homer.Rd ├── select_loadings.Rd ├── simulate_count_data.Rd ├── simulate_gene_data.Rd ├── simulate_toy_gene_data.Rd ├── structure_plot.Rd ├── summary.poisson_nmf_fit.Rd └── volcano_plot.Rd ├── src ├── Makevars ├── Makevars.win ├── RcppExports.cpp ├── ccd.cpp ├── cost.cpp ├── cost.h ├── misc.cpp ├── misc.h ├── mixem.cpp ├── mixem.h ├── pnmfem.cpp ├── poismix.cpp ├── poismix.h ├── poisson.cpp └── scd.cpp ├── tests ├── testthat.R └── testthat │ ├── helper_functions.R │ ├── test_de_analysis.R │ ├── test_fit_multinom_model.R │ ├── test_fit_poisson_nmf.R │ ├── test_fit_topic_model.R │ ├── test_likelihood.R │ ├── test_mixem.R │ ├── test_plots.R │ ├── test_poismix.R │ ├── test_poisson2multinom.R │ ├── test_select.R │ └── test_summary.R └── vignettes ├── relationship.Rmd ├── single_cell_rnaseq_basic.Rmd ├── single_cell_rnaseq_practical.Rmd └── topics_vs_clusters.Rmd /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^TODO\.txt$ 2 | ^NOTES\.txt$ 3 | ^docs$ 4 | ^_pkgdown\.yml$ 5 | ^\.travis\.yml$ 6 | ^appveyor\.yml$ 7 | ^\.circleci$ 8 | ^\.circleci/config\.yml$ 9 | ^\.github$ 10 | ^inst/code$ 11 | ^vignettes/single\_cell\_rnaseq\_practical\.Rmd$ 12 | ^\.Renviron$ 13 | -------------------------------------------------------------------------------- /.Renviron: -------------------------------------------------------------------------------- 1 | R_LIBS_USER = ~/R_libs 2 | -------------------------------------------------------------------------------- /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | jobs: 3 | build: 4 | docker: 5 | - image: rocker/verse:latest 6 | environment: 7 | R_LIBS: ~/R/Library 8 | R_REMOTES_NO_ERRORS_FROM_WARNINGS: true 9 | _R_CHECK_FORCE_SUGGESTS_: false 10 | NOT_CRAN: true 11 | steps: 12 | - restore_cache: 13 | keys: 14 | - r-pkg-cache-{{ arch }}-{{ .Branch }} 15 | - r-pkg-cache-{{ arch }}- 16 | - checkout 17 | - run: 18 | name: Install package dependencies 19 | command: | 20 | mkdir -p ~/R/Library 21 | Rscript -e 'update.packages("Matrix")' 22 | Rscript -e 'install.packages(c("devtools","remotes","quadprog","gtools","irlba","Rtsne","uwot","dplyr","rlang","tidyr","Rcpp","RcppArmadillo","RcppParallel","progress","pbapply","ggplot2","ggrepel","cowplot","plotly","htmlwidgets","testthat","Ternary","RhpcBLASctl"))' 23 | Rscript -e 'devtools::install_github("slowkow/ggrepel",upgrade="never",force=TRUE)' 24 | Rscript -e 'devtools::install_github("stephens999/ashr",upgrade="never",force=TRUE)' 25 | Rscript -e 'devtools::install_github("linxihui/NNLM",upgrade="never",force=TRUE)' 26 | - run: 27 | name: Session information and installed package versions 28 | command: | 29 | Rscript -e 'sessionInfo()' 30 | Rscript -e 'installed.packages()[, c("Package", "Version")]' 31 | Rscript -e 'rmarkdown::pandoc_version()' 32 | - run: 33 | name: Build package 34 | command: R CMD build --no-build-vignettes --no-manual . 35 | - run: 36 | name: Check package 37 | no_output_timeout: 55m 38 | command: R CMD check --ignore-vignettes --no-manual --no-examples *tar.gz 39 | - store_artifacts: 40 | path: fastTopics.Rcheck/ 41 | - save_cache: 42 | key: r-pkg-cache-{{ arch }}-{{ .Branch }} 43 | paths: 44 | - "~/R/Library" 45 | -------------------------------------------------------------------------------- /.github/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | -------------------------------------------------------------------------------- /.github/workflows/R-CMD-check.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | on: 3 | push: 4 | branches: [main, master] 5 | pull_request: 6 | branches: [main, master] 7 | 8 | name: R-CMD-check 9 | 10 | jobs: 11 | R-CMD-check: 12 | runs-on: macos-latest 13 | env: 14 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 15 | R_KEEP_PKG_SOURCE: yes 16 | _R_CHECK_FORCE_SUGGESTS_: false 17 | steps: 18 | - uses: actions/checkout@v3 19 | - uses: r-lib/actions/setup-r@v2 20 | with: 21 | use-public-rspm: true 22 | 23 | - name: Install dependencies 24 | run: | 25 | update.packages("Matrix") 26 | install.packages(c("remotes","rcmdcheck")) 27 | install.packages(c("devtools","remotes","quadprog","gtools")) 28 | install.packages(c("irlba","Rtsne","uwot","dplyr","tidyr","rlang")) 29 | install.packages(c("Rcpp","RcppArmadillo","RcppParallel","pbapply")) 30 | install.packages(c("progress","ggplot2","ggrepel","cowplot")) 31 | install.packages(c("plotly","htmlwidgets","testthat","Ternary")) 32 | install.packages("RhpcBLASctl") 33 | remotes::install_github("slowkow/ggrepel",upgrade="never",force=TRUE) 34 | remotes::install_github("stephens999/ashr",upgrade="never",force=TRUE) 35 | remotes::install_github("linxihui/NNLM",upgrade="never",force=TRUE) 36 | shell: Rscript {0} 37 | 38 | - name: Check 39 | run: | 40 | options(crayon.enabled = TRUE) 41 | rcmdcheck::rcmdcheck(args = c("--no-manual","--ignore-vignettes"), 42 | error_on = "error",build_args = "--no-build-vignettes") 43 | shell: Rscript {0} 44 | 45 | 46 | -------------------------------------------------------------------------------- /.github/workflows/rhub.yaml: -------------------------------------------------------------------------------- 1 | # R-hub's generic GitHub Actions workflow file. It's canonical location is at 2 | # https://github.com/r-hub/actions/blob/v1/workflows/rhub.yaml 3 | # You can update this file to a newer version using the rhub2 package: 4 | # 5 | # rhub::rhub_setup() 6 | # 7 | # It is unlikely that you need to modify this file manually. 8 | 9 | name: R-hub 10 | run-name: "${{ github.event.inputs.id }}: ${{ github.event.inputs.name || format('Manually run by {0}', github.triggering_actor) }}" 11 | 12 | on: 13 | workflow_dispatch: 14 | inputs: 15 | config: 16 | description: 'A comma separated list of R-hub platforms to use.' 17 | type: string 18 | default: 'linux,windows,macos' 19 | name: 20 | description: 'Run name. You can leave this empty now.' 21 | type: string 22 | id: 23 | description: 'Unique ID. You can leave this empty now.' 24 | type: string 25 | 26 | jobs: 27 | 28 | setup: 29 | runs-on: ubuntu-latest 30 | outputs: 31 | containers: ${{ steps.rhub-setup.outputs.containers }} 32 | platforms: ${{ steps.rhub-setup.outputs.platforms }} 33 | 34 | steps: 35 | # NO NEED TO CHECKOUT HERE 36 | - uses: r-hub/actions/setup@v1 37 | with: 38 | config: ${{ github.event.inputs.config }} 39 | id: rhub-setup 40 | 41 | linux-containers: 42 | needs: setup 43 | if: ${{ needs.setup.outputs.containers != '[]' }} 44 | runs-on: ubuntu-latest 45 | name: ${{ matrix.config.label }} 46 | strategy: 47 | fail-fast: false 48 | matrix: 49 | config: ${{ fromJson(needs.setup.outputs.containers) }} 50 | container: 51 | image: ${{ matrix.config.container }} 52 | 53 | steps: 54 | - uses: r-hub/actions/checkout@v1 55 | - uses: r-hub/actions/platform-info@v1 56 | with: 57 | token: ${{ secrets.RHUB_TOKEN }} 58 | job-config: ${{ matrix.config.job-config }} 59 | - uses: r-hub/actions/setup-deps@v1 60 | with: 61 | token: ${{ secrets.RHUB_TOKEN }} 62 | job-config: ${{ matrix.config.job-config }} 63 | - uses: r-hub/actions/run-check@v1 64 | with: 65 | token: ${{ secrets.RHUB_TOKEN }} 66 | job-config: ${{ matrix.config.job-config }} 67 | 68 | other-platforms: 69 | needs: setup 70 | if: ${{ needs.setup.outputs.platforms != '[]' }} 71 | runs-on: ${{ matrix.config.os }} 72 | name: ${{ matrix.config.label }} 73 | strategy: 74 | fail-fast: false 75 | matrix: 76 | config: ${{ fromJson(needs.setup.outputs.platforms) }} 77 | 78 | steps: 79 | - uses: r-hub/actions/checkout@v1 80 | - uses: r-hub/actions/setup-r@v1 81 | with: 82 | job-config: ${{ matrix.config.job-config }} 83 | token: ${{ secrets.RHUB_TOKEN }} 84 | - uses: r-hub/actions/platform-info@v1 85 | with: 86 | token: ${{ secrets.RHUB_TOKEN }} 87 | job-config: ${{ matrix.config.job-config }} 88 | - uses: r-hub/actions/setup-deps@v1 89 | with: 90 | job-config: ${{ matrix.config.job-config }} 91 | token: ${{ secrets.RHUB_TOKEN }} 92 | - uses: r-hub/actions/run-check@v1 93 | with: 94 | job-config: ${{ matrix.config.job-config }} 95 | token: ${{ secrets.RHUB_TOKEN }} 96 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | src/*.o 2 | src/*.so 3 | inst/derivations/algorithms/algorithms.aux 4 | inst/derivations/algorithms/algorithms.bbl 5 | inst/derivations/algorithms/algorithms.blg 6 | inst/derivations/algorithms/algorithms.log 7 | inst/derivations/algorithms/algorithms.out 8 | inst/derivations/algorithms/algorithms.thm 9 | inst/derivations/altsqp/altsqp.aux 10 | inst/derivations/altsqp/altsqp.bbl 11 | inst/derivations/altsqp/altsqp.blg 12 | inst/derivations/altsqp/altsqp.log 13 | inst/derivations/altsqp/altsqp.out 14 | inst/derivations/altsqp/altsqp.thm 15 | inst/derivations/diffcount/diffcount.aux 16 | inst/derivations/diffcount/diffcount.bbl 17 | inst/derivations/diffcount/diffcount.blg 18 | inst/derivations/diffcount/diffcount.log 19 | inst/derivations/diffcount/diffcount.out 20 | inst/derivations/diffcount/diffcount.thm 21 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: r 2 | cache: packages 3 | latex: false 4 | warnings_are_errors: false 5 | r_build_args: 6 | r_check_args: --as-cran 7 | 8 | # This is the minimal set of R packages needed to run "R CMD check" on 9 | # the package. 10 | install: 11 | - R -e 'install.packages(c("devtools","covr","testthat","knitr","rmarkdown","quadprog","gtools","irlba","Rtsne","uwot","dplyr","rlang","tidyr","Rcpp","RcppArmadillo","RcppParallel","progress","pbapply","ggplot2","ggrepel","cowplot","plotly","htmlwidgets","Ternary","RhpcBLASctl"))' 12 | - R -e 'devtools::install_github("linxihui/NNLM",upgrade="never",force=TRUE)' 13 | - R -e 'devtools::install_github("slowkow/ggrepel",upgrade="never",force=TRUE)' 14 | - R -e 'devtools::install_github("stephens999/ashr",upgrade="never",force=TRUE)' 15 | 16 | env: 17 | global: 18 | - _R_CHECK_FORCE_SUGGESTS_: false 19 | - R_REMOTES_NO_ERRORS_FROM_WARNINGS: true 20 | 21 | after_success: 22 | - Rscript -e 'library(covr); codecov()' 23 | 24 | branches: 25 | only: 26 | - master 27 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Encoding: UTF-8 2 | Type: Package 3 | Package: fastTopics 4 | Version: 0.7-25 5 | Date: 2025-06-03 6 | Title: Fast Algorithms for Fitting Topic Models and Non-Negative 7 | Matrix Factorizations to Count Data 8 | Authors@R: c(person("Peter","Carbonetto",role=c("aut","cre"), 9 | email="peter.carbonetto@gmail.com"), 10 | person("Kevin","Luo",role="aut"), 11 | person("Kushal","Dey",role="aut"), 12 | person("Joyce","Hsiao",role="ctb"), 13 | person("Abhishek","Sarkar",role="ctb"), 14 | person("Anthony","Hung",role="ctb"), 15 | person("Xihui","Lin",role="ctb"), 16 | person("Paul C.","Boutros",role="ctb"), 17 | person("Minzhe","Wang",role="ctb"), 18 | person("Tracy","Ke",role="ctb"), 19 | person("Eric","Weine",role="ctb"), 20 | person("Matthew","Stephens",role="aut")) 21 | URL: https://stephenslab.github.io/fastTopics/, https://github.com/stephenslab/fastTopics 22 | BugReports: https://github.com/stephenslab/fastTopics/issues 23 | Depends: R (>= 3.3.0) 24 | Description: Implements fast, scalable optimization algorithms for 25 | fitting topic models ("grade of membership" models) and 26 | non-negative matrix factorizations to count data. The methods 27 | exploit the special relationship between the multinomial topic 28 | model (also, "probabilistic latent semantic indexing") and Poisson 29 | non-negative matrix factorization. The package provides tools to 30 | compare, annotate and visualize model fits, including functions to 31 | efficiently create "structure plots" and identify key features in 32 | topics. The 'fastTopics' package is a successor to the 33 | 'CountClust' package. For more information, see 34 | and 35 | . Please also see the GitHub 36 | repository for additional vignettes not included in the package on 37 | CRAN. 38 | License: BSD_2_clause + file LICENSE 39 | Copyright: inst/COPYRIGHTS 40 | SystemRequirements: GNU make 41 | Imports: 42 | graphics, 43 | utils, 44 | methods, 45 | stats, 46 | Matrix, 47 | gtools, 48 | quadprog, 49 | irlba, 50 | dplyr, 51 | Rtsne, 52 | uwot, 53 | ashr, 54 | Rcpp (>= 1.0.12), 55 | RcppParallel (>= 5.1.7), 56 | RhpcBLASctl, 57 | parallel, 58 | progress, 59 | pbapply, 60 | ggplot2 (>= 3.3.0), 61 | ggrepel (>= 0.9.0), 62 | cowplot, 63 | plotly, 64 | reshape2, 65 | htmlwidgets 66 | Suggests: 67 | Ternary, 68 | testthat, 69 | knitr, 70 | rmarkdown 71 | LinkingTo: 72 | Rcpp, 73 | RcppParallel, 74 | RcppArmadillo 75 | LazyData: true 76 | LazyDataCompression: xz 77 | NeedsCompilation: yes 78 | RoxygenNote: 7.3.1 79 | VignetteBuilder: knitr 80 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | YEAR: 2019-2025 2 | COPYRIGHT HOLDER: Peter Carbonetto and Matthew Stephens 3 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | S3method(plot,multinom_topic_model_fit) 4 | S3method(plot,poisson_nmf_fit) 5 | S3method(plot,topic_model_de_analysis) 6 | S3method(predict,multinom_topic_model_fit) 7 | S3method(predict,poisson_nmf_fit) 8 | S3method(print,summary.multinom_topic_model_fit) 9 | S3method(print,summary.poisson_nmf_fit) 10 | S3method(select,multinom_topic_model_fit) 11 | S3method(select,poisson_nmf_fit) 12 | S3method(summary,multinom_topic_model_fit) 13 | S3method(summary,poisson_nmf_fit) 14 | export(annotation_heatmap) 15 | export(compare_fits) 16 | export(cost) 17 | export(de_analysis) 18 | export(de_analysis_control_default) 19 | export(deviance_poisson_nmf) 20 | export(embedding_plot_2d) 21 | export(embedding_plot_2d_ggplot_call) 22 | export(fit_multinom_model) 23 | export(fit_poisson_nmf) 24 | export(fit_poisson_nmf_control_default) 25 | export(fit_topic_model) 26 | export(init_poisson_nmf) 27 | export(init_poisson_nmf_from_clustering) 28 | export(loadings_plot) 29 | export(loadings_plot_ggplot_call) 30 | export(loglik_multinom_topic_model) 31 | export(loglik_poisson_nmf) 32 | export(loglik_vs_rank_ggplot_call) 33 | export(merge_topics) 34 | export(multinom2poisson) 35 | export(pca_from_topics) 36 | export(pca_hexbin_plot) 37 | export(pca_hexbin_plot_ggplot_call) 38 | export(pca_plot) 39 | export(plot_loglik_vs_rank) 40 | export(plot_progress) 41 | export(poisson2multinom) 42 | export(run_homer) 43 | export(select_loadings) 44 | export(simulate_count_data) 45 | export(simulate_multinom_gene_data) 46 | export(simulate_poisson_gene_data) 47 | export(simulate_toy_gene_data) 48 | export(structure_plot) 49 | export(structure_plot_default_embed_method) 50 | export(structure_plot_ggplot_call) 51 | export(tsne_from_topics) 52 | export(tsne_plot) 53 | export(umap_from_topics) 54 | export(umap_plot) 55 | export(volcano_plot) 56 | export(volcano_plot_do_label_default) 57 | export(volcano_plot_ggplot_call) 58 | export(volcano_plot_ly_call) 59 | export(volcano_plotly) 60 | import(Matrix) 61 | importFrom(Matrix,colMeans) 62 | importFrom(Matrix,colSums) 63 | importFrom(Matrix,rowMeans) 64 | importFrom(Matrix,rowSums) 65 | importFrom(Matrix,sparseMatrix) 66 | importFrom(Rcpp,evalCpp) 67 | importFrom(RcppParallel,RcppParallelLibs) 68 | importFrom(RcppParallel,defaultNumThreads) 69 | importFrom(RcppParallel,setThreadOptions) 70 | importFrom(RhpcBLASctl,blas_get_num_procs) 71 | importFrom(RhpcBLASctl,blas_set_num_threads) 72 | importFrom(Rtsne,Rtsne) 73 | importFrom(ashr,ash) 74 | importFrom(cowplot,plot_grid) 75 | importFrom(cowplot,theme_cowplot) 76 | importFrom(dplyr,select) 77 | importFrom(ggplot2,aes) 78 | importFrom(ggplot2,aes_q) 79 | importFrom(ggplot2,aes_string) 80 | importFrom(ggplot2,after_stat) 81 | importFrom(ggplot2,element_blank) 82 | importFrom(ggplot2,element_text) 83 | importFrom(ggplot2,geom_boxplot) 84 | importFrom(ggplot2,geom_col) 85 | importFrom(ggplot2,geom_line) 86 | importFrom(ggplot2,geom_point) 87 | importFrom(ggplot2,ggplot) 88 | importFrom(ggplot2,guide_legend) 89 | importFrom(ggplot2,guides) 90 | importFrom(ggplot2,labs) 91 | importFrom(ggplot2,scale_color_manual) 92 | importFrom(ggplot2,scale_fill_gradient2) 93 | importFrom(ggplot2,scale_fill_gradientn) 94 | importFrom(ggplot2,scale_fill_manual) 95 | importFrom(ggplot2,scale_linetype_manual) 96 | importFrom(ggplot2,scale_shape_manual) 97 | importFrom(ggplot2,scale_size) 98 | importFrom(ggplot2,scale_size_manual) 99 | importFrom(ggplot2,scale_x_continuous) 100 | importFrom(ggplot2,scale_y_continuous) 101 | importFrom(ggplot2,stat_bin_hex) 102 | importFrom(ggplot2,theme) 103 | importFrom(ggplot2,waiver) 104 | importFrom(ggrepel,geom_text_repel) 105 | importFrom(graphics,plot) 106 | importFrom(gtools,rdirichlet) 107 | importFrom(htmlwidgets,saveWidget) 108 | importFrom(irlba,irlba) 109 | importFrom(methods,as) 110 | importFrom(parallel,splitIndices) 111 | importFrom(pbapply,pblapply) 112 | importFrom(pbapply,pboptions) 113 | importFrom(plotly,hide_colorbar) 114 | importFrom(plotly,layout) 115 | importFrom(plotly,plot_ly) 116 | importFrom(progress,progress_bar) 117 | importFrom(quadprog,solve.QP) 118 | importFrom(reshape2,melt) 119 | importFrom(stats,dpois) 120 | importFrom(stats,formula) 121 | importFrom(stats,glm) 122 | importFrom(stats,glm.control) 123 | importFrom(stats,kmeans) 124 | importFrom(stats,pnorm) 125 | importFrom(stats,poisson) 126 | importFrom(stats,prcomp) 127 | importFrom(stats,predict) 128 | importFrom(stats,quantile) 129 | importFrom(stats,rmultinom) 130 | importFrom(stats,rnorm) 131 | importFrom(stats,rpois) 132 | importFrom(stats,runif) 133 | importFrom(stats,summary.glm) 134 | importFrom(utils,combn) 135 | importFrom(utils,modifyList) 136 | importFrom(utils,read.table) 137 | importFrom(utils,write.table) 138 | importFrom(uwot,umap) 139 | useDynLib(fastTopics) 140 | -------------------------------------------------------------------------------- /NOTES.txt: -------------------------------------------------------------------------------- 1 | # For Matrix version 1.4-2. 2 | export R_BUILD_ENVIRON=$HOME/git/fastTopics/.Renviron 3 | export R_CHECK_ENVIRON=$HOME/git/fastTopics/.Renviron 4 | R CMD build fastTopics 5 | R CMD check fastTopics_0.6.138.tar.gz 6 | -------------------------------------------------------------------------------- /R/RcppExports.R: -------------------------------------------------------------------------------- 1 | # Generated by using Rcpp::compileAttributes() -> do not edit by hand 2 | # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 3 | 4 | ccd_update_factors_rcpp <- function(V, W, H, e) { 5 | .Call('_fastTopics_ccd_update_factors_rcpp', PACKAGE = 'fastTopics', V, W, H, e) 6 | } 7 | 8 | ccd_update_factors_sparse_rcpp <- function(V, W, H, e) { 9 | .Call('_fastTopics_ccd_update_factors_sparse_rcpp', PACKAGE = 'fastTopics', V, W, H, e) 10 | } 11 | 12 | ccd_update_factors_parallel_rcpp <- function(V, W, H, e) { 13 | .Call('_fastTopics_ccd_update_factors_parallel_rcpp', PACKAGE = 'fastTopics', V, W, H, e) 14 | } 15 | 16 | ccd_update_factors_sparse_parallel_rcpp <- function(V, W, H, e) { 17 | .Call('_fastTopics_ccd_update_factors_sparse_parallel_rcpp', PACKAGE = 'fastTopics', V, W, H, e) 18 | } 19 | 20 | cost_rcpp <- function(X, A, B, e, poisson) { 21 | .Call('_fastTopics_cost_rcpp', PACKAGE = 'fastTopics', X, A, B, e, poisson) 22 | } 23 | 24 | cost_sparse_rcpp <- function(X, A, B, e, poisson) { 25 | .Call('_fastTopics_cost_sparse_rcpp', PACKAGE = 'fastTopics', X, A, B, e, poisson) 26 | } 27 | 28 | le_diff_rcpp <- function(X) { 29 | .Call('_fastTopics_le_diff_rcpp', PACKAGE = 'fastTopics', X) 30 | } 31 | 32 | x_over_crossprod_rcpp <- function(i, j, x, A, B, e) { 33 | .Call('_fastTopics_x_over_crossprod_rcpp', PACKAGE = 'fastTopics', i, j, x, A, B, e) 34 | } 35 | 36 | mixem_rcpp <- function(L, w, x0, numiter) { 37 | .Call('_fastTopics_mixem_rcpp', PACKAGE = 'fastTopics', L, w, x0, numiter) 38 | } 39 | 40 | pnmfem_update_factors_rcpp <- function(X, F, L, j, numiter) { 41 | .Call('_fastTopics_pnmfem_update_factors_rcpp', PACKAGE = 'fastTopics', X, F, L, j, numiter) 42 | } 43 | 44 | pnmfem_update_factors_sparse_rcpp <- function(X, F, L, j, numiter) { 45 | .Call('_fastTopics_pnmfem_update_factors_sparse_rcpp', PACKAGE = 'fastTopics', X, F, L, j, numiter) 46 | } 47 | 48 | pnmfem_update_factors_parallel_rcpp <- function(X, F, L, j, numiter) { 49 | .Call('_fastTopics_pnmfem_update_factors_parallel_rcpp', PACKAGE = 'fastTopics', X, F, L, j, numiter) 50 | } 51 | 52 | pnmfem_update_factors_sparse_parallel_rcpp <- function(X, F, L, j, numiter) { 53 | .Call('_fastTopics_pnmfem_update_factors_sparse_parallel_rcpp', PACKAGE = 'fastTopics', X, F, L, j, numiter) 54 | } 55 | 56 | poismixem_rcpp <- function(L, w, x0, numiter) { 57 | .Call('_fastTopics_poismixem_rcpp', PACKAGE = 'fastTopics', L, w, x0, numiter) 58 | } 59 | 60 | poismixem2_rcpp <- function(L1, w, u, x0, numiter) { 61 | .Call('_fastTopics_poismixem2_rcpp', PACKAGE = 'fastTopics', L1, w, u, x0, numiter) 62 | } 63 | 64 | poismixem3_rcpp <- function(L1, w, u, i, x0, numiter) { 65 | .Call('_fastTopics_poismixem3_rcpp', PACKAGE = 'fastTopics', L1, w, u, i, x0, numiter) 66 | } 67 | 68 | scd_kl_update_rcpp <- function(L, w, x0, numiter, e) { 69 | .Call('_fastTopics_scd_kl_update_rcpp', PACKAGE = 'fastTopics', L, w, x0, numiter, e) 70 | } 71 | 72 | scd_kl_update2_rcpp <- function(L, u, w, x0, numiter, e) { 73 | .Call('_fastTopics_scd_kl_update2_rcpp', PACKAGE = 'fastTopics', L, u, w, x0, numiter, e) 74 | } 75 | 76 | ccd_kl_update_rcpp <- function(L, w, x0, numiter, e) { 77 | .Call('_fastTopics_ccd_kl_update_rcpp', PACKAGE = 'fastTopics', L, w, x0, numiter, e) 78 | } 79 | 80 | ccd_kl_update2_rcpp <- function(L, u, w, x0, numiter, e) { 81 | .Call('_fastTopics_ccd_kl_update2_rcpp', PACKAGE = 'fastTopics', L, u, w, x0, numiter, e) 82 | } 83 | 84 | simulate_posterior_poisson_rcpp <- function(x, L, f, D, U, M, s, e) { 85 | .Call('_fastTopics_simulate_posterior_poisson_rcpp', PACKAGE = 'fastTopics', x, L, f, D, U, M, s, e) 86 | } 87 | 88 | simulate_posterior_poisson_sparse_rcpp <- function(x, L, w, f, D, U, M, s, e) { 89 | .Call('_fastTopics_simulate_posterior_poisson_sparse_rcpp', PACKAGE = 'fastTopics', x, L, w, f, D, U, M, s, e) 90 | } 91 | 92 | scd_update_factors_rcpp <- function(A, W, H, j, numiter, e) { 93 | .Call('_fastTopics_scd_update_factors_rcpp', PACKAGE = 'fastTopics', A, W, H, j, numiter, e) 94 | } 95 | 96 | scd_update_factors_sparse_rcpp <- function(A, W, H, j, numiter, e) { 97 | .Call('_fastTopics_scd_update_factors_sparse_rcpp', PACKAGE = 'fastTopics', A, W, H, j, numiter, e) 98 | } 99 | 100 | scd_update_factors_parallel_rcpp <- function(A, W, H, j, numiter, e) { 101 | .Call('_fastTopics_scd_update_factors_parallel_rcpp', PACKAGE = 'fastTopics', A, W, H, j, numiter, e) 102 | } 103 | 104 | scd_update_factors_sparse_parallel_rcpp <- function(A, W, H, j, numiter, e) { 105 | .Call('_fastTopics_scd_update_factors_sparse_parallel_rcpp', PACKAGE = 'fastTopics', A, W, H, j, numiter, e) 106 | } 107 | 108 | -------------------------------------------------------------------------------- /R/betanmf.R: -------------------------------------------------------------------------------- 1 | # This function implements the multiplicative update rule for the 2 | # loadings matrix (the "activations"), A, in which the matrix X is 3 | # approximated by the matrix product A*B. Inputs X, A and B should 4 | # not be sparse matrices ("is.matrix" should return TRUE). 5 | betanmf_update_loadings <- function (X, A, B) 6 | scale.cols(A * tcrossprod(X / (A %*% B),B),1/rowSums(B)) 7 | 8 | # This function implements the multiplicative update rule for the 9 | # factors matrix (the "basis vectors"), B, in which the matrix X is 10 | # approximated by the matrix product A*B. Inputs X, A and B should not 11 | # be sparse matrices ("is.matrix" should return TRUE). 12 | betanmf_update_factors <- function (X, A, B) 13 | B * crossprod(A,X / (A %*% B)) / colSums(A) 14 | -------------------------------------------------------------------------------- /R/ccd.R: -------------------------------------------------------------------------------- 1 | # This function implements the cyclic co-ordinate descent (CCD) update 2 | # for the factors matrix (the "basis vectors"), B, in which the matrix 3 | # V is approximated by the matrix product W*H. Inputs W and H should 4 | # be dense matrices ("is.matrix" should return TRUE). Input argument 5 | # "e" is a non-negative scalar specifying the minimum value of the 6 | # updated factors. 7 | # 8 | # Note that a single EM update of each factor is performed before 9 | # running the CCD updates. 10 | # 11 | # Also note that the RcppParallel multithreading (specified by 12 | # argument "nc") will only work correctly if the number of threads is 13 | # set beforehand using RcppParallel::setThreadOptions. 14 | # 15 | #' @importFrom Rcpp evalCpp 16 | #' @importFrom RcppParallel RcppParallelLibs 17 | #' 18 | ccd_update_factors <- function (V, W, H, nc = 1, e = 1e-15) { 19 | m <- ncol(V) 20 | j <- 1:m 21 | if (nc == 1) { 22 | if (is.matrix(V)) { 23 | H <- pnmfem_update_factors_rcpp(V,H,W,j-1,1) 24 | H <- ccd_update_factors_rcpp(V,W,H,e) 25 | } else if (is.sparse.matrix(V)) { 26 | H <- pnmfem_update_factors_sparse_rcpp(V,H,W,j-1,1) 27 | H <- ccd_update_factors_sparse_rcpp(V,W,H,e) 28 | } 29 | } else if (nc > 1) { 30 | if (is.matrix(V)) { 31 | H <- pnmfem_update_factors_parallel_rcpp(V,H,W,j-1,1) 32 | H <- ccd_update_factors_parallel_rcpp(V,W,H,e) 33 | } else if (is.sparse.matrix(V)) { 34 | H <- pnmfem_update_factors_sparse_parallel_rcpp(V,H,W,j-1,1) 35 | H <- ccd_update_factors_sparse_parallel_rcpp(V,W,H,e) 36 | } 37 | } 38 | return(H) 39 | } 40 | 41 | # This function implements the cyclic co-ordinate descent (CCD) update 42 | # for the loadings matrix (the "activations"), V, in which the matrix 43 | # V is approximated by the matrix product W*H. Inputs W and H should 44 | # not be sparse matrices ("is.matrix" should return TRUE). Input 45 | # argument "e" is a non-negative scalar specifying the minimum value 46 | # of the updated factors. 47 | # 48 | # Note that a single EM update of the loadings is performed before 49 | # running the CCD updates. 50 | # 51 | # Also note that the RcppParallel multithreading (specified by 52 | # argument "nc") will only work correctly if the number of threads is 53 | # set beforehand using RcppParallel::setThreadOptions. 54 | # 55 | #' @importFrom Rcpp evalCpp 56 | #' @importFrom RcppParallel RcppParallelLibs 57 | #' 58 | ccd_update_loadings <- function (V, W, H, nc = 1, e = 1e-15) { 59 | n <- nrow(V) 60 | i <- 1:n 61 | V <- t(V) 62 | W <- t(W) 63 | H <- t(H) 64 | if (nc == 1) { 65 | if (is.matrix(V)) { 66 | W <- pnmfem_update_factors_rcpp(V,W,H,i-1,1) 67 | W <- ccd_update_factors_rcpp(V,H,W,e) 68 | } else if (is.sparse.matrix(V)) { 69 | W <- pnmfem_update_factors_sparse_rcpp(V,W,H,i-1,1) 70 | W <- ccd_update_factors_sparse_rcpp(V,H,W,e) 71 | } 72 | } else if (nc > 1) { 73 | if (is.matrix(V)) { 74 | W <- pnmfem_update_factors_parallel_rcpp(V,W,H,i-1,1) 75 | W <- ccd_update_factors_parallel_rcpp(V,H,W,e) 76 | } 77 | else if (is.sparse.matrix(V)) { 78 | W <- pnmfem_update_factors_sparse_parallel_rcpp(V,W,H,i-1,1) 79 | W <- ccd_update_factors_sparse_parallel_rcpp(V,H,W,e) 80 | } 81 | } 82 | return(t(W)) 83 | } 84 | -------------------------------------------------------------------------------- /R/colors.R: -------------------------------------------------------------------------------- 1 | # Code in this file is adapted from 2 | # https://github.com/btupper/catecolors 3 | # by Ben Tupper. 4 | 5 | # K. L. Kelly. Twenty two colors of maximum contrast. Color 6 | # Engineering, 3:26-27, 1965. 7 | # http://www.iscc.org/pdf/PC54_1724_001.pdf 8 | kelly <- function (index, ...) { 9 | if (missing(index)) 10 | index <- seq_len(nrow(KELLYLUT)) 11 | return(get_lut(KELLYLUT,index,...)) 12 | } 13 | 14 | # Retrieve one or more of Glasbey et al 256 color specifications. 15 | glasbey <- function (index, ...) { 16 | if (missing(index)) 17 | index <- seq_len(nrow(GLASBEYLUT)) 18 | return(get_lut(GLASBEYLUT,index,...)) 19 | } 20 | 21 | # Retrieve one or more of color specifications as hex, rgb triplets or 22 | # a data.frame. 23 | get_lut <- function (LUT, index, 24 | form = c("hex", "rgb", "data.frame")[1], 25 | name = FALSE) { 26 | if (missing(LUT)) 27 | stop("LUT is required") 28 | if (name) 29 | nm <- rownames(LUT) 30 | if (missing(index)) 31 | index <- seq_len(nrow(LUT)) 32 | form <- tolower(form[1]) 33 | if (form == "hex"){ 34 | x <- LUT[index,"hex"] 35 | if (name) 36 | names(x) <- rownames(LUT)[index] 37 | } else if (form == "rgb") { 38 | x <- as.matrix(LUT[index,c("red","green","blue")]) 39 | if (name) 40 | names(x) <- rownames(LUT)[index] 41 | } else 42 | x <- LUT[index,] 43 | return(x) 44 | } 45 | -------------------------------------------------------------------------------- /R/fit_multinom_model.R: -------------------------------------------------------------------------------- 1 | #' @title Fit Simple Multinomial Model 2 | #' 3 | #' @description Fit a simple multinomial model for count data, in 4 | #' which each sample (\emph{i.e.}, a row of the data matrix \code{X}) 5 | #' is assigned to a cluster. Under this simple multinomial model, 6 | #' \eqn{x_{ij}} assigned to cluster \eqn{k} is multinomial with sample 7 | #' size \eqn{s_i = x_{i1} + ... + x_{im}} and multinomial 8 | #' probabilities \eqn{p_{1k}, ..., p_{mk}}. This is a special case of 9 | #' the multinomial topic model in which all the mixture proportions 10 | #' are either 0 or 1. The maximum-likelihood estimates (MLEs) of the 11 | #' multinomial probabilities have a closed-form solution; no 12 | #' iterative algorithm is needed to fit this simple model. 13 | #' 14 | #' @param cluster A factor specifying a grouping, or clustering, of 15 | #' the rows of \code{X}; e.g., the \dQuote{cluster} output from 16 | #' \code{\link[stats]{kmeans}}. 17 | #' 18 | #' @param X The n x m matrix of counts; all entries of X should be 19 | #' non-negative. It can be a sparse matrix (class \code{"dgCMatrix"}) 20 | #' or dense matrix (class \code{"matrix"}), with some exceptions (see 21 | #' \sQuote{Details}). 22 | #' 23 | #' @param verbose This is passed as the \dQuote{verbose} argument in 24 | #' the call to \code{\link{init_poisson_nmf}}. 25 | #' 26 | #' @param \dots Additional arguments passed to 27 | #' \code{\link{init_poisson_nmf}}. 28 | #' 29 | #' @return A multinomial topic model fit. 30 | #' 31 | #' @seealso \code{\link{fit_topic_model}} 32 | #' 33 | #' @importFrom Matrix colSums 34 | #' 35 | #' @export 36 | #' 37 | fit_multinom_model <- function (cluster, X, 38 | verbose = c("none","detailed"), ...) { 39 | 40 | # Check the input data matrix. 41 | verify.count.matrix(X) 42 | 43 | # Check and process input argument "verbose" 44 | verbose <- match.arg(verbose) 45 | 46 | # If necessary, remove all-zero columns from the counts matrix. 47 | if (any_allzero_cols(X)) { 48 | X <- remove.allzero.cols(X) 49 | warning(sprintf(paste("One or more columns of X are all zero; after", 50 | "removing all-zero columns, %d columns will be", 51 | "used for model fitting"),ncol(X))) 52 | } 53 | 54 | # Get the number of rows (n) and columns (m) of the data matrix, 55 | n <- nrow(X) 56 | m <- ncol(X) 57 | 58 | # Check the "cluster" input. 59 | if (!is.factor(cluster)) 60 | cluster <- factor(cluster) 61 | if (length(cluster) != n) 62 | stop("Input argument \"cluster\" should have one entry for each row of ", 63 | "\"X\"") 64 | if (any(table(cluster) == 0)) 65 | stop("Each level must appear at least once in factor \"cluster\"") 66 | 67 | # Initialize the loadings and factors matrices from the clustering: 68 | # L[i,j] = 1 if row i is assigned to cluster j, and L[i,j] = 0 69 | # otherwise. The maximum-likelihood estimates of the factors have a 70 | # closed-form solution in this case. 71 | k <- nlevels(cluster) 72 | F <- matrix(0,m,k) 73 | L <- matrix(0,n,k) 74 | rownames(L) <- rownames(X) 75 | rownames(F) <- colnames(X) 76 | colnames(L) <- levels(cluster) 77 | colnames(F) <- levels(cluster) 78 | for (j in levels(cluster)) { 79 | i <- which(cluster == j) 80 | L[i,j] <- 1 81 | F[,j] <- colSums(X[i,])/sum(L[i,j]) 82 | } 83 | 84 | # Return a multinomial topic model fit. 85 | return(poisson2multinom(init_poisson_nmf(X,F = F,L = L, 86 | verbose = verbose,...))) 87 | } 88 | -------------------------------------------------------------------------------- /R/homer.R: -------------------------------------------------------------------------------- 1 | #' @title Perform HOMER Motif Enrichment Analysis using DE Genomic Positions 2 | #' 3 | #' @description Run HOMER motif finding algorithm 4 | #' (\code{findMotifsGenome.pl}) to identify motifs enriched for 5 | #' differentially expressed (DE) genomic positions. See 6 | #' \url{http://homer.ucsd.edu} for more information. 7 | #' 8 | #' @param de An object of class \dQuote{topic_model_de_analysis}, 9 | #' usually the result of running \code{\link{de_analysis}}. 10 | #' 11 | #' @param k Use the DE analysis results for this topic. 12 | #' 13 | #' @param positions A table of genomic positions corresponding to rows 14 | #' of the \code{de_analysis} results. Specifically, it should a data 15 | #' frame with four columns: \dQuote{chr}, chromosome name or number; 16 | #' \dQuote{start}, start position of genomic feature; \dQuote{end}, 17 | #' end position of genomic feature; and \dQuote{name}, the name of the 18 | #' genomic feature. If not specified, the genomic positions will be 19 | #' extracted from the row names of \code{de$postmean}, in which the 20 | #' row names are expected to be of the form \code{chr_start_end}. The 21 | #' genomic positions will be written to a BED file (see 22 | #' \url{https://genome.ucsc.edu/FAQ/FAQformat.html} for more 23 | #' information about BED files). 24 | #' 25 | #' @param genome The genome parameter passed to 26 | #' \code{findMotifsGenome.pl}. 27 | #' 28 | #' @param subset Describe input argument "subset" here. 29 | #' 30 | #' @param homer.exec The name or file path of the HOMER 31 | #' \code{findMotifsGenome.pl} excutable. 32 | #' 33 | #' @param out.dir The positions BED file and HOMER results are written 34 | #' to this directory. 35 | #' 36 | #' @param homer.options Character string used to override default 37 | #' \code{findMotifsGenome.pl} options. 38 | #' 39 | #' @param verbose When \code{verbose = TRUE}, progress information is 40 | #' printed to the console. 41 | #' 42 | #' @return A data frame containing the motif enrichment results. It 43 | #' is created from the \code{knownResults.txt} HOMER output. 44 | #' 45 | #' @importFrom utils read.table 46 | #' @importFrom utils write.table 47 | #' 48 | #' @references 49 | #' Heinz, S., Benner, C., Spann, N., Bertolino, E., Lin, Y. C., Laslo, 50 | #' P., Cheng, J. X., Murre, C., Singh, H. and Glass, C. K. (2010). 51 | #' Simple combinations of lineage-determining transcription factors 52 | #' prime cis-regulatory elements required for macrophage and B cell 53 | #' identities. \emph{Molecular Cell} \bold{38}, 576-589. 54 | #' 55 | #' @export 56 | #' 57 | run_homer <- 58 | function (de, k, positions, genome = "hg19", 59 | subset = function (postmean, lpval, lfsr, rank, quantile) 60 | lfsr < 0.05, 61 | homer.exec = "findMotifsGenome.pl", 62 | out.dir = tempdir(), 63 | homer.options = "-len 8,10,12 -size 200 -mis 2 -S 25 -p 1 -h", 64 | verbose = TRUE) { 65 | 66 | # Get the positions if they are not provided. 67 | if (missing(positions)) { 68 | feature_names <- rownames(de$postmean) 69 | out <- strsplit(feature_names,"_") 70 | positions <- data.frame(chr = sapply(out,"[[",1), 71 | start = sapply(out,"[[",2), 72 | end = sapply(out,"[[",3), 73 | name = feature_names, 74 | stringsAsFactors = FALSE) 75 | } 76 | 77 | # Select the differentially expressed positions. 78 | rows <- select_de_genes(de,k,subset) 79 | if (verbose) 80 | cat(sprintf("%d out of %d positions selected\n", 81 | length(rows),nrow(de$postmean))) 82 | 83 | # Write the selected positions to a BED file. 84 | pos.file <- file.path(out.dir,"positions.bed") 85 | if (verbose) 86 | cat("Writing selected positions to",pos.file,"\n") 87 | write.table(positions[rows,],pos.file,sep = "\t",quote = FALSE, 88 | row.names = FALSE,col.names = FALSE) 89 | 90 | # Run the HOMER motif enrichment analysis. 91 | homer.dir <- file.path(out.dir,"homer") 92 | homer.command <- paste(homer.exec,pos.file,genome,homer.dir,homer.options) 93 | if (verbose) { 94 | cat("Performing HOMER motif enrichment analysis:\n") 95 | cat(homer.command,"\n") 96 | } 97 | system.out <- system(homer.command,ignore.stderr = TRUE, 98 | ignore.stdout = TRUE,intern = TRUE) 99 | res <- read.table(file.path(homer.dir,"knownResults.txt"), 100 | sep = "\t",comment.char = "",header = TRUE, 101 | check.names = FALSE,stringsAsFactors = FALSE) 102 | return(res) 103 | } 104 | 105 | -------------------------------------------------------------------------------- /R/merge_topics.R: -------------------------------------------------------------------------------- 1 | #' @title Combine Topics in Multinomial Topic Model 2 | #' 3 | #' @description Combine two or more topics in a multinomial topic 4 | #' model fit. 5 | #' 6 | #' @details Mixture proportions are combined by summation, and factors 7 | #' are combined by averaging. 8 | #' 9 | #' @param fit A multinomial topic model fit. 10 | #' 11 | #' @param k The names or numbers of the topics to be combined. Two or 12 | #' more topics should be chosen. 13 | #' 14 | #' @return A multinomial topic model fit. 15 | #' 16 | #' @export 17 | #' 18 | merge_topics <- function (fit, k) { 19 | 20 | # Verify input "fit". 21 | if (!inherits(fit,"multinom_topic_model_fit")) 22 | stop("Input argument \"fit\" should be an object of class ", 23 | "\"multinom_topic_model_fit\"") 24 | verify.fit(fit) 25 | 26 | # Verify and process input "k". 27 | msg <- paste("Input argument \"k\" should contain valid topic names or", 28 | "numbers (column indices of F and L)") 29 | if (!((is.numeric(k) | is.character(k)) & length(k) >= 2)) 30 | stop(msg) 31 | if (is.numeric(k)) { 32 | if (!all(k >= 1 & k <= ncol(fit$F))) 33 | stop(msg) 34 | } else { 35 | if (!all(is.element(k,colnames(fit$F)))) 36 | stop(msg) 37 | k <- match(k,colnames(fit$F)) 38 | } 39 | 40 | # Combine the selected topics. 41 | out1 <- combine_factors(fit$F,fit$L,k) 42 | out2 <- combine_factors(fit$Fn,fit$Ln,k) 43 | out3 <- combine_factors(fit$Fy,fit$Ly,k) 44 | fit$F <- out1$F 45 | fit$L <- out1$L 46 | fit$Fn <- out2$F 47 | fit$Ln <- out2$L 48 | fit$Fy <- out3$F 49 | fit$Ly <- out3$L 50 | return(fit) 51 | } 52 | 53 | # Combine two or more columns of the factors matrix (F) and loadings 54 | # matrix (L). Loadings are combined by summation, and factors are 55 | # combined by averaging. 56 | combine_factors <- function (F, L, k) { 57 | if (is.null(colnames(F))) 58 | y <- NULL 59 | else { 60 | y <- colnames(F) 61 | y <- c(y[-k],paste(y[k],collapse = "+")) 62 | } 63 | F <- cbind(F[,-k],rowMeans(F[,k])) 64 | L <- cbind(L[,-k],rowSums(L[,k])) 65 | colnames(F) <- y 66 | colnames(L) <- y 67 | return(list(F = F,L = L)) 68 | } 69 | -------------------------------------------------------------------------------- /R/mixem.R: -------------------------------------------------------------------------------- 1 | # Compute a maximum-likelihood estimate (MLE) of the mixture 2 | # proportions in the multinomial mixture model by iterating the EM 3 | # updates for a fixed number of iterations. This is mainly used for 4 | # testing the C++ implementation. See the comments attached to the 5 | # "mixem" C++ function for an explanation of the inputs. 6 | mixem <- function (L, w, x0, numiter) { 7 | L1 <- normalize.cols(L) 8 | x <- x0 9 | for (i in 1:numiter) 10 | x <- mixem.update(L1,w,x) 11 | return(x) 12 | } 13 | 14 | # Perform a single EM update for the multinomial mixture model. This 15 | # is mainly used for testing the C++ implementation. 16 | mixem.update <- function (L1, w, x) { 17 | e <- 1e-15 18 | x <- x/sum(x) 19 | w <- w/sum(w) 20 | 21 | # Compute the posterior mixture assignment probabilities. A small 22 | # number is added to the posterior probabilities to prevent any 23 | # divisions by zero. This is the "E step". 24 | P <- scale.cols(L1,x) 25 | P <- normalize.rows.by.max(P) + e 26 | P <- normalize.rows(P) 27 | 28 | # Update the mixture weights. This is the "M step". 29 | return(drop(w %*% P)) 30 | } 31 | 32 | # Find the maximum-likelihood estimate (MLE) for the special case when 33 | # only one of the counts is positive. 34 | mixture.one.nonzero <- function (L, w) { 35 | j <- which.max(w %*% normalize.cols(L)) 36 | x <- rep(0,ncol(L)) 37 | x[j] <- 1 38 | return(x) 39 | } 40 | -------------------------------------------------------------------------------- /R/multinom2poisson.R: -------------------------------------------------------------------------------- 1 | #' @title Recover Poisson NMF Fit from Multinomial Topic Model Fit 2 | #' 3 | #' @description This function recovers parameter estimates of the 4 | #' Poisson non-negative matrix factorization (NMF) given parameter 5 | #' estimates for a multinomial topic model. 6 | #' 7 | #' @param fit An object of class \dQuote{multinom_topic_model_fit}, 8 | #' such as an output from \code{poisson2multinom}. If a Poisson NMF 9 | #' fit is provided (that is, an object of class 10 | #' \dQuote{poisson_nmf_fit}), the fit object is immediately returned 11 | #' \dQuote{as is}. 12 | #' 13 | #' @param X Optional n x m matrix of counts, or pseudocounts. It can 14 | #' be a sparse matrix (class \code{"dgCMatrix"}) or dense matrix 15 | #' (class \code{"matrix"}). This only needs to be provided if the 16 | #' document sizes \code{fit$s} are not available. 17 | #' 18 | #' @return The return value is the list \code{fit}, in which matrices 19 | #' \code{fit$F} and \code{fit$L} specify the factors and loadings in 20 | #' the Poisson non-negative matrix factorization; specifically, 21 | #' the counts matrix is modeled by the low-rank matrix product 22 | #' \code{tcrossprod(fit$L,fit$F)}. 23 | #' 24 | #' @importFrom Matrix rowSums 25 | #' 26 | #' @export 27 | #' 28 | multinom2poisson <- function (fit, X) { 29 | 30 | # Check input argument "fit". 31 | if (inherits(fit,"poisson_nmf_fit")) 32 | return(fit) 33 | if (!inherits(fit,"multinom_topic_model_fit")) 34 | stop("Input argument \"fit\" should be an object of class ", 35 | "\"multinom_topic_model_fit\"") 36 | verify.fit(fit) 37 | F <- fit$F 38 | L <- fit$L 39 | 40 | # Check input argument "X". 41 | if (!missing(X)) 42 | verify.fit.and.count.matrix(X,fit) 43 | 44 | # Exactly one of X and fit$s should be provided. 45 | if (sum(c(!missing(X),is.element("s",names(fit)))) != 1) 46 | stop("Exactly one of \"X\" and \"fit$s\" should be specified") 47 | 48 | if (missing(X)) 49 | 50 | # Process the "scale factors", s. 51 | s <- as.double(fit$s) 52 | else 53 | 54 | # Compute maximum-likelihood estimates of the "document sizes", s, 55 | # from the counts matrix, X. 56 | s <- as.double(rowSums(X)) 57 | 58 | # Recover F and L for the Poisson non-negative matrix factorization. 59 | out <- rescale.factors(F,s*L) 60 | 61 | # Update the "fit" object, and return it. 62 | fit$F <- out$F 63 | fit$L <- out$L 64 | fit$s <- NULL 65 | class(fit) <- c("poisson_nmf_fit","list") 66 | return(fit) 67 | } 68 | -------------------------------------------------------------------------------- /R/newsgroups.R: -------------------------------------------------------------------------------- 1 | #' @name newsgroups 2 | #' 3 | #' @title Topic modeling results from the \dQuote{20 Newsgroups} data 4 | #' set. 5 | #' 6 | #' @docType data 7 | #' 8 | #' @description These are topic modeling results from the \dQuote{20 9 | #' Newsgroups} data, with k = 10 topics. The data were originally 10 | #' downloaded from \url{http://qwone.com/~jason/20Newsgroups} and 11 | #' prepared by running code that found in an R Markdown file in this 12 | #' GitHub repository: 13 | #' \url{https://github.com/stephenslab/fastTopics-experiments}. See 14 | #' the \dQuote{inst} directory of this package for the scripts used to 15 | #' generate these results. 16 | #' 17 | #' @format \code{newsgroups} is a list with the following elements: 18 | #' 19 | #' \describe{ 20 | #' 21 | #' \item{topics}{Original labeling of the documents: each document 22 | #' is from one of 20 \dQuote{newsgroups}.} 23 | #' 24 | #' \item{L}{Estimated topic proportions matrix; rows are 25 | #' documents and columns are topics.} 26 | #' 27 | #' \item{F}{Matrix containing posterior mean estimates of log-fold 28 | #' changes (in base-2 logarithm). These were computed using 29 | #' \code{\link{de_analysis}} with \code{lfc.stat = "vsnull"}. Columns 30 | #' are words and columns are topics.}} 31 | #' 32 | #' @keywords data 33 | #' 34 | #' @examples 35 | #' data(newsgroups) 36 | #' table(newsgroups$topics) 37 | #' dim(newsgroups$L) 38 | #' dim(newsgroups$F) 39 | #' 40 | NULL 41 | -------------------------------------------------------------------------------- /R/pbmc_facs.R: -------------------------------------------------------------------------------- 1 | #' @name pbmc_facs 2 | #' 3 | #' @title Mixture of 10 FACS-purified PBMC Single-Cell RNA-seq data 4 | #' 5 | #' @docType data 6 | #' 7 | #' @description These data are a selection of the reference 8 | #' transcriptome profiles generated via single-cell RNA sequencing 9 | #' (RNA-seq) of 10 bead-enriched subpopulations of PBMCs (Donor A), 10 | #' described in Zheng \emph{et al} (2017). The data are unique 11 | #' molecular identifier (UMI) counts for 16,791 genes in 3,774 cells. 12 | #' (Genes with no expression in any of the cells were removed.) Since 13 | #' the majority of the UMI counts are zero, they are efficiently 14 | #' stored as a 3,774 x 16,791 sparse matrix. These data are used in 15 | #' the vignette illustrating how 'fastTopics' can be used to analyze to 16 | #' single-cell RNA-seq data. Data for a separate set of 1,000 cells is 17 | #' provided as a \dQuote{test set} to evaluate out-of-sample predictions. 18 | #' 19 | #' @format \code{pbmc_facs} is a list with the following elements: 20 | #' 21 | #' \describe{ 22 | #' 23 | #' \item{counts}{3,774 x 16,791 sparse matrix of UMI counts, with 24 | #' rows corresponding to samples (cells) and columns corresponding to 25 | #' genes. It is an object of class \code{"dgCMatrix"}).} 26 | #' 27 | #' \item{counts_test}{UMI counts for an additional test set of 100 28 | #' cells.} 29 | #' 30 | #' \item{samples}{Data frame containing information about the 31 | #' samples, including cell barcode and source FACS population 32 | #' (\dQuote{celltype} and \dQuote{facs_subpop}).} 33 | #' 34 | #' \item{samples_test}{Sample information for the additional test 35 | #' set of 100 cells.} 36 | #' 37 | #' \item{genes}{Data frame containing information and the genes, 38 | #' including gene symbol and Ensembl identifier.} 39 | #' 40 | #' \item{fit}{Poisson non-negative matrix factorization (NMF) fitted 41 | #' to the UMI count data \code{counts}, with rank \code{k = 6}. See 42 | #' the vignette how the Poisson NMF model fitting was performed.}} 43 | #' 44 | #' \url{https://www.10xgenomics.com/resources/datasets} 45 | #' 46 | #' @references 47 | #' G. X. Y. Zheng \emph{et al} (2017). Massively parallel digital 48 | #' transcriptional profiling of single cells. \emph{Nature Communications} 49 | #' \bold{8}, 14049. \doi{10.1038/ncomms14049} 50 | #' 51 | #' @keywords data 52 | #' 53 | #' @examples 54 | #' library(Matrix) 55 | #' data(pbmc_facs) 56 | #' cat(sprintf("Number of cells: %d\n",nrow(pbmc_facs$counts))) 57 | #' cat(sprintf("Number of genes: %d\n",ncol(pbmc_facs$counts))) 58 | #' cat(sprintf("Proportion of counts that are non-zero: %0.1f%%.\n", 59 | #' 100*mean(pbmc_facs$counts > 0))) 60 | #' 61 | NULL 62 | -------------------------------------------------------------------------------- /R/pnmfem.R: -------------------------------------------------------------------------------- 1 | # This function implements the EM updates for the factors matrix, F, 2 | # in which the matrix X is approximated by tcrossprod(L,F). The EM 3 | # updates are equivalent to multiplicative updates, but computation is 4 | # implemented differently. Inputs F and L should be dense matrices 5 | # ("is.matrix" should return TRUE), but for X both dense matrices and 6 | # sparse matrices are supported ("matrix" and "dgCMatrix" classes). 7 | # Input "j" specifies which rows of F to update; by default, all rows 8 | # are updated. Input "numiter" specifies the number of EM updates to 9 | # perform. Input argument "e" is a non-negative scalar specifying the 10 | # minimum value of the updated loadings. A positive value of "e" 11 | # promotes better convergence of the EM updates. 12 | # 13 | # Note that the RcppParallel multithreading (specified by argument 14 | # "nc") will only work correctly if the number of threads is set 15 | # beforehand using RcppParallel::setThreadOptions. 16 | # 17 | #' @importFrom Rcpp evalCpp 18 | #' @importFrom RcppParallel RcppParallelLibs 19 | #' 20 | pnmfem_update_factors <- function (X, F, L, j = seq(1,ncol(X)), 21 | numiter = 1, nc = 1) { 22 | F <- t(F) 23 | if (nc == 1) { 24 | if (is.matrix(X)) 25 | F <- pnmfem_update_factors_rcpp(X,F,L,j-1,numiter) 26 | else if (is.sparse.matrix(X)) 27 | F <- pnmfem_update_factors_sparse_rcpp(X,F,L,j-1,numiter) 28 | } else if (nc > 1) { 29 | if (is.matrix(X)) 30 | F <- pnmfem_update_factors_parallel_rcpp(X,F,L,j-1,numiter) 31 | else if (is.sparse.matrix(X)) 32 | F <- pnmfem_update_factors_sparse_parallel_rcpp(X,F,L,j-1,numiter) 33 | } 34 | return(t(F)) 35 | } 36 | 37 | # This function implements the EM updates for the loadings matrix, L, 38 | # in which the matrix X is approximated by tcrossprod(L,F). The EM 39 | # updates are equivalent to multiplicative updates, but computation is 40 | # implemented differently. Inputs F and L should be dense matrices 41 | # ("is.matrix" should return TRUE), but for X both dense matrices and 42 | # sparse matrices are supported ("matrix" and "dgCMatrix" classes). 43 | # Input "i" specifies which rows of L to update; by default, all rows 44 | # are updated. Input "numiter" specifies the number of EM updates to 45 | # perform, and input "nc" specifies the number of threads to use in 46 | # the multithreaded updates. Input argument "e" is a non-negative 47 | # scalar specifying the minimum value of the updated loadings. A 48 | # positive value of "e" promotes better convergence of the EM updates. 49 | # 50 | # Note that the RcppParallel multithreading (specified by argument 51 | # "nc") will only work correctly if the number of threads is set 52 | # beforehand using RcppParallel::setThreadOptions. 53 | # 54 | #' @importFrom Rcpp evalCpp 55 | #' @importFrom RcppParallel RcppParallelLibs 56 | #' 57 | pnmfem_update_loadings <- function (X, F, L, i = seq(1,nrow(X)), 58 | numiter = 1, nc = 1) { 59 | X <- t(X) 60 | L <- t(L) 61 | if (nc == 1) { 62 | if (is.matrix(X)) 63 | L <- pnmfem_update_factors_rcpp(X,L,F,i-1,numiter) 64 | else if (is.sparse.matrix(X)) 65 | L <- pnmfem_update_factors_sparse_rcpp(X,L,F,i-1,numiter) 66 | } else if (nc > 1) { 67 | if (is.matrix(X)) 68 | L <- pnmfem_update_factors_parallel_rcpp(X,L,F,i-1,numiter) 69 | else if (is.sparse.matrix(X)) 70 | L <- pnmfem_update_factors_sparse_parallel_rcpp(X,L,F,i-1,numiter) 71 | } 72 | return(t(L)) 73 | } 74 | -------------------------------------------------------------------------------- /R/poismix.R: -------------------------------------------------------------------------------- 1 | # Compute a maximum-likelihood estimate (MLE) of the mixture weights 2 | # in a Poisson mixture model by iterating the multinomial mixture 3 | # model EM updates for a fixed number of iterations. This is mainly 4 | # used for testing the C++ implementation. See the comments attached 5 | # to the "poismixem" C++ function for an explanation of the inputs. 6 | poismixem <- function (L, w, x0, numiter) { 7 | x <- x0 8 | 9 | # Recover the mixture weights of the multinomial mixture model from 10 | # the mixture weights of the Poisson mixture model. Here, s is the 11 | # "scale factor". 12 | s <- sum(L %*% x) 13 | u <- colSums(L) 14 | L <- normalize.cols(L) 15 | x <- x*u/s 16 | 17 | # Perform one or more EM updates for the multinomial mixture model. 18 | x <- mixem(L,w,x,numiter) 19 | 20 | # Recover the mixture weights of the Poisson mixture model from the 21 | # mixture weights of the multinomial mixture model. 22 | s <- sum(w) 23 | return(s*x/u) 24 | } 25 | 26 | # Find the maximum-likelihood estimate (MLE) for the special case when 27 | # only one of the counts is positive. 28 | poismix.one.nonzero <- function (L, w) { 29 | x <- mixture.one.nonzero(L,w) 30 | j <- which.max(x) 31 | x[j] <- sum(w)/sum(L[,j]) 32 | return(x) 33 | } 34 | -------------------------------------------------------------------------------- /R/poisson2multinom.R: -------------------------------------------------------------------------------- 1 | #' @title Recover Multinomial Topic Model Fit from Poisson NMF fit 2 | #' 3 | #' @description This function recovers parameter estimates of the 4 | #' multinomial topic model given parameter estimates for a Poisson 5 | #' non-negative matrix factorization (NMF). 6 | #' 7 | #' @param fit An object of class \dQuote{poisson_nmf_fit}, such as an 8 | #' output from \code{fit_poisson_nmf}. It does not make sense for a 9 | #' multinomial topic model to have less than two topics, so an error 10 | #' will be reported when k < 2, where k is the rank of the matrix 11 | #' factorization. If a multinomial topic model fit is provided (that 12 | #' is, an object of class \dQuote{multinom_topic_model_fit}), the fit 13 | #' object is immediately returned \dQuote{as is}. 14 | #' 15 | #' @return The return value is the list \code{fit}, in which 16 | #' \code{fit$F} and \code{fit$L} are the parameters of the multinomial 17 | #' topic model; specifically, \code{fit$L[i,]} gives the topic 18 | #' probabilities for sample or document i, and \code{fit$F[,k]} gives 19 | #' the term probabilities for topic k. An additional vector 20 | #' \code{fit$s} of length n is returned giving the "size factors". 21 | #' 22 | #' @export 23 | #' 24 | poisson2multinom <- function (fit) { 25 | 26 | # Check input argument "fit". 27 | if (inherits(fit,"multinom_topic_model_fit")) 28 | return(fit) 29 | if (!inherits(fit,"poisson_nmf_fit")) 30 | stop("Input argument \"fit\" should be an object of class ", 31 | "\"poisson_nmf_fit\"") 32 | verify.fit(fit) 33 | if (ncol(fit$F) < 2 | ncol(fit$L) < 2) 34 | stop("Input matrices \"fit$F\" and \"fit$L\" should have 2 or more", 35 | "columns") 36 | 37 | # Recover F and L for the multinomial model. Here, s gives the 38 | # Poisson rates for generating the "document sizes". 39 | out <- get_multinom_from_pnmf(fit$F,fit$L) 40 | fit$F <- out$F 41 | fit$L <- out$L 42 | fit$s <- out$s 43 | 44 | # Return the updated fit. 45 | class(fit) <- c("multinom_topic_model_fit","list") 46 | return(fit) 47 | } 48 | 49 | # Get the parameters of the multinomial topic model from the 50 | # parameters of the Poisson NMF model. 51 | get_multinom_from_pnmf <- function (F, L) { 52 | u <- colSums(F) 53 | F <- scale.cols(F,1/u) 54 | L <- scale.cols(L,u) 55 | s <- rowSums(L) 56 | L <- L / s 57 | return(list(F = F,L = L,s = s)) 58 | } 59 | -------------------------------------------------------------------------------- /R/scd.R: -------------------------------------------------------------------------------- 1 | # This function implements a sequential co-ordinate descent (SCD) 2 | # update for the factors matrix (the "basis vectors"), H, in which the 3 | # matrix A is approximated by the matrix product W*H. Inputs W and H 4 | # should be dense matrices ("is.matrix" should return TRUE). Input "j" 5 | # specifies which columns of H to update; by default, all columns are 6 | # updated. Input "numiter" specifies the number of inner-loop 7 | # iterations to perform. Input argument "e" a non-negative scalar 8 | # included in the computations to prevent NaNs due to division by 9 | # zero. 10 | # 11 | # Note that a single EM update of each factor is performed before 12 | # running the CCD updates (unless runem = FALSE). 13 | # 14 | # Also note that the RcppParallel multithreading (specified by 15 | # argument "nc") will only work correctly if the number of threads is 16 | # set beforehand using RcppParallel::setThreadOptions. 17 | # 18 | #' @importFrom Rcpp evalCpp 19 | #' @importFrom RcppParallel RcppParallelLibs 20 | #' 21 | scd_update_factors <- function (A, W, H, j = seq(1,ncol(A)), numiter = 1, 22 | nc = 1, e = 1e-16, runem = TRUE) { 23 | if (!is.numeric(j)) 24 | stop("Input argument \"j\" should be a numeric vector") 25 | if (nc == 1) { 26 | if (is.matrix(A)) { 27 | if (runem) 28 | H <- pnmfem_update_factors_rcpp(A,H,W,j-1,1) 29 | H <- scd_update_factors_rcpp(A,W,H,j-1,numiter,e) 30 | } else if (is.sparse.matrix(A)) { 31 | if (runem) 32 | H <- pnmfem_update_factors_sparse_rcpp(A,H,W,j-1,1) 33 | H <- scd_update_factors_sparse_rcpp(A,W,H,j-1,numiter,e) 34 | } 35 | } else if (nc > 1) { 36 | if (is.matrix(A)) { 37 | if (runem) 38 | H <- pnmfem_update_factors_parallel_rcpp(A,H,W,j-1,1) 39 | H <- scd_update_factors_parallel_rcpp(A,W,H,j-1,numiter,e) 40 | } else if (is.sparse.matrix(A)) { 41 | if (runem) 42 | H <- pnmfem_update_factors_sparse_parallel_rcpp(A,H,W,j-1,1) 43 | H <- scd_update_factors_sparse_parallel_rcpp(A,W,H,j-1,numiter,e) 44 | } 45 | } 46 | return(H) 47 | } 48 | 49 | # This function implements a sequential co-ordinate descent (SCD) 50 | # update for the loadings matrix (the "activations"), W, in which the 51 | # matrix A is approximated by the matrix product W*H. Inputs W and H 52 | # should be dense matrices ("is.matrix" should return TRUE). Input "i" 53 | # specifies which rows of W to update; by default, all rows are 54 | # updated. Input "numiter" specifies the number of inner-loop 55 | # iterations to perform. Input argument "e" a non-negative scalar 56 | # included in the computations to prevent NaNs due to division by 57 | # zero. 58 | # 59 | # Note that a single EM update of the loadings is performed before 60 | # running the CCD updates (unless runem = FALSE). 61 | # 62 | # Also note that the RcppParallel multithreading (specified by 63 | # argument "nc") will only work correctly if the number of threads is 64 | # set beforehand using RcppParallel::setThreadOptions. 65 | # 66 | #' @importFrom Rcpp evalCpp 67 | #' @importFrom RcppParallel RcppParallelLibs 68 | #' 69 | scd_update_loadings <- function (A, W, H, i = seq(1,nrow(A)), numiter = 1, 70 | nc = 1, e = 1e-16, runem = TRUE) { 71 | if (!is.numeric(i)) 72 | stop("Input argument \"i\" should be a numeric vector") 73 | A <- t(A) 74 | W <- t(W) 75 | H <- t(H) 76 | if (nc == 1) { 77 | if (is.matrix(A)) { 78 | if (runem) 79 | W <- pnmfem_update_factors_rcpp(A,W,H,i-1,1) 80 | W <- scd_update_factors_rcpp(A,H,W,i-1,numiter,e) 81 | } else if (is.sparse.matrix(A)) { 82 | if (runem) 83 | W <- pnmfem_update_factors_sparse_rcpp(A,W,H,i-1,1) 84 | W <- scd_update_factors_sparse_rcpp(A,H,W,i-1,numiter,e) 85 | } 86 | } else if (nc > 1) { 87 | if (is.matrix(A)) { 88 | if (runem) 89 | W <- pnmfem_update_factors_parallel_rcpp(A,W,H,i-1,1) 90 | W <- scd_update_factors_parallel_rcpp(A,H,W,i-1,numiter,e) 91 | } else if (is.sparse.matrix(A)) { 92 | if (runem) 93 | W <- pnmfem_update_factors_sparse_parallel_rcpp(A,W,H,i-1,1) 94 | W <- scd_update_factors_sparse_parallel_rcpp(A,H,W,i-1,numiter,e) 95 | } 96 | } 97 | return(t(W)) 98 | } 99 | -------------------------------------------------------------------------------- /R/select.R: -------------------------------------------------------------------------------- 1 | #' @rdname select_loadings 2 | #' 3 | #' @title Extract or Re-order Data Rows in Poisson NMF or Multinomial Topic Model Fit 4 | #' 5 | #' @description This function can be used to extract estimates for a 6 | #' subset of the count data, or to re-order the rows of the loadings 7 | #' matrix. 8 | #' 9 | #' @param .data Poisson NMF or Multinomial Topic Model fit; that is, 10 | #' an object of class \dQuote{poisson_nmf_fit} or 11 | #' \dQuote{multinom_topic_model_fit}, such as an output from 12 | #' \code{\link{fit_poisson_nmf}} or \code{\link{fit_topic_model}}. 13 | #' 14 | #' @param loadings Indices (names or numbers) giving data rows to 15 | #' keep. If not specified, all rows are kept. 16 | #' 17 | #' @param \dots Other arguments passed to the generic select function. 18 | #' 19 | #' @return A Poisson NMF or multinomial topic model fit containing the 20 | #' selected data rows only. 21 | #' 22 | #' @importFrom dplyr select 23 | #' 24 | #' @aliases select 25 | #' 26 | #' @method select poisson_nmf_fit 27 | #' 28 | #' @export 29 | #' 30 | select.poisson_nmf_fit <- function (.data, loadings, ...) 31 | select_loadings(.data,loadings,...) 32 | 33 | #' @rdname select_loadings 34 | #' 35 | #' @method select multinom_topic_model_fit 36 | #' 37 | #' @export 38 | #' 39 | select.multinom_topic_model_fit <- function (.data, loadings, ...) 40 | select_loadings(.data,loadings,...) 41 | 42 | #' @rdname select_loadings 43 | #' 44 | #' @export 45 | #' 46 | select_loadings <- function (.data, loadings, ...) { 47 | if (!(inherits(.data,"poisson_nmf_fit") | 48 | inherits(.data,"multinom_topic_model_fit"))) 49 | stop("Input \"fit\" should be an object of class \"poisson_nmf_fit\" or ", 50 | "\"multinom_topic_model_fit\"") 51 | verify.fit(.data) 52 | n <- nrow(.data$L) 53 | if (missing(loadings)) 54 | loadings <- 1:n 55 | tryCatch({ 56 | .data$L <- .data$L[loadings,,drop = FALSE] 57 | .data$Ln <- .data$Ln[loadings,,drop = FALSE] 58 | .data$Ly <- .data$Ly[loadings,,drop = FALSE] 59 | .data$s <- .data$s[loadings] 60 | },error = function (e) stop("Invalid selection of loadings")) 61 | return(.data) 62 | } 63 | -------------------------------------------------------------------------------- /R/sysdata.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenslab/fastTopics/3a35d33d3e5ea0314d11965a1700e66035f1d18d/R/sysdata.rda -------------------------------------------------------------------------------- /R/topicscore.R: -------------------------------------------------------------------------------- 1 | # Much of the code contained here is based on the TopicScore package 2 | # source code developed by Minzhe Wang and Tracy Ke, distributed under 3 | # the MIT license. 4 | 5 | # Estimates the word-topic matrix (A) from the word-document matrix 6 | # (X) using the Topic SCORE algorithm. 7 | # 8 | # The inputs are: X, the n x m counts matrix (may be sparse or dense); 9 | # k, the number of topics; k0, the number of greedy search steps to 10 | # use in Vertex Hunting; m, the number of centers in the k-means step 11 | # of Vertex Hunting; and Mquantile, the percentage of the quantile of 12 | # the diagonal entries of matrix M, which is used to upper truncate 13 | # the diagonal entries of matrix M. When it is zero, it will 14 | # degenerate the case when there is no normalization. When it's 1, 15 | # there is no truncation. 16 | # 17 | # The return value is an m x k word-topic matrix. 18 | # 19 | #' @importFrom stats quantile 20 | #' @importFrom irlba irlba 21 | #' 22 | topic_score <- function (X, k, k0 = ifelse(k < 10,ceiling(1.5*k),k + 2), 23 | m = 3*k, nstart = 4, Mquantile = 0) { 24 | 25 | # M0 = D0*1/n, where D0(j,i) is the expected frequency of word j 26 | # in document i. 27 | M0 <- colMeans(X) 28 | M0 <- pmin(M0,quantile(M0,Mquantile)) 29 | 30 | # Compute the k right singular vectors of the normalized counts matrix. 31 | X <- scale.cols(X,1/sqrt(M0)) 32 | V <- irlba(X,k)$v 33 | 34 | # Step 1: Recover the left-scaling matrix (LSM). 35 | v1 <- abs(V[,1]) 36 | R <- V[,-1,drop = FALSE]/v1 37 | 38 | # Step 2: Perform "Vertex Hunting". 39 | V <- vertex_hunting(R,k0,m,nstart) 40 | 41 | # Step 3: Recover the normalized topic matrix (NTM). 42 | P <- cbind(R,1) %*% solve(cbind(V,1)) 43 | P <- pmax(P,0) 44 | P <- P / rowSums(P) 45 | 46 | # Step 4: Recover the unscaled topic matrix. 47 | A <- sqrt(M0)*v1*P 48 | 49 | # Step 5: Return the scaled topic matrix. 50 | return(normalize.cols(A)) 51 | } 52 | 53 | # The Vertex Hunting algorithm for Topic-SCORE. It finds a simplex 54 | # with k vertices that best approximates the given p data points in a 55 | # (k-1) dimensional space. 56 | # 57 | # The inputs are: R, the p x k-1 data matrix, with each row being a 58 | # data point; k0, the number of greedy search steps; and m, the number of 59 | # centers in the k-means step. 60 | # 61 | # The output is the k x k-1 vertices matrix, with each row being a 62 | # vertex in the found simplex. 63 | # 64 | #' @importFrom utils combn 65 | #' @importFrom stats kmeans 66 | #' 67 | vertex_hunting <- function (R, k0, m, nstart) { 68 | k <- ncol(R) + 1 69 | 70 | # Step 2a. 71 | X <- kmeans(R,m,iter.max = 100,nstart = nstart)$centers 72 | 73 | # Step 2b'. 74 | Y <- tcrossprod(X) 75 | D <- matrix(diag(Y),m,m) 76 | D <- D + t(D) - 2*Y 77 | i <- drop(arrayInd(which.max(D),dim(D))) 78 | X0 <- X[i,,drop = FALSE] 79 | X <- X[-i,,drop = FALSE] 80 | if (k0 > 2) { 81 | for (j in 3:k0) { 82 | D <- matrix(diag(tcrossprod(X)),j-1,nrow(X),byrow = TRUE) 83 | D <- D - 2*tcrossprod(X0,X) 84 | i <- which.max(colMeans(D)) 85 | X0 <- rbind(X0,X[i,]) 86 | X <- X[-i,,drop = FALSE] 87 | } 88 | X <- X0 89 | } 90 | 91 | # Step 2b. 92 | B <- combn(1:k0,k) 93 | n <- ncol(B) 94 | v <- rep(0,n) 95 | for (i in 1:n) 96 | for (j in 1:k0) { 97 | u <- tryCatch(simplex_dist(X[j,],X[B[,i],,drop = FALSE]), 98 | error = function (e) Inf) 99 | v[i] <- max(u,v[i]) 100 | } 101 | i <- which.min(v) 102 | return(X[B[,i],]) 103 | } 104 | 105 | # This function computes the shortest (Euclidean) distance between the 106 | # given point (x) and any point in the simplex (V). 107 | # 108 | #' @importFrom quadprog solve.QP 109 | #' 110 | simplex_dist <- function (x, V) { 111 | n <- nrow(V) 112 | v <- V[n,] 113 | A <- cbind(diag(n-1),-1) 114 | VV <- A %*% V 115 | M <- tcrossprod(VV) 116 | d <- drop(VV %*% (x - v)) 117 | b0 <- rep(0,n) 118 | b0[n] <- -1 119 | f <- solve.QP(M,d,A,b0)$value 120 | return(sqrt(max(sum((x - v)^2) + 2*f,0))) 121 | } 122 | -------------------------------------------------------------------------------- /R/verify_args.R: -------------------------------------------------------------------------------- 1 | # Verify that x is a vector with positive entries. 2 | verify.positive.vector <- function (x, arg.name = deparse(substitute(x))) { 3 | arg.name <- sprintf("\"%s\"",arg.name) 4 | msg <- paste("Input argument",arg.name,"should be a numeric vector in", 5 | "which all entries are finite, non-missing and positive") 6 | if (!is.numeric(x)) 7 | stop(msg) 8 | else if (any(x <= 0) | any(is.infinite(x)) | anyNA(x)) 9 | stop(msg) 10 | return(TRUE) 11 | } 12 | 13 | # Verify that x is non-negative matrix. 14 | verify.nonnegative.matrix <- function (x, arg.name = deparse(substitute(x))) { 15 | arg.name <- sprintf("\"%s\"",arg.name) 16 | msg <- paste("Input argument",arg.name,"should be a non-negative,", 17 | "numeric matrix (a \"matrix\" or a \"dgCMatrix\"), and", 18 | "all entries should be finite and non-missing") 19 | if (!((is.matrix(x) & is.numeric(x)) | is.sparse.matrix(x))) 20 | stop(msg) 21 | else if (any(x < 0) | any(is.infinite(x)) | anyNA(x)) 22 | stop(msg) 23 | return(TRUE) 24 | } 25 | 26 | # Verify that x is a valid count matrix. 27 | verify.count.matrix <- function (x, arg.name = deparse(substitute(x))) { 28 | verify.nonnegative.matrix(x,arg.name) 29 | arg.name <- sprintf("\"%s\"",arg.name) 30 | if (!(nrow(x) > 1 & ncol(x) > 1)) 31 | stop(paste("Input matrix",arg.name,"should have at least 2 rows", 32 | "and 2 columns")) 33 | return(TRUE) 34 | } 35 | 36 | # Verify that x is a valid multinomial topic model fit or Poisson 37 | # non-negative matrix factorization. 38 | verify.fit <- function (x, arg.name = deparse(substitute(x))) { 39 | arg.name.F <- paste0(arg.name,"$F") 40 | arg.name.L <- paste0(arg.name,"$L") 41 | arg.name.s <- paste0(arg.name,"$s") 42 | arg.name <- sprintf("\"%s\"",arg.name) 43 | msg <- paste("Input argument",arg.name,"should be a list containing", 44 | "non-negative matrices \"F\" and \"L\"") 45 | if (!is.list(x)) 46 | stop(msg) 47 | else if (!all(is.element(c("F","L"),names(x)))) 48 | stop(msg) 49 | verify.nonnegative.matrix(x$F,arg.name.F) 50 | verify.nonnegative.matrix(x$L,arg.name.L) 51 | arg.name.F <- sprintf("\"%s\"",arg.name.F) 52 | arg.name.L <- sprintf("\"%s\"",arg.name.L) 53 | if (ncol(x$F) != ncol(x$L)) 54 | stop(paste("Input matrices",arg.name.F,"and",arg.name.L,"should have", 55 | "the same number of columns")) 56 | if (is.element("s",names(x))) { 57 | 58 | # Check the vector of "scale factors", s. 59 | verify.positive.vector(x$s,arg.name.s) 60 | arg.name.s <- sprintf("\"%s\"",arg.name.s) 61 | if (length(x$s) != nrow(x$L)) 62 | stop(paste("The length of input vector",arg.name.s,"should equal the", 63 | "number of rows in",arg.name.L)) 64 | } 65 | return(TRUE) 66 | } 67 | 68 | # Verify that x is a valid count matrix and "fit" is a valid topic model 69 | # fit or non-negative matrix factorization. 70 | verify.fit.and.count.matrix <- 71 | function (x, fit, 72 | arg.name.x = deparse(substitute(x)), 73 | arg.name.fit = deparse(substitute(fit))) { 74 | verify.count.matrix(x,arg.name.x) 75 | verify.fit(fit,arg.name.fit) 76 | arg.name.x <- sprintf("\"%s\"",arg.name.x) 77 | arg.name.F <- sprintf("\"%s$F\"",arg.name.fit) 78 | arg.name.L <- sprintf("\"%s$L\"",arg.name.fit) 79 | if (!(nrow(fit$L) == nrow(x) & nrow(fit$F) == ncol(x))) 80 | stop(paste("Dimensions of input matrices",arg.name.x,",",arg.name.F, 81 | "and",arg.name.L,"do not agree")) 82 | if (!(identical(rownames(fit$L),rownames(x)) & 83 | identical(rownames(fit$F),colnames(x)))) 84 | stop(paste("Dimnames of input matrices",arg.name.x,",",arg.name.F, 85 | "and",arg.name.L,"are not consistent")) 86 | return(TRUE) 87 | } 88 | 89 | # Return TRUE if x is a finite scalar with no missing entries. 90 | is.scalar <- function (x) 91 | is.numeric(x) & 92 | length(x) == 1 & 93 | all(!is.na(x)) & 94 | all(is.finite(x)) 95 | -------------------------------------------------------------------------------- /R/zzz.R: -------------------------------------------------------------------------------- 1 | .onLoad <- function (lib, pkg) { 2 | options(Matrix.warnDeprecatedCoerce = 2) 3 | } 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # fastTopics 2 | 3 | [![R-CMD-check](https://github.com/stephenslab/fastTopics/workflows/R-CMD-check/badge.svg)](https://github.com/stephenslab/fastTopics/actions) 4 | [![CircleCI](https://dl.circleci.com/status-badge/img/gh/stephenslab/fastTopics/tree/master.svg?style=svg)](https://app.circleci.com/pipelines/github/stephenslab/fastTopics?branch=master) 5 | [![codecov](https://codecov.io/gh/stephenslab/fastTopics/branch/master/graph/badge.svg)](https://app.codecov.io/gh/stephenslab/fastTopics) 6 | 7 | fastTopics is an R package implementing fast, scalable optimization 8 | algorithms for fitting topic models and non-negative matrix 9 | factorizations to count data. The methods exploit the 10 | [close relationship][vignette-close-relationship] between the topic 11 | model and Poisson non-negative matrix factorization. The package also 12 | provides tools to compare, annotate and visualize model fits, 13 | including functions to create "structure plots" and functions to 14 | identify distinctive features of topics. The fastTopics package is a 15 | successor to the [CountClust package][countclust]. 16 | 17 | If you find a bug, or you have a question or feedback on this software, 18 | please post an [issue][issues]. 19 | 20 | ## Citing this work 21 | 22 | If you find the fastTopics package or any of the source code in this 23 | repository useful for your work, please cite: 24 | 25 | > K. K. Dey, C. J. Hsiao and M. Stephens (2017). [Visualizing the 26 | > structure of RNA-seq expression data using grade of membership 27 | > models.][countclust-paper] PLoS Genetics 13, e1006599. 28 | > 29 | > P. Carbonetto, A. Sarkar, Z. Wang and M. Stephens (2021). 30 | > [Non-negative matrix factorization algorithms greatly improve topic 31 | > model fits.][fasttopics-paper] arXiv 2105.13440. 32 | 33 | If you used the `de_analysis` function in fastTopics, please cite: 34 | 35 | > P. Carbonetto, K. Luo, A. Sarkar, A. Hung, K. Tayeb, S. Pott and 36 | > M. Stephens (2023). [GoM DE: interpreting structure in sequence 37 | > count data with differential expression analysis allowing for 38 | > grades of membership.][singlecell-topics-paper] 39 | > Genome Biology 24, 236. 40 | 41 | ## License 42 | 43 | Copyright (c) 2019-2025, Peter Carbonetto and Matthew Stephens. 44 | 45 | All source code and software in this repository are made available 46 | under the terms of the [MIT license][mit-license]. 47 | 48 | ## Quick Start 49 | 50 | Install and load the package from CRAN: 51 | 52 | ```R 53 | install.packages("fastTopics") 54 | library(fastTopics) 55 | ``` 56 | 57 | Alternatively, install the latest version from GitHub: 58 | 59 | ```R 60 | remotes::install_github("stephenslab/fastTopics") 61 | library(fastTopics) 62 | ``` 63 | 64 | Note that installing the package will require a C++ compiler setup 65 | that is appropriate for the version of R installed on your 66 | computer. For details, refer to the documentation on the 67 | [CRAN website][cran]. 68 | 69 | For guidance on using fastTopics to analyze gene expression data, see 70 | the [single-cell RNA-seq vignette, part 1][vignette-scrnaseq-1] and 71 | [part 2][vignette-scrnaseq-2]. 72 | 73 | Also, try running the small example that illustrates the fast model 74 | fitting algorithms: 75 | 76 | ```R 77 | example("fit_poisson_nmf") 78 | ``` 79 | 80 | See the [package documentation][pkgdown] for more information. 81 | 82 | ## Developer notes 83 | 84 | To prepare the package for CRAN, remove both single-cell vignettes, 85 | then run `R CMD build fastTopics` to build the source package. 86 | 87 | This is the command used to check the package before submitting to 88 | CRAN: 89 | 90 | ```r 91 | library(rhub) 92 | check_for_cran(".",show_status = TRUE, 93 | env_vars = c(`_R_CHECK_FORCE_SUGGESTS_` = "false", 94 | `_R_CHECK_CRAN_INCOMING_USE_ASPELL_` = "true")) 95 | ``` 96 | 97 | ## Credits 98 | 99 | The fastTopics R package was developed by [Peter Carbonetto][peter], 100 | [Matthew Stephens][matthew] and others. 101 | 102 | [fasttopics]: https://github.com/stephenslab/fastTopics 103 | [mit-license]: https://opensource.org/license/mit 104 | [issues]: https://github.com/stephenslab/fastTopics/issues 105 | [peter]: https://pcarbo.github.io 106 | [kevin]: https://github.com/kevinlkx 107 | [matthew]: http://stephenslab.uchicago.edu 108 | [uchicago]: https://www.uchicago.edu 109 | [cran]: https://cran.r-project.org 110 | [countclust]: https://github.com/kkdey/CountClust 111 | [countclust-paper]: https://doi.org/10.1371/journal.pgen.1006599 112 | [fasttopics-paper]: https://arxiv.org/abs/2105.13440 113 | [singlecell-topics-paper]: https://doi.org/10.1186/s13059-023-03067-9 114 | [pkgdown]: https://stephenslab.github.io/fastTopics/ 115 | [vignette-close-relationship]: https://stephenslab.github.io/fastTopics/articles/relationship.html 116 | [vignette-scrnaseq-1]: https://stephenslab.github.io/fastTopics/articles/single_cell_rnaseq_basic.html 117 | [vignette-scrnaseq-2]: https://stephenslab.github.io/fastTopics/articles/single_cell_rnaseq_practical.html 118 | -------------------------------------------------------------------------------- /TODO.txt: -------------------------------------------------------------------------------- 1 | to do 2 | ===== 3 | 4 | + Implement function to fit LDA model by initializing with MLEs for 5 | Poisson NMF or multinomial topic model. 6 | 7 | + Update the vignettes. 8 | 9 | + Update pkgdown site. 10 | 11 | + Implement KKT-based stopping criterion for fit_poisson_nmf. 12 | 13 | + Implement "t" S3 method to transpose the rows and columns of a 14 | Poisson NMF fit. 15 | 16 | + Implement integrations for Seurat, Signac and/or ArchR. 17 | 18 | + Implement backtracking line search option for SCD algorithm. 19 | 20 | + Create vignette giving more details about the differential 21 | expression analysis. 22 | 23 | + Create vignette giving more details about the different optimization 24 | algorithms. (Then simplify the fit_poisson_nmf example.) 25 | 26 | + Create vignette illustrating application of fastTopics to text data, 27 | with a focus on the different ways to visualize the results. 28 | -------------------------------------------------------------------------------- /_pkgdown.yml: -------------------------------------------------------------------------------- 1 | navbar: 2 | left: 3 | - text: "Home" 4 | href: index.html 5 | - text: "Vignettes" 6 | href: articles/index.html 7 | - text: "Functions" 8 | href: reference/index.html 9 | right: 10 | - text: "Source" 11 | href: https://github.com/stephenslab/fastTopics 12 | -------------------------------------------------------------------------------- /appveyor.yml: -------------------------------------------------------------------------------- 1 | init: 2 | ps: | 3 | $ErrorActionPreference = "Stop" 4 | Invoke-WebRequest http://raw.github.com/krlmlr/r-appveyor/master/scripts/appveyor-tool.ps1 -OutFile "..\appveyor-tool.ps1" 5 | Import-Module '..\appveyor-tool.ps1' 6 | 7 | branches: 8 | only: 9 | - master 10 | 11 | install: 12 | ps: Bootstrap 13 | 14 | environment: 15 | global: 16 | R_REMOTES_NO_ERRORS_FROM_WARNINGS: true 17 | _R_CHECK_FORCE_SUGGESTS_: false 18 | USE_RTOOLS: yes 19 | WARNINGS_ARE_ERRORS: 20 | R_VERSION: release 21 | R_ARCH: x64 22 | R_CHECK_ARGS: --no-manual --no-examples --as-cran 23 | 24 | # This is the minimal set of R packages needed to run "R CMD check" on 25 | # the package. 26 | build_script: 27 | - R -e install.packages(c('devtools','testthat','quadprog','gtools','irlba','Rtsne','uwot','dplyr','rlang','tidyr','Rcpp','RcppArmadillo','RcppParallel','RhpcBLASctl','progress','pbapply','ggplot2','cowplot','plotly','htmlwidgets'),head(.libPaths(),1),'http://cran.wustl.edu') 28 | - R -e devtools::install_github('slowkow/ggrepel',upgrade='never',force=TRUE) 29 | - R -e devtools::install_github('stephens999/ashr',upgrade='never',force=TRUE) 30 | 31 | test_script: 32 | - travis-tool.sh run_tests 33 | 34 | on_failure: 35 | - 7z a failure.zip *.Rcheck\* 36 | - appveyor PushArtifact failure.zip 37 | 38 | artifacts: 39 | - path: '*.Rcheck\**\*.log' 40 | name: Logs 41 | 42 | - path: '*.Rcheck\**\*.out' 43 | name: Logs 44 | 45 | - path: '*.Rcheck\**\*.fail' 46 | name: Logs 47 | 48 | - path: '*.Rcheck\**\*.Rout' 49 | name: Logs 50 | 51 | - path: '\*_*.tar.gz' 52 | name: Bits 53 | 54 | - path: '\*_*.zip' 55 | name: Bits 56 | -------------------------------------------------------------------------------- /data/newsgroups.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenslab/fastTopics/3a35d33d3e5ea0314d11965a1700e66035f1d18d/data/newsgroups.RData -------------------------------------------------------------------------------- /data/pbmc_facs.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenslab/fastTopics/3a35d33d3e5ea0314d11965a1700e66035f1d18d/data/pbmc_facs.RData -------------------------------------------------------------------------------- /docs/LICENSE-text.html: -------------------------------------------------------------------------------- 1 | 2 | License • fastTopics 6 | 7 | 8 |
9 |
40 | 41 | 42 | 43 |
44 |
45 | 48 | 49 |
YEAR: 2019
50 | COPYRIGHT HOLDER: Peter Carbonetto and Matthew Stephens
51 | 
52 | 53 |
54 | 55 | 58 | 59 |
60 | 61 | 62 | 63 |
66 | 67 |
68 |

Site built with pkgdown 2.0.7.

69 |
70 | 71 |
72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | -------------------------------------------------------------------------------- /docs/articles/relationship_files/figure-html/loglik-poisson-vs-multinom-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenslab/fastTopics/3a35d33d3e5ea0314d11965a1700e66035f1d18d/docs/articles/relationship_files/figure-html/loglik-poisson-vs-multinom-1.png -------------------------------------------------------------------------------- /docs/articles/relationship_files/figure-html/multinom2poisson-1-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenslab/fastTopics/3a35d33d3e5ea0314d11965a1700e66035f1d18d/docs/articles/relationship_files/figure-html/multinom2poisson-1-1.png -------------------------------------------------------------------------------- /docs/articles/relationship_files/figure-html/multinom2poisson-2-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenslab/fastTopics/3a35d33d3e5ea0314d11965a1700e66035f1d18d/docs/articles/relationship_files/figure-html/multinom2poisson-2-1.png -------------------------------------------------------------------------------- /docs/articles/relationship_files/figure-html/plot-loglik-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenslab/fastTopics/3a35d33d3e5ea0314d11965a1700e66035f1d18d/docs/articles/relationship_files/figure-html/plot-loglik-1.png -------------------------------------------------------------------------------- /docs/articles/single_cell_rnaseq_basic_files/figure-html/structure-plot-test-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenslab/fastTopics/3a35d33d3e5ea0314d11965a1700e66035f1d18d/docs/articles/single_cell_rnaseq_basic_files/figure-html/structure-plot-test-1.png -------------------------------------------------------------------------------- /docs/articles/single_cell_rnaseq_basic_files/figure-html/structure-plot-with-celltype-labels-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenslab/fastTopics/3a35d33d3e5ea0314d11965a1700e66035f1d18d/docs/articles/single_cell_rnaseq_basic_files/figure-html/structure-plot-with-celltype-labels-1.png -------------------------------------------------------------------------------- /docs/articles/single_cell_rnaseq_basic_files/figure-html/volcano-plot-b-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenslab/fastTopics/3a35d33d3e5ea0314d11965a1700e66035f1d18d/docs/articles/single_cell_rnaseq_basic_files/figure-html/volcano-plot-b-1.png -------------------------------------------------------------------------------- /docs/articles/single_cell_rnaseq_basic_files/figure-html/volcano-plot-bcells-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenslab/fastTopics/3a35d33d3e5ea0314d11965a1700e66035f1d18d/docs/articles/single_cell_rnaseq_basic_files/figure-html/volcano-plot-bcells-1.png -------------------------------------------------------------------------------- /docs/articles/single_cell_rnaseq_basic_files/figure-html/volcano-plot-nk-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenslab/fastTopics/3a35d33d3e5ea0314d11965a1700e66035f1d18d/docs/articles/single_cell_rnaseq_basic_files/figure-html/volcano-plot-nk-1.png -------------------------------------------------------------------------------- /docs/articles/single_cell_rnaseq_basic_files/figure-html/volcano-plot-t-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenslab/fastTopics/3a35d33d3e5ea0314d11965a1700e66035f1d18d/docs/articles/single_cell_rnaseq_basic_files/figure-html/volcano-plot-t-1.png -------------------------------------------------------------------------------- /docs/articles/single_cell_rnaseq_basic_files/figure-html/volcano-plot-tcells-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenslab/fastTopics/3a35d33d3e5ea0314d11965a1700e66035f1d18d/docs/articles/single_cell_rnaseq_basic_files/figure-html/volcano-plot-tcells-1.png -------------------------------------------------------------------------------- /docs/articles/single_cell_rnaseq_practical_files/crosstalk-1.0.0/css/crosstalk.css: -------------------------------------------------------------------------------- 1 | /* Adjust margins outwards, so column contents line up with the edges of the 2 | parent of container-fluid. */ 3 | .container-fluid.crosstalk-bscols { 4 | margin-left: -30px; 5 | margin-right: -30px; 6 | white-space: normal; 7 | } 8 | 9 | /* But don't adjust the margins outwards if we're directly under the body, 10 | i.e. we were the top-level of something at the console. */ 11 | body > .container-fluid.crosstalk-bscols { 12 | margin-left: auto; 13 | margin-right: auto; 14 | } 15 | 16 | .crosstalk-input-checkboxgroup .crosstalk-options-group .crosstalk-options-column { 17 | display: inline-block; 18 | padding-right: 12px; 19 | vertical-align: top; 20 | } 21 | 22 | @media only screen and (max-width:480px) { 23 | .crosstalk-input-checkboxgroup .crosstalk-options-group .crosstalk-options-column { 24 | display: block; 25 | padding-right: inherit; 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /docs/articles/single_cell_rnaseq_practical_files/figure-html/loglik-2-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenslab/fastTopics/3a35d33d3e5ea0314d11965a1700e66035f1d18d/docs/articles/single_cell_rnaseq_practical_files/figure-html/loglik-2-1.png -------------------------------------------------------------------------------- /docs/articles/single_cell_rnaseq_practical_files/figure-html/loglik-3-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenslab/fastTopics/3a35d33d3e5ea0314d11965a1700e66035f1d18d/docs/articles/single_cell_rnaseq_practical_files/figure-html/loglik-3-1.png -------------------------------------------------------------------------------- /docs/articles/single_cell_rnaseq_practical_files/figure-html/pca-plot-1-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenslab/fastTopics/3a35d33d3e5ea0314d11965a1700e66035f1d18d/docs/articles/single_cell_rnaseq_practical_files/figure-html/pca-plot-1-1.png -------------------------------------------------------------------------------- /docs/articles/single_cell_rnaseq_practical_files/figure-html/pca-plot-2-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenslab/fastTopics/3a35d33d3e5ea0314d11965a1700e66035f1d18d/docs/articles/single_cell_rnaseq_practical_files/figure-html/pca-plot-2-1.png -------------------------------------------------------------------------------- /docs/articles/single_cell_rnaseq_practical_files/figure-html/plot-loglik-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenslab/fastTopics/3a35d33d3e5ea0314d11965a1700e66035f1d18d/docs/articles/single_cell_rnaseq_practical_files/figure-html/plot-loglik-1.png -------------------------------------------------------------------------------- /docs/articles/single_cell_rnaseq_practical_files/figure-html/structure-plot-by-cluster-1-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenslab/fastTopics/3a35d33d3e5ea0314d11965a1700e66035f1d18d/docs/articles/single_cell_rnaseq_practical_files/figure-html/structure-plot-by-cluster-1-1.png -------------------------------------------------------------------------------- /docs/articles/single_cell_rnaseq_practical_files/figure-html/structure-plot-by-cluster-2-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenslab/fastTopics/3a35d33d3e5ea0314d11965a1700e66035f1d18d/docs/articles/single_cell_rnaseq_practical_files/figure-html/structure-plot-by-cluster-2-1.png -------------------------------------------------------------------------------- /docs/articles/single_cell_rnaseq_practical_files/figure-html/structure-plot-by-cluster-3-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenslab/fastTopics/3a35d33d3e5ea0314d11965a1700e66035f1d18d/docs/articles/single_cell_rnaseq_practical_files/figure-html/structure-plot-by-cluster-3-1.png -------------------------------------------------------------------------------- /docs/articles/single_cell_rnaseq_practical_files/figure-html/structure-plot-without-labels-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenslab/fastTopics/3a35d33d3e5ea0314d11965a1700e66035f1d18d/docs/articles/single_cell_rnaseq_practical_files/figure-html/structure-plot-without-labels-1.png -------------------------------------------------------------------------------- /docs/articles/single_cell_rnaseq_practical_files/figure-html/volcano-plot-cd4-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenslab/fastTopics/3a35d33d3e5ea0314d11965a1700e66035f1d18d/docs/articles/single_cell_rnaseq_practical_files/figure-html/volcano-plot-cd4-1.png -------------------------------------------------------------------------------- /docs/articles/single_cell_rnaseq_practical_files/figure-html/volcano-plot-cd8-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenslab/fastTopics/3a35d33d3e5ea0314d11965a1700e66035f1d18d/docs/articles/single_cell_rnaseq_practical_files/figure-html/volcano-plot-cd8-1.png -------------------------------------------------------------------------------- /docs/articles/single_cell_rnaseq_practical_files/figure-html/volcano-plot-t-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenslab/fastTopics/3a35d33d3e5ea0314d11965a1700e66035f1d18d/docs/articles/single_cell_rnaseq_practical_files/figure-html/volcano-plot-t-1.png -------------------------------------------------------------------------------- /docs/articles/single_cell_rnaseq_practical_files/plotly-htmlwidgets-css-1.52.2/plotly-htmlwidgets.css: -------------------------------------------------------------------------------- 1 | /* 2 | just here so that plotly works 3 | correctly with ioslides. 4 | see https://github.com/ropensci/plotly/issues/463 5 | */ 6 | 7 | slide:not(.current) .plotly.html-widget{ 8 | display: none; 9 | } 10 | -------------------------------------------------------------------------------- /docs/articles/topics_vs_clusters_files/figure-html/pca-from-loadings-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenslab/fastTopics/3a35d33d3e5ea0314d11965a1700e66035f1d18d/docs/articles/topics_vs_clusters_files/figure-html/pca-from-loadings-1.png -------------------------------------------------------------------------------- /docs/articles/topics_vs_clusters_files/figure-html/plot-topic-proportions-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenslab/fastTopics/3a35d33d3e5ea0314d11965a1700e66035f1d18d/docs/articles/topics_vs_clusters_files/figure-html/plot-topic-proportions-1.png -------------------------------------------------------------------------------- /docs/articles/topics_vs_clusters_files/figure-html/tsne-from-counts-1-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenslab/fastTopics/3a35d33d3e5ea0314d11965a1700e66035f1d18d/docs/articles/topics_vs_clusters_files/figure-html/tsne-from-counts-1-1.png -------------------------------------------------------------------------------- /docs/articles/topics_vs_clusters_files/figure-html/tsne-from-counts-2-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenslab/fastTopics/3a35d33d3e5ea0314d11965a1700e66035f1d18d/docs/articles/topics_vs_clusters_files/figure-html/tsne-from-counts-2-1.png -------------------------------------------------------------------------------- /docs/articles/topics_vs_clusters_files/figure-html/tsne-from-loadings-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenslab/fastTopics/3a35d33d3e5ea0314d11965a1700e66035f1d18d/docs/articles/topics_vs_clusters_files/figure-html/tsne-from-loadings-1.png -------------------------------------------------------------------------------- /docs/bootstrap-toc.css: -------------------------------------------------------------------------------- 1 | /*! 2 | * Bootstrap Table of Contents v0.4.1 (http://afeld.github.io/bootstrap-toc/) 3 | * Copyright 2015 Aidan Feldman 4 | * Licensed under MIT (https://github.com/afeld/bootstrap-toc/blob/gh-pages/LICENSE.md) */ 5 | 6 | /* modified from https://github.com/twbs/bootstrap/blob/94b4076dd2efba9af71f0b18d4ee4b163aa9e0dd/docs/assets/css/src/docs.css#L548-L601 */ 7 | 8 | /* All levels of nav */ 9 | nav[data-toggle='toc'] .nav > li > a { 10 | display: block; 11 | padding: 4px 20px; 12 | font-size: 13px; 13 | font-weight: 500; 14 | color: #767676; 15 | } 16 | nav[data-toggle='toc'] .nav > li > a:hover, 17 | nav[data-toggle='toc'] .nav > li > a:focus { 18 | padding-left: 19px; 19 | color: #563d7c; 20 | text-decoration: none; 21 | background-color: transparent; 22 | border-left: 1px solid #563d7c; 23 | } 24 | nav[data-toggle='toc'] .nav > .active > a, 25 | nav[data-toggle='toc'] .nav > .active:hover > a, 26 | nav[data-toggle='toc'] .nav > .active:focus > a { 27 | padding-left: 18px; 28 | font-weight: bold; 29 | color: #563d7c; 30 | background-color: transparent; 31 | border-left: 2px solid #563d7c; 32 | } 33 | 34 | /* Nav: second level (shown on .active) */ 35 | nav[data-toggle='toc'] .nav .nav { 36 | display: none; /* Hide by default, but at >768px, show it */ 37 | padding-bottom: 10px; 38 | } 39 | nav[data-toggle='toc'] .nav .nav > li > a { 40 | padding-top: 1px; 41 | padding-bottom: 1px; 42 | padding-left: 30px; 43 | font-size: 12px; 44 | font-weight: normal; 45 | } 46 | nav[data-toggle='toc'] .nav .nav > li > a:hover, 47 | nav[data-toggle='toc'] .nav .nav > li > a:focus { 48 | padding-left: 29px; 49 | } 50 | nav[data-toggle='toc'] .nav .nav > .active > a, 51 | nav[data-toggle='toc'] .nav .nav > .active:hover > a, 52 | nav[data-toggle='toc'] .nav .nav > .active:focus > a { 53 | padding-left: 28px; 54 | font-weight: 500; 55 | } 56 | 57 | /* from https://github.com/twbs/bootstrap/blob/e38f066d8c203c3e032da0ff23cd2d6098ee2dd6/docs/assets/css/src/docs.css#L631-L634 */ 58 | nav[data-toggle='toc'] .nav > .active > ul { 59 | display: block; 60 | } 61 | -------------------------------------------------------------------------------- /docs/docsearch.js: -------------------------------------------------------------------------------- 1 | $(function() { 2 | 3 | // register a handler to move the focus to the search bar 4 | // upon pressing shift + "/" (i.e. "?") 5 | $(document).on('keydown', function(e) { 6 | if (e.shiftKey && e.keyCode == 191) { 7 | e.preventDefault(); 8 | $("#search-input").focus(); 9 | } 10 | }); 11 | 12 | $(document).ready(function() { 13 | // do keyword highlighting 14 | /* modified from https://jsfiddle.net/julmot/bL6bb5oo/ */ 15 | var mark = function() { 16 | 17 | var referrer = document.URL ; 18 | var paramKey = "q" ; 19 | 20 | if (referrer.indexOf("?") !== -1) { 21 | var qs = referrer.substr(referrer.indexOf('?') + 1); 22 | var qs_noanchor = qs.split('#')[0]; 23 | var qsa = qs_noanchor.split('&'); 24 | var keyword = ""; 25 | 26 | for (var i = 0; i < qsa.length; i++) { 27 | var currentParam = qsa[i].split('='); 28 | 29 | if (currentParam.length !== 2) { 30 | continue; 31 | } 32 | 33 | if (currentParam[0] == paramKey) { 34 | keyword = decodeURIComponent(currentParam[1].replace(/\+/g, "%20")); 35 | } 36 | } 37 | 38 | if (keyword !== "") { 39 | $(".contents").unmark({ 40 | done: function() { 41 | $(".contents").mark(keyword); 42 | } 43 | }); 44 | } 45 | } 46 | }; 47 | 48 | mark(); 49 | }); 50 | }); 51 | 52 | /* Search term highlighting ------------------------------*/ 53 | 54 | function matchedWords(hit) { 55 | var words = []; 56 | 57 | var hierarchy = hit._highlightResult.hierarchy; 58 | // loop to fetch from lvl0, lvl1, etc. 59 | for (var idx in hierarchy) { 60 | words = words.concat(hierarchy[idx].matchedWords); 61 | } 62 | 63 | var content = hit._highlightResult.content; 64 | if (content) { 65 | words = words.concat(content.matchedWords); 66 | } 67 | 68 | // return unique words 69 | var words_uniq = [...new Set(words)]; 70 | return words_uniq; 71 | } 72 | 73 | function updateHitURL(hit) { 74 | 75 | var words = matchedWords(hit); 76 | var url = ""; 77 | 78 | if (hit.anchor) { 79 | url = hit.url_without_anchor + '?q=' + escape(words.join(" ")) + '#' + hit.anchor; 80 | } else { 81 | url = hit.url + '?q=' + escape(words.join(" ")); 82 | } 83 | 84 | return url; 85 | } 86 | -------------------------------------------------------------------------------- /docs/link.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 8 | 12 | 13 | -------------------------------------------------------------------------------- /docs/pbmc_facs.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenslab/fastTopics/3a35d33d3e5ea0314d11965a1700e66035f1d18d/docs/pbmc_facs.RData -------------------------------------------------------------------------------- /docs/pkgdown.js: -------------------------------------------------------------------------------- 1 | /* http://gregfranko.com/blog/jquery-best-practices/ */ 2 | (function($) { 3 | $(function() { 4 | 5 | $('.navbar-fixed-top').headroom(); 6 | 7 | $('body').css('padding-top', $('.navbar').height() + 10); 8 | $(window).resize(function(){ 9 | $('body').css('padding-top', $('.navbar').height() + 10); 10 | }); 11 | 12 | $('[data-toggle="tooltip"]').tooltip(); 13 | 14 | var cur_path = paths(location.pathname); 15 | var links = $("#navbar ul li a"); 16 | var max_length = -1; 17 | var pos = -1; 18 | for (var i = 0; i < links.length; i++) { 19 | if (links[i].getAttribute("href") === "#") 20 | continue; 21 | // Ignore external links 22 | if (links[i].host !== location.host) 23 | continue; 24 | 25 | var nav_path = paths(links[i].pathname); 26 | 27 | var length = prefix_length(nav_path, cur_path); 28 | if (length > max_length) { 29 | max_length = length; 30 | pos = i; 31 | } 32 | } 33 | 34 | // Add class to parent
  • , and enclosing
  • if in dropdown 35 | if (pos >= 0) { 36 | var menu_anchor = $(links[pos]); 37 | menu_anchor.parent().addClass("active"); 38 | menu_anchor.closest("li.dropdown").addClass("active"); 39 | } 40 | }); 41 | 42 | function paths(pathname) { 43 | var pieces = pathname.split("/"); 44 | pieces.shift(); // always starts with / 45 | 46 | var end = pieces[pieces.length - 1]; 47 | if (end === "index.html" || end === "") 48 | pieces.pop(); 49 | return(pieces); 50 | } 51 | 52 | // Returns -1 if not found 53 | function prefix_length(needle, haystack) { 54 | if (needle.length > haystack.length) 55 | return(-1); 56 | 57 | // Special case for length-0 haystack, since for loop won't run 58 | if (haystack.length === 0) { 59 | return(needle.length === 0 ? 0 : -1); 60 | } 61 | 62 | for (var i = 0; i < haystack.length; i++) { 63 | if (needle[i] != haystack[i]) 64 | return(i); 65 | } 66 | 67 | return(haystack.length); 68 | } 69 | 70 | /* Clipboard --------------------------*/ 71 | 72 | function changeTooltipMessage(element, msg) { 73 | var tooltipOriginalTitle=element.getAttribute('data-original-title'); 74 | element.setAttribute('data-original-title', msg); 75 | $(element).tooltip('show'); 76 | element.setAttribute('data-original-title', tooltipOriginalTitle); 77 | } 78 | 79 | if(ClipboardJS.isSupported()) { 80 | $(document).ready(function() { 81 | var copyButton = ""; 82 | 83 | $("div.sourceCode").addClass("hasCopyButton"); 84 | 85 | // Insert copy buttons: 86 | $(copyButton).prependTo(".hasCopyButton"); 87 | 88 | // Initialize tooltips: 89 | $('.btn-copy-ex').tooltip({container: 'body'}); 90 | 91 | // Initialize clipboard: 92 | var clipboardBtnCopies = new ClipboardJS('[data-clipboard-copy]', { 93 | text: function(trigger) { 94 | return trigger.parentNode.textContent.replace(/\n#>[^\n]*/g, ""); 95 | } 96 | }); 97 | 98 | clipboardBtnCopies.on('success', function(e) { 99 | changeTooltipMessage(e.trigger, 'Copied!'); 100 | e.clearSelection(); 101 | }); 102 | 103 | clipboardBtnCopies.on('error', function() { 104 | changeTooltipMessage(e.trigger,'Press Ctrl+C or Command+C to copy'); 105 | }); 106 | }); 107 | } 108 | })(window.jQuery || window.$) 109 | -------------------------------------------------------------------------------- /docs/pkgdown.yml: -------------------------------------------------------------------------------- 1 | pandoc: 2.17.1.1 2 | pkgdown: 2.0.2 3 | pkgdown_sha: ~ 4 | articles: 5 | relationship: relationship.html 6 | single_cell_rnaseq_basic: single_cell_rnaseq_basic.html 7 | single_cell_rnaseq_practical: single_cell_rnaseq_practical.html 8 | topics_vs_clusters: topics_vs_clusters.html 9 | last_built: 2023-03-13T20:41Z 10 | 11 | -------------------------------------------------------------------------------- /docs/reference/Rplot001.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenslab/fastTopics/3a35d33d3e5ea0314d11965a1700e66035f1d18d/docs/reference/Rplot001.png -------------------------------------------------------------------------------- /docs/reference/Rplot002.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenslab/fastTopics/3a35d33d3e5ea0314d11965a1700e66035f1d18d/docs/reference/Rplot002.png -------------------------------------------------------------------------------- /docs/reference/Rplot003.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenslab/fastTopics/3a35d33d3e5ea0314d11965a1700e66035f1d18d/docs/reference/Rplot003.png -------------------------------------------------------------------------------- /docs/reference/Rplot004.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenslab/fastTopics/3a35d33d3e5ea0314d11965a1700e66035f1d18d/docs/reference/Rplot004.png -------------------------------------------------------------------------------- /docs/reference/Rplot005.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenslab/fastTopics/3a35d33d3e5ea0314d11965a1700e66035f1d18d/docs/reference/Rplot005.png -------------------------------------------------------------------------------- /docs/reference/Rplot006.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenslab/fastTopics/3a35d33d3e5ea0314d11965a1700e66035f1d18d/docs/reference/Rplot006.png -------------------------------------------------------------------------------- /docs/reference/de_analysis-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenslab/fastTopics/3a35d33d3e5ea0314d11965a1700e66035f1d18d/docs/reference/de_analysis-1.png -------------------------------------------------------------------------------- /docs/reference/embeddings_from_topics-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenslab/fastTopics/3a35d33d3e5ea0314d11965a1700e66035f1d18d/docs/reference/embeddings_from_topics-1.png -------------------------------------------------------------------------------- /docs/reference/embeddings_from_topics-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenslab/fastTopics/3a35d33d3e5ea0314d11965a1700e66035f1d18d/docs/reference/embeddings_from_topics-2.png -------------------------------------------------------------------------------- /docs/reference/embeddings_from_topics-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenslab/fastTopics/3a35d33d3e5ea0314d11965a1700e66035f1d18d/docs/reference/embeddings_from_topics-3.png -------------------------------------------------------------------------------- /docs/reference/embeddings_from_topics-4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenslab/fastTopics/3a35d33d3e5ea0314d11965a1700e66035f1d18d/docs/reference/embeddings_from_topics-4.png -------------------------------------------------------------------------------- /docs/reference/embeddings_from_topics-5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenslab/fastTopics/3a35d33d3e5ea0314d11965a1700e66035f1d18d/docs/reference/embeddings_from_topics-5.png -------------------------------------------------------------------------------- /docs/reference/embeddings_from_topics-6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenslab/fastTopics/3a35d33d3e5ea0314d11965a1700e66035f1d18d/docs/reference/embeddings_from_topics-6.png -------------------------------------------------------------------------------- /docs/reference/fit_poisson_nmf-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenslab/fastTopics/3a35d33d3e5ea0314d11965a1700e66035f1d18d/docs/reference/fit_poisson_nmf-1.png -------------------------------------------------------------------------------- /docs/reference/fit_poisson_nmf-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenslab/fastTopics/3a35d33d3e5ea0314d11965a1700e66035f1d18d/docs/reference/fit_poisson_nmf-2.png -------------------------------------------------------------------------------- /docs/reference/predict-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenslab/fastTopics/3a35d33d3e5ea0314d11965a1700e66035f1d18d/docs/reference/predict-1.png -------------------------------------------------------------------------------- /docs/reference/predict-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenslab/fastTopics/3a35d33d3e5ea0314d11965a1700e66035f1d18d/docs/reference/predict-2.png -------------------------------------------------------------------------------- /docs/reference/predict-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenslab/fastTopics/3a35d33d3e5ea0314d11965a1700e66035f1d18d/docs/reference/predict-3.png -------------------------------------------------------------------------------- /docs/reference/predict-4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenslab/fastTopics/3a35d33d3e5ea0314d11965a1700e66035f1d18d/docs/reference/predict-4.png -------------------------------------------------------------------------------- /docs/sitemap.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | /404.html 5 | 6 | 7 | /LICENSE-text.html 8 | 9 | 10 | /articles/index.html 11 | 12 | 13 | /articles/relationship.html 14 | 15 | 16 | /articles/single_cell_rnaseq_basic.html 17 | 18 | 19 | /articles/single_cell_rnaseq_practical.html 20 | 21 | 22 | /articles/topics_vs_clusters.html 23 | 24 | 25 | /articles/volcano_plot_t_cells.html 26 | 27 | 28 | /authors.html 29 | 30 | 31 | /index.html 32 | 33 | 34 | /pbmc_de_analysis.html 35 | 36 | 37 | /reference/compare_fits.html 38 | 39 | 40 | /reference/compare_poisson_nmf_fits.html 41 | 42 | 43 | /reference/de_analysis.html 44 | 45 | 46 | /reference/diff_count_analysis.html 47 | 48 | 49 | /reference/embedding_plots.html 50 | 51 | 52 | /reference/embeddings_from_topics.html 53 | 54 | 55 | /reference/fit_multinom_model.html 56 | 57 | 58 | /reference/fit_poisson_nmf.html 59 | 60 | 61 | /reference/fit_topic_model.html 62 | 63 | 64 | /reference/index.html 65 | 66 | 67 | /reference/likelihood.html 68 | 69 | 70 | /reference/loadings_plot.html 71 | 72 | 73 | /reference/merge_topics.html 74 | 75 | 76 | /reference/multinom2poisson.html 77 | 78 | 79 | /reference/pbmc_4k.html 80 | 81 | 82 | /reference/pbmc_facs.html 83 | 84 | 85 | /reference/pca_plot.html 86 | 87 | 88 | /reference/plot_loglik_vs_rank.html 89 | 90 | 91 | /reference/plot_progress.html 92 | 93 | 94 | /reference/plot_progress_poisson_nmf.html 95 | 96 | 97 | /reference/poisson2multinom.html 98 | 99 | 100 | /reference/predict.html 101 | 102 | 103 | /reference/run_homer.html 104 | 105 | 106 | /reference/select_loadings.html 107 | 108 | 109 | /reference/simulate_count_data.html 110 | 111 | 112 | /reference/simulate_gene_data.html 113 | 114 | 115 | /reference/simulate_toy_gene_data.html 116 | 117 | 118 | /reference/structure_plot.html 119 | 120 | 121 | /reference/summary.poisson_nmf_fit.html 122 | 123 | 124 | /reference/tsne_from_topics.html 125 | 126 | 127 | /reference/tsne_plot.html 128 | 129 | 130 | /reference/volcano_plot.html 131 | 132 | 133 | -------------------------------------------------------------------------------- /inst/CITATION: -------------------------------------------------------------------------------- 1 | citHeader("To cite the fastTopics package, please use:") 2 | 3 | bibentry(bibtype = "Article", 4 | title = paste("Visualizing the structure of RNA-seq expression data", 5 | "using grade of membership models"), 6 | author = c(person("Kushal K. Dey"), 7 | person("Chiaowen Joyce Hsiao"), 8 | person("Matthew Stephens")), 9 | journal = "PLoS Genetics", 10 | volume = 13, 11 | number = 3, 12 | pages = "e1006599", 13 | year = "2017", 14 | url = "https://doi.org/10.1371/journal.pgen.1006599", 15 | textVersion = 16 | paste("Kushal K. Dey, Chiaowen Joyce Hsiao and Matthew Stephens", 17 | "(2017). Visualizing the structure of RNA-seq expression", 18 | "data using grade of membership models. PLoS Genetics", 19 | "13(3), e1006599, doi:10.1371/journal.pgen.1006599")) 20 | 21 | bibentry(bibtype = "Article", 22 | title = paste("Non-negative matrix factorization algorithms greatly", 23 | "improve topic model fits"), 24 | author = c(person("Peter Carbonetto"), 25 | person("Abhishek Sarkar"), 26 | person("Zihao Wang"), 27 | person("Matthew Stephens")), 28 | journal = "arXiv", 29 | volume = "2105.13440", 30 | eprint = "2105.13440", 31 | year = "2021", 32 | archivePrefix = "arXiv", 33 | url = "https://arxiv.org/abs/2105.13440", 34 | textVersion = paste("Peter Carbonetto, Abhishek Sarkar, Zihao Wang", 35 | "and Matthew Stephens (2021). Non-negative matrix", 36 | "factorization algorithms greatly improve topic model", 37 | "fits. arXiv 2105.13440.")) 38 | 39 | bibentry(header = "If de_analysis is used, please also cite:", 40 | bibtype = "Article", 41 | title = paste("Interpreting structure in sequence count data with", 42 | "differential expression analysis allowing for grades of", 43 | "membership"), 44 | author = c(person("Peter Carbonetto"), 45 | person("Kaixuan Luo"), 46 | person("Abhishek Sarkar"), 47 | person("Anthony Hung"), 48 | person("Karl Tayeb"), 49 | person("Sebastian Pott"), 50 | person("Matthew Stephens")), 51 | journal = "Genome Biology", 52 | doi = "10.1186/s13059-023-03067-9", 53 | volume = 24, 54 | pages = 236, 55 | year = 2023, 56 | textVersion = paste("Peter Carbonetto, Kaixuan Luo, Abhishek Sarkar,", 57 | "Anthony Hung, Karl Tayeb, Sebastian Pott and Matthew Stephens.", 58 | "GoM DE: interpreting structure in sequence count data with differential", 59 | "expression analysis allowing for grades of membership. Genome Biology", 60 | "24: 236 (2023). https://doi.org/10.1186/s13059-023-03067-9")) 61 | 62 | -------------------------------------------------------------------------------- /inst/COPYRIGHTS: -------------------------------------------------------------------------------- 1 | This file contains additional copyright information about code adapted 2 | from other R packages. 3 | 4 | Portions of the C++ code in src/poismix.cpp and src/scd.cpp were 5 | adapted from R and C++ code developed by Eric Xihui Lin and Paul 6 | C. Boutros, which is available for download at 7 | https://github.com/linxihui/NNLM. This code is distributed under the 8 | 2-Clause BSD license and retains the following copyright: 9 | 10 | YEAR: 2015 11 | COPYRIGHT HOLDER: Eric Xihui Lin, the Boutros Lab and the Ontario Institute for Cancer Research 12 | 13 | Portions of the R code in R/topicscore.R were adapted from R code 14 | developed by Minzhe Wang and Tracy Ke. This code is distributed under 15 | the MIT license and retains the following copyright: 16 | 17 | YEAR: 2019 18 | COPYRIGHT HOLDER: Minzhe Wang 19 | -------------------------------------------------------------------------------- /inst/code/check_map.R: -------------------------------------------------------------------------------- 1 | library(Compositional) 2 | 3 | # Simulate a 80 x 100 data set. 4 | set.seed(1) 5 | n <- 80 6 | m <- 100 7 | k <- 3 8 | dat <- simulate_count_data(n,m,k) 9 | X <- dat$X 10 | L <- dat$L 11 | F <- dat$F 12 | a <- matrix(abs(rnorm(m*k)) + 1,m,k) 13 | b <- abs(rnorm(k)) 14 | 15 | N <- 100 16 | f0 <- rep(0,N) 17 | f1 <- rep(0,N) 18 | f2 <- rep(0,N) 19 | f3 <- rep(0,N) 20 | for (i in 1:N) { 21 | 22 | # Compute the penalized likelihood for the multinomial topic model 23 | # with a Dirichlet prior. 24 | fit <- list(L = L,F = F) 25 | class(fit) <- c("poisson_nmf_fit","list") 26 | f0[i] <- sum(loglik_multinom_topic_model(X,poisson2multinom(fit),e = 0)) 27 | for (j in 1:k) 28 | f0[i] <- f0[i] + ddiri(fit$F[,j],a[,j],logged = TRUE) 29 | 30 | # Compute the multinomial topic model likelihood with "pseudodata". 31 | Y <- rbind(X,t(a - 1)) 32 | fit2 <- fit 33 | u <- colSums(a - 1)/b 34 | fit2$L <- rbind(fit$L,diag(k)) 35 | fit2$F <- scale.cols(fit2$F,u) 36 | fit2$L <- scale.cols(fit2$L,1/u) 37 | f1[i] <- sum(loglik_multinom_topic_model(Y,poisson2multinom(fit2),e = 0)) 38 | 39 | # Compute the penalized Poisson NMF likelihood with a gamma prior. 40 | f2[i] <- sum(loglik_poisson_nmf(X,fit,e = 0)) 41 | for (j in 1:k) 42 | f2[i] <- f2[i] + sum(dgamma(fit$F[,j],a[,j],b[j],log = TRUE)) 43 | 44 | # Compute Poisson NMF likelihood with "pseudodata". 45 | f3[i] <- sum(loglik_poisson_nmf(Y,fit2,e = 0)) 46 | 47 | # Tweak the fit. 48 | L <- L * matrix(exp(rnorm(n*k,sd = 0.1)),n,k) 49 | F <- F * matrix(exp(rnorm(m*k,sd = 0.1)),m,k) 50 | } 51 | 52 | # The multinomial penalized log-likelihoods and the multinomial 53 | # log-likelihoods with pseudodata should be equal up to a constant. 54 | plot(f0,f1,pch = 20) 55 | 56 | # The Poisson NMF penalized log-likelihoods and the Poisson NMF 57 | # log-likleihoods with pseudodata should be equal up to a constant. 58 | plot(f2,f3,pch = 20) 59 | -------------------------------------------------------------------------------- /inst/code/check_poisson_hessian.R: -------------------------------------------------------------------------------- 1 | # Verify gradient and Hessian calculations for the "single gene" 2 | # Poisson model. 3 | library(pracma) 4 | 5 | # Simulate data x ~ Pois(u), with u = l0*f0 + l1*f1. 6 | set.seed(1) 7 | n <- 40 8 | f0 <- 0.1 9 | f1 <- 1 10 | s <- sample(10,n,replace = TRUE) 11 | u <- runif(n) 12 | l0 <- s*(1-u) 13 | l1 <- s*u 14 | x <- rpois(n,l0*f0 + l1*f1) 15 | 16 | # Compute the log-likelihood under the model x ~ Pois(u), with 17 | # Poisson rates u = l0*f0 + l1*f1. 18 | loglik <- function (x, l0, l1, f0, f1) 19 | sum(dpois(x,l0*f0 + l1*f1,log = TRUE)) 20 | 21 | # Compute the gradient of the log-likelihood with respect to log(f0) 22 | # and log(f1). 23 | loglik_grad <- function (x, l0, l1, f0, f1) { 24 | u <- l0*f0 + l1*f1 25 | y <- x/u - 1 26 | return(c(f0*sum(l0*y), 27 | f1*sum(l1*y))) 28 | } 29 | 30 | # Compute the MLEs of f0 and f1. 31 | control <- glm.control(epsilon = 1e-10, maxit = 100) 32 | dat <- data.frame(x = x,f0 = l0,f1 = l1) 33 | fit <- glm(x ~ f0 + f1 - 1,family = poisson(link = "identity"), 34 | data = dat,start = c(0.5,0.5),control = control) 35 | f0 <- coef(fit)["f0"] 36 | f1 <- coef(fit)["f1"] 37 | 38 | # Compare loglik_grad and loglik_hessian against numerical gradients 39 | # calculated using finite differences. 40 | cat("gradient:\n") 41 | print(grad(function (v) loglik(x,l0,l1,exp(v[1]),exp(v[2])),log(c(f0,f1))), 42 | digits = 12) 43 | print(loglik_grad(x,l0,l1,f0,f1),digits = 12) 44 | cat("Hessian:\n") 45 | print(rbind(grad(function (v) loglik_grad(x,l0,l1,exp(v[1]),exp(v[2]))[1], 46 | log(c(f0,f1))), 47 | grad(function (v) loglik_grad(x,l0,l1,exp(v[1]),exp(v[2]))[2], 48 | log(c(f0,f1)))), 49 | digits = 12) 50 | print(-solve(compute_poisson_covariance(x,cbind(l0,l1),coef(fit))),digits = 12) 51 | -------------------------------------------------------------------------------- /inst/code/compile_newsgroups_results_for_annotation.R: -------------------------------------------------------------------------------- 1 | # I run this after compute_newsgroups_topics.R to compile the key 2 | # matrices I would like to keep for subsequent analyses. 3 | library(tools) 4 | library(fastTopics) 5 | 6 | # Load the newsgroups data. 7 | load("../data/newsgroups.RData") 8 | 9 | # Load the output generated by the compute_newsgroups_topics.R script. 10 | load("../output/newsgroups_topics.RData") 11 | 12 | # Get the topic proportions matrix. 13 | L <- poisson2multinom(pnmf)$L 14 | 15 | # Get the posterior mean log-fold changes (compared to the mean word 16 | # frequencies), and call this the F matrix. Note that here we are 17 | # using the base-2 logarithm. 18 | F <- de_vsnull$postmean 19 | 20 | # Save the compiled results to an .Rdata file. 21 | newsgroups <- list(topics = topics,L = L,F = F) 22 | save(list = "newsgroups",file = "newsgroups.RData") 23 | resaveRdaFiles("newsgroups.RData") 24 | -------------------------------------------------------------------------------- /inst/code/compute_newsgroups_topics.R: -------------------------------------------------------------------------------- 1 | # Analyze the "20 Newsgroups" data using fastTopics. 2 | # 3 | # sinteractive --mem=24G -c 8 --time=24:00:00 4 | # module load R/4.2.0 5 | # .libPaths()[1] 6 | # /home/pcarbo/R_libs_4_20 7 | library(tools) 8 | library(Matrix) 9 | library(fastTopics) 10 | load("../datafiles/newsgroups.RData") 11 | set.seed(1) 12 | 13 | # Remove words that appear in fewer than 10 documents. 14 | x <- colSums(counts > 0) 15 | j <- which(x > 9) 16 | counts <- counts[,j] 17 | 18 | # Fit a Poisson NMF using fastTopics, with k = 10 factors/topics. 19 | pnmf <- fit_poisson_nmf(counts,k = 10,numiter = 100,method = "em", 20 | control = list(numiter = 4,nc = 8,extrapolate = FALSE), 21 | init.method = "random",verbose = "detailed") 22 | pnmf <- fit_poisson_nmf(counts,fit0 = pnmf,numiter = 100,method = "scd", 23 | control = list(numiter = 4,nc = 8,extrapolate = TRUE), 24 | verbose = "detailed") 25 | 26 | # Perform the "grade of membership" differential expression analysis 27 | # using the fitted Poisson NMF model. 28 | de_le <- de_analysis(pnmf,counts,shrink.method = "ash", 29 | lfc.stat = "le",pseudocount = 0.1, 30 | control = list(ns = 1e4,nc = 8,nsplit = 1000)) 31 | de_vsnull <- de_analysis(pnmf,counts,shrink.method = "ash", 32 | lfc.stat = "vsnull",pseudocount = 0.1, 33 | control = list(ns = 1e4,nc = 8,nsplit = 1000)) 34 | 35 | # Save the outputs to an .Rdata file. 36 | session_info <- sessionInfo() 37 | save(list = c("pnmf","de_le","de_vsnull","session_info"), 38 | file = "newsgroups_topics.RData") 39 | resaveRdaFiles("newsgroups_topics.RData") 40 | -------------------------------------------------------------------------------- /inst/code/droplet.R: -------------------------------------------------------------------------------- 1 | #' @name droplet 2 | #' 3 | #' @title Droplet single-cell RNA-seq read count data from Montoro 4 | #' \emph{et al} (2018) 5 | #' 6 | #' @docType data 7 | #' 8 | #' @description These data are gene expression profiles of trachea 9 | #' epithelial cells in C57BL/6 mice obtained using droplet-based 3' 10 | #' single-cell RNA-seq. They were prepared from file 11 | #' \code{GSE103354_Trachea_droplet_UMIcounts.txt.gz} downloaded from 12 | #' the Gene Expression Omnibus (GEO) website, accession GSE103354. 13 | #' 14 | #' @format \code{droplet} is a 7,193 x 17,133 sparse matrix of read 15 | #' counts, with rows corresponding to samples (cells), and columns 16 | #' corresponding to genes. 17 | #' 18 | #' @references 19 | #' 20 | #' D. T. Montoro \emph{et al} (2018). A revised airway epithelial 21 | #' hierarchy includes CFTR-expressing ionocytes. \emph{Nature} \bold{560}, 22 | #' 319–-324. 23 | #' 24 | #' @keywords data 25 | #' 26 | #' @examples 27 | #' 28 | #' # Roughly 10% of the read counts are greater than zero. 29 | #' data(droplet) 30 | #' nnzero(droplet)/length(droplet) 31 | #' 32 | NULL 33 | -------------------------------------------------------------------------------- /inst/code/lda.R: -------------------------------------------------------------------------------- 1 | lda <- function (X, F, L, alpha = rep(1,ncol(F)), numiter = 1000) { 2 | 3 | # Get the number of rows (n) and columns (m) of X, and the number of 4 | # topics. 5 | n <- nrow(X) 6 | m <- ncol(X) 7 | k <- ncol(F) 8 | 9 | # This variable is used to keep track of the algorithm's progress; 10 | # it stores the value of the objective (the variational lower bound, 11 | # or "ELBO") at each iteration. 12 | value <- rep(0,numiter) 13 | 14 | # Iterate the E and M steps. 15 | cat("iter --objective(ELBO)-- max.diff\n") 16 | for (iter in 1:numiter) { 17 | L0 <- L 18 | F0 <- F 19 | 20 | # E STEP 21 | # ------ 22 | # Update the expected topic counts (N) and expected word counts (M). 23 | N <- matrix(0,n,k) 24 | M <- matrix(0,m,k) 25 | for (i in 1:n) { 26 | P <- scale.cols(F,exp(digamma(L[i,]))) 27 | P <- P / rowSums(P) 28 | N[i,] <- X[i,] %*% P 29 | M <- M + X[i,] * P 30 | } 31 | 32 | # M STEP 33 | # ------ 34 | # Update the topic proportions (loadings). 35 | L <- alpha + N 36 | 37 | # Update the word probabilities (factors). 38 | F <- scale.cols(M + 1e-6) 39 | 40 | # Compute the variational lower bound at the current solution. 41 | value[iter] <- elbo.lda(X,F,L,alpha) 42 | cat(sprintf("%4d %+0.12e %0.2e\n",iter,value[iter], 43 | max(max(abs(L - L0)),max(abs(F - F0))))) 44 | } 45 | 46 | # Return the estimates of the topic proportions (L) and word 47 | # probabilities (F), and the value of the objective at each 48 | # iteration ("value"). 49 | return(list(F = F,L = L,value = value)) 50 | } 51 | 52 | elbo.lda <- function (X, F, L, alpha) { 53 | n <- nrow(X) 54 | f <- rep(0,n) 55 | for (i in 1:n) { 56 | L[i,] <- L[i,] * (sum(alpha) + sum(X[i,])) 57 | P <- scale.cols(F,exp(digamma(L[i,]))) 58 | P <- P / rowSums(P) 59 | u <- digamma(L[i,]) - digamma(sum(L[i,])) 60 | f[i] <- (lgamma(sum(alpha)) - lgamma(sum(L[i,])) 61 | + sum(lgamma(L[i,])) - sum(lgamma(alpha)) 62 | + sum((alpha - L[i,]) * u) 63 | + sum(X[i,] %*% (scale.cols(P,u) + P*log(F) - P*log(P)))) 64 | } 65 | return(f) 66 | } 67 | -------------------------------------------------------------------------------- /inst/code/multinom_demo.R: -------------------------------------------------------------------------------- 1 | # Short script to verify implementation of the differential expression 2 | # (DE) analysis methods applied to data simulated from a multinomial 3 | # topic model. 4 | library(Matrix) 5 | library(ggplot2) 6 | library(cowplot) 7 | 8 | # Simulate data. 9 | set.seed(1) 10 | n <- 400 11 | m <- 1000 12 | k <- 4 13 | dat <- simulate_multinom_gene_data(n,m,k,sparse = TRUE) 14 | X <- dat$X 15 | L <- dat$L 16 | 17 | # Fit a Poisson model (approximating a binomial model) to each gene 18 | # (row) j, and compute the log-fold change statistics. 19 | fit <- init_poisson_nmf(X,L = L,init.method = "random") 20 | de1 <- de_analysis(fit,X,fit.method = "glm") 21 | de2 <- de_analysis(fit,X,fit.method = "scd") 22 | de3 <- de_analysis(fit,X,fit.method = "em") 23 | 24 | # Compare the glm and scd estimates of the model parameters. 25 | plot(de1$F + 1e-4,de2$F + 1e-4,pch = 4,cex = 0.5,log = "xy",xlab = "glm", 26 | ylab = "scd") 27 | abline(a = 0,b = 1,col = "magenta",lty = "dotted") 28 | 29 | # Compare the glm and EM estimates of the model parameters. 30 | plot(de1$F + 1e-4,de3$F + 1e-4,pch = 4,cex = 0.5,log = "xy",xlab = "glm", 31 | ylab = "em") 32 | abline(a = 0,b = 1,col = "magenta",lty = "dotted") 33 | 34 | # Compare the scd estimates against the probabilities used to simulate 35 | # the data. 36 | plot(dat$F + 1e-4,de2$F + 1e-4,pch = 4,cex = 0.5,log = "xy",xlab = "true", 37 | ylab = "estimated") 38 | abline(a = 0,b = 1,col = "magenta",lty = "dotted") 39 | 40 | # Here we show that the z-score varies (as expected) with the log-fold 41 | # change estimate and the average expression level. 42 | pdat <- data.frame(f0 = rep(de3$f0,4), 43 | postmean = as.vector(de3$postmean), 44 | z = as.vector(de3$z)) 45 | print(ggplot(pdat,aes(x = f0,y = postmean,fill = z)) + 46 | geom_point(size = 2,shape = 21,color = "white") + 47 | geom_abline(intercept = 0,slope = 0,color = "black",linetype = "dotted") + 48 | scale_x_continuous(trans = "log10") + 49 | scale_fill_gradient2(low = "darkblue",mid = "lightskyblue", 50 | high = "orangered",midpoint = 0) + 51 | labs(x = "average expression",y = "log-fold change",fill = "z-score") + 52 | theme_cowplot(12)) 53 | -------------------------------------------------------------------------------- /inst/code/pbmc_demo.R: -------------------------------------------------------------------------------- 1 | library(ggplot2) 2 | library(ggrepel) 3 | library(cowplot) 4 | library(Matrix) 5 | 6 | # Load the data. 7 | set.seed(1) 8 | data(pbmc_facs) 9 | genes <- pbmc_facs$genes 10 | X <- pbmc_facs$counts 11 | fit <- pbmc_facs$fit 12 | 13 | # Perform the differential expression analysis, with and without 14 | # shrinkage. 15 | set.seed(1) 16 | out1 <- de_analysis(fit,X,shrink.method = "none", 17 | control = list(nc = 4,nsplit = 400)) 18 | set.seed(1) 19 | out2 <- de_analysis(fit,X,shrink.method = "ash", 20 | control = list(nc = 4,nsplit = 400)) 21 | 22 | # Plot the distribution of MCMC acceptance rates. 23 | hist(out2$ar,n = 64) 24 | 25 | # Compare the LFC estimates with and without shrinkage. 26 | pdat <- data.frame(postmean1 = as.vector(out1$postmean), 27 | postmean2 = as.vector(out2$postmean)) 28 | p1 <- ggplot(pdat,aes(x = postmean1,y = postmean2)) + 29 | geom_point(shape = 21,color = "white",fill = "darkblue",na.rm = TRUE) + 30 | geom_abline(intercept = 0,slope = 1,color = "magenta",linetype = "dotted") + 31 | labs(x = "posterior mean estimate",y = "stabilized posterior estimate") + 32 | theme_cowplot(font_size = 10) 33 | 34 | # Create a volcano plot to visualize the DE results for topic k = 4. 35 | k <- "k4" 36 | pdat <- data.frame(gene = genes$symbol, 37 | postmean = out2$postmean[,k], 38 | z = pmin(40,abs(out2$z[,k])), 39 | lfsr = cut(out2$lfsr[,k],c(-1,0.001,0.01,0.05,Inf)), 40 | stringsAsFactors = FALSE) 41 | rows <- which(with(pdat,!(postmean > 3 | (postmean > 0 & z > 10)))) 42 | pdat[rows,"gene"] <- "" 43 | p2 <- ggplot(pdat,aes(x = postmean,y = z,fill = lfsr,label = gene)) + 44 | geom_point(color = "white",stroke = 0.3,shape = 21, 45 | na.rm = TRUE) + 46 | geom_text_repel(color = "darkgray",size = 2.25,fontface = "italic", 47 | segment.color = "darkgray",segment.size = 0.25, 48 | min.segment.length = 0,max.overlaps = Inf, 49 | na.rm = TRUE) + 50 | scale_y_continuous(trans = "sqrt",breaks = c(0,1,2,5,10,20,50)) + 51 | scale_fill_manual(values = c("deepskyblue","gold","orange","tomato"), 52 | na.value = "gainsboro") + 53 | labs(x = "log-fold change",y = "|z-score|") + 54 | theme_cowplot(font_size = 10) 55 | -------------------------------------------------------------------------------- /inst/code/plsi.R: -------------------------------------------------------------------------------- 1 | # Small script to verify that the Poisson NMF multiplicative updates 2 | # are equivalent to the pLSI EM updates. 3 | 4 | # Simulate a 100 x 200 counts matrix. 5 | set.seed(1) 6 | n <- 100 7 | m <- 200 8 | k <- 3 9 | out <- simulate_count_data(n,m,k) 10 | X <- out$X 11 | A <- out$L 12 | B <- t(out$F) 13 | 14 | # Apply the EM updates for pLSI and Poisson NMF in parallel. 15 | N <- rowSums(X) 16 | for (iter in 1:20) { 17 | out <- get_multinom_from_pnmf(t(B),A) 18 | L <- out$L 19 | F <- out$F 20 | 21 | # Apply the multiplicative (EM) update for L. 22 | A <- scale.cols(A * tcrossprod(X / (A %*% B),B),1/rowSums(B)) 23 | 24 | # Apply the pLSI EM update for L. 25 | P <- matrix(0,m,k) 26 | for (i in 1:n) { 27 | for (j in 1:m) 28 | P[j,] <- F[j,]*L[i,]/sum(F[j,]*L[i,]) 29 | L[i,] <- (X[i,] %*% P)/N[i] 30 | } 31 | 32 | # Compare the updated L matrices. 33 | out <- get_multinom_from_pnmf(t(B),A) 34 | cat(sprintf("%0.1e ",max(abs(out$L - L)))) 35 | 36 | # Apply the multiplicative (EM) update for F. 37 | B <- B * crossprod(A,X / (A %*% B)) / colSums(A) 38 | 39 | # Apply the pLSI EM update for F. 40 | P <- matrix(0,n,k) 41 | for (j in 1:m) { 42 | for (i in 1:n) 43 | P[i,] <- F[j,]*L[i,]/sum(F[j,]*L[i,]) 44 | F[j,] <- X[,j] %*% P 45 | } 46 | F <- normalize.cols(F) 47 | 48 | # Compare the updated F matrices 49 | out <- get_multinom_from_pnmf(t(B),A) 50 | cat(sprintf("%0.1e\n",max(abs(out$F - F)))) 51 | } 52 | -------------------------------------------------------------------------------- /inst/code/pois_vs_binom.R: -------------------------------------------------------------------------------- 1 | # Small script to illustrate the Poisson approximation to the binomial 2 | # likelihood. 3 | 4 | # Simulate binomial data. 5 | set.seed(1) 6 | n <- 1000 7 | p <- 0.1 8 | x <- rbinom(1,n,p) 9 | 10 | # Plot binomial and Poisson likelihoods. 11 | p <- seq(0,0.25,length.out = 1000) 12 | f1 <- dbinom(x,n,p,log = TRUE) 13 | f2 <- dpois(x,n*p,log = TRUE) 14 | f1 <- exp(f1 - max(f1)) 15 | f2 <- exp(f2 - max(f2)) 16 | f1 <- f1/sum(f1) 17 | f2 <- f2/sum(f2) 18 | plot(p,f1,type = "l",col = "darkorange",lwd = 2) 19 | lines(p,f2,col = "darkblue",lwd = 2,lty = "dashed") 20 | -------------------------------------------------------------------------------- /inst/code/pois_vs_multinom.R: -------------------------------------------------------------------------------- 1 | # A short script to verify the fit_poisson_models computations against 2 | # fit_topic_model. 3 | # 4 | # Use the commented-out code to show that, in smaller data sets, the f 5 | # parameters in the Poisson glm do not always come close to summing to 6 | # 1; that is, the approximation is not as good for smaller samples. 7 | library(Matrix) 8 | 9 | # Simulate data. 10 | set.seed(1) 11 | n <- 120 # 20 12 | m <- 1000 # 8 13 | k <- 4 14 | dat <- simulate_multinom_gene_data(n,m,k,sparse = TRUE) 15 | # dat <- simulate_poisson_gene_data(n,m,k,s = rep(10,n),sparse = FALSE) 16 | X <- dat$X 17 | L <- dat$L 18 | 19 | # Fit a multinomial topic model, with k = 4 topics. 20 | fit <- fit_topic_model(X,k = 4,init.method = "random") 21 | 22 | # Ensure that none of the topic proportions are exactly zero or 23 | # exactly one. 24 | L <- fit$L 25 | L <- pmax(L,1e-8) 26 | L <- pmin(L,1 - 1e-8) 27 | 28 | s <- rowSums(X) 29 | out <- add_pseudocounts(X,s*L,0.01) 30 | X <- out$X 31 | L <- out$L 32 | 33 | # For each column j of the counts matrix, compute MLEs of the 34 | # parameters in the Poisson glm, x ~ Poisson(u), in which the 35 | # Poisson rates are u = sum(L*f), and f = F[j,]. 36 | F <- fit_poisson_models(X,L,"scd",1e-8,100,1e-8,1) 37 | F <- pmax(F,1e-8) 38 | 39 | # Compare the estimates obtained by computing MLEs under the 40 | # multinomial topic model against the estimates obtained by running 41 | # fit_poisson_models. 42 | plot(fit$F + 1e-6,F + 1e-6,pch = 4,cex = 0.5,log = "xy") 43 | abline(a = 0,b = 1,col = "skyblue",lty = "dotted") 44 | 45 | # Note that the model parameters estimated in fit_poisson_models no 46 | # longer represent frequencies, but they come close. 47 | print(colSums(fit$F)) 48 | print(colSums(F)) 49 | -------------------------------------------------------------------------------- /inst/code/poisson_demo.R: -------------------------------------------------------------------------------- 1 | # Short script to verify implementation of the differential expression 2 | # (DE) analysis methods applied to data simulated from a Poisson NMF 3 | # model. 4 | library(Matrix) 5 | library(ggplot2) 6 | library(cowplot) 7 | 8 | # Simulate data. 9 | set.seed(1) 10 | n <- 800 11 | m <- 1000 12 | k <- 4 13 | s <- 10^runif(n,-1,1) 14 | dat <- simulate_poisson_gene_data(n,m,k,s) 15 | X <- dat$X 16 | L <- dat$L 17 | Y <- as(X,"dgCMatrix") 18 | mu <- colMeans(X) 19 | f0 <- colSums(X)/sum(s) 20 | 21 | # Add "pseudocounts" to the data. 22 | out <- add_pseudocounts(X,s*L,0.01) 23 | X <- out$X 24 | L <- out$L 25 | 26 | # Fit a Poisson model for each gene. 27 | F1 <- fit_poisson_models(X,L,method = "glm") 28 | F2 <- fit_poisson_models(X,L,method = "scd",nc = 4) 29 | print(range(F1 - F2)) 30 | 31 | # Compare the estimates against the Poisson rates used to simulate the 32 | # data. 33 | e <- 1e-4 34 | i <- 1 35 | plot(dat$F + e,F1 + e,pch = 20,log = "xy",xlab = "true frequency", 36 | ylab = "estimated frequency") 37 | abline(a = 0,b = 1,col = "dodgerblue",lty = "dotted") 38 | 39 | # Compute the log-fold change statistics for each gene j and topic k. 40 | out <- compute_lfc_stats(X,F1,L,f0) 41 | 42 | # Here we show that the z-score varies, as expected, with the log-fold 43 | # change estimate and the average expression level. 44 | pdat <- data.frame(x = mu,lfc = out$est[,1],z = out$z[,1]) 45 | print(ggplot(pdat,aes(x = x,y = lfc,fill = z)) + 46 | geom_point(size = 2,shape = 21,color = "white") + 47 | geom_abline(intercept = 0,slope = 0,color = "black",linetype = "dotted") + 48 | scale_x_continuous(trans = "log10") + 49 | scale_fill_gradient2(low = "darkblue",mid = "skyblue", 50 | high = "orangered",midpoint = 0) + 51 | labs(x = "average expression",y = "log-fold change",fill = "z-score") + 52 | theme_cowplot(font_size = 12)) 53 | 54 | # Create a volcano plot in which log-fold change is shown on the 55 | # x-axis and the z-score is shown on the y-axis. To illustrate the 56 | # impact of (mean) gene expression level on the z-scores, the (log) 57 | # average expression level is shown by a colour gradient. 58 | print(ggplot(pdat,aes(x = lfc,y = abs(z),fill = log10(x))) + 59 | geom_point(size = 2,shape = 21,color = "white") + 60 | labs(x = "log-fold change",y = "|z-score|",fill = "log10(mean)") + 61 | scale_y_continuous(trans = "sqrt") + 62 | scale_fill_gradient2(low = "skyblue",mid = "gold",high = "orangered", 63 | midpoint = 0) + 64 | theme_cowplot(font_size = 12)) 65 | -------------------------------------------------------------------------------- /inst/code/postfit_motif_analysis_Buenrostro2018.R: -------------------------------------------------------------------------------- 1 | # Perform differential accessbility analysis for ATAC-seq regions (peaks), 2 | # and perform TF motif enrichment analysis using HOMER. 3 | de <- readRDS("DA_regions_topics_noshrinkage_10000iters.rds") 4 | 5 | # For each topic, perform TF motif enrichment analysis using HOMER 6 | # hypergeometric test. 7 | select_small_pvals <- function (postmean,lpval,lfsr,rank,quantile) 8 | lpval > 1 9 | res <- run_homer(de,k = 4,subset = select_small_pvals) 10 | -------------------------------------------------------------------------------- /inst/code/pseudocounts.R: -------------------------------------------------------------------------------- 1 | # Simulate a 100 x 200 counts matrix. 2 | set.seed(1) 3 | n <- 100 4 | m <- 200 5 | k <- 3 6 | out <- simulate_count_data(n,m,k) 7 | X <- out$X 8 | F <- out$F 9 | L <- out$L 10 | 11 | # Add pseudocounts. 12 | a <- 1.1 13 | b <- 1.1 14 | X <- rbind(X,matrix(a - 1,k,m)) 15 | L <- rbind(L,diag(k)) 16 | X <- cbind(X,matrix(b - 1,n+k,k)) 17 | F <- rbind(F,1e-4 * diag(k)) 18 | 19 | # Fit a multinomial topic model, with k = 3. 20 | s <- rowSums(L) 21 | fit <- init_poisson_nmf(X,F = F,L = L) 22 | fit <- fit_poisson_nmf(X,fit0 = fit,numiter = 400, 23 | update.factors = 1:m, 24 | update.loadings = 1:n, 25 | control = list(extrapolate = TRUE)) 26 | fit.multinom <- poisson2multinom(fit) 27 | 28 | # Apply the pLSI EM update for L. 29 | X <- X[1:n,1:m] 30 | L <- fit.multinom$L[1:n,] 31 | F <- fit.multinom$F[1:m,] 32 | F <- normalize.cols(F) 33 | P <- matrix(0,m,k) 34 | for (i in 1:n) { 35 | for (j in 1:m) 36 | P[j,] <- F[j,]*L[i,]/sum(F[j,]*L[i,]) 37 | L[i,] <- X[i,] %*% P + b - 1 38 | } 39 | L <- normalize.rows(L) 40 | print(range(L - fit.multinom$L[1:n,])) 41 | 42 | # Apply the pLSI EM update for F. 43 | P <- matrix(0,n,k) 44 | for (j in 1:m) { 45 | for (i in 1:n) 46 | P[i,] <- F[j,]*L[i,]/sum(F[j,]*L[i,]) 47 | F[j,] <- X[,j] %*% P + a - 1 48 | } 49 | F <- normalize.cols(F) 50 | print(range(F - normalize.cols(fit.multinom$F[1:m,]))) 51 | -------------------------------------------------------------------------------- /inst/code/scd.R: -------------------------------------------------------------------------------- 1 | # A small script to illustrate the co-ordinate ascent updates in the 2 | # sequential co-ordinate descent (SCD) algorithm described by Lin & 3 | # Boutros (2018). 4 | # 5 | # Here I enhance the SCD algorithm with a simple backtracking line 6 | # search to guarantee that the objective decreases at each iteration. 7 | # 8 | 9 | # SCRIPT PARAMETERS 10 | # ----------------- 11 | n <- 20 12 | numiter <- 20 13 | line.search <- TRUE 14 | 15 | # SIMULATE DATA 16 | # ------------- 17 | set.seed(49) 18 | w <- rpois(n,2) 19 | a <- abs(rnorm(n)) 20 | b <- abs(rnorm(n)) 21 | 22 | # Solve the following 1-d optimization problem: 23 | # 24 | # minimize f(x) = sum(b*x - w*log(y)) 25 | # subject to y = a + b*x, 26 | # x >= 0. 27 | # 28 | # using a simple sequential quadratic programming (SQP) method. 29 | x <- 1 30 | e <- 1e-15 31 | f <- rep(0,numiter) 32 | for (i in 1:numiter) { 33 | 34 | # Compute the value of the objective at x. 35 | y <- a + b*x; 36 | f[i] <- sum(b*x - w*log(y)) 37 | 38 | # Compute the gradient and Hessian at x. 39 | u <- b/y 40 | h <- sum(w*u^2) 41 | g <- sum(b - w*u) 42 | 43 | # Optionally, perform backtracking line search to determine a 44 | # suitable step size. 45 | p <- -g/h 46 | if (line.search) { 47 | if (p >= -e) 48 | s <- 1 49 | else 50 | s <- min(1,-x/p) 51 | smin <- e 52 | while (TRUE) { 53 | xnew <- x + s*p 54 | ynew <- a + b*xnew 55 | fnew <- sum(b*xnew - w*log(ynew)) 56 | if (s < smin) { 57 | xnew <- x 58 | s <- 0 59 | break 60 | } else if (fnew < f[i]) 61 | break 62 | else 63 | s <- s/2 64 | } 65 | } else 66 | xnew <- max(0,x + p) 67 | 68 | # Update x. 69 | x <- xnew 70 | } 71 | cat(sprintf("solution: %0.6f\n",x)) 72 | 73 | # Plot the improvement in the solution over time. 74 | y <- f - min(f) + 1e-15 75 | plot(1:numiter,y,type = "l",col = "dodgerblue",lwd = 1,log = "y", 76 | xlab = "iteration",ylab = "distance to minimum") 77 | points(1:numiter,y,pch = 20,col = "dodgerblue") 78 | 79 | # This is a plot I created to compare the progression with and without 80 | # the backtracking line search. 81 | # 82 | # f <- min(c(f.nols,f.ls)) 83 | # y <- f.nols - f + 1e-8 84 | # plot(1:numiter,y,type = "l",col = "dodgerblue",lwd = 1,log = "y", 85 | # xlab = "iteration",ylab = "distance to minimum") 86 | # points(1:numiter,y,pch = 20,col = "dodgerblue") 87 | # y <- f.ls - f + 1e-8 88 | # lines(1:numiter,y,col = "darkorange",lwd = 1) 89 | # points(1:numiter,y,pch = 20,col = "darkorange") 90 | # 91 | -------------------------------------------------------------------------------- /inst/code/simulate_data_for_sfa.R: -------------------------------------------------------------------------------- 1 | library(R.matlab) 2 | set.seed(1) 3 | n <- 400 4 | m <- 1000 5 | k <- 6 6 | dat <- simulate_multinom_gene_data(n,m,k) 7 | X <- dat$X 8 | L <- dat$L 9 | writeMat("sim_multinom.mat",X=X,L=L) 10 | -------------------------------------------------------------------------------- /inst/code/test_hpd.R: -------------------------------------------------------------------------------- 1 | set.seed(1) 2 | n <- 1e4 3 | x <- rnorm(n) 4 | print(hpd(x)) 5 | -------------------------------------------------------------------------------- /inst/code/test_poisson_fit.R: -------------------------------------------------------------------------------- 1 | # Verify the Poisson model computations on a small example with k = 2. 2 | library(ggplot2) 3 | library(cowplot) 4 | 5 | # Simulate a Poisson data set. 6 | set.seed(1) 7 | n <- 200 8 | f1 <- 0.1 9 | f2 <- 1 10 | s <- sample(10,n,replace = TRUE) 11 | q <- runif(n) 12 | u <- (1-q)*f1 + q*f2 13 | x <- rpois(n,s*u) 14 | 15 | # Fit the generalized linear model. 16 | control <- glm.control(epsilon = 1e-10, maxit = 100) 17 | L <- cbind(s*(1-q),s*q) 18 | dat <- data.frame(x = x,f1 = L[,1],f2 = L[,2]) 19 | fit <- glm(x ~ f1 + f2 - 1,family = poisson(link = "identity"), 20 | data = dat,start = c(0.5,0.5),control = control) 21 | print(log(coef(fit))) 22 | 23 | # Fit the model parameters using glm with family = poisson(link = 24 | # "identity"). 25 | out <- fit_poisson_glm(x,L) 26 | print(log(out$coef)) 27 | 28 | # Compute the covariance of log(f). 29 | cat("Cov(log(f)) estimated via Laplace approximation:\n") 30 | print(compute_poisson_covariance(x,L,out$coef)) 31 | 32 | # Draw samples from the posterior using random-walk Metropolis. 33 | ns <- 1e5 34 | D <- matrix(rnorm(2*ns),ns,2) 35 | U <- matrix(runif(2*ns),ns,2) 36 | M <- matrix(sample(2,2*ns,replace = TRUE),ns,2) - 1 37 | sim <- simulate_posterior_poisson_rcpp(x,L,out$coef,D,U,M,0.3,1e-15) 38 | cat("Acceptance rates:\n") 39 | print(drop(sim$ar)) 40 | 41 | cat("MCMC estimate of Cov(log(f)):\n") 42 | print(cov(sim$samples)) 43 | 44 | # Get 90% HPD intervals. 45 | cat("MCMC estimates of 90% HPD intervals:\n") 46 | print(hpd(sim$samples[,1],0.9)) 47 | print(hpd(sim$samples[,2],0.9)) 48 | 49 | # Plot the likelihood surface. 50 | dat <- expand.grid(t1 = seq(-4,1,0.05),t2 = seq(-4,1,0.02)) 51 | n <- nrow(dat) 52 | dat$lik <- 0 53 | for (i in 1:n) { 54 | f <- exp(c(dat[i,1],dat[i,2])) 55 | u <- drop(L %*% f) 56 | dat[i,"lik"] <- sum(dpois(x,u,log = TRUE)) 57 | } 58 | dat$lik <- exp(dat$lik - max(dat$lik)) 59 | p1 <- ggplot(dat,aes(x = t1,y = t2,z = lik)) + 60 | geom_contour(color = "black",bins = 16) + 61 | geom_point(data = as.data.frame(t(log(out$coef))), 62 | mapping = aes(x = f1,y = f2), 63 | color = "red",shape = 4, 64 | inherit.aes = FALSE) + 65 | labs(x = "log(f1)",y = "log(f2)") + 66 | theme_cowplot(font_size = 10) 67 | 68 | # Plot the MCMC density estimate. 69 | sim$samples <- as.data.frame(sim$samples) 70 | names(sim$samples) <- c("k1","k2") 71 | p2 <- ggplot(sim$samples,aes(x = k1,y = k2)) + 72 | geom_density_2d(color = "black") + 73 | geom_point(data = as.data.frame(t(log(out$coef))), 74 | mapping = aes(x = f1,y = f2), 75 | color = "red",shape = 4, 76 | inherit.aes = FALSE) + 77 | labs(x = "log(f1)",y = "log(f2)") + 78 | theme_cowplot(font_size = 10) 79 | print(plot_grid(p1,p2)) 80 | -------------------------------------------------------------------------------- /inst/code/test_poisson_fit_basic.R: -------------------------------------------------------------------------------- 1 | # Simulate binomial data, x ~ binom(s*p0). 2 | set.seed(1) 3 | n <- 50 4 | s <- ceiling(10*runif(n)) 5 | p0 <- 0.05 6 | q <- runif(n) 7 | x <- rbinom(n,s,p0) 8 | 9 | # Fit the basic Poisson model x ~ Pois(s*f0) using glm. 10 | fit <- glm(x ~ f0 - 1,family = poisson(link = "identity"), 11 | data = data.frame(x = x,f0 = s),start = 0.5, 12 | control = glm.control(epsilon = 1e-10, maxit = 100)) 13 | 14 | # Compute the MLE of f0. 15 | f0 <- sum(x)/sum(s) 16 | 17 | # The glm estimate should be the same as f0. 18 | cat(coef(fit),f0,"\n") 19 | 20 | # Compute the s.e. of log(f0) using the Laplace approximation. 21 | se <- 1/sqrt(f0*sum(s)) 22 | 23 | # Compute the s.e. of log(f0) using numerical integration. 24 | ns <- 1000 25 | t <- seq(-6,0,length.out = ns) 26 | w <- rep(0,ns) 27 | for (i in 1:ns) 28 | w[i] <- sum(dpois(x,s*exp(t[i]),log = TRUE)) 29 | w <- exp(w - max(w)) 30 | w <- w/sum(w) 31 | mu <- sum(w*t) 32 | se_mc <- sqrt(sum(w*(t - mu)^2)) 33 | 34 | # The two s.e. calculations should be pretty close. 35 | cat(se,se_mc,"\n") 36 | -------------------------------------------------------------------------------- /inst/datafiles/newsgroups.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenslab/fastTopics/3a35d33d3e5ea0314d11965a1700e66035f1d18d/inst/datafiles/newsgroups.RData -------------------------------------------------------------------------------- /inst/datafiles/newsgroups_topics.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenslab/fastTopics/3a35d33d3e5ea0314d11965a1700e66035f1d18d/inst/datafiles/newsgroups_topics.RData -------------------------------------------------------------------------------- /man/annotation_heatmap.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/annotation_heatmap.R 3 | \name{annotation_heatmap} 4 | \alias{annotation_heatmap} 5 | \title{Annotation Heatmap} 6 | \usage{ 7 | annotation_heatmap( 8 | effects_matrix, 9 | select_features = c("largest", "distinctive", "both", "all"), 10 | feature_sign = c("both", "positive", "negative"), 11 | dims = colnames(effects_matrix), 12 | compare_dims = colnames(effects_matrix), 13 | n = 2, 14 | show_dims = colnames(effects_matrix), 15 | zero_value = 0.01, 16 | font_size = 10, 17 | verbose = TRUE 18 | ) 19 | } 20 | \arguments{ 21 | \item{effects_matrix}{n x d numeric matrix, where n is the number 22 | of features and d is the number of dimensions. This could be for 23 | example the word frequencies matrix \code{F} from a multinomial 24 | topic model fitted using \code{\link{fit_topic_model}}. The row 25 | and columns of this matrix must be named, otherwise this function 26 | will throw and error.} 27 | 28 | \item{select_features}{This may be a character vector specifying 29 | the features to plot (rows of the effects matrix). Or it may be one 30 | of the following: \code{"largest"}, which automatically selects the 31 | largest effects for each chosen dimension; \code{"distinctive"}, 32 | which automatically selects the \dQuote{most distinctive} effects 33 | for each chosen dimension; or \code{"both"}, which uses both 34 | criteria to select features. Distinctive features are defined as 35 | rows of the effects matrix that are much larger in magnitude than 36 | the effects in the other dimensions that also share the same sign.} 37 | 38 | \item{feature_sign}{For automated selection of features, this 39 | option determines whether to consider positive effects only 40 | (\code{"positive"}), negative effects only (\code{"negative"}), or 41 | both (\code{"both"}).} 42 | 43 | \item{dims}{The dimensions (columns of the effect matrix) to 44 | consider for automatic feature selection. This should be dimension 45 | names (not numbers).} 46 | 47 | \item{compare_dims}{This should be dimension names (not numbers).} 48 | 49 | \item{n}{For automated feature selection, the number of features to 50 | select of each type and for each dimension. (see arguments 51 | \code{select_features} and \code{feature_sign}).} 52 | 53 | \item{show_dims}{The dimensions (columns) to include in the 54 | plot. This should be dimension names (not numbers).} 55 | 56 | \item{zero_value}{Numbers smaller than \code{zero_value} (in 57 | magnitude) are not shown in the plot.} 58 | 59 | \item{font_size}{Specifies the font size for the plot.} 60 | 61 | \item{verbose}{When \code{verbose = TRUE}, the list of selected 62 | features (rows) is printed.} 63 | } 64 | \value{ 65 | A \code{ggplot} object. 66 | } 67 | \description{ 68 | This is a generic plotting utility (not specific to 69 | topic the model) for comparing \dQuote{effects} across multiple 70 | dimensions (e.g., topics). The function has several options for 71 | selecting the features to compare. 72 | } 73 | \examples{ 74 | data(newsgroups) 75 | p1 <- annotation_heatmap(newsgroups$F,feature_sign = "positive") 76 | 77 | } 78 | -------------------------------------------------------------------------------- /man/compare_fits.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/summary.R 3 | \name{compare_fits} 4 | \alias{compare_fits} 5 | \title{Summarize and Compare Model Fits} 6 | \usage{ 7 | compare_fits(fits) 8 | } 9 | \arguments{ 10 | \item{fits}{An object of class \code{"poisson_nmf_fit"} or 11 | \code{"multinom_topic_model_fit"}, or a non-empty, named list in 12 | which all list elements are Poisson NMF model fits or all 13 | multinomial topic model fits.} 14 | } 15 | \value{ 16 | A data frame with one row per element of \code{fits}, and 17 | with the following columns: 18 | 19 | \item{k}{The rank of the matrix factorization.} 20 | 21 | \item{loglik}{The log-likelihood (either Poisson NMF or multinomial topic 22 | model likelihood) achieved at the last model fitting update.} 23 | 24 | \item{dev}{For Poisson NMF model fits only, the deviance achieved 25 | at the last model fitting update.} 26 | 27 | \item{res}{The maximum residual of the Karush-Kuhn-Tucker (KKT) 28 | system achieved at the last model fitting update; small values 29 | indicate that the solution is close to a local maximum, or 30 | stationary point, of the likelihood.} 31 | 32 | \item{loglik.diff}{The improvement in the log-likelihood relative 33 | to the model fit with the smallest log-likelihood.} 34 | 35 | \item{dev.diff}{The improvement in the deviance relative to the 36 | model fit with the largest deviance.} 37 | 38 | \item{nonzeros.f}{The rate of nonzeros in the factors matrix, as 39 | determined by \code{control$zero.threshold}.} 40 | 41 | \item{nonzeros.l}{The rate of nonzeros in the loadings matrix, as 42 | determined by \code{control$zero.threshold}.} 43 | 44 | \item{numiter}{The number of loadings and/or factor updates 45 | performed.} 46 | 47 | \item{runtime}{The total runtime (in s) of the model fitting 48 | updates.} 49 | } 50 | \description{ 51 | Create a table summarizing the results of fitting one 52 | or more Poisson non-negative matrix factorizations or multinomial 53 | topic models. 54 | } 55 | \seealso{ 56 | \code{\link{fit_poisson_nmf}}, 57 | \code{\link{fit_topic_model}} 58 | } 59 | -------------------------------------------------------------------------------- /man/fit_multinom_model.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/fit_multinom_model.R 3 | \name{fit_multinom_model} 4 | \alias{fit_multinom_model} 5 | \title{Fit Simple Multinomial Model} 6 | \usage{ 7 | fit_multinom_model(cluster, X, verbose = c("none", "detailed"), ...) 8 | } 9 | \arguments{ 10 | \item{cluster}{A factor specifying a grouping, or clustering, of 11 | the rows of \code{X}; e.g., the \dQuote{cluster} output from 12 | \code{\link[stats]{kmeans}}.} 13 | 14 | \item{X}{The n x m matrix of counts; all entries of X should be 15 | non-negative. It can be a sparse matrix (class \code{"dgCMatrix"}) 16 | or dense matrix (class \code{"matrix"}), with some exceptions (see 17 | \sQuote{Details}).} 18 | 19 | \item{verbose}{This is passed as the \dQuote{verbose} argument in 20 | the call to \code{\link{init_poisson_nmf}}.} 21 | 22 | \item{\dots}{Additional arguments passed to 23 | \code{\link{init_poisson_nmf}}.} 24 | } 25 | \value{ 26 | A multinomial topic model fit. 27 | } 28 | \description{ 29 | Fit a simple multinomial model for count data, in 30 | which each sample (\emph{i.e.}, a row of the data matrix \code{X}) 31 | is assigned to a cluster. Under this simple multinomial model, 32 | \eqn{x_{ij}} assigned to cluster \eqn{k} is multinomial with sample 33 | size \eqn{s_i = x_{i1} + ... + x_{im}} and multinomial 34 | probabilities \eqn{p_{1k}, ..., p_{mk}}. This is a special case of 35 | the multinomial topic model in which all the mixture proportions 36 | are either 0 or 1. The maximum-likelihood estimates (MLEs) of the 37 | multinomial probabilities have a closed-form solution; no 38 | iterative algorithm is needed to fit this simple model. 39 | } 40 | \seealso{ 41 | \code{\link{fit_topic_model}} 42 | } 43 | -------------------------------------------------------------------------------- /man/likelihood.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/likelihood.R 3 | \name{loglik_poisson_nmf} 4 | \alias{loglik_poisson_nmf} 5 | \alias{loglik_multinom_topic_model} 6 | \alias{deviance_poisson_nmf} 7 | \alias{cost} 8 | \title{NMF and Topic Model Likelihoods and Deviances} 9 | \usage{ 10 | loglik_poisson_nmf(X, fit, e = 1e-08) 11 | 12 | loglik_multinom_topic_model(X, fit, e = 1e-08) 13 | 14 | deviance_poisson_nmf(X, fit, e = 1e-08) 15 | 16 | cost(X, A, B, e = 1e-08, family = c("poisson", "multinom"), version) 17 | } 18 | \arguments{ 19 | \item{X}{The n x m matrix of counts or pseudocounts. It can be a 20 | sparse matrix (class \code{"dgCMatrix"}) or dense matrix (class 21 | \code{"matrix"}).} 22 | 23 | \item{fit}{A Poisson NMF or multinomial topic model fit, such as an 24 | output from \code{\link{fit_poisson_nmf}} or 25 | \code{\link{fit_topic_model}}.} 26 | 27 | \item{e}{A small, non-negative number added to the terms inside the 28 | logarithms to avoid computing logarithms of zero. This prevents 29 | numerical problems at the cost of introducing a very small 30 | inaccuracy in the computation.} 31 | 32 | \item{A}{The n x k matrix of loadings. It should be a dense matrix.} 33 | 34 | \item{B}{The k x m matrix of factors. It should be a dense matrix.} 35 | 36 | \item{family}{If \code{model = "poisson"}, the loss function values 37 | corresponding to the Poisson non-negative matrix factorization are 38 | computed; if \code{model = "multinom"}, the multinomial topic model 39 | loss function values are returned.} 40 | 41 | \item{version}{When \code{version == "R"}, the computations are 42 | performed entirely in R; when \code{version == "Rcpp"}, an Rcpp 43 | implementation is used. The R version is typically faster when 44 | \code{X} is a dense matrix, whereas the Rcpp version is faster and 45 | more memory-efficient when \code{X} is a large, sparse matrix. When 46 | not specified, the most suitable version is called depending on 47 | whether \code{X} is dense or sparse.} 48 | } 49 | \value{ 50 | A numeric vector with one entry per row of \code{X}. 51 | } 52 | \description{ 53 | Compute log-likelihoods and deviances for assessing 54 | fit of a topic model or a non-negative matrix factorization (NMF). 55 | } 56 | \details{ 57 | Function \code{cost} computes loss functions proportional 58 | to the negative log-likelihoods, and is mainly for internal use to 59 | quickly compute log-likelihoods and deviances; it should not be 60 | used directly unless you know what you are doing. In particular, 61 | little argument checking is performed by \code{cost}. 62 | } 63 | \examples{ 64 | 65 | # Generate a small counts matrix. 66 | set.seed(1) 67 | out <- simulate_count_data(10,20,3) 68 | X <- out$X 69 | fit <- out[c("F","L")] 70 | class(fit) <- c("poisson_nmf_fit","list") 71 | 72 | # Compute the Poisson log-likelihoods and deviances. 73 | data.frame(loglik = loglik_poisson_nmf(X,fit), 74 | deviance = deviance_poisson_nmf(X,fit)) 75 | 76 | # Compute multinomial log-likelihoods. 77 | loglik_multinom_topic_model(X,fit) 78 | 79 | } 80 | -------------------------------------------------------------------------------- /man/loadings_plot.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/other_plots.R 3 | \name{loadings_plot} 4 | \alias{loadings_plot} 5 | \alias{loadings_plot_ggplot_call} 6 | \title{Loadings Plot} 7 | \usage{ 8 | loadings_plot( 9 | fit, 10 | x, 11 | k, 12 | ggplot_call = loadings_plot_ggplot_call, 13 | plot_grid_call = function(plots) do.call(plot_grid, plots) 14 | ) 15 | 16 | loadings_plot_ggplot_call(dat, topic.label, font.size = 9) 17 | } 18 | \arguments{ 19 | \item{fit}{An object of class \dQuote{poisson_nmf_fit} or 20 | \dQuote{multinom_topic_model_fit}.} 21 | 22 | \item{x}{A categorical variable represented as a 23 | \code{\link{factor}}. It should have the same number of elements as 24 | the number of rows in \code{fit$L}.} 25 | 26 | \item{k}{The topic, or topics, selected by number or name. When not 27 | specified, all topics are plotted.} 28 | 29 | \item{ggplot_call}{The function used to create the plot. Replace 30 | \code{loadings_plot_ggplot_call} with your own function to 31 | customize the appearance of the plot.} 32 | 33 | \item{plot_grid_call}{When multiple topics are selected, this is 34 | the function used to arrange the plots into a grid using 35 | \code{\link[cowplot]{plot_grid}}. It should be a function accepting 36 | a single argument, \code{plots}, a list of \code{ggplot} objects.} 37 | 38 | \item{dat}{A data frame passed as input to 39 | \code{\link[ggplot2]{ggplot}}, containing, at a minimum, columns 40 | \dQuote{x} and \dQuote{loading}.} 41 | 42 | \item{topic.label}{The name or number of the topic being plotted. 43 | Only used to determine the plot title.} 44 | 45 | \item{font.size}{Font size used in plot.} 46 | } 47 | \value{ 48 | A \code{ggplot} object. 49 | } 50 | \description{ 51 | Generate one or more barcharts to visualize the 52 | relationship between the loadings or mixture proportions and a 53 | selected categorical variable (a factor). 54 | } 55 | \details{ 56 | This is a lightweight interface primarily intended to 57 | expedite creation of boxplots for investigating relationships 58 | between topics and a categorical variables of interest without 59 | having to spend a great deal of time worrying about the plotting 60 | settings; most of the \dQuote{heavy lifting} is done by 61 | \sQuote{ggplot2} (specifically, function 62 | \code{\link[ggplot2]{geom_boxplot}} in the \sQuote{ggplot2} 63 | package). For more control over the plot's appearance, the plot can 64 | be customized by modifying the \code{ggplot_call} and 65 | \code{plot_grid_call} arguments. 66 | } 67 | -------------------------------------------------------------------------------- /man/merge_topics.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/merge_topics.R 3 | \name{merge_topics} 4 | \alias{merge_topics} 5 | \title{Combine Topics in Multinomial Topic Model} 6 | \usage{ 7 | merge_topics(fit, k) 8 | } 9 | \arguments{ 10 | \item{fit}{A multinomial topic model fit.} 11 | 12 | \item{k}{The names or numbers of the topics to be combined. Two or 13 | more topics should be chosen.} 14 | } 15 | \value{ 16 | A multinomial topic model fit. 17 | } 18 | \description{ 19 | Combine two or more topics in a multinomial topic 20 | model fit. 21 | } 22 | \details{ 23 | Mixture proportions are combined by summation, and factors 24 | are combined by averaging. 25 | } 26 | -------------------------------------------------------------------------------- /man/multinom2poisson.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/multinom2poisson.R 3 | \name{multinom2poisson} 4 | \alias{multinom2poisson} 5 | \title{Recover Poisson NMF Fit from Multinomial Topic Model Fit} 6 | \usage{ 7 | multinom2poisson(fit, X) 8 | } 9 | \arguments{ 10 | \item{fit}{An object of class \dQuote{multinom_topic_model_fit}, 11 | such as an output from \code{poisson2multinom}. If a Poisson NMF 12 | fit is provided (that is, an object of class 13 | \dQuote{poisson_nmf_fit}), the fit object is immediately returned 14 | \dQuote{as is}.} 15 | 16 | \item{X}{Optional n x m matrix of counts, or pseudocounts. It can 17 | be a sparse matrix (class \code{"dgCMatrix"}) or dense matrix 18 | (class \code{"matrix"}). This only needs to be provided if the 19 | document sizes \code{fit$s} are not available.} 20 | } 21 | \value{ 22 | The return value is the list \code{fit}, in which matrices 23 | \code{fit$F} and \code{fit$L} specify the factors and loadings in 24 | the Poisson non-negative matrix factorization; specifically, 25 | the counts matrix is modeled by the low-rank matrix product 26 | \code{tcrossprod(fit$L,fit$F)}. 27 | } 28 | \description{ 29 | This function recovers parameter estimates of the 30 | Poisson non-negative matrix factorization (NMF) given parameter 31 | estimates for a multinomial topic model. 32 | } 33 | -------------------------------------------------------------------------------- /man/newsgroups.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/newsgroups.R 3 | \docType{data} 4 | \name{newsgroups} 5 | \alias{newsgroups} 6 | \title{Topic modeling results from the \dQuote{20 Newsgroups} data 7 | set.} 8 | \format{ 9 | \code{newsgroups} is a list with the following elements: 10 | 11 | \describe{ 12 | 13 | \item{topics}{Original labeling of the documents: each document 14 | is from one of 20 \dQuote{newsgroups}.} 15 | 16 | \item{L}{Estimated topic proportions matrix; rows are 17 | documents and columns are topics.} 18 | 19 | \item{F}{Matrix containing posterior mean estimates of log-fold 20 | changes (in base-2 logarithm). These were computed using 21 | \code{\link{de_analysis}} with \code{lfc.stat = "vsnull"}. Columns 22 | are words and columns are topics.}} 23 | } 24 | \description{ 25 | These are topic modeling results from the \dQuote{20 26 | Newsgroups} data, with k = 10 topics. The data were originally 27 | downloaded from \url{http://qwone.com/~jason/20Newsgroups} and 28 | prepared by running code that found in an R Markdown file in this 29 | GitHub repository: 30 | \url{https://github.com/stephenslab/fastTopics-experiments}. See 31 | the \dQuote{inst} directory of this package for the scripts used to 32 | generate these results. 33 | } 34 | \examples{ 35 | data(newsgroups) 36 | table(newsgroups$topics) 37 | dim(newsgroups$L) 38 | dim(newsgroups$F) 39 | 40 | } 41 | \keyword{data} 42 | -------------------------------------------------------------------------------- /man/pbmc_facs.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/pbmc_facs.R 3 | \docType{data} 4 | \name{pbmc_facs} 5 | \alias{pbmc_facs} 6 | \title{Mixture of 10 FACS-purified PBMC Single-Cell RNA-seq data} 7 | \format{ 8 | \code{pbmc_facs} is a list with the following elements: 9 | 10 | \describe{ 11 | 12 | \item{counts}{3,774 x 16,791 sparse matrix of UMI counts, with 13 | rows corresponding to samples (cells) and columns corresponding to 14 | genes. It is an object of class \code{"dgCMatrix"}).} 15 | 16 | \item{counts_test}{UMI counts for an additional test set of 100 17 | cells.} 18 | 19 | \item{samples}{Data frame containing information about the 20 | samples, including cell barcode and source FACS population 21 | (\dQuote{celltype} and \dQuote{facs_subpop}).} 22 | 23 | \item{samples_test}{Sample information for the additional test 24 | set of 100 cells.} 25 | 26 | \item{genes}{Data frame containing information and the genes, 27 | including gene symbol and Ensembl identifier.} 28 | 29 | \item{fit}{Poisson non-negative matrix factorization (NMF) fitted 30 | to the UMI count data \code{counts}, with rank \code{k = 6}. See 31 | the vignette how the Poisson NMF model fitting was performed.}} 32 | 33 | \url{https://www.10xgenomics.com/resources/datasets} 34 | } 35 | \description{ 36 | These data are a selection of the reference 37 | transcriptome profiles generated via single-cell RNA sequencing 38 | (RNA-seq) of 10 bead-enriched subpopulations of PBMCs (Donor A), 39 | described in Zheng \emph{et al} (2017). The data are unique 40 | molecular identifier (UMI) counts for 16,791 genes in 3,774 cells. 41 | (Genes with no expression in any of the cells were removed.) Since 42 | the majority of the UMI counts are zero, they are efficiently 43 | stored as a 3,774 x 16,791 sparse matrix. These data are used in 44 | the vignette illustrating how 'fastTopics' can be used to analyze to 45 | single-cell RNA-seq data. Data for a separate set of 1,000 cells is 46 | provided as a \dQuote{test set} to evaluate out-of-sample predictions. 47 | } 48 | \examples{ 49 | library(Matrix) 50 | data(pbmc_facs) 51 | cat(sprintf("Number of cells: \%d\n",nrow(pbmc_facs$counts))) 52 | cat(sprintf("Number of genes: \%d\n",ncol(pbmc_facs$counts))) 53 | cat(sprintf("Proportion of counts that are non-zero: \%0.1f\%\%.\n", 54 | 100*mean(pbmc_facs$counts > 0))) 55 | 56 | } 57 | \references{ 58 | G. X. Y. Zheng \emph{et al} (2017). Massively parallel digital 59 | transcriptional profiling of single cells. \emph{Nature Communications} 60 | \bold{8}, 14049. \doi{10.1038/ncomms14049} 61 | } 62 | \keyword{data} 63 | -------------------------------------------------------------------------------- /man/plot_loglik_vs_rank.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/other_plots.R 3 | \name{plot_loglik_vs_rank} 4 | \alias{plot_loglik_vs_rank} 5 | \alias{loglik_vs_rank_ggplot_call} 6 | \title{Plot Log-Likelihood Versus Rank} 7 | \usage{ 8 | plot_loglik_vs_rank(fits, ggplot_call = loglik_vs_rank_ggplot_call) 9 | 10 | loglik_vs_rank_ggplot_call(dat, font.size = 9) 11 | } 12 | \arguments{ 13 | \item{fits}{A list with 2 more list elements, in which each list 14 | element is an object of class \code{"poisson_nmf_fit"} or 15 | \code{"multinom_topic_model_fit"}. If two or more fits share the 16 | same rank, or number of topics, the largest log-likelihood is 17 | plotted.} 18 | 19 | \item{ggplot_call}{The function used to create the plot. Replace 20 | \code{loglik_vs_rank_ggplot_call} with your own function to 21 | customize the appearance of the plot.} 22 | 23 | \item{dat}{A data frame passed as input to 24 | \code{\link[ggplot2]{ggplot}}, containing, at a minimum, columns 25 | \dQuote{x} and \dQuote{y}.} 26 | 27 | \item{font.size}{Font size used in plot.} 28 | } 29 | \value{ 30 | A \code{ggplot} object. 31 | } 32 | \description{ 33 | Create a plot showing the improvement in the 34 | log-likelihood as the rank of the matrix factorization or the 35 | number of topics (\dQuote{k}) increases. 36 | } 37 | -------------------------------------------------------------------------------- /man/plot_progress.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/other_plots.R 3 | \name{plot_progress} 4 | \alias{plot_progress} 5 | \title{Plot Progress of Model Fitting Over Time} 6 | \usage{ 7 | plot_progress( 8 | fits, 9 | x = c("timing", "iter"), 10 | y = c("loglik", "dev", "res"), 11 | add.point.every = 20, 12 | colors = c("#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7"), 13 | linetypes = "solid", 14 | linesizes = 0.5, 15 | shapes = 19, 16 | fills = "white", 17 | e = 0.01, 18 | theme = function() theme_cowplot(12) 19 | ) 20 | } 21 | \arguments{ 22 | \item{fits}{An object of class \code{"poisson_nmf_fit"} or 23 | \code{"multinom_topic_model_fit"}, or a non-empty, named list in 24 | which each all list elements are objects of class 25 | \code{"poisson_nmf_fit"} or all objects of class 26 | \code{"multinom_topic_model_fit"}.} 27 | 28 | \item{x}{Choose \code{"timing"} to plot improvement in the solution 29 | over time, or choose \code{"iter"} to plot improvement in the 30 | solution per iteration.} 31 | 32 | \item{y}{Column of the "progress" data frame used to assess 33 | progress of the Poisson NMF optimization method(s). Should be one 34 | of \code{"loglik"} (Poisson NMF or multinomial topic model 35 | log-likelihood), \code{"dev"} (deviance) or \code{"res"} (maximum 36 | residual of KKT conditions). The deviance is only valid for Poisson 37 | NMF model fits.} 38 | 39 | \item{add.point.every}{A positive integer giving the iteration 40 | interval for drawing points on the progress curves. Set to 41 | \code{Inf} to prevent points from being drawn on the plot.} 42 | 43 | \item{colors}{Colours used to draw progress curves; passed as the 44 | \code{values} input to \code{\link[ggplot2]{scale_color_manual}}. 45 | If fewer colours than "fits" are given, the colours are recycled.} 46 | 47 | \item{linetypes}{Line types used to draw progress curves; passed as 48 | the \code{values} input to \code{\link[ggplot2]{scale_linetype_manual}}. 49 | If fewer line types than \dQuote{fits} are given, the line types are 50 | recycled.} 51 | 52 | \item{linesizes}{Line sizes used to draw progress curves; passed as 53 | the \code{values} input to \code{\link[ggplot2]{scale_size_manual}}. 54 | If fewer line sizes than \dQuote{fits} are given, the line sizes are 55 | recycled.} 56 | 57 | \item{shapes}{Shapes used to draw points at the selected 58 | iterations; passed as the \code{values} input to 59 | \code{\link[ggplot2]{scale_shape_manual}}. If fewer shapes than 60 | \dQuote{fits} are given, the shapes are recycled.} 61 | 62 | \item{fills}{Fill colours used to draw points at the selected 63 | iterations; passed as the \code{values} input to 64 | \code{\link[ggplot2]{scale_fill_manual}}. If fewer fill colours 65 | than \dQuote{fits} are given, the fill colours are recycled.} 66 | 67 | \item{e}{A small, positive number added to the vertical axis (for 68 | \code{y = "loglik"} and \code{y = "dev"} only) so that the 69 | logarithmic scale does not over-emphasize very small differences.} 70 | 71 | \item{theme}{The \sQuote{ggplot2} \dQuote{theme}.} 72 | } 73 | \value{ 74 | A \code{ggplot} object. 75 | } 76 | \description{ 77 | Create a plot showing improvement in one or more 78 | Poisson NMF or multinomial topic model fits over time. 79 | } 80 | \details{ 81 | The horizontal axis shows the recorded runtime (in s), and 82 | the vertical axis shows some quantity measuring the quality of the 83 | fit: the log-likelihood, deviance or maximum residual of the 84 | Karush-Kuhn-Tucker (KKT) first-order optimality conditions. To 85 | better visualize log-likelihoods and deviances, log-likelihood and 86 | deviance differences are shown on the logarithmic scale. 87 | Differences are calculated with respect to the best value achieved 88 | over all the fits compared. 89 | 90 | Note that only minimal argument checking is performed. 91 | } 92 | \seealso{ 93 | \code{\link{fit_poisson_nmf}} 94 | } 95 | -------------------------------------------------------------------------------- /man/poisson2multinom.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/poisson2multinom.R 3 | \name{poisson2multinom} 4 | \alias{poisson2multinom} 5 | \title{Recover Multinomial Topic Model Fit from Poisson NMF fit} 6 | \usage{ 7 | poisson2multinom(fit) 8 | } 9 | \arguments{ 10 | \item{fit}{An object of class \dQuote{poisson_nmf_fit}, such as an 11 | output from \code{fit_poisson_nmf}. It does not make sense for a 12 | multinomial topic model to have less than two topics, so an error 13 | will be reported when k < 2, where k is the rank of the matrix 14 | factorization. If a multinomial topic model fit is provided (that 15 | is, an object of class \dQuote{multinom_topic_model_fit}), the fit 16 | object is immediately returned \dQuote{as is}.} 17 | } 18 | \value{ 19 | The return value is the list \code{fit}, in which 20 | \code{fit$F} and \code{fit$L} are the parameters of the multinomial 21 | topic model; specifically, \code{fit$L[i,]} gives the topic 22 | probabilities for sample or document i, and \code{fit$F[,k]} gives 23 | the term probabilities for topic k. An additional vector 24 | \code{fit$s} of length n is returned giving the "size factors". 25 | } 26 | \description{ 27 | This function recovers parameter estimates of the 28 | multinomial topic model given parameter estimates for a Poisson 29 | non-negative matrix factorization (NMF). 30 | } 31 | -------------------------------------------------------------------------------- /man/predict.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/predict.R 3 | \name{predict.poisson_nmf_fit} 4 | \alias{predict.poisson_nmf_fit} 5 | \alias{predict.multinom_topic_model_fit} 6 | \title{Predict Methods for Poisson NMF and Multinomial Topic Model} 7 | \usage{ 8 | \method{predict}{poisson_nmf_fit}(object, newdata, numiter = 20, ...) 9 | 10 | \method{predict}{multinom_topic_model_fit}(object, newdata, numiter = 20, ...) 11 | } 12 | \arguments{ 13 | \item{object}{An object of class \dQuote{poisson_nmf_fit} or 14 | \dQuote{multinom_topic_model_fit}.} 15 | 16 | \item{newdata}{An optional counts matrix. If omitted, the loadings 17 | estimated in the original data are returned.} 18 | 19 | \item{numiter}{The number of updates to perform.} 20 | 21 | \item{\dots}{Additional arguments passed to 22 | \code{\link{fit_poisson_nmf}}.} 23 | } 24 | \value{ 25 | A loadings matrix with one row for each data point and one 26 | column for each topic or factor. For 27 | \code{predict.multinom_topic_model_fit}, the output can also be 28 | interpreted as a matrix of estimated topic proportions, in which 29 | \code{L[i,j]} is the proportional contribution of topic j to data 30 | point i. 31 | } 32 | \description{ 33 | Predict loadings based on previously fit Poisson NMF, 34 | or predict topic proportions based on previously fit multinomial 35 | topic model. This can be thought of as projecting data points onto 36 | a previously estimated set of factors \code{fit$F}. 37 | } 38 | \examples{ 39 | \donttest{ 40 | # Simulate a 175 x 1,200 counts matrix. 41 | set.seed(1) 42 | dat <- simulate_count_data(175,1200,k = 3) 43 | 44 | # Split the data into training and test sets. 45 | train <- dat$X[1:100,] 46 | test <- dat$X[101:175,] 47 | 48 | # Fit a Poisson non-negative matrix factorization using the 49 | # training data. 50 | fit <- init_poisson_nmf(train,F = dat$F,init.method = "random") 51 | fit <- fit_poisson_nmf(train,fit0 = fit) 52 | 53 | # Compare the estimated loadings in the training data against the 54 | # loadings used to simulate these data. 55 | Ltrain <- predict(fit) 56 | plot(dat$L[1:100,],Ltrain,pch = 20,col = "darkblue") 57 | abline(a = 0,b = 1,col = "magenta",lty = "dotted", 58 | xlab = "true",ylab = "estimated") 59 | 60 | # Next, predict loadings in unseen (test) data points, and compare 61 | # these predictions against the loadings that were used to simulate 62 | # the test data. 63 | Ltest <- predict(fit,test) 64 | plot(dat$L[101:175,],Ltest,pch = 20,col = "darkblue", 65 | xlab = "true",ylab = "estimated") 66 | abline(a = 0,b = 1,col = "magenta",lty = "dotted") 67 | 68 | # Simulate a 175 x 1,200 counts matrix. 69 | set.seed(1) 70 | dat <- simulate_multinom_gene_data(175,1200,k = 3) 71 | 72 | # Split the data into training and test sets. 73 | train <- dat$X[1:100,] 74 | test <- dat$X[101:175,] 75 | 76 | # Fit a topic model using the training data. 77 | fit <- init_poisson_nmf(train,F = dat$F,init.method = "random") 78 | fit <- fit_poisson_nmf(train,fit0 = fit) 79 | fit <- poisson2multinom(fit) 80 | 81 | # Compare the estimated topic proportions in the training data against 82 | # the topic proportions used to simulate these data. 83 | Ltrain <- predict(fit) 84 | plot(dat$L[1:100,],Ltrain,pch = 20,col = "darkblue") 85 | abline(a = 0,b = 1,col = "magenta",lty = "dotted", 86 | xlab = "true",ylab = "estimated") 87 | 88 | # Next, predict loadings in unseen (test) data points, and compare 89 | # these predictions against the loadings that were used to simulate 90 | # the test data. 91 | Ltest <- predict(fit,test) 92 | plot(dat$L[101:175,],Ltest,pch = 20,col = "darkblue", 93 | xlab = "true",ylab = "estimated") 94 | abline(a = 0,b = 1,col = "magenta",lty = "dotted") 95 | } 96 | 97 | } 98 | \seealso{ 99 | \code{\link{fit_poisson_nmf}} 100 | } 101 | -------------------------------------------------------------------------------- /man/run_homer.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/homer.R 3 | \name{run_homer} 4 | \alias{run_homer} 5 | \title{Perform HOMER Motif Enrichment Analysis using DE Genomic Positions} 6 | \usage{ 7 | run_homer( 8 | de, 9 | k, 10 | positions, 11 | genome = "hg19", 12 | subset = function(postmean, lpval, lfsr, rank, quantile) lfsr < 0.05, 13 | homer.exec = "findMotifsGenome.pl", 14 | out.dir = tempdir(), 15 | homer.options = "-len 8,10,12 -size 200 -mis 2 -S 25 -p 1 -h", 16 | verbose = TRUE 17 | ) 18 | } 19 | \arguments{ 20 | \item{de}{An object of class \dQuote{topic_model_de_analysis}, 21 | usually the result of running \code{\link{de_analysis}}.} 22 | 23 | \item{k}{Use the DE analysis results for this topic.} 24 | 25 | \item{positions}{A table of genomic positions corresponding to rows 26 | of the \code{de_analysis} results. Specifically, it should a data 27 | frame with four columns: \dQuote{chr}, chromosome name or number; 28 | \dQuote{start}, start position of genomic feature; \dQuote{end}, 29 | end position of genomic feature; and \dQuote{name}, the name of the 30 | genomic feature. If not specified, the genomic positions will be 31 | extracted from the row names of \code{de$postmean}, in which the 32 | row names are expected to be of the form \code{chr_start_end}. The 33 | genomic positions will be written to a BED file (see 34 | \url{https://genome.ucsc.edu/FAQ/FAQformat.html} for more 35 | information about BED files).} 36 | 37 | \item{genome}{The genome parameter passed to 38 | \code{findMotifsGenome.pl}.} 39 | 40 | \item{subset}{Describe input argument "subset" here.} 41 | 42 | \item{homer.exec}{The name or file path of the HOMER 43 | \code{findMotifsGenome.pl} excutable.} 44 | 45 | \item{out.dir}{The positions BED file and HOMER results are written 46 | to this directory.} 47 | 48 | \item{homer.options}{Character string used to override default 49 | \code{findMotifsGenome.pl} options.} 50 | 51 | \item{verbose}{When \code{verbose = TRUE}, progress information is 52 | printed to the console.} 53 | } 54 | \value{ 55 | A data frame containing the motif enrichment results. It 56 | is created from the \code{knownResults.txt} HOMER output. 57 | } 58 | \description{ 59 | Run HOMER motif finding algorithm 60 | (\code{findMotifsGenome.pl}) to identify motifs enriched for 61 | differentially expressed (DE) genomic positions. See 62 | \url{http://homer.ucsd.edu} for more information. 63 | } 64 | \references{ 65 | Heinz, S., Benner, C., Spann, N., Bertolino, E., Lin, Y. C., Laslo, 66 | P., Cheng, J. X., Murre, C., Singh, H. and Glass, C. K. (2010). 67 | Simple combinations of lineage-determining transcription factors 68 | prime cis-regulatory elements required for macrophage and B cell 69 | identities. \emph{Molecular Cell} \bold{38}, 576-589. 70 | } 71 | -------------------------------------------------------------------------------- /man/select_loadings.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/select.R 3 | \name{select.poisson_nmf_fit} 4 | \alias{select.poisson_nmf_fit} 5 | \alias{select} 6 | \alias{select.multinom_topic_model_fit} 7 | \alias{select_loadings} 8 | \title{Extract or Re-order Data Rows in Poisson NMF or Multinomial Topic Model Fit} 9 | \usage{ 10 | \method{select}{poisson_nmf_fit}(.data, loadings, ...) 11 | 12 | \method{select}{multinom_topic_model_fit}(.data, loadings, ...) 13 | 14 | select_loadings(.data, loadings, ...) 15 | } 16 | \arguments{ 17 | \item{.data}{Poisson NMF or Multinomial Topic Model fit; that is, 18 | an object of class \dQuote{poisson_nmf_fit} or 19 | \dQuote{multinom_topic_model_fit}, such as an output from 20 | \code{\link{fit_poisson_nmf}} or \code{\link{fit_topic_model}}.} 21 | 22 | \item{loadings}{Indices (names or numbers) giving data rows to 23 | keep. If not specified, all rows are kept.} 24 | 25 | \item{\dots}{Other arguments passed to the generic select function.} 26 | } 27 | \value{ 28 | A Poisson NMF or multinomial topic model fit containing the 29 | selected data rows only. 30 | } 31 | \description{ 32 | This function can be used to extract estimates for a 33 | subset of the count data, or to re-order the rows of the loadings 34 | matrix. 35 | } 36 | -------------------------------------------------------------------------------- /man/simulate_count_data.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/datasim.R 3 | \name{simulate_count_data} 4 | \alias{simulate_count_data} 5 | \title{Simulate Count Data from Poisson NMF Model} 6 | \usage{ 7 | simulate_count_data(n, m, k, fmax = 1, lmax = 1, sparse = FALSE) 8 | } 9 | \arguments{ 10 | \item{n}{Number of rows in simulated count matrix. The number of 11 | rows should be at least 2.} 12 | 13 | \item{m}{Number of columns in simulated count matrix. The number of 14 | columns should be at least 2.} 15 | 16 | \item{k}{Number of factors, or \dQuote{topics}, used to determine 17 | Poisson rates. The number of topics should be 1 or more.} 18 | 19 | \item{fmax}{Factors are drawn uniformly at random between zero and 20 | \code{fmax}.} 21 | 22 | \item{lmax}{Loadings are drawn uniformly at random between zero and 23 | \code{lmax}.} 24 | 25 | \item{sparse}{If \code{sparse = TRUE}, convert the counts matrix to 26 | a sparse matrix in compressed, column-oriented format; see 27 | \code{\link[Matrix]{sparseMatrix}}.} 28 | } 29 | \value{ 30 | The return value is a list containing the counts matrix 31 | \code{X} and the factorization, \code{F} and \code{L}, used to 32 | generate the counts. 33 | } 34 | \description{ 35 | Simulate a counts matrix \code{X} such that 36 | \code{X[i,j]} is Poisson with rate (mean) \code{Y[i,j]}, where 37 | \code{Y = tcrossprod(L,F)}, \code{L} is an n x k loadings 38 | (\dQuote{activations}) matrix, and \code{F} is an m x k factors 39 | (\dQuote{basis vectors}) matrix. The entries of matrix \code{L} are 40 | drawn uniformly at random between zero and \code{lmax}, and the 41 | entries of matrix \code{F} are drawn uniformly at random between 0 42 | and \code{fmax}. 43 | } 44 | \details{ 45 | Note that only minimal argument checking is performed. This 46 | function is mainly used to simulate small data sets for the examples 47 | and package tests. 48 | } 49 | -------------------------------------------------------------------------------- /man/simulate_gene_data.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/datasim.R 3 | \name{simulate_poisson_gene_data} 4 | \alias{simulate_poisson_gene_data} 5 | \alias{simulate_multinom_gene_data} 6 | \title{Simulate Gene Expression Data from Poisson NMF or Multinomial 7 | Topic Model} 8 | \usage{ 9 | simulate_poisson_gene_data(n, m, k, s, p = 1, sparse = FALSE) 10 | 11 | simulate_multinom_gene_data(n, m, k, sparse = FALSE) 12 | } 13 | \arguments{ 14 | \item{n}{Number of rows in the simulated count matrix. Should be at 15 | least 2.} 16 | 17 | \item{m}{Number of columns in the simulated count matrix. Should be 18 | at least 2.} 19 | 20 | \item{k}{Number of factors, or \dQuote{topics}, used to generate 21 | the data. Should be 2 or more.} 22 | 23 | \item{s}{Vector of \dQuote{size factors}; each row of the loadings 24 | matrix \code{L} is scaled by the entries of \code{s} before 25 | generating the counts. This should be a vector of length n 26 | containing only positive values.} 27 | 28 | \item{p}{Probability that \code{F[i,j]} is equal to the mean rate. 29 | Smaller values of \code{p} will result in more factors that are the 30 | same across topics.} 31 | 32 | \item{sparse}{If \code{sparse = TRUE}, convert the counts matrix to 33 | a sparse matrix in compressed, column-oriented format; see 34 | \code{\link[Matrix]{sparseMatrix}}.} 35 | } 36 | \value{ 37 | \code{simulate_poisson_gene_data} returns a list containing 38 | the counts matrix \code{X}, and the size factors \code{s} and 39 | factorization, \code{F}, \code{L}, used to generate the counts. 40 | \code{simulate_multinom_gene_data} returns a list containing the 41 | counts matrix \code{X}, and the mixture proportions \code{L} and 42 | factors (gene probabilities, or relative gene expression levels) 43 | \code{F} used to generate the counts. 44 | } 45 | \description{ 46 | Simulate count data from a Poisson NMF model or 47 | multinomial topic model, in which topics represent \dQuote{gene 48 | expression programs}, and gene expression programs are 49 | characterized by different rates of expression. The way in which 50 | the counts are simulated is modeled after gene expression studies 51 | in which expression is measured by single-cell RNA sequencing 52 | (\dQuote{RNA-seq}) techniques: each row of the counts matrix 53 | corresponds a gene expression profile, each column corresponds to a 54 | gene, and each matrix element is a \dQuote{read count}, or 55 | \dQuote{UMI count}, measuring expression level. Factors are 56 | simulated so as to capture realistic changes in gene expression 57 | across different cell types. See \dQuote{Details} for the procedure 58 | used to simulate factors, loadings and counts. 59 | } 60 | \details{ 61 | Here we describe the process for generating the n x k 62 | loadings matrix \code{L} and the m x k factors matrix \code{F}. 63 | 64 | Each row of the \code{L} matrix is generated in the following 65 | manner: (1) the number of nonzero mixture proportions is \eqn{1 66 | \le n \le k}, with probability proportional to \eqn{2^{-n}}; 67 | (2) the indices of the nonzero mixture proportions are sampled 68 | uniformly at random; and (3) the nonzero mixture proportions are 69 | sampled from the Dirichlet distribution with \eqn{\alpha = 1} (so 70 | that all topics are equally likely). 71 | 72 | Each row of the factors matrix are generated according to the 73 | following procedure: (1) generate \eqn{u = |r| - 5}, where \eqn{r ~ 74 | N(0,2)}; (2) for each topic \eqn{k}, generate the Poisson rates as 75 | \eqn{exp(max(t,-5))}, where \eqn{t ~ 0.95 * N(u,s/10) + 0.05 * 76 | N(u,s)}, and \eqn{s = exp(-u/8)}. Factors can be interpreted as 77 | Poisson rates or multinomial probabilities, so that individual 78 | counts can be viewed as being generated from a weighted mixture 79 | of \dQuote{topics} with different rates or probabilities. 80 | 81 | Once the loadings and factors have been generated, the counts are 82 | simulated from either the Poisson NMF or multinomial topic model: 83 | for the former, \code{X[i,j]} is Poisson with rate \code{Y[i,j]}, 84 | where \code{Y = tcrossprod(L,F)}; for the latter, \code{X[i,]} is 85 | multinomial with size \code{s[i]} and with class probabilities 86 | \code{P[i,]}, where \code{P = tcrossprod(L,F)}. For the multinomial 87 | model only, the sizes \code{s} are randomly generated as \code{s = 88 | 10^rnorm(n,3,0.2)}. 89 | 90 | Note that only minimal argument checking is performed; 91 | the function is mainly used to test implementation of the 92 | topic-model-based differential count analysis. 93 | } 94 | -------------------------------------------------------------------------------- /man/simulate_toy_gene_data.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/datasim.R 3 | \name{simulate_toy_gene_data} 4 | \alias{simulate_toy_gene_data} 5 | \title{Simulate Toy Gene Expression Data} 6 | \usage{ 7 | simulate_toy_gene_data(n, m, k, s) 8 | } 9 | \arguments{ 10 | \item{n}{The number of samples (gene expression profiles) to 11 | simulate.} 12 | 13 | \item{m}{The number of counts (genes) to simulate.} 14 | 15 | \item{k}{The number of topics ("gene programs") used to simulate 16 | the data.} 17 | 18 | \item{s}{A scalar specifying the total expression of each sample; 19 | it specifies the "size" parameter in the calls to 20 | \code{\link[stats]{rmultinom}}.} 21 | } 22 | \value{ 23 | The return value is a list containing the counts matrix 24 | \code{X}, and the gene frequencies \code{F} and mixture proportions 25 | \code{L} used to generate the counts. 26 | } 27 | \description{ 28 | Simulate gene expression data (UMI counts) under a 29 | toy expression model. Samples (expression profiles) are drawn 30 | from a multinomial topic model in which topics are "gene programs". 31 | } 32 | \details{ 33 | The mixture proportions are generated as follows. With 34 | probability 0.9, one proportion is one, or close to one, and the 35 | remaining are zero, or close to zero; that is, the counts are 36 | primarily generated from a single gene program. Otherwise (wtth 37 | probability 0.1), the mixture proportions are roughly equal. 38 | 39 | Gene frequencies are drawn uniformly at random from [0,1]. 40 | } 41 | -------------------------------------------------------------------------------- /man/summary.poisson_nmf_fit.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/summary.R 3 | \name{summary.poisson_nmf_fit} 4 | \alias{summary.poisson_nmf_fit} 5 | \alias{summary.multinom_topic_model_fit} 6 | \alias{print.summary.poisson_nmf_fit} 7 | \alias{print.summary.multinom_topic_model_fit} 8 | \title{Summarize Poisson NMF or Multinomial Topic Model Fit} 9 | \usage{ 10 | \method{summary}{poisson_nmf_fit}(object, ...) 11 | 12 | \method{summary}{multinom_topic_model_fit}(object, ...) 13 | 14 | \method{print}{summary.poisson_nmf_fit}(x, show.mixprops = FALSE, show.topic.reps = FALSE, ...) 15 | 16 | \method{print}{summary.multinom_topic_model_fit}( 17 | x, 18 | show.size.factors = FALSE, 19 | show.mixprops = FALSE, 20 | show.topic.reps = FALSE, 21 | ... 22 | ) 23 | } 24 | \arguments{ 25 | \item{object}{An object of class \dQuote{poisson_nmf_fit} or 26 | \dQuote{multinom_topic_model_fit}. The former is usually the result 27 | of calling \code{\link{fit_poisson_nmf}}; the latter is usually the 28 | result of calling \code{\link{fit_topic_model}} or 29 | \code{\link{poisson2multinom}}.} 30 | 31 | \item{\dots}{Additional arguments passed to the generic \code{summary} 32 | or \code{print.summary} method.} 33 | 34 | \item{x}{An object of class \dQuote{summary.poisson_nmf_fit}, 35 | usually a result of a call to \code{summary.poisson_nmf_fit}.} 36 | 37 | \item{show.mixprops}{If \code{TRUE}, print a summary of the mixture 38 | proportions.} 39 | 40 | \item{show.topic.reps}{If \code{TRUE}, print a summary of the topic 41 | representatives.} 42 | 43 | \item{show.size.factors}{If \code{TRUE}, print a summary of the 44 | size factors.} 45 | } 46 | \value{ 47 | The functions \code{summary.poisson_nmf_fit} and 48 | \code{summary.multinom_topic_model_fit} compute and return a list 49 | of statistics summarizing the model fit. The returned list 50 | includes some or all of the following elements: 51 | 52 | \item{n}{The number of rows in the counts matrix, typically the 53 | number of samples.} 54 | 55 | \item{m}{The number of columns in the counts matrix, typically the 56 | number of observed counts per sample.} 57 | 58 | \item{k}{The rank of the Poisson NMF or the number of topics.} 59 | 60 | \item{s}{A vector of length n giving the "size factor" estimates; 61 | these estimates should be equal, or close to, the total counts in 62 | each row of the counts matrix.} 63 | 64 | \item{numiter}{The number of loadings and/or factor updates 65 | performed.} 66 | 67 | \item{loglik}{The Poisson NMF log-likelihood.} 68 | 69 | \item{loglik.multinom}{The multinomial topic model log-likelihood.} 70 | 71 | \item{dev}{The Poisson NMF deviance.} 72 | 73 | \item{res}{The maximum residual of the Karush-Kuhn-Tucker (KKT) 74 | first-order optimality conditions. This can be used to assess 75 | convergence of the updates to a (local) solution.} 76 | 77 | \item{mixprops}{Matrix giving a high-level summary of the 78 | mixture proportions, in which rows correspond to topics, and 79 | columns are ranges of mixture proportionss.} 80 | 81 | \item{topic.reps}{A matrix in which the ith row gives the mixture 82 | proportions for the sample "most representative" of topic i; by 83 | "most representative", we mean the row (or sample) with the highest 84 | proportion of counts drawn from the topic i.} 85 | } 86 | \description{ 87 | \code{summary} method for the \dQuote{poisson_nmf_fit} 88 | and \dQuote{multinom_topic_model_fit} classes. 89 | } 90 | -------------------------------------------------------------------------------- /src/Makevars: -------------------------------------------------------------------------------- 1 | PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS) -DARMA_DONT_PRINT_ERRORS \ 2 | -DARMA_NO_DEBUG -DARMA_USE_BLAS -DARMA_DONT_USE_OPENMP \ 3 | -DARMA_USE_TBB_ALLOC -DRCPP_PARALLEL_USE_TBB=1 \ 4 | -DARMA_WARN_LEVEL=1 -DARMA_64BIT_WORD 5 | PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS) $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) \ 6 | $(shell ${R_HOME}/bin/Rscript -e "RcppParallel::RcppParallelLibs()") 7 | 8 | -------------------------------------------------------------------------------- /src/Makevars.win: -------------------------------------------------------------------------------- 1 | PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS) -DARMA_DONT_PRINT_ERRORS \ 2 | -DARMA_NO_DEBUG -DARMA_USE_BLAS -DARMA_DONT_USE_OPENMP \ 3 | -DARMA_USE_TBB_ALLOC -DRCPP_PARALLEL_USE_TBB=1 \ 4 | -DARMA_WARN_LEVEL=1 -DARMA_64BIT_WORD 5 | PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS) $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) \ 6 | $(shell "${R_HOME}/bin${R_ARCH_BIN}/Rscript.exe" -e "RcppParallel::RcppParallelLibs()") 7 | -------------------------------------------------------------------------------- /src/cost.cpp: -------------------------------------------------------------------------------- 1 | #include "misc.h" 2 | #include "cost.h" 3 | 4 | using namespace arma; 5 | 6 | // FUNCTION DEFINITIONS 7 | // -------------------- 8 | // Compute negative log-likelihoods for assessing a topic model fit or 9 | // quality of a non-negative matrix factorization, in which matrix X 10 | // is approximated by matrix product A * B. 11 | // 12 | // [[Rcpp::depends(RcppArmadillo)]] 13 | // [[Rcpp::export]] 14 | arma::vec cost_rcpp (const arma::mat& X, const arma::mat& A, 15 | const arma::mat& B, double e, bool poisson) { 16 | return cost(X,A,B,e,poisson); 17 | } 18 | 19 | // This is the same as cost_rcpp, except that X must be sparse. 20 | // 21 | // [[Rcpp::export]] 22 | arma::vec cost_sparse_rcpp (const arma::sp_mat& X, const arma::mat& A, 23 | const arma::mat& B, double e, bool poisson) { 24 | return cost_sparse(X,A,B,e,poisson); 25 | } 26 | 27 | // This is the helper function for cost_rcpp. 28 | arma::vec cost (const mat& X, const mat& A, const mat& B, double e, 29 | bool poisson) { 30 | unsigned int n = X.n_rows; 31 | unsigned int m = X.n_cols; 32 | vec f(n,fill::zeros); 33 | vec y(n); 34 | 35 | // Repeat for each column of X. 36 | for (unsigned int j = 0; j < m; j++) { 37 | 38 | // This is equivalent to the following R code: 39 | // 40 | // f = f + poisson*y - X[,j]*log(y + e)) 41 | // 42 | // where 43 | // 44 | // y = A %*% B[,j] 45 | // 46 | y = A * B.col(j); 47 | f -= X.col(j) % log(y + e); 48 | if (poisson) 49 | f += y; 50 | } 51 | 52 | return f; 53 | } 54 | 55 | // Helper function for cost_sparse_rcpp. 56 | arma::vec cost_sparse (const sp_mat& X, const mat& A, const mat& B, 57 | double e, bool poisson) { 58 | unsigned int n = X.n_rows; 59 | unsigned int m = X.n_cols; 60 | unsigned int i; 61 | vec f(n,fill::zeros); 62 | vec y(n); 63 | 64 | // Repeat for each column of X. 65 | for (unsigned int j = 0; j < m; j++) { 66 | 67 | // Initialize an iterator for the nonzero elements in the jth 68 | // column of X. 69 | sp_mat::const_col_iterator xj = X.begin_col(j); 70 | sp_mat::const_col_iterator xm = X.end_col(j); 71 | 72 | // This is equivalent to the following R code: 73 | // 74 | // f = f + poisson*y - X[,j]*log(y + e) 75 | // 76 | // where 77 | // 78 | // y = A %*% B[,j] 79 | // 80 | y = A * B.col(j); 81 | for(; xj != xm; ++xj) { 82 | i = xj.row(); 83 | f(i) -= (*xj) * log(y(i) + e); 84 | } 85 | if (poisson) 86 | f += y; 87 | } 88 | 89 | return f; 90 | } 91 | -------------------------------------------------------------------------------- /src/cost.h: -------------------------------------------------------------------------------- 1 | #ifndef INCLUDE_COST 2 | #define INCLUDE_COST 3 | 4 | #include 5 | 6 | // FUNCTION DECLARATIONS 7 | // --------------------- 8 | arma::vec cost (const arma::mat& X, const arma::mat& A, 9 | const arma::mat& B, double e, bool poisson); 10 | arma::vec cost_sparse (const arma::sp_mat& X, const arma::mat& A, 11 | const arma::mat& B, double e, bool poisson); 12 | 13 | #endif 14 | -------------------------------------------------------------------------------- /src/misc.cpp: -------------------------------------------------------------------------------- 1 | #include "misc.h" 2 | 3 | using namespace arma; 4 | 5 | // FUNCTION DECLARATIONS 6 | // --------------------- 7 | void le_diff (const vec& x, vec& y); 8 | 9 | // FUNCTION DEFINITIONS 10 | // -------------------- 11 | // Compute, for each row of X, the "least extreme" differences. This 12 | // should output the same result as t(apply(X,1,le.diff)), but faster. 13 | // 14 | // [[Rcpp::depends(RcppArmadillo)]] 15 | // [[Rcpp::export]] 16 | arma::mat le_diff_rcpp (const arma::mat& X) { 17 | unsigned int n = X.n_rows; 18 | unsigned int m = X.n_cols; 19 | mat Y(n,m); 20 | vec x(m); 21 | vec y(m); 22 | for (unsigned int i = 0; i < n; i++) { 23 | x = trans(X.row(i)); 24 | le_diff(x,y); 25 | Y.row(i) = trans(y); 26 | } 27 | return Y; 28 | } 29 | 30 | // This is used to implement x_over_tcrossprod. 31 | // 32 | // [[Rcpp::export]] 33 | arma::vec x_over_crossprod_rcpp (const arma::vec& i, const arma::vec& j, 34 | const arma::vec& x, const arma::mat& A, 35 | const arma::mat& B, double e) { 36 | unsigned int n = x.n_elem; 37 | vec y = x; 38 | for (unsigned int t = 0; t < n; t++) 39 | y(t) /= (dot(A.col(i(t)),B.col(j(t))) + e); 40 | return y; 41 | } 42 | 43 | // For vector x, return a vector of the same length y containing the 44 | // "least extreme" differences y(i) = x(i) - x(j), in which j is the 45 | // index not equal to i such that abs(x(i) - x(j)) is the smallest 46 | // possible. 47 | void le_diff (const vec& x, vec& y) { 48 | unsigned int n = x.n_elem; 49 | if (n == 2) { 50 | y(0) = x(0) - x(1); 51 | y(1) = -y(0); 52 | } else { 53 | uvec indices = sort_index(x); 54 | unsigned int i, j, k; 55 | double a, b; 56 | i = indices(0); 57 | j = indices(1); 58 | y(i) = x(i) - x(j); 59 | i = indices(n-1); 60 | j = indices(n-2); 61 | y(i) = x(i) - x(j); 62 | for (unsigned int t = 1; t < n-1; t++) { 63 | i = indices(t-1); 64 | j = indices(t); 65 | k = indices(t+1); 66 | a = x(j) - x(i); 67 | b = x(k) - x(j); 68 | if (a <= b) 69 | y(j) = x(j) - x(i); 70 | else 71 | y(j) = x(j) - x(k); 72 | } 73 | } 74 | } 75 | 76 | // Return the row indices of the nonzeros in the jth column of sparse 77 | // matrix A. This is the same as 78 | // 79 | // i = find(A.col(j)) 80 | // 81 | // but this code does not compile in some versions of gcc, so I 82 | // re-implemented this code here. Vector i must already been 83 | // initialized with the proper length, e.g., by doing 84 | // 85 | // vec a = nonzeros(A.col(j)); 86 | // unsigned int n = a.n_elem; 87 | // uvec i(n); 88 | // getcolnonzeros(A,i,j); 89 | // 90 | void getcolnonzeros (const sp_mat& A, uvec& i, unsigned int j) { 91 | sp_mat::const_col_iterator ai = A.begin_col(j); 92 | sp_mat::const_col_iterator an = A.end_col(j); 93 | for (unsigned int t = 0; ai != an; ++ai, ++t) 94 | i(t) = ai.row(); 95 | } 96 | 97 | // Scale each column A[,i] by b[i]. 98 | void scalecols (mat& A, const vec& b) { 99 | rowvec c = trans(b); 100 | A.each_row() %= c; 101 | } 102 | 103 | // Normalize each row of A so that the entries in each row sum to 1. 104 | void normalizerows (mat& A) { 105 | vec b = conv_to::from(sum(A,1)); 106 | A.each_col() /= b; 107 | } 108 | 109 | // Normalize each column of A so that the entries in each column sum to 1. 110 | void normalizecols (mat& A) { 111 | rowvec b = sum(A,0); 112 | A.each_row() /= b; 113 | } 114 | 115 | // Scale each row of A so that the largest entry in each row is 1. 116 | void normalizerowsbymax (mat& A) { 117 | vec b = conv_to::from(max(A,1)); 118 | A.each_col() /= b; 119 | } 120 | -------------------------------------------------------------------------------- /src/misc.h: -------------------------------------------------------------------------------- 1 | #ifndef INCLUDE_MISC 2 | #define INCLUDE_MISC 3 | 4 | #include 5 | 6 | #define maximum(a,b) ((a) > (b) ? (a) : (b)) 7 | #define minimum(a,b) ((a) < (b) ? (a) : (b)) 8 | 9 | // FUNCTION DECLARATIONS 10 | // --------------------- 11 | void getcolnonzeros (const arma::sp_mat& A, arma::uvec& i, unsigned int j); 12 | void scalecols (arma::mat& A, const arma::vec& b); 13 | void normalizerows (arma::mat& A); 14 | void normalizecols (arma::mat& A); 15 | void normalizerowsbymax (arma::mat& A); 16 | 17 | #endif 18 | -------------------------------------------------------------------------------- /src/mixem.cpp: -------------------------------------------------------------------------------- 1 | #include "mixem.h" 2 | #include "misc.h" 3 | 4 | using namespace arma; 5 | 6 | // FUNCTION DECLARATIONS 7 | // --------------------- 8 | void mixem_update (const arma::mat& L1, const arma::vec& w, 9 | arma::vec& x, arma::mat& P); 10 | 11 | // FUNCTION DEFINITIONS 12 | // -------------------- 13 | // This is mainly used for testing the mixem C++ function. 14 | // 15 | // [[Rcpp::depends(RcppArmadillo)]] 16 | // [[Rcpp::export]] 17 | arma::vec mixem_rcpp (const arma::mat& L, const arma::vec& w, 18 | const arma::vec& x0, unsigned int numiter) { 19 | return mixem(L,w,x0,numiter); 20 | } 21 | 22 | // Compute a maximum-likelihood estimate (MLE) of the mixture 23 | // proportions in the multinomial mixture model by iterating the EM 24 | // updates for a fixed number of iterations. 25 | // 26 | // Input argument L is an n x m matrix with non-negative entries; 27 | // input w is a vector of length n containing a non-negative "weight" 28 | // associated with each row of L; input argument x0 is the initial 29 | // estimate of the mixture proportions; input P is a matrix of the 30 | // same dimension as L, and is used to store the posterior mixture 31 | // assignment probabilities; and input "numiter" specifies the number 32 | // of EM updates to perform. 33 | // 34 | // The return value is a vector of length m containing the updated 35 | // mixture proportions. 36 | // 37 | // Note that x0 and L need not be normalized; they will automatically 38 | // be normalized inside this function. 39 | // 40 | // Also note that it does not make sense to compute a MLE of the 41 | // mixture proportions when n < 2 and/or when m < 2; mixem will supply 42 | // a result in such cases, but the result will not be valid. 43 | vec mixem (const mat& L, const vec& w, const vec& x0, unsigned int numiter) { 44 | mat L1 = L; 45 | mat P = L; 46 | vec x = x0; 47 | normalizecols(L1); 48 | mixem(L1,w,x,P,numiter); 49 | return x; 50 | } 51 | 52 | // Use this variant of mixem if you plan on using the same L matrix 53 | // multiple times, or for calling mixem multiple times with matrices 54 | // of the same dimension. In the first case, you can reuse the L1 and 55 | // P matrices; in the latter case, you can reuse the P matrix. 56 | // 57 | // For the result to be valid, the matrix L1 should be normalized 58 | // beforehand so that each column sums to 1. P should be a matrix of 59 | // the same size as L1. 60 | // 61 | // Note that x need not be normalized; it will automatically be 62 | // normalized inside this function. 63 | // 64 | // Also note that in this mixem variant, L1 and w do not need to 65 | // contain all the data; any rows of L1 associated with zero weights 66 | // have no effect, so only the vector of nonzero weights w, and the 67 | // rows of L1 associated with those weights, need to be supplied. 68 | void mixem (const mat& L1, const vec& w, vec& x, mat& P, 69 | unsigned int numiter) { 70 | for (unsigned int i = 0; i < numiter; i++) 71 | mixem_update(L1,w,x,P); 72 | } 73 | 74 | // Perform a single EM update. For this update to be valid, the matrix 75 | // L1 should be normalized beforehand so that each column sums to 1. 76 | // Note that x need not be normalized; it will automatically be 77 | // normalized inside this function. 78 | void mixem_update (const mat& L1, const vec& w, vec& x, mat& P) { 79 | double e = 1e-15; 80 | 81 | // Normalize the "weights". 82 | vec w1 = w/sum(w); 83 | 84 | // Normalize the mixture proportions. 85 | x /= sum(x); 86 | 87 | // Compute the posterior mixture assignment probabilities. A small 88 | // number is added to the posterior probabilities to prevent any 89 | // divisions by zero. This is the "E step". 90 | P = L1; 91 | scalecols(P,x); 92 | normalizerowsbymax(P); 93 | P += e; 94 | normalizerows(P); 95 | 96 | // Update the mixture weights. This is the "M step". 97 | x = trans(P) * w1; 98 | } 99 | 100 | // Find the maximum-likelihood estimate (MLE) for the special case 101 | // when only one of the weights (w) is positive. Here, L1 should be 102 | // the column-normalized matrix, and i should be the index of the 103 | // nonzero weight. 104 | void mixture_one_nonzero (const mat& L1, unsigned int i, vec& x) { 105 | unsigned int j = index_max(L1.row(i)); 106 | x.fill(0); 107 | x(j) = 1; 108 | } 109 | -------------------------------------------------------------------------------- /src/mixem.h: -------------------------------------------------------------------------------- 1 | #ifndef INCLUDE_MIXEM 2 | #define INCLUDE_MIXEM 3 | 4 | #include 5 | 6 | // FUNCTION DECLARATIONS 7 | // --------------------- 8 | arma::vec mixem (const arma::mat& L, const arma::vec& w, const arma::vec& x0, 9 | unsigned int numiter); 10 | 11 | void mixem (const arma::mat& L1, const arma::vec& w, arma::vec& x, 12 | arma::mat& P, unsigned int numiter); 13 | 14 | void mixture_one_nonzero (const arma::mat& L1, unsigned int i, arma::vec& x); 15 | 16 | #endif 17 | -------------------------------------------------------------------------------- /src/poismix.h: -------------------------------------------------------------------------------- 1 | #ifndef INCLUDE_POISMIXEM 2 | #define INCLUDE_POISMIXEM 3 | 4 | #include 5 | 6 | // FUNCTION DECLARATIONS 7 | // --------------------- 8 | arma::vec poismixem (const arma::mat& L, const arma::vec& w, 9 | const arma::vec& x0, unsigned int numiter); 10 | 11 | void poismixem (const arma::mat& L1, const arma::vec& u, const arma::vec& w, 12 | arma::vec& x, arma::mat& P, unsigned int numiter); 13 | 14 | void poismixem (const arma::mat& L1, const arma::vec& u, const arma::vec& w, 15 | const arma::uvec& i, arma::vec& x, unsigned int numiter); 16 | 17 | arma::vec scd_kl_update (const arma::mat& L, const arma::vec& w, 18 | const arma::vec& x0, unsigned int numiter, 19 | double e); 20 | 21 | arma::vec scd_kl_update (const arma::mat& L, const arma::vec& u, 22 | const arma::vec& w, const arma::vec& x0, 23 | unsigned int numiter, double e); 24 | 25 | arma::vec ccd_kl_update (const arma::mat& L, const arma::vec& w, 26 | const arma::vec& x0, double e); 27 | 28 | arma::vec ccd_kl_update (const arma::mat& L, const arma::vec& u, 29 | const arma::vec& w, const arma::vec& x0, 30 | double e); 31 | 32 | #endif 33 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(fastTopics) 3 | test_check("fastTopics") 4 | -------------------------------------------------------------------------------- /tests/testthat/test_fit_multinom_model.R: -------------------------------------------------------------------------------- 1 | context("fit_multinom_model") 2 | 3 | test_that("fit_multinom_model gives correct factor estimates",{ 4 | 5 | # Simulate a "toy" gene expression data set. 6 | set.seed(1) 7 | n <- 400 8 | m <- 40 9 | k <- 3 10 | out <- simulate_toy_gene_data(n,m,k,s = 1000) 11 | X <- out$X 12 | Y <- as(X,"CsparseMatrix") 13 | 14 | # Force "hard" topic assignments. 15 | cluster <- factor(apply(force_hard_topic_assignments(out$L),1,which.max)) 16 | levels(cluster) <- paste0("k",1:k) 17 | 18 | # Fit the simple multinomial model. 19 | fit1 <- fit_multinom_model(cluster,X) 20 | fit2 <- fit_multinom_model(cluster,Y) 21 | 22 | # Both calls to fit_multinom_model should result in nearly the same 23 | # loadings. 24 | expect_equal(fit1$L,fit2$L,scale = 1,tolerance = 1e-15) 25 | 26 | # Check that both calls to fit_multinom_model recover the 27 | # maximum-likelihood estimates (MLEs) of the factors (F) and "size 28 | # factors" (s). 29 | s <- rowSums(X) 30 | F <- matrix(0,m,k) 31 | for (j in 1:k) { 32 | i <- which(cluster == levels(cluster)[j]) 33 | F[,j] <- colSums(X[i,])/sum(X[i,]) 34 | } 35 | expect_equal(fit1$s,s,scale = 1,tolerance = 1e-6) 36 | expect_equal(fit2$s,s,scale = 1,tolerance = 1e-6) 37 | expect_equivalent(fit1$F,F,scale = 1,tolerance = 1e-15) 38 | expect_equivalent(fit2$F,F,scale = 1,tolerance = 1e-15) 39 | }) 40 | 41 | -------------------------------------------------------------------------------- /tests/testthat/test_fit_topic_model.R: -------------------------------------------------------------------------------- 1 | context("fit_poisson_nmf") 2 | 3 | test_that("fit_topic_model successfully fits a multinomial topic model",{ 4 | 5 | # Generate a 80 x 100 sparse count matrix to factorize. 6 | set.seed(1) 7 | out <- simulate_count_data(80,100,k = 3,sparse = TRUE) 8 | X <- out$X 9 | 10 | # Fit a multinomial topic model to these data. 11 | capture.output(fit <- fit_topic_model(X,k = 3)) 12 | expect_s3_class(fit,"multinom_topic_model_fit") 13 | expect_s3_class(summary(fit),"summary.multinom_topic_model_fit") 14 | }) 15 | -------------------------------------------------------------------------------- /tests/testthat/test_mixem.R: -------------------------------------------------------------------------------- 1 | context("mixem") 2 | 3 | test_that("mixem and mixem_rcpp produce same result",{ 4 | 5 | # Generate small data set. 6 | set.seed(1) 7 | out <- generate_poismix_data(100,c(1,2,0,0,0,4,0,0)) 8 | L <- out$L 9 | w <- out$w 10 | 11 | # Run 100 EM updates for the multinomial mixture model. The R and 12 | # C++ implementations should give nearly the same result. 13 | m <- ncol(L) 14 | x0 <- runif(m) 15 | x1 <- mixem(L,w,x0,100) 16 | x2 <- drop(mixem_rcpp(L,w,x0,100)) 17 | expect_equal(x1,x2,tolerance = 1e-12,scale = 1) 18 | }) 19 | 20 | test_that("mixem and mixem_rcpp produce correct result when sum(w > 0) = 1",{ 21 | 22 | # Generate the data set. 23 | set.seed(1) 24 | n <- 10 25 | out <- generate_poismix_data(n,c(1,2,0,0)) 26 | L <- out$L 27 | w <- rep(0,n) 28 | w[8] <- 2 29 | 30 | # Get the exact solution. 31 | x0 <- mixture.one.nonzero(L,w) 32 | 33 | # Run 20 EM updates for the multinomial mixture model. 34 | x1 <- mixem(L,w,x0,20) 35 | x2 <- drop(mixem_rcpp(L,w,x0,20)) 36 | 37 | # The solution should not change much after running the EM updates. 38 | expect_equal(x0,x1,tolerance = 1e-12,scale = 1) 39 | expect_equal(x0,x2,tolerance = 1e-12,scale = 1) 40 | }) 41 | -------------------------------------------------------------------------------- /tests/testthat/test_plots.R: -------------------------------------------------------------------------------- 1 | context("plots") 2 | 3 | test_that("Test that plot_loglik_vs_rank works",{ 4 | set.seed(1) 5 | dat <- generate_test_data(80,100,3) 6 | X <- dat$X 7 | 8 | # Fit matrix factorizations with rank k = 2, 3, 5, 10. 9 | capture.output(fit2 <- fit_poisson_nmf(X,k = 2,numiter = 100)) 10 | capture.output(fit3 <- fit_poisson_nmf(X,k = 3,numiter = 100)) 11 | capture.output(fit5 <- fit_poisson_nmf(X,k = 5,numiter = 100)) 12 | capture.output(fit10 <- fit_poisson_nmf(X,k = 10,numiter = 100)) 13 | 14 | # Plot log-likelihood vs. rank. 15 | p1 <- plot_loglik_vs_rank(list(fit2,fit3,fit5,fit10)) 16 | p2 <- plot_loglik_vs_rank(lapply(list(fit2,fit3,fit5,fit10), 17 | poisson2multinom)) 18 | expect_s3_class(p1,"ggplot") 19 | expect_s3_class(p2,"ggplot") 20 | }) 21 | 22 | test_that("Test that pca_plot and pca_hexbin_plot work",{ 23 | set.seed(1) 24 | k <- 3 25 | X <- simulate_toy_gene_data(n = 400,m = 40,k = k,s = 1000)$X 26 | capture.output(fit1 <- fit_poisson_nmf(X,k = k,numiter = 100, 27 | control = list(extrapolate = TRUE))) 28 | fit2 <- poisson2multinom(fit1) 29 | 30 | # Test pca_plot. 31 | p1 <- pca_plot(fit1) 32 | p2 <- pca_plot(fit2) 33 | p3 <- pca_hexbin_plot(fit2) 34 | p4 <- pca_hexbin_plot(fit2) 35 | expect_s3_class(p1,"ggplot") 36 | expect_s3_class(p2,"ggplot") 37 | expect_s3_class(p3,"ggplot") 38 | expect_s3_class(p4,"ggplot") 39 | 40 | # Test the other variants of pca_plot. 41 | y <- factor(apply(fit2$L,1,which.max)) 42 | levels(y) <- paste0("k",1:k) 43 | p5 <- pca_plot(fit1,fill = "none") 44 | p6 <- pca_plot(fit1,fill = fit2$L[,1]) 45 | p7 <- pca_plot(fit1,fill = y) 46 | expect_s3_class(p5,"ggplot") 47 | expect_s3_class(p6,"ggplot") 48 | expect_s3_class(p7,"ggplot") 49 | }) 50 | 51 | test_that("Test that other plotting functions work",{ 52 | set.seed(1) 53 | dat <- generate_test_data(200,100,3) 54 | X <- dat$X 55 | capture.output(fit0 <- init_poisson_nmf(X,k = 3)) 56 | capture.output( 57 | fit1 <- fit_poisson_nmf(X,fit0 = fit0,numiter = 50,method = "scd", 58 | control = list(extrapolate = TRUE))) 59 | capture.output( 60 | fit2 <- fit_poisson_nmf(X,fit0 = fit0,numiter = 50,method = "em", 61 | control = list(extrapolate = TRUE))) 62 | 63 | # Test plot_progress. 64 | plot_progress(list(scd = fit1,em = fit2),y = "loglik") 65 | plot_progress(list(scd = fit1,em = fit2),y = "dev") 66 | plot_progress(list(scd = fit1,em = fit2),y = "res") 67 | plot_progress(list(scd = poisson2multinom(fit1),em = poisson2multinom(fit2))) 68 | 69 | # Test loadings_plot. 70 | x <- factor(sample(1:4,200,replace = TRUE)) 71 | p1 <- loadings_plot(fit1,x) 72 | p2 <- loadings_plot(poisson2multinom(fit1),x) 73 | expect_s3_class(p1,"ggplot") 74 | expect_s3_class(p2,"ggplot") 75 | 76 | # Test structure_plot. 77 | grouping <- factor(apply(poisson2multinom(fit1)$L,1,which.max)) 78 | capture.output(y <- drop(tsne_from_topics(poisson2multinom(fit1),dims = 1))) 79 | capture.output(p1 <- structure_plot(fit1)) 80 | capture.output(p2 <- structure_plot(fit1,grouping = grouping,gap = 5)) 81 | capture.output(p3 <- structure_plot(poisson2multinom(fit1)$L)) 82 | capture.output(p4 <- structure_plot(fit1$L)) 83 | capture.output(p5 <- structure_plot(fit1,loadings_order = order(y))) 84 | expect_s3_class(p1,"ggplot") 85 | expect_s3_class(p2,"ggplot") 86 | expect_s3_class(p3,"ggplot") 87 | expect_s3_class(p4,"ggplot") 88 | expect_s3_class(p5,"ggplot") 89 | 90 | # Test the "plot" S3 method (which creates a Structure plot). 91 | fit2 <- poisson2multinom(fit1) 92 | capture.output(p1 <- plot(fit1)) 93 | capture.output(p2 <- plot(fit2)) 94 | expect_s3_class(p1,"ggplot") 95 | expect_s3_class(p2,"ggplot") 96 | 97 | skip_if(on_cran) 98 | 99 | # Test tsne_plot and umap_plot. 100 | capture.output(p1 <- tsne_plot(fit1,fill = "loading")) 101 | capture.output(p2 <- tsne_plot(fit2,fill = "loading")) 102 | capture.output(p3 <- umap_plot(fit1,fill = "loading",verbose = FALSE)) 103 | capture.output(p4 <- umap_plot(fit2,fill = "loading",verbose = FALSE)) 104 | expect_s3_class(p1,"ggplot") 105 | expect_s3_class(p2,"ggplot") 106 | expect_s3_class(p3,"ggplot") 107 | expect_s3_class(p4,"ggplot") 108 | }) 109 | -------------------------------------------------------------------------------- /tests/testthat/test_poismix.R: -------------------------------------------------------------------------------- 1 | context("poismix") 2 | 3 | test_that("poismixem and poismixem_rcpp produce same result",{ 4 | 5 | # Generate small data set. 6 | set.seed(1) 7 | out <- generate_poismix_data(100,c(1,2,0,0,0,4,0,0)) 8 | L <- out$L 9 | w <- out$w 10 | 11 | # Run 100 EM updates for the Poisson mixture model. The R 12 | # implementation, and all variations of the C++ implementation, 13 | # should give nearly the same result. 14 | numiter <- 100 15 | m <- ncol(L) 16 | L1 <- normalize.cols(L) 17 | u <- colSums(L) 18 | i <- which(w > 0) 19 | x0 <- runif(m) 20 | x1 <- poismixem(L,w,x0,numiter) 21 | x2 <- drop(poismixem_rcpp(L,w,x0,numiter)) 22 | x3 <- drop(poismixem2_rcpp(L1,w,u,x0,numiter)) 23 | x4 <- drop(poismixem3_rcpp(L1,w[i],u,i-1,x0,numiter)) 24 | expect_equal(x1,x2,tolerance = 1e-14,scale = 1) 25 | expect_equal(x1,x3,tolerance = 1e-14,scale = 1) 26 | expect_equal(x1,x4,tolerance = 1e-14,scale = 1) 27 | }) 28 | 29 | test_that(paste("poismixem, scd_kl_update and ccd_kl_update give nearly the", 30 | "same solution"),{ 31 | 32 | # Generate small data set. 33 | set.seed(1) 34 | out <- generate_poismix_data(100,c(1,2,0,0,0,4,0,0)) 35 | L <- out$L 36 | w <- out$w 37 | 38 | # Run 10,000 EM updates. 39 | m <- ncol(L) 40 | x0 <- runif(m) 41 | x1 <- drop(poismixem_rcpp(L,w,x0,1e4)) 42 | 43 | # Run 100 sequential coordinate descent (SCD) updates, using both 44 | # C++ interfaces. 45 | numiter <- 100 46 | L1 <- normalize.cols(L) 47 | u <- colSums(L) 48 | i <- which(w > 0) 49 | x2 <- drop(scd_kl_update_rcpp(L,w,x0,numiter,1e-15)) 50 | x3 <- drop(scd_kl_update2_rcpp(L[i,],u,w[i],x0,numiter,1e-15)) 51 | 52 | # Run 100 cyclic coordinate descent (CCD) updates, using both C++ 53 | # interfaces. 54 | x4 <- drop(ccd_kl_update_rcpp(L,w,x0,numiter,1e-15)) 55 | x5 <- drop(ccd_kl_update2_rcpp(L[i,],u,w[i],x0,numiter,1e-15)) 56 | 57 | # The coordinatewise updates should recover nearly the same solution 58 | # as mix-SQP, and should give the same results whether the "dense" 59 | # or "sparse" updates are used. 60 | expect_equal(x1,x2,tolerance = 1e-5,scale = 1) 61 | expect_equal(x1,x4,tolerance = 1e-5,scale = 1) 62 | expect_equal(x2,x3,tolerance = 1e-14,scale = 1) 63 | expect_equal(x4,x5,tolerance = 1e-14,scale = 1) 64 | }) 65 | 66 | test_that(paste("poismixem and poismixem_rcpp produce correct result", 67 | "when sum(w > 0) = 1"),{ 68 | 69 | # Generate the data set. 70 | set.seed(1) 71 | n <- 10 72 | out <- generate_poismix_data(n,c(1,2,0,0)) 73 | L <- out$L 74 | w <- rep(0,n) 75 | i <- 8 76 | w[i] <- 2 77 | 78 | # Run 100 EM updates for the multinomial mixture model. 79 | numiter <- 100 80 | m <- ncol(L) 81 | x0 <- runif(m) 82 | x1 <- poismixem(L,w,x0,numiter) 83 | 84 | # Run 100 EM updates another a few times, using the different C++ 85 | # interfaces. 86 | L1 <- normalize.cols(L) 87 | u <- colSums(L) 88 | x2 <- drop(poismixem_rcpp(L,w,x0,numiter)) 89 | x3 <- drop(poismixem2_rcpp(L1,w,u,x0,numiter)) 90 | x4 <- drop(poismixem3_rcpp(L1,w[i],u,i-1,x0,numiter)) 91 | 92 | # The R and C++ implementations should give nearly the same result, 93 | # and should be very close to the exact solution obtained by calling 94 | # poismix.one.nonzero. 95 | x5 <- poismix.one.nonzero(L,w) 96 | expect_equal(x1,x2,tolerance = 1e-12,scale = 1) 97 | expect_equal(x1,x3,tolerance = 1e-12,scale = 1) 98 | expect_equal(x1,x4,tolerance = 1e-12,scale = 1) 99 | expect_equal(x1,x5,tolerance = 1e-12,scale = 1) 100 | }) 101 | -------------------------------------------------------------------------------- /tests/testthat/test_poisson2multinom.R: -------------------------------------------------------------------------------- 1 | context("poisson2multinom") 2 | 3 | test_that("poisson2multinom gives error when k = 1",{ 4 | L <- matrix(0:3,4,1) 5 | F <- matrix(0:4,5,1) 6 | fit <- list(F = F,L = L) 7 | expect_error(poisson2multinom(fit)) 8 | }) 9 | 10 | test_that("poisson2multinom correctly scales factors and loadings",{ 11 | L <- matrix(0:7,4,2) 12 | F <- matrix(0:9,5,2) 13 | rownames(L) <- paste0("i",1:4) 14 | rownames(F) <- paste0("j",1:5) 15 | colnames(L) <- paste0("k",1:2) 16 | colnames(F) <- paste0("k",1:2) 17 | fit <- list(F = F,L = L) 18 | class(fit) <- c("poisson_nmf_fit","list") 19 | fit <- poisson2multinom(fit) 20 | expect_equivalent(colSums(fit$F),c(1,1)) 21 | expect_equivalent(rowSums(fit$L),c(1,1,1,1)) 22 | }) 23 | 24 | test_that("multinom2poisson recovers original Poisson NMF model fit",{ 25 | set.seed(1) 26 | out <- simulate_count_data(10,20,3) 27 | X <- out$X 28 | fit1 <- iterate_updates(X,out$F,out$L,100, 29 | function (X,F,L) t(betanmf_update_factors(X,L,t(F))), 30 | function (X,F,L) betanmf_update_loadings(X,L,t(F))) 31 | class(fit1) <- c("poisson_nmf_fit","list") 32 | fit2 <- poisson2multinom(fit1) 33 | fit2a <- fit2 34 | fit2a$s <- NULL 35 | fit3 <- multinom2poisson(fit2) 36 | fit4 <- multinom2poisson(fit2a,X) 37 | fit5 <- multinom2poisson(fit2a,as(X,"CsparseMatrix")) 38 | Y1 <- with(fit1,tcrossprod(L,F)) 39 | Y3 <- with(fit3,tcrossprod(L,F)) 40 | Y4 <- with(fit4,tcrossprod(L,F)) 41 | Y5 <- with(fit5,tcrossprod(L,F)) 42 | f1 <- loglik_poisson_nmf(X,fit1) 43 | f3 <- loglik_poisson_nmf(X,fit3) 44 | f4 <- loglik_poisson_nmf(X,fit4) 45 | f5 <- loglik_poisson_nmf(X,fit5) 46 | expect_equal(Y1,Y3,tolerance = 1e-15,scale = 1) 47 | expect_equal(Y1,Y4,tolerance = 1e-15,scale = 1) 48 | expect_equal(Y1,Y5,tolerance = 1e-15,scale = 1) 49 | expect_equal(f1,f3,tolerance = 1e-14,scale = 1) 50 | expect_equal(f1,f4,tolerance = 1e-14,scale = 1) 51 | expect_equal(f1,f5,tolerance = 1e-14,scale = 1) 52 | }) 53 | -------------------------------------------------------------------------------- /tests/testthat/test_select.R: -------------------------------------------------------------------------------- 1 | context("select") 2 | 3 | test_that(paste("Select S3 method correctly subsets and re-orders the", 4 | "factors and loadings in a small example; also check ", 5 | "merge_topics"),{ 6 | 7 | # Generate a 80 x 100 data matrix to factorize. 8 | set.seed(1) 9 | n <- 80 10 | m <- 100 11 | k <- 3 12 | X <- generate_test_data(n,m,k)$X 13 | 14 | # Run 20 EM updates. 15 | capture.output( 16 | fit <- poisson2multinom(fit_poisson_nmf(X,k = k,numiter = 20, 17 | method = "em"))) 18 | 19 | # Select and re-order factors and loadings by number (here, we use 20 | # the "select_loadings" function). 21 | n0 <- 40 22 | rows <- sample(n,n0) 23 | fit1 <- select_loadings(fit,rows) 24 | 25 | # Select and re-order factors and loadings by name (here, we use the 26 | # "select" S3 method). 27 | rows <- rownames(X)[rows] 28 | fit2 <- select(fit,rows) 29 | 30 | # Check the outputted Poisson NMF fits. 31 | expect_equal(dim(fit1$L),c(n0,k)) 32 | expect_equal(dim(fit2$L),c(n0,k)) 33 | expect_equal(length(fit1$s),n0) 34 | expect_equal(length(fit2$s),n0) 35 | expect_equal(rownames(fit2$L),rows) 36 | expect_equal(names(fit2$s),rows) 37 | 38 | # An error is thrown when the selected loadings do not exist. 39 | expect_error(select(fit,loadings = n + 1)) 40 | 41 | # Check that merge_topics does the right thing. 42 | fit3 <- merge_topics(fit2,k = 1:2) 43 | fit4 <- merge_topics(fit2,k = c("k1","k2")) 44 | expect_equal(dim(fit3$F),c(100,2)) 45 | expect_equal(dim(fit3$L),c(40,2)) 46 | expect_equal(colnames(fit3$F),c("k3","k1+k2")) 47 | expect_equal(colnames(fit3$L),c("k3","k1+k2")) 48 | expect_equal(fit3,fit4) 49 | }) 50 | 51 | test_that(paste("select followed by poisson2multinom gives the same result", 52 | "as poisson2multinom followed by select"),{ 53 | 54 | # Generate a 80 x 100 data matrix to factorize. 55 | set.seed(1) 56 | n <- 80 57 | m <- 100 58 | k <- 3 59 | X <- generate_test_data(n,m,k)$X 60 | 61 | # Run 20 EM updates. 62 | capture.output(fit <- fit_poisson_nmf(X,k = k,numiter = 20,method = "em")) 63 | rows <- sample(n,20) 64 | 65 | # Check that select followed by poisson2multinom gives the same 66 | # result as poisson2multinom followed by select. 67 | fit1 <- poisson2multinom(select(fit,rows)) 68 | fit2 <- select(poisson2multinom(fit),rows) 69 | expect_equal(fit1,fit2,scale = 1,tolerance = 1e-15) 70 | }) 71 | -------------------------------------------------------------------------------- /tests/testthat/test_summary.R: -------------------------------------------------------------------------------- 1 | context("summary") 2 | 3 | test_that("summary method and print.summary methods produce output",{ 4 | 5 | # Generate a 80 x 100 data matrix to factorize. 6 | set.seed(1) 7 | out <- generate_test_data(80,100,3) 8 | X <- out$X 9 | 10 | # Fit a Poisson non-negative factorization. 11 | capture.output(fit <- fit_poisson_nmf(X,k = 3,numiter = 100)) 12 | 13 | # Produce summaries of the model fit. 14 | expect_output(print(summary(fit))) 15 | expect_output(print(summary(poisson2multinom(fit)))) 16 | expect_output(print(summary(fit),show.mixprops = TRUE, 17 | show.topic.reps = TRUE)) 18 | expect_output(print(summary(poisson2multinom(fit)), 19 | show.size.factors = TRUE, 20 | show.mixprops = TRUE, 21 | show.topic.reps = TRUE)) 22 | }) 23 | --------------------------------------------------------------------------------