├── .gitignore ├── Alphadiversity.Rmd ├── Alphadiversity.html ├── Atlas.Rmd ├── Atlas.html ├── Betadiversity.Rmd ├── Betadiversity.html ├── Bimodality.Rmd ├── Bimodality.html ├── Clustering.Rmd ├── Clustering.html ├── CoAbundantTaxa.Rmd ├── CoAbundantTaxa.html ├── Comparisons.Rmd ├── Comparisons.html ├── Composition.Rmd ├── Composition.html ├── CompositionAmplicondata.html ├── CompositionAmplicondata2.Rmd ├── CompositionAmplicondata2.html ├── Contributing.Rmd ├── Contributing.html ├── Core.Rmd ├── Core.html ├── CoremicrobiotaAmplicon.Rmd ├── CoremicrobiotaAmplicon.html ├── DMM.Rmd ├── DMM.html ├── DOC.Rmd ├── DOC.html ├── Data.R ├── Data.Rmd ├── Data.html ├── Diversity.html ├── Edu.html ├── Edu ├── .gitignore ├── all.Rmd ├── all.html ├── bibliography.bib ├── extra.Rmd ├── extra.html ├── lm.R ├── lm.Rmd ├── lm.html ├── main.R ├── misc.R ├── pooled.R ├── pooled.Rmd ├── pooled.html ├── univariate.Rmd └── univariate.html ├── Experimental.html ├── HITChip.html ├── Heatmap.Rmd ├── Heatmap.html ├── Installation.Rmd ├── Installation.html ├── Interactive.Rmd ├── Interactive.html ├── LICENSE ├── Landscaping.Rmd ├── Landscaping.html ├── Misc ├── CompositionAmplicondata.Rmd ├── CompositionAmplicondata.html ├── CompositionAmplicondata2.Rmd ├── CompositionAmplicondata2.html ├── Experimental.Rmd ├── HITChip.Rmd ├── Probelevel.Rmd ├── Visualization.Rmd └── info.Rmd ├── Mixedmodels.Rmd ├── Mixedmodels.html ├── MovingPictures_MicrobiomeExperiment.rds ├── Negativebinomial.Rmd ├── Negativebinomial.html ├── Networks.Rmd ├── Networks.html ├── Ordination.Rmd ├── Ordination.html ├── Output.Rmd ├── Output.html ├── PERMANOVA.Rmd ├── PERMANOVA.html ├── PlotDiversity.Rmd ├── PlotDiversity.html ├── Preprocessing.Rmd ├── Preprocessing.html ├── Probelevel.html ├── RDA.Rmd ├── RDA.html ├── README.Rmd ├── README.html ├── README.md ├── Regression.Rmd ├── Regression.html ├── Rplotvio1.jpeg ├── Rplotvio2.jpeg ├── Stability.Rmd ├── Stability.html ├── TODO.Rmd ├── TODO.html ├── Template.html ├── TemporalMicrobiotaTrajectory.Rmd ├── TemporalMicrobiotaTrajectory.html ├── Themes.Rmd ├── Themes.html ├── Tutorial.Rmd ├── Tutorial.html ├── Visualization.Rmd ├── Visualization.html ├── _config.yml ├── all.Rmd ├── all.html ├── ancom ├── .gitignore ├── README.md ├── ancom.R ├── ancom.Rmd ├── ancom.md ├── figure │ ├── tops-1.png │ ├── tops-2.png │ └── tops-3.png ├── functions.R └── main.R ├── archive ├── Baxter_FITs_Microbiome_2016_fit.final.tx.1.cons.taxonomy ├── Baxter_FITs_Microbiome_2016_fit.final.tx.1.subsample.shared ├── Baxter_FITs_Microbiome_2016_mapping.csv ├── Crosshyb.Rmd ├── DiversityTable.tab ├── DynamicsIBD.rda ├── F1000 │ └── MyArticle │ │ ├── F1000header.png │ │ ├── MyArticle.Rmd │ │ ├── f1000_styles.sty │ │ ├── frog.jpg │ │ └── sample.bib ├── HITChipAtlas.Rmd ├── Maturity.Rmd ├── Motionchart.Rmd ├── PhylotypeRatios.R ├── ProvasIdata.R ├── RDA │ ├── firstlib2.R │ ├── plot_rda_bagged.R │ └── rda_bagged.R ├── ROC.Rmd ├── affybatch.R ├── anticorrelation.old.R ├── betadiversity.R ├── bicor.R ├── bimodality_sarle.R ├── core_abundance.R ├── core_bootstrap.R ├── core_bootstrap2.R ├── core_members.R ├── core_richness.R ├── crash-handler.conf ├── crosshyb.tab ├── data.R ├── data_old.R ├── debug.R ├── densityplot.R ├── deprecated.R ├── devtools.R ├── get_ordination.R ├── gini.R ├── gini_index.R ├── index.Rmd ├── index.html ├── internal.R ├── legacy.R ├── limma.Rmd ├── linearmodel.R ├── lm_phyloseq.R ├── low_abundance.R ├── lower.triangle.R ├── make.abundancy.table.R ├── maturity.R ├── metadata.xls ├── misc.R ├── misc.Rmd ├── multimodality_phyloseq.R ├── myfile.tab ├── neatsort.R ├── nets.R ├── nets.Rmd ├── old.R ├── peerj32_lipids.csv ├── peerj32_meta.csv ├── peerj32_microbes.csv ├── pet.R ├── pet2.R ├── plot-methods.R ├── plot_diversity.R ├── plot_matrix.R ├── plot_potential.R ├── plot_taxa_prevalence.Rmd ├── potential_analysis.R ├── project.data.R ├── qiita1629.biom ├── qiita1629_mapping.csv ├── qiita1629_mapping_subset.csv ├── qiita1629_otu_table.csv ├── qiita1629_taxonomy_table.csv ├── rare_abundance.R ├── rare_members.R ├── rare_richness.R ├── rda.R ├── rda_bagged.R ├── read.profiling.R ├── read_profiling.R ├── reading.R ├── save.phylogeny.R ├── subject_tables.R ├── summarize.rpa.R ├── summarize.sum.R ├── test.R ├── top_abundance.R ├── validate.R └── vm.R ├── bibliography.bib ├── bioc.css ├── cleaning_taxonomy_table.Rmd ├── cleaning_taxonomy_table.html ├── core_venn.html ├── core_venn.rmd ├── crash-handler-permission ├── crash-handler.conf ├── debug.log ├── deseq2.Rmd ├── deseq2.html ├── figure ├── pooled3-1.png ├── pooled_overdispersion-1.png ├── pooled_pcomp-1.png ├── tail-1.png ├── univariate5-1.png ├── univariate6-1.png ├── univariate_boxplot-1.png └── univariate_densityplot-1.png ├── ieee.csl ├── index.html ├── index.rmd ├── info.html ├── init.R ├── limma.Rmd ├── limma.html ├── main.R ├── misc.Rmd ├── plot_sample_trajectory.R ├── plot_time_trajectory.R ├── post_hoc.Rmd ├── post_hoc.html ├── rstanarm.Rmd └── rstanarm.html /.gitignore: -------------------------------------------------------------------------------- 1 | .RData 2 | Preprocessing.md 3 | R 4 | vignettes 5 | inst 6 | vignettes 7 | R 8 | inst 9 | OAtoken 10 | *_cache 11 | *_files 12 | cache/ 13 | figure/ 14 | *~ 15 | *.md 16 | *.Rproj 17 | docs/* 18 | Temp/* 19 | .Rproj.user 20 | me 21 | wis 22 | .Rhistory -------------------------------------------------------------------------------- /Atlas.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "HITChip Atlas" 3 | author: "Leo Lahti, Sudarshan Shetty et al." 4 | bibliography: 5 | - bibliography.bib 6 | output: 7 | BiocStyle::html_document: 8 | number_sections: no 9 | toc: yes 10 | toc_depth: 4 11 | toc_float: true 12 | self_contained: true 13 | thumbnails: true 14 | lightbox: true 15 | gallery: true 16 | use_bookdown: false 17 | highlight: haddock 18 | --- 19 | 24 | 25 | 26 | ## Intestinal microbiota diversity in 1006 western adults 27 | 28 | The data set from [Lahti et al. Nat. Comm. 5:4344, 2014](http://www.nature.com/ncomms/2014/140708/ncomms5344/full/ncomms5344.html) has microbiota profiling of 130 genus-like taxa across 1006 normal western adults from [Data Dryad](http://doi.org/10.5061/dryad.pk75d). Load the data in R: 29 | 30 | ```{r data2, warning=FALSE, message=FALSE} 31 | # Download the required R packages and then the HITChip Atlas data set 32 | library(microbiome) 33 | data(atlas1006) 34 | ``` 35 | 36 | Estimate ecosystem alpha diversity and related indicators for this data set: 37 | 38 | ```{r div-example, warning=FALSE, message=FALSE} 39 | tab <- microbiome::alpha(atlas1006, index = c("shannon", "invsimpson")) 40 | library(knitr) 41 | kable(head(tab)) 42 | ``` 43 | 44 | 45 | -------------------------------------------------------------------------------- /Bimodality.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Bimodality analysis" 3 | author: "Leo Lahti, Sudarshan Shetty et al." 4 | bibliography: 5 | - bibliography.bib 6 | output: 7 | BiocStyle::html_document: 8 | number_sections: no 9 | toc: yes 10 | toc_depth: 4 11 | toc_float: true 12 | self_contained: true 13 | thumbnails: true 14 | lightbox: true 15 | gallery: true 16 | use_bookdown: false 17 | highlight: haddock 18 | --- 19 | 25 | 26 | 27 | Get example data - [HITChip Atlas of 130 genus-like taxa across 1006 healthy western adults](http://www.nature.com/ncomms/2014/140708/ncomms5344/full/ncomms5344.html). 28 | 29 | ```{r bistability, message=FALSE} 30 | # Load the example data 31 | library(microbiome) 32 | library(dplyr) 33 | data(atlas1006) 34 | 35 | # Rename the example data 36 | pseq <- atlas1006 37 | 38 | # Focus on specific DNA extraction method 39 | pseq <- pseq %>% subset_samples(DNA_extraction_method == "r") 40 | 41 | # Use relative abundances 42 | pseq <- microbiome::transform(pseq, "compositional") 43 | 44 | # Merge rare taxa to speed up examples 45 | pseq <- aggregate_rare(pseq, level = "Genus", detection = .1/100, prevalence = 10/100) 46 | 47 | # For cross-sectional analysis, include 48 | # only the zero time point: 49 | pseq0 <- subset_samples(pseq, time == 0) 50 | ``` 51 | 52 | 53 | # Bimodality indicators 54 | 55 | Bimodality of the abundance distribution provides an indirect 56 | indicator of bistability, although other explanations such as sampling 57 | biases etc. should be controlled. Multiple bimodality scores are 58 | available. 59 | 60 | 61 | Multimodality score using [potential analysis with 62 | bootstrap](http://www.nature.com/ncomms/2014/140708/ncomms5344/full/ncomms5344.html). Sarle's bimodality coefficient is available as well; and for classical test of unimodality, see the DIP test. 63 | 64 | ```{r bimodality2, message=FALSE, warning=FALSE} 65 | # Bimodality is better estimated from log10 abundances 66 | pseq0.clr <- microbiome::transform(pseq0, "clr") 67 | bimodality <- bimodality(pseq0.clr, method = "potential_analysis", bs.iter = 20) 68 | ``` 69 | 70 | # Visualization 71 | 72 | **Visualize population densities for unimodal and bimodal groups** 73 | 74 | ```{r stability2, message=FALSE, warning=FALSE, fig.width=12, fig.height=5, out.width="500px"} 75 | # Pick the most and least bimodal taxa as examples 76 | unimodal <- names(sort(bimodality))[[1]] 77 | bimodal <- rev(names(sort(bimodality)))[[1]] 78 | 79 | # Visualize population frequencies at the baseline time point 80 | library(ggplot2) 81 | theme_set(theme_bw(20)) 82 | p1 <- plot_density(pseq0.clr, variable = unimodal) 83 | p2 <- plot_density(pseq0.clr, variable = bimodal) 84 | library(gridExtra) 85 | library(ggplot2) 86 | grid.arrange(p1, p2, nrow = 1) 87 | ``` 88 | 89 | 90 | ## Variation lineplot and bimodality hotplot 91 | 92 | Pick subset of the [HITChip Atlas data set](http://doi.org/10.5061/dryad.pk75d) and plot the subject abundance variation lineplot (**Variation tip plot**) and **Bimodality hotplot** for a given taxon as in [Lahti et al. 2014](http://www.nature.com/ncomms/2014/140708/ncomms5344/full/ncomms5344.html). The Dialister has bimodal population distribution and reduced temporal stability within subjects at intermediate abundances. 93 | 94 | For examples on tipping point detection, see 95 | [Stability](Stability). We set the tipping point manually in the 96 | following example. 97 | 98 | ```{r stability-variationplot, message=FALSE, warning=FALSE, fig.show='hold', out.width="430px", eval=FALSE} 99 | # Bimodality hotplot: 100 | # Consider a unique sample from each subject: the baseline time point 101 | p <- hotplot(pseq0, tax, tipping.point = 0.005) 102 | print(p) 103 | 104 | # Visualize bimodality 105 | pv <- plot_tipping(pseq, tax, tipping.point = 0.005) 106 | print(pv) 107 | ``` 108 | 109 | -------------------------------------------------------------------------------- /Clustering.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Cluster analysis" 3 | author: "Leo Lahti, Sudarshan Shetty et al." 4 | bibliography: 5 | - bibliography.bib 6 | output: 7 | BiocStyle::html_document: 8 | number_sections: no 9 | toc: yes 10 | toc_depth: 4 11 | toc_float: true 12 | self_contained: true 13 | thumbnails: true 14 | lightbox: true 15 | gallery: true 16 | use_bookdown: false 17 | highlight: haddock 18 | --- 19 | 25 | 26 | 27 | ## Multivariate (infinite) Gaussian mixture model 28 | 29 | Fit and visualize Variational Dirichlet process multivariate infinite Gaussian mixture. This variational version has been partially written in C and it is relatively fast. Kindly cite [this article](http://bioinformatics.oxfordjournals.org/content/26/21/2713.short). Note that the implementation uses diagonal covariances on the Gaussian modes. The C code was partially derived from [Honkela et al. 2008](http://www.sciencedirect.com/science/article/pii/S0925231208000659). 30 | 31 | ```{r LCA2, fig.width=6, fig.height=5, warning=FALSE, message=FALSE, eval=FALSE} 32 | library(netresponse) 33 | 34 | # Generate simulated data 35 | res <- generate.toydata(Dim = 2) 36 | D <- res$data 37 | component.means <- res$means 38 | component.sds <- res$sds 39 | 40 | # Fit the mixture 41 | m <- mixture.model(D, mixture.method = "vdp", pca.basis = FALSE) 42 | 43 | # Plot the data, and indicate estimated modes with colors. 44 | # If data dimensionality exceeds 2, 45 | # the results are visualized on PCA projection 46 | # (with pca.basis = TRUE the data is projected on PCA coordinates; 47 | # without loss of information. This trick can help to avoid overlearning 48 | # as the variational mixture relies 49 | # on diagonal covariance matrices, so the ellipsoidal axes of the 50 | # Gaussian modes are parallel to the coordinate axes.) 51 | p <- PlotMixtureMultivariate(D, means = m$mu, sds = m$sd, ws = m$w, modes = apply(m$qofz,1,which.max)) 52 | plot(p) 53 | ``` 54 | 55 | ## Univariate (infinite) Gaussian mixture model 56 | 57 | Fit and visualize Variational Dirichlet process univariate infinite Gaussian mixture. Kindly cite [this article](http://bioinformatics.oxfordjournals.org/content/26/21/2713.short) for the code. 58 | 59 | ```{r LCA1, fig.width=7, fig.height=5, warning=FALSE, message=FALSE, eval=FALSE} 60 | # Generate simulated bimodal univariate data 61 | x <- c(rnorm(200), rnorm(200, mean = 5)) 62 | 63 | # Variational Dirichlet process univariate Gaussian mixture 64 | m <- mixture.model(x, mixture.method = "vdp", max.responses = 10) 65 | 66 | # Plot the data and estimated modes 67 | p <- PlotMixtureUnivariate(x, means = m$mu, sds = m$sd, ws = m$w, binwidth = 0.1, qofz = m$qofz) 68 | 69 | plot(p) 70 | ``` 71 | 72 | ## Clustering samples with mixed variables 73 | 74 | Gower distance is useful for samples with mixed-type variables (binary, factor, numeric)): 75 | 76 | ```{r clustering-gower, fig.width=10, fig.height=4, warning=FALSE, message=FALSE, eval=FALSE} 77 | # Example data 78 | data("dietswap") 79 | 80 | library(FD) 81 | d <- gowdis(as(sample_data(dietswap), "data.frame")) 82 | 83 | plot(hclust(d)) 84 | ``` 85 | 86 | 87 | -------------------------------------------------------------------------------- /CoAbundantTaxa.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Identify Co-abundant taxa groups" 3 | author: "Leo Lahti, Sudarshan Shetty et al." 4 | bibliography: 5 | - bibliography.bib 6 | output: 7 | BiocStyle::html_document: 8 | number_sections: no 9 | toc: yes 10 | toc_depth: 4 11 | toc_float: true 12 | self_contained: true 13 | thumbnails: true 14 | lightbox: true 15 | gallery: true 16 | use_bookdown: false 17 | highlight: haddock 18 | --- 19 | 25 | 26 | **Co-abundant groups of taxa** 27 | 28 | Get data 29 | ```{r cag-1} 30 | library(microbiome) 31 | pseq <- atlas1006 32 | pseq.rel <- microbiome::transform(pseq, "compositional") 33 | ``` 34 | 35 | Identify co-abundant taxa. 36 | ```{r cag-2} 37 | # get median abundances 38 | median_abund = apply(abundances(pseq.rel), MARGIN = 1, FUN = median) 39 | 40 | # select taxa with median abundance of more than or equal to 0.01 41 | abundant_taxa = abundances(pseq)[median_abund >= 0.01, ] 42 | # check how many taxa 43 | nrow(abundant_taxa) 44 | # check which taxa 45 | rownames(abundant_taxa) 46 | ``` 47 | 48 | 49 | Use Spearman's correlation to identify which of the co-abundant groups show correlation. 50 | ```{r cag-1b} 51 | # cor() is from `stats` package 52 | spearman_matrix = cor(t(abundant_taxa), method = "spearman") 53 | # hclust() is from `stats` package 54 | spearman_tree = hclust(dist(spearman_matrix), method = "ward") 55 | plot(spearman_tree) 56 | 57 | ``` 58 | 59 | Using the `heat` function in `microbiome pkg` now compare the correlation with the tree above. 60 | ```{r plot-heat} 61 | spearman_long_df <- reshape2::melt(spearman_matrix) 62 | head(spearman_long_df) 63 | 64 | heat(spearman_long_df, "Var1","Var2", 65 | order.rows = TRUE, order.cols = TRUE) + 66 | theme_bw() + theme(axis.text.x = element_text(angle = 90)) 67 | ``` 68 | 69 | Validation of CAGs 70 | ```{r} 71 | # Validation of CAGs 72 | # Randomly split the dataset, compute a correlation matrix and run a Mantel test 73 | subsample = sample(c(1:300), size = 220, replace = F) 74 | train_data = abundant_taxa[,subsample] 75 | test_data = abundant_taxa[,-subsample] 76 | train_spearman = cor(t(train_data), method = "spearman") 77 | test_spearman = cor(t(test_data), method = "spearman") 78 | vegan::mantel(as.dist(train_spearman), as.dist(test_spearman), permutations=999) 79 | ``` 80 | 81 | Further reading: 82 | 83 | [de la Cuesta-Zuluaga, J., Corrales-Agudelo, V., Velásquez-Mejía, E.P. et al. Gut microbiota is associated with obesity and cardiometabolic disease in a population in the midst of Westernization. Sci Rep 8, 11356 (2018).](https://doi.org/10.1038/s41598-018-29687-x) 84 | 85 | [Claesson, M., Jeffery, I., Conde, S. et al. Gut microbiota composition correlates with diet and health in the elderly. Nature 488, 178–184 (2012).](https://doi.org/10.1038/nature11319) 86 | -------------------------------------------------------------------------------- /Comparisons.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Community comparisons" 3 | author: "Leo Lahti, Sudarshan Shetty et al." 4 | bibliography: 5 | - bibliography.bib 6 | output: 7 | BiocStyle::html_document: 8 | number_sections: no 9 | toc: yes 10 | toc_depth: 4 11 | toc_float: true 12 | self_contained: true 13 | thumbnails: true 14 | lightbox: true 15 | gallery: true 16 | use_bookdown: false 17 | highlight: haddock 18 | --- 19 | 25 | 26 | 27 | ## Group-wise comparisons 28 | 29 | A number of methods for microbiota community comparisons have been proposed. For a recent benchmarking study, see [Weiss et al. (2017)](http://doi.org/10.1186/s40168-017-0237-y). For a comprehensive example workflow, see [Callahan et al. F1000 (2017)](https://f1000research.com/articles/5-1492/v2). 30 | 31 | ### Univariate comparisons 32 | 33 | For individual taxa, diversity indicators etc. 34 | 35 | * [Linear mixed effect models](Mixedmodels.html) 36 | * [Negative binomial test](Negativebinomial.html) 37 | * [post-hoc testing For Kruskal-Wallis](post_hoc.html) 38 | 39 | 40 | Other methods, not implemented here (see [Weiss et al. (2017)](http://microbiomejournal.biomedcentral.com/articles/10.1186/s40168-017-0237-y) for a recent survey): 41 | 42 | * [Zero-inflated Gaussians (ZIGs)](https://www.ncbi.nlm.nih.gov/pubmed/24076764/) (see [metagenomeSeq](https://bioconductor.org/packages/release/bioc/html/metagenomeSeq.html) Bioconductor package) 43 | * [DESeq2](deseq2.html) and other advanced methods based on negative binomial 44 | 45 | 46 | ### Multivariate comparisons 47 | 48 | For community-level multivariate comparisons 49 | 50 | * [Multivariate linear models (limma)](limma.html) 51 | * [PERMANOVA](PERMANOVA.html) 52 | 53 | -------------------------------------------------------------------------------- /Contributing.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "How to contribute?" 3 | author: "Leo Lahti, Sudarshan Shetty et al." 4 | bibliography: 5 | - bibliography.bib 6 | output: 7 | BiocStyle::html_document: 8 | number_sections: no 9 | toc: yes 10 | toc_depth: 4 11 | toc_float: true 12 | self_contained: true 13 | thumbnails: true 14 | lightbox: true 15 | gallery: true 16 | use_bookdown: false 17 | highlight: haddock 18 | --- 19 | 25 | 26 | 27 | ### Get in touch 28 | 29 | * [Research homepage](http://www.iki.fi/Leo.Lahti) 30 | * [Twitter](https://twitter.com/openreslabs) 31 | * [Gitter](https://gitter.im/microbiome) 32 | * [Mailing list](https://groups.google.com/forum/#!forum/microbiome-devel) (microbiome-devel@googlegroups.com) 33 | 34 | ### Contribute 35 | 36 | * [Issue Tracker](https://github.com/microbiome/microbiome/issues) 37 | * [Pull requests](https://github.com/microbiome/microbiome/) 38 | * [Star us on the Github page](https://github.com/microbiome/microbiome) 39 | 40 | We welcome new examples and tutorials to be added in this collection, 41 | and will fully acknowledge such contributions. 42 | 43 | You can modify one of the existing examples (.html) and make a pull request. Or, you can propose a new tutorial page. The Rmarkdown source code (..html) for all [tutorials](http://microbiome.github.io/tutorials/) is available in the Github [index.page](https://github.com/microbiome/tutorials). Fork the repository, clone it, modify the tutorials, and make a pull request. 44 | 45 | 46 | 47 | 48 | 49 | 50 | -------------------------------------------------------------------------------- /DMM.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Dirichlet Multinomial Mixtures" 3 | author: "Leo Lahti, Sudarshan Shetty et al." 4 | bibliography: 5 | - bibliography.bib 6 | output: 7 | BiocStyle::html_document: 8 | number_sections: no 9 | toc: yes 10 | toc_depth: 4 11 | toc_float: true 12 | self_contained: true 13 | thumbnails: true 14 | lightbox: true 15 | gallery: true 16 | use_bookdown: false 17 | highlight: haddock 18 | --- 19 | 25 | 26 | 27 | ## Community typing with Dirichlet Multinomial Mixtures 28 | 29 | [Dirichlet Multinomial Mixtures (DMM)](https://doi.org/10.1371/journal.pone.0030126) (Quince et al. 2012) is a probabilistic method for community typing (or clustering) of microbial community profiling data. It is an infinite mixture model, which means that the method can infer the optimal number of community types. Note that the number of community types is likely to grow with data size. 30 | 31 | 32 | Let us load example data. 33 | 34 | ```{r DMM0, fig.width=6, fig.height=5, warning=FALSE, message=FALSE} 35 | library(microbiome) 36 | library(DirichletMultinomial) 37 | library(reshape2) 38 | library(magrittr) 39 | library(dplyr) 40 | # Load example data 41 | data(dietswap) 42 | pseq <- dietswap 43 | 44 | # To speed up, only consider the core taxa 45 | # that are prevalent at 0.1% relative abundance in 50% of the samples 46 | # (note that this is not strictly correct as information is 47 | # being discarded; one alternative would be to aggregate rare taxa) 48 | pseq.comp <- microbiome::transform(pseq, "compositional") 49 | taxa <- core_members(pseq.comp, detection = 0.1/100, prevalence = 50/100) 50 | pseq <- prune_taxa(taxa, pseq) 51 | 52 | # Pick the OTU count matrix 53 | # and convert it into samples x taxa format 54 | dat <- abundances(pseq) 55 | count <- as.matrix(t(dat)) 56 | ``` 57 | 58 | 59 | Fit the DMM model. Let us set the maximum allowed number of community types to 3 to speed up the example. 60 | 61 | ```{r DMM, fig.width=6, fig.height=5, warning=FALSE, message=FALSE, eval=TRUE} 62 | fit <- lapply(1:3, dmn, count = count, verbose=TRUE) 63 | ``` 64 | 65 | 66 | Check model fit with different number of mixture components using standard information criteria 67 | 68 | ```{r DMMplot, fig.width=6, fig.height=5, warning=FALSE, message=FALSE, eval=TRUE} 69 | lplc <- base::sapply(fit, DirichletMultinomial::laplace) # AIC / BIC / Laplace 70 | aic <- base::sapply(fit, DirichletMultinomial::AIC) # AIC / BIC / Laplace 71 | bic <- base::sapply(fit, DirichletMultinomial::BIC) # AIC / BIC / Laplace 72 | #plot(lplc, type="b", xlab="Number of Dirichlet Components", ylab="Model Fit") 73 | #lines(aic, type="b", lty = 2) 74 | #lines(bic, type="b", lty = 3) 75 | ``` 76 | 77 | Pick the optimal model 78 | 79 | ```{r DMM3, fig.width=6, fig.height=5, warning=FALSE, message=FALSE,error=FALSE, eval=TRUE} 80 | best <- fit[[which.min(unlist(lplc))]] 81 | ``` 82 | 83 | Mixture parameters pi and theta 84 | 85 | ```{r DMM4, fig.width=6, fig.height=5, warning=FALSE, message=FALSE, eval=TRUE} 86 | mixturewt(best) 87 | ``` 88 | 89 | Sample-component assignments 90 | 91 | ```{r DMM5, fig.width=6, fig.height=5, warning=FALSE, message=FALSE, eval=TRUE} 92 | ass <- apply(mixture(best), 1, which.max) 93 | ``` 94 | 95 | Contribution of each taxonomic group to each component 96 | 97 | ```{r DMM6, fig.width=9, fig.heigth=6, out.width="400px", warning=FALSE, message=FALSE} 98 | for (k in seq(ncol(fitted(best)))) { 99 | d <- melt(fitted(best)) 100 | colnames(d) <- c("OTU", "cluster", "value") 101 | d <- subset(d, cluster == k) %>% 102 | # Arrange OTUs by assignment strength 103 | arrange(value) %>% 104 | mutate(OTU = factor(OTU, levels = unique(OTU))) %>% 105 | # Only show the most important drivers 106 | filter(abs(value) > quantile(abs(value), 0.8)) 107 | 108 | p <- ggplot(d, aes(x = OTU, y = value)) + 109 | geom_bar(stat = "identity") + 110 | coord_flip() + 111 | labs(title = paste("Top drivers: community type", k)) 112 | print(p) 113 | } 114 | ``` 115 | 116 | -------------------------------------------------------------------------------- /DOC.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Dissimilarity-Overlap analysis" 3 | author: "Leo Lahti " 4 | bibliography: 5 | - bibliography.bib 6 | output: 7 | BiocStyle::html_document: 8 | number_sections: no 9 | toc: yes 10 | toc_depth: 4 11 | toc_float: true 12 | self_contained: true 13 | thumbnails: true 14 | lightbox: true 15 | gallery: true 16 | use_bookdown: false 17 | highlight: haddock 18 | 19 | --- 20 | 26 | 27 | ```{r setup, message = FALSE, warning = FALSE, results = 'hide'} 28 | #library("devtools") 29 | #install_github("microbiome/microbiome") 30 | ``` 31 | 32 | # Dissimilarity-Overlap analysis 33 | 34 | This page provides an example of running the dissimilarity-overlap analysis as proposed in Bashan, A., Gibson, T., Friedman, J. et al. Universality of human microbial dynamics. Nature 534, 259–262 (2016). [DOI:10.1038/nature18301](https://doi.org/10.1038/nature18301). 35 | 36 | 37 | Load example data: 38 | 39 | ```{r doc_data, warning=FALSE, message=FALSE} 40 | # Dissimilarity-Overlap analysis 41 | library(microbiome) 42 | data(atlas1006) 43 | ``` 44 | 45 | Estimate the overlap and dissimilarity quantities: 46 | 47 | ```{r doc_quantities, warning=FALSE, message=FALSE} 48 | # Dissimilarity 49 | d <- phyloseq::distance(microbiome::transform(atlas1006, "compositional"), "jsd", parallel=TRUE) 50 | 51 | # Overlap 52 | o <- overlap(atlas1006, detection = 0.2/100) 53 | ``` 54 | 55 | ```{r doc_compare, warning=FALSE, message=FALSE} 56 | # Compare 57 | dvec <- d[lower.tri(d)] 58 | ovec <- o[lower.tri(o)] 59 | 60 | # Assess rough correlation 61 | cc <- cor(dvec, ovec, method = "spearman", use = "pairwise.complete.obs") 62 | 63 | # Scatterplot 64 | plot(dvec, ovec, pch = ".", main = paste("Spearman rho", round(cc, 2)), las = 1, xlab = "Dissimilarity (Jensen-Shannon)", ylab = "Overlap") 65 | 66 | #p <- data.frame(D = dvec, O = ovec) %>% 67 | # ggplot(aes(x = D, y = O)) + 68 | # geom_hex() 69 | ``` 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | -------------------------------------------------------------------------------- /Data.R: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microbiome/tutorials/4c9c5eb321358c97369b1d2325ff81906a14ac0b/Data.R -------------------------------------------------------------------------------- /Data.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Data" 3 | author: "Leo Lahti, Sudarshan Shetty et al." 4 | bibliography: 5 | - bibliography.bib 6 | output: 7 | BiocStyle::html_document: 8 | number_sections: no 9 | toc: yes 10 | toc_depth: 4 11 | toc_float: true 12 | self_contained: true 13 | thumbnails: true 14 | lightbox: true 15 | gallery: true 16 | use_bookdown: false 17 | highlight: haddock 18 | --- 19 | 25 | 26 | 27 | ## Importing microbiome data in R 28 | 29 | 30 | ### Importing standard formats (CSV, Mothur, BIOM) 31 | 32 | The microbiome package has import functions for certain standard data formats for 16S profiling (Simple CSV, Mothur, biom). For details, see the function help. To import these, use: 33 | 34 | ```{r read_phyloseq, eval=FALSE} 35 | # Import output CSV files generated by write_phyloseq 36 | pseq1 <- read_phyloseq(otu.file, taxonomy.file, metadata.file, type = "simple") 37 | 38 | # Import mother .shared and .taxonomy and metadata files 39 | pseq2 <- read_phyloseq(otu.file, taxonomy.file, metadata.file, type = "mothur") 40 | 41 | # Import BIOM files 42 | pseq3 <- read_phyloseq(otu.file, taxonomy.file, metadata.file, type = "biom") 43 | ``` 44 | 45 | You can also use additional [import functions](http://joey711.github.io/phyloseq/import-data) from the independent phyloseq R package. 46 | 47 | 48 | ### Converting you own data to phyloseq format in R 49 | 50 | Alternatively, you can read your data in R (read.table, read.csv or other standard functions) and convert into phyloseq format. The procedure is well explained in the [phyloseq tutorial](http://joey711.github.io/phyloseq/import-data) from the independent phyloseq R package. See also examples on [manipulating](Preprocessing.html) for phyloseq objects. 51 | 52 | 53 | 54 | ## Microbiome example data sets 55 | 56 | ### Intestinal microbiota profiling of 1006 Western adults 57 | 58 | [The HITChip Atlas](Atlas.html) data set is available via the microbiome R package in phyloseq format, and via [Data Dryad](http://doi.org/10.5061/dryad.pk75d) in tabular format. This data set from [Lahti et al. Nat. Comm. 5:4344, 2014](http://www.nature.com/ncomms/2014/140708/ncomms5344/full/ncomms5344.html) comes with 130 genus-like taxonomic groups across 1006 western adults with no reported health complications. Some subjects have also short time series. Load the data in R with: 59 | 60 | ```{r atlasdata, warning=FALSE, message=FALSE} 61 | # Data citation doi: 10.1038/ncomms5344 62 | library(microbiome) 63 | data(atlas1006) 64 | print(atlas1006) 65 | ``` 66 | 67 | 68 | ### Diet swap between Rural and Western populations 69 | 70 | A two-week diet swap study between western (USA) and traditional 71 | (rural Africa) diets, reported in [O'Keefe et al. Nat. Comm. 6:6342, 72 | 2015](http://dx.doi.org/10.1038/ncomms7342). The data is also 73 | available for download from [Data 74 | Dryad](http://dx.doi.org/10.5061/dryad.1mn1n). Load in R with: 75 | 76 | ```{r dietswap2} 77 | # Data citation doi: 10.1038/ncomms7342 78 | data(dietswap) 79 | print(dietswap) 80 | ``` 81 | 82 | 83 | ### Intestinal microbiota versus blood metabolites 84 | 85 | Data set from [Lahti et al. PeerJ 1:e32, 86 | 2013](https://doi.org/10.7717/peerj.32) characterizes associations 87 | between human intestinal microbiota and blood serum lipids. Note that 88 | this data set contains an additional data matrix of lipid 89 | species. Load the data in R with: 90 | 91 | ```{r peerj2} 92 | # Data citation doi: 10.7717/peerj.32 93 | data(peerj32) 94 | print(names(peerj32)) 95 | ``` 96 | -------------------------------------------------------------------------------- /Edu/.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | -------------------------------------------------------------------------------- /Edu/extra.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Linear models" 3 | author: "`r Sys.Date()`" 4 | bibliography: 5 | - bibliography.bib 6 | output: 7 | rmdformats::readthedown: 8 | self_contained: true 9 | thumbnails: true 10 | lightbox: true 11 | gallery: true 12 | use_bookdown: false 13 | highlight: haddock 14 | --- 15 | 20 | 21 | 22 | ```{r, echo=FALSE, message=FALSE, warning=FALSE} 23 | # Handle citations 24 | require(knitcitations) 25 | require(bookdown) 26 | # cleanbib() 27 | # options("citation_format" = "pandoc") 28 | bib <- read.bibtex("bibliography.bib") 29 | #opts_chunk$set(fig.width=4, fig.height=3, par=TRUE, out.width='2in', fig.pos='H') 30 | library(knitr) 31 | knitr::opts_chunk$set(fig.path = "figure/", dev="CairoPNG") 32 | ``` 33 | 34 | 35 | # Other usefulf models and algorithms 36 | 37 | - envfit / bioenvfit 38 | - ordistep 39 | - mixed models 40 | 41 | 42 | # Unimodality vs. bi/multimodality 43 | 44 | - How this may affect the results in the above tests 45 | - How to quantify 46 | 47 | 48 | # Dirichlet Mixture Model (DMM) 49 | 50 | - Clustering 51 | - Nonparametric models 52 | 53 | 54 | -------------------------------------------------------------------------------- /Edu/lm.R: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Linear models" 3 | author: "`r Sys.Date()`" 4 | bibliography: 5 | - bibliography.bib 6 | output: 7 | rmdformats::readthedown: 8 | self_contained: true 9 | thumbnails: true 10 | lightbox: true 11 | gallery: true 12 | use_bookdown: false 13 | highlight: haddock 14 | --- 15 | 20 | 21 | 22 | ```{r, echo=FALSE, message=FALSE, warning=FALSE} 23 | # Handle citations 24 | require(knitcitations) 25 | require(bookdown) 26 | # cleanbib() 27 | # options("citation_format" = "pandoc") 28 | bib <- read.bibtex("bibliography.bib") 29 | #opts_chunk$set(fig.width=4, fig.height=3, par=TRUE, out.width='2in', fig.pos='H') 30 | library(knitr) 31 | knitr::opts_chunk$set(fig.path = "figure/", dev="CairoPNG") 32 | ``` 33 | 34 | 35 | # Linear models 36 | 37 | This section provides a hands-on introduction to linear and 38 | generalized linear models, including covariates. 39 | 40 | 41 | Load example data. 42 | 43 | ```{r lm1, warning=FALSE, message=FALSE} 44 | library(microbiome) 45 | data(dietswap) 46 | d <- dietswap 47 | 48 | # Pick microbial abundances for a given taxonomic group 49 | taxa <- "Dialister" 50 | ``` 51 | 52 | 53 | Construct a data.frame with the selected taxonomic group and grouping. 54 | 55 | ```{r lm2, warning=FALSE, message=FALSE} 56 | df <- data.frame(Abundance = abundances(d)[taxa,], 57 | Group = meta(d)$nationality, 58 | Log10_Abundance = log10(1 + df$Abundance) 59 | ) 60 | 61 | # Compare the groups with a linear model. 62 | # Use Log10 abundances 63 | res <- glm(Log10_Abundance ~ Group, data = df, family = "gaussian") 64 | ``` 65 | 66 | 67 | Investigate model coefficients 68 | 69 | ```{r lm_coefs, warning=FALSE, message=FALSE} 70 | print(summary(res)$coefficients) 71 | 72 | # The intercept equals to the mean in the first group 73 | # The group term equals to the difference between group means 74 | print(mean(subset(df, Group == "AAM")$Log10_Abundance)) 75 | print(mean(subset(df, Group == "AFR")$Log10_Abundance) - mean(subset(df, Group == "AAM")$Log10_Abundance)) 76 | ``` 77 | 78 | 79 | Significance with t-test assuming equal variances. 80 | 81 | ```{r lm_signif, warning=FALSE, message=FALSE} 82 | print(t.test(Log10_Abundance ~ Group, data = df, var.equal=TRUE)$p.value) 83 | ``` 84 | 85 | 86 | Now, using the linear model allows incorporation of additional variables, 87 | for instance potential confounders. 88 | 89 | ```{r lm_glm, warning=FALSE, message=FALSE} 90 | df$sex <- meta(d)$sex 91 | res <- glm(Log10_Abundance ~ Group + sex, data = df, family = "gaussian") 92 | ``` 93 | 94 | 95 | Generalized linear models 96 | 97 | The GLM consists of three elements: 98 | 1. A probability distribution (from exponential family) 99 | 1. A linear predictor η = Xβ . 100 | 1. A link function g such that E(Y) = μ = g−1(η). 101 | 102 | We use Poisson with (its natural) log-link. 103 | 104 | ```{r lm_glm2, warning=FALSE, message=FALSE} 105 | res <- glm(Abundance ~ Group, data = df, family = "poisson") 106 | print(summary(res)) 107 | ``` 108 | 109 | -------------------------------------------------------------------------------- /Edu/lm.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Linear models" 3 | author: "`r Sys.Date()`" 4 | bibliography: 5 | - bibliography.bib 6 | output: 7 | rmdformats::readthedown: 8 | self_contained: true 9 | thumbnails: true 10 | lightbox: true 11 | gallery: true 12 | use_bookdown: false 13 | highlight: haddock 14 | --- 15 | 20 | 21 | 22 | # Linear models 23 | 24 | This section provides a hands-on introduction to linear and 25 | generalized linear models, including covariates. 26 | 27 | 28 | Load example data. 29 | 30 | ```{r lm1, warning=FALSE, message=FALSE} 31 | library(microbiome) 32 | data(dietswap) 33 | d <- dietswap 34 | 35 | # Pick microbial abundances for a given taxonomic group 36 | taxa <- "Dialister" 37 | ``` 38 | 39 | 40 | Construct a data.frame with the selected taxonomic group and grouping. 41 | 42 | ```{r lm2, warning=FALSE, message=FALSE} 43 | df <- data.frame(Abundance = abundances(d)[taxa,], 44 | Group = meta(d)$nationality, 45 | Log10_Abundance = log10(1 + df$Abundance) 46 | ) 47 | 48 | # Compare the groups with a linear model. 49 | # Use Log10 abundances 50 | res <- glm(Log10_Abundance ~ Group, data = df, family = "gaussian") 51 | ``` 52 | 53 | 54 | Investigate model coefficients 55 | 56 | ```{r lm_coefs, warning=FALSE, message=FALSE} 57 | print(summary(res)$coefficients) 58 | 59 | # The intercept equals to the mean in the first group 60 | # The group term equals to the difference between group means 61 | print(mean(subset(df, Group == "AAM")$Log10_Abundance)) 62 | print(mean(subset(df, Group == "AFR")$Log10_Abundance) - mean(subset(df, Group == "AAM")$Log10_Abundance)) 63 | ``` 64 | 65 | 66 | Significance with t-test assuming equal variances. 67 | 68 | ```{r lm_signif, warning=FALSE, message=FALSE} 69 | print(t.test(Log10_Abundance ~ Group, data = df, var.equal=TRUE)$p.value) 70 | ``` 71 | 72 | 73 | Now, using the linear model allows incorporation of additional variables, 74 | for instance potential confounders. 75 | 76 | ```{r lm_glm, warning=FALSE, message=FALSE} 77 | df$sex <- meta(d)$sex 78 | res <- glm(Log10_Abundance ~ Group + sex, data = df, family = "gaussian") 79 | ``` 80 | 81 | 82 | Generalized linear models 83 | 84 | The GLM consists of three elements: 85 | 1. A probability distribution (from exponential family) 86 | 1. A linear predictor η = Xβ . 87 | 1. A link function g such that E(Y) = μ = g−1(η). 88 | 89 | We use Poisson with (its natural) log-link. 90 | 91 | ```{r lm_glm2, warning=FALSE, message=FALSE} 92 | res <- glm(Abundance ~ Group, data = df, family = "poisson") 93 | print(summary(res)) 94 | ``` 95 | 96 | -------------------------------------------------------------------------------- /Edu/main.R: -------------------------------------------------------------------------------- 1 | library(rmarkdown) 2 | render("all.Rmd") 3 | 4 | -------------------------------------------------------------------------------- /Edu/misc.R: -------------------------------------------------------------------------------- 1 | # Regression plot and model fit 2 | 3 | # Residuals of the fitted model 4 | 5 | # Offset terms 6 | -------------------------------------------------------------------------------- /Installation.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Installation of microbiome R tools" 3 | author: "Leo Lahti, Sudarshan Shetty et al." 4 | bibliography: 5 | - bibliography.bib 6 | output: 7 | BiocStyle::html_document: 8 | number_sections: no 9 | toc: yes 10 | toc_depth: 4 11 | toc_float: true 12 | self_contained: true 13 | thumbnails: true 14 | lightbox: true 15 | gallery: true 16 | use_bookdown: false 17 | highlight: haddock 18 | --- 19 | 25 | 26 | 27 | ## Installing R/RStudio 28 | 29 | **If you do not already have R/RStudio installed**, do as follows. 30 | 31 | 1. Install [R](http://www.r-project.org/) 32 | 1. Install [RStudio](http://rstudio.org) 33 | 1. With Windows, install also [RTools](http://cran.r-project.org/bin/windows/Rtools/) (version corresponding to your R version) 34 | 35 | 36 | ## Installing microbiome R package 37 | 38 | Open R and install the package. If the installation fails, ensure from 39 | the RStudio tools panel that you have access to the Bioconductor 40 | repository. 41 | 42 | ```{r installation, message=FALSE, warning=FALSE, eval=FALSE} 43 | library(BiocManager) 44 | BiocManager::install("microbiome") 45 | ``` 46 | 47 | Alternatively, to install the bleeding edge (potentially unstable) 48 | development version, run in R: 49 | 50 | ```{r microbiomeinstall, message=FALSE, warning=FALSE, eval=FALSE} 51 | library(devtools) # Load the devtools package 52 | install_github("microbiome/microbiome") # Install the package 53 | ``` 54 | 55 | ## Using the tools 56 | 57 | Once the package has been installed, load it in R 58 | 59 | ```{r loading, eval=TRUE, message=FALSE} 60 | library(microbiome) 61 | ``` 62 | 63 | For a brief overview, see the [package vignette](https://bioconductor.org/packages/devel/bioc/vignettes/microbiome/inst/doc/vignette.html) and the more extensive [on-line tutorial](http://microbiome.github.io/tutorials/). 64 | 65 | 66 | 67 | ## Further reading 68 | 69 | * [Rmarkdown tips](http://rmarkdown.rstudio.com/) 70 | * [R cheat sheets](http://devcheatsheet.com/tag/r/) 71 | * [Using Github with R and RStudio](http://www.molecularecologist.com/2013/11/using-github-with-r-and-rstudio/) 72 | * [Molecular ecologist's view on code sharing](http://www.molecularecologist.com/2013/08/want-to-share-your-code/) 73 | 74 | 75 | ## General instructions to install R packages 76 | 77 | Most R packages are maintained in CRAN, Bioconductor or Github. To 78 | install the package from each, use: 79 | 80 | ```{r generalinstall, message=FALSE, warning=FALSE, eval=FALSE} 81 | # Installing from Bioconductor 82 | #source("http://www.bioconductor.org/biocLite.R") 83 | BiocManager::install("MASS") 84 | 85 | # Installing from CRAN 86 | install.packages("sorvi") 87 | 88 | # Installing from Github 89 | library(devtools) 90 | install_github("antagomir/netresponse") 91 | ``` 92 | 93 | 94 | -------------------------------------------------------------------------------- /Interactive.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Interactive examples" 3 | author: "Leo Lahti, Sudarshan Shetty et al." 4 | bibliography: 5 | - bibliography.bib 6 | output: 7 | BiocStyle::html_document: 8 | number_sections: no 9 | toc: yes 10 | toc_depth: 4 11 | toc_float: true 12 | self_contained: true 13 | thumbnails: true 14 | lightbox: true 15 | gallery: true 16 | use_bookdown: false 17 | highlight: haddock 18 | --- 19 | 25 | 26 | 27 | ## Interactive plots 28 | 29 | ```{r interactive-example, warning=FALSE, message=FALSE, fig.height=5, eval = TRUE} 30 | library(ggplot2) 31 | library(rvg) 32 | library(ggiraph) 33 | library(microbiome) 34 | data("atlas1006") 35 | pseq <- atlas1006 36 | x <- microbiome::transform(atlas1006, "compositional") 37 | 38 | mytheme_main <- theme( panel.background = element_blank(), 39 | panel.grid.major = element_line(colour = "#dddddd"), 40 | axis.ticks = element_line(colour = "#dddddd") ) 41 | 42 | mytheme_map <- theme( 43 | panel.background = element_blank(), axis.title.x = element_blank(), 44 | axis.text = element_blank(), axis.line.x = element_blank(), 45 | axis.line.y = element_blank(), axis.title.y = element_blank(), 46 | axis.ticks.x = element_blank(), axis.ticks.y = element_blank() ) 47 | 48 | df <- as(sample_data(x), "data.frame") 49 | df$Dialister <- get_sample(x, "Dialister") 50 | df$Prevotella <- get_sample(x, "Prevotella melaninogenica et rel.") 51 | df$sample <- row.names(df) 52 | 53 | # geom_point_interactive example 54 | gg_point_1 <- ggplot(df, aes(x = Prevotella, y = Dialister, 55 | color = age, tooltip = sample) ) + 56 | geom_point_interactive(size=3) 57 | 58 | # htmlwidget call 59 | ggiraph(code = {print(gg_point_1 + mytheme_main)}) 60 | ``` 61 | 62 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | YEAR: 2011-2014 2 | COPYRIGHT HOLDER: Leo Lahti, Jarkko Salojarvi 3 | -------------------------------------------------------------------------------- /Misc/CompositionAmplicondata.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Microbiome composition" 3 | author: "Leo Lahti, Sudarshan Shetty et al." 4 | bibliography: 5 | - bibliography.bib 6 | output: 7 | BiocStyle::html_document: 8 | number_sections: no 9 | toc: yes 10 | toc_depth: 4 11 | toc_float: true 12 | self_contained: true 13 | thumbnails: true 14 | lightbox: true 15 | gallery: true 16 | use_bookdown: false 17 | highlight: haddock 18 | --- 19 | 25 | 26 | 27 | Also see [phyloseq barplot examples](http://joey711.github.io/phyloseq/plot_bar-examples.html). 28 | 29 | Read example data from a [diet swap study](http://dx.doi.org/10.1038/ncomms7342): 30 | 31 | ```{r composition-example1} 32 | # Example data 33 | library(microbiome) 34 | library(dplyr) 35 | data(dietswap) 36 | 37 | # Just use prevalent taxa to speed up examples 38 | # (not absolute counts used in this example) 39 | pseq <- core(dietswap, detection = 8^2, prevalence = 90/100) 40 | 41 | # Pick sample subset 42 | library(phyloseq) 43 | pseq2 <- subset_samples(pseq, group == "DI" & nationality == "AFR" & timepoint.within.group == 1) 44 | ``` 45 | 46 | ### Composition barplots 47 | 48 | Same with compositional (relative) abundances; for each sample (left), or averafged by group (right). 49 | 50 | ```{r composition-example4b, fig.width=12, fig.height=5, out.width="400px", fig.show="hold", warning=FALSE, message=FALSE} 51 | # Try another theme 52 | # from https://github.com/hrbrmstr/hrbrthemes 53 | library(hrbrthemes) 54 | library(gcookbook) 55 | library(tidyverse) 56 | 57 | # Limit the analysis on core taxa and specific sample group 58 | p <- plot_composition(pseq2, 59 | taxonomic.level = "OTU", 60 | sample.sort = "nationality", 61 | x.label = "nationality") + 62 | guides(fill = guide_legend(ncol = 1)) + 63 | scale_y_percent() + 64 | labs(x = "Samples", y = "Relative abundance (%)", 65 | title = "Relative abundance data", 66 | subtitle = "Subtitle", 67 | caption = "Caption text.") + 68 | theme_ipsum(grid="Y") 69 | print(p) 70 | 71 | # Averaged by group 72 | p <- plot_composition(pseq2, 73 | average_by = "bmi_group", transform = "compositional") 74 | print(p) 75 | ``` 76 | 77 | 78 | 79 | ### Composition heatmaps 80 | 81 | 82 | Heatmap for CLR-transformed abundances, with samples and OTUs sorted 83 | with the neatmap method: 84 | 85 | ```{r composition-example7, fig.width=10, fig.height=4, eval=FALSE} 86 | tmp <- plot_composition(pseq2, plot.type = "heatmap", transform = "compositional", 87 | sample.sort = "neatmap", otu.sort = "neatmap", mar = c(6, 13, 1, 1)) 88 | ``` 89 | 90 | 91 | 92 | ### Plot taxa prevalence 93 | 94 | This function 95 | allows you to have an overview of OTU prevalences alongwith their 96 | taxonomic affiliations. This will aid in checking if you filter OTUs 97 | based on prevalence, then what taxonomic affliations will be lost. 98 | 99 | ```{r plot_prev, fig.height=6, fig.width=8, dev="CairoPNG"} 100 | data(atlas1006) 101 | 102 | # Use sample and taxa subset to speed up example 103 | p0 <- subset_samples(atlas1006, DNA_extraction_method == "r") 104 | 105 | # Define detection and prevalence thresholds to filter out rare taxa 106 | p0 <- core(p0, detection = 10, prevalence = 0) 107 | 108 | # For the available taxonomic levels 109 | plot_taxa_prevalence(p0, "Phylum", detection = 10) 110 | ``` 111 | 112 | ### Amplicon data 113 | 114 | See further examples on [community composition for amplicon data](CompositionAmplicondata2.html). 115 | 116 | -------------------------------------------------------------------------------- /Misc/Probelevel.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Probe-level analysis for phylogenetic microarrays" 3 | author: "Leo Lahti, Sudarshan Shetty et al. `r Sys.Date()`" 4 | bibliography: 5 | - bibliography.bib 6 | output: 7 | BiocStyle::html_document: 8 | number_sections: no 9 | toc: yes 10 | toc_depth: 4 11 | toc_float: true 12 | self_contained: true 13 | thumbnails: true 14 | lightbox: true 15 | gallery: true 16 | use_bookdown: false 17 | highlight: haddock 18 | --- 19 | 25 | 26 | 27 | ## Probe summarization 28 | 29 | Summarize (preprocessed) oligo-level data into phylotype level; examples with simulated data; see [read_hitchip](reading) to use your own data. We use and recommend the [Robust Probabilistic Averaging (RPA)](https://github.com/antagomir/RPA/wiki) for probe summarization. 30 | 31 | 32 | ```{r probelevel-ex, eval=FALSE} 33 | library(microbiome) 34 | 35 | library(HITChipDB) 36 | data.directory <- system.file("extdata", package = "microbiome") 37 | 38 | # Read oligo-level data (here: simulated example data) 39 | probedata <- HITChipDB::read_hitchip(data.directory, method = "frpa")$probedata 40 | 41 | # Read phylogeny map 42 | # NOTE: use phylogeny.filtered for species/L1/L2 summarization 43 | # Load taxonomy from output directory 44 | f <- system.file("inst/extdata/get_hitchip_taxonomy.R", package = "microbiome") 45 | source(f) 46 | taxonomy <- get_hitchip_taxonomy("HITChip", "filtered") 47 | 48 | # Summarize oligos into higher level phylotypes 49 | dat <- RPA::summarize_probedata( 50 | probedata = probedata, 51 | taxonomy = taxonomy, 52 | method = "rpa", 53 | level = "species") 54 | ``` 55 | 56 | 57 | ## Retrieve probe-level data 58 | 59 | Get probes for each probeset: 60 | 61 | ```{r probelevel-ex2, eval=FALSE} 62 | sets <- RPA::retrieve.probesets(taxonomy, level = "species", name = NULL) 63 | ``` 64 | 65 | 66 | Get probeset data matrix/matrices: 67 | 68 | ```{r probelevel-ex3, eval=FALSE} 69 | set <- RPA::get.probeset("Actinomyces naeslundii", "species", 70 | taxonomy, probedata, log10 = TRUE) 71 | ``` 72 | 73 | 74 | -------------------------------------------------------------------------------- /Misc/Visualization.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Visualization" 3 | author: "Leo Lahti, Sudarshan Shetty et al. `r Sys.Date()`" 4 | bibliography: 5 | - bibliography.bib 6 | output: 7 | BiocStyle::html_document: 8 | number_sections: no 9 | toc: yes 10 | toc_depth: 4 11 | toc_float: true 12 | self_contained: true 13 | thumbnails: true 14 | lightbox: true 15 | gallery: true 16 | use_bookdown: false 17 | highlight: haddock 18 | --- 19 | 25 | 26 | 27 | # Experimental functions 28 | 29 | 30 | **Time series for individual subjects** 31 | 32 | ```{r homogeneity-timeseries, message=FALSE, warning=FALSE, fig.height=5, fig.height=5, eval=FALSE} 33 | source(system.file("extdata/plot_longitudinal.R", package = "microbiome")) 34 | p <- plot_longitudinal(pseq, "Dialister", subject = "831", tipping.point = 0.5) 35 | print(p) 36 | ``` 37 | -------------------------------------------------------------------------------- /Misc/info.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Rendering statistics" 3 | author: "`r Sys.Date()`" 4 | output: 5 | BiocStyle::html_document: 6 | number_sections: no 7 | toc: yes 8 | toc_depth: 4 9 | toc_float: true 10 | self_contained: true 11 | thumbnails: true 12 | lightbox: true 13 | gallery: true 14 | use_bookdown: false 15 | highlight: haddock 16 | --- 17 | 23 | 24 | 25 | ## Execution times 26 | 27 | This visualization shows the rendering times for the tutorial pages for possible optimization purposes. 28 | 29 | ```{r exectime, echo=FALSE, fig.width=20, fig.height=20, warning=FALSE, message=FALSE, out.width="100%"} 30 | library(knitr) 31 | theme_set(theme_bw(40)) 32 | df <- as.data.frame(t(sapply(times, identity))) 33 | df$file <- rownames(df) 34 | dfm <- as_tibble(melt(df, id = "file")) %>% 35 | filter(variable %in% c("user.self", "elapsed")) %>% 36 | arrange(value) %>% 37 | mutate(file = factor(file, levels = unique(file))) 38 | p <- ggplot(dfm, aes(x = file, fill = variable, y = value)) + 39 | geom_bar(stat = "identity", position = "dodge", color = "black") + 40 | coord_flip() + labs(x = "", y = "Time (s)") 41 | print(p) 42 | ``` 43 | 44 | 45 | -------------------------------------------------------------------------------- /Mixedmodels.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Comparisons of microbiome community composition" 3 | author: "Leo Lahti, Sudarshan Shetty et al." 4 | bibliography: 5 | - bibliography.bib 6 | output: 7 | BiocStyle::html_document: 8 | number_sections: no 9 | toc: yes 10 | toc_depth: 4 11 | toc_float: true 12 | self_contained: true 13 | thumbnails: true 14 | lightbox: true 15 | gallery: true 16 | use_bookdown: false 17 | highlight: haddock 18 | --- 19 | 25 | 26 | 27 | ## Mixed models for univariate comparisons 28 | 29 | 30 | Load example data: 31 | 32 | ```{r boxplot-example_w, warning=FALSE, message=FALSE} 33 | # Load libraries 34 | library(microbiome) 35 | library(ggplot2) 36 | library(dplyr) 37 | library(IRanges) 38 | 39 | # Probiotics intervention example data 40 | data(peerj32) # Source: https://peerj.com/articles/32/ 41 | pseq <- peerj32$phyloseq # Rename the example data 42 | ``` 43 | 44 | 45 | Abundance boxplot 46 | 47 | ```{r boxplot2_w, warning=FALSE, message=FALSE} 48 | p <- boxplot_abundance(pseq, x = "time", y = "Akkermansia", line = "subject") + 49 | scale_y_log10() 50 | print(p) 51 | ``` 52 | 53 | 54 | ## Linear model comparison with random effect subject term 55 | 56 | Test individual taxonomic group 57 | 58 | ```{r comparisons-lmer_w, message=FALSE, warning=FALSE} 59 | # Get sample metadata 60 | dfs <- meta(pseq) 61 | 62 | # Add abundance as the signal to model 63 | dfs$signal <- abundances(pseq)["Akkermansia", rownames(dfs)] 64 | 65 | # Paired comparison 66 | # with fixed group effect and random subject effect 67 | library(lme4) 68 | out <- lmer(signal ~ group + (1|subject), data = dfs) 69 | out0 <- lmer(signal ~ (1|subject), data = dfs) 70 | comp <- anova(out0, out) 71 | pv <- comp[["Pr(>Chisq)"]][[2]] 72 | print(pv) 73 | ``` 74 | 75 | -------------------------------------------------------------------------------- /MovingPictures_MicrobiomeExperiment.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microbiome/tutorials/4c9c5eb321358c97369b1d2325ff81906a14ac0b/MovingPictures_MicrobiomeExperiment.rds -------------------------------------------------------------------------------- /Negativebinomial.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Negative binomial" 3 | author: "Leo Lahti, Sudarshan Shetty et al." 4 | bibliography: 5 | - bibliography.bib 6 | output: 7 | BiocStyle::html_document: 8 | number_sections: no 9 | toc: yes 10 | toc_depth: 4 11 | toc_float: true 12 | self_contained: true 13 | thumbnails: true 14 | lightbox: true 15 | gallery: true 16 | use_bookdown: false 17 | highlight: haddock 18 | --- 19 | 25 | 26 | 27 | ## Group-wise comparisons with negative binomial 28 | 29 | [Read more on negative binomials](http://www.ats.ucla.edu/stat/r/dae/nbreg.htm) 30 | 31 | Load example data: 32 | 33 | ```{r boxplot-example, warning=FALSE, message=FALSE} 34 | # Load libraries 35 | library(microbiome) 36 | library(ggplot2) 37 | library(dplyr) 38 | 39 | # Probiotics intervention example data 40 | data(peerj32) # Source: https://peerj.com/articles/32/ 41 | pseq <- peerj32$phyloseq # Rename the example data 42 | ``` 43 | 44 | 45 | Visually compare Akkermansia abundance between time point 1 and 2 46 | 47 | ```{r boxplot2, warning=FALSE, message=FALSE} 48 | p <- boxplot_abundance(pseq, x = "time", y = "Akkermansia", line = "subject") + scale_y_log10() 49 | print(p) 50 | ``` 51 | 52 | Test statistical significance with negative binomial: 53 | 54 | ```{r comparisons2b, message=FALSE, error=FALSE, warning=FALSE} 55 | library(MASS) 56 | library(tidyr) 57 | 58 | # Analyse specific taxa 59 | tax <- "Akkermansia" 60 | 61 | # Pick the signal (abundance) for this tax 62 | sample_data(pseq)$signal <- get_sample(pseq, tax) 63 | 64 | # Negative binomial test with group and gender included 65 | res <- glm.nb(signal ~ group + sex, data = meta(pseq)) 66 | 67 | # Show the results 68 | print(coef(summary(res))) 69 | ``` 70 | 71 | 72 | -------------------------------------------------------------------------------- /Networks.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Taxonomic network visualization" 3 | author: "Leo Lahti, Sudarshan Shetty et al." 4 | bibliography: 5 | - bibliography.bib 6 | output: 7 | BiocStyle::html_document: 8 | number_sections: no 9 | toc: yes 10 | toc_depth: 4 11 | toc_float: true 12 | self_contained: true 13 | thumbnails: true 14 | lightbox: true 15 | gallery: true 16 | use_bookdown: false 17 | highlight: haddock 18 | --- 19 | 25 | 26 | 27 | Load [example data](Data.html): 28 | 29 | ```{r networks1, message=FALSE, warning=FALSE} 30 | library(microbiome) 31 | data(dietswap) 32 | pseq <- dietswap 33 | 34 | # Keep only the prevalent taxa to speed up examples 35 | pseq <- core(pseq, detection = 5^2, prevalence = 80/100) 36 | pseq <- subset_samples(pseq, nationality == "AFR" & group == "DI" & bmi_group == "lean") 37 | ``` 38 | 39 | 40 | ### Taxonomic network reconstruction 41 | 42 | See the [phyloseq tutorial](http://joey711.github.io/phyloseq/plot_network-examples) for 43 | additional network visualization tools. 44 | 45 | The widely reported compositionality bias in similarity measures can 46 | be fixed with SpiecEasi or SparCC; the implementations are available 47 | via the [SpiecEasi package](https://github.com/zdk123/SpiecEasi). Note 48 | that the execution is slow. 49 | 50 | ```{r networks4, warning=FALSE, message=FALSE, fig.width=10, fig.height=10} 51 | # Pick the OTU table 52 | library(phyloseq) 53 | otu <- abundances(pseq) 54 | ``` 55 | 56 | 57 | ```{r spieceasi, warning=FALSE, message=FALSE, fig.width=10, fig.height=10, eval=TRUE} 58 | # SPIEC-EASI network reconstruction 59 | # In practice, use more repetitions 60 | library(devtools) 61 | #install_github("zdk123/SpiecEasi") 62 | library(SpiecEasi) #install_github("zdk123/SpiecEasi") 63 | net <- spiec.easi(t(otu), method='mb', lambda.min.ratio=1e-2, nlambda=5, icov.select.params=list(rep.num=1)) 64 | 65 | ## Create graph object 66 | n <- net$refit 67 | #colnames(n) <- rownames(n) <- rownames(otu) 68 | 69 | # Network format 70 | library(network) 71 | #netw <- network(as.matrix(n), directed = FALSE) 72 | 73 | # igraph format 74 | library(igraph) 75 | # ig <- graph.adjacency(n, mode='undirected', add.rownames = TRUE) 76 | 77 | # Network layout 78 | # coord <- layout.fruchterman.reingold(ig) 79 | 80 | ## set size of vertex to log2 mean abundance 81 | # vsize <- log2(rowMeans(otu)) 82 | 83 | # Visualize the network 84 | # print(plot(ig, layout = coord, vertex.size = vsize, vertex.label = names(vsize))) 85 | ``` 86 | 87 | 88 | Investigate degree distribution with the following: 89 | 90 | ```{r degree, warning=FALSE, message=FALSE, fig.width=10, fig.height=7, eval=FALSE} 91 | #dd <- degree.distribution(ig) 92 | #plot(0:(length(dd)-1), dd, ylim = c(0,.35), type = 'b', ylab = "Frequency", xlab = "Degree", main = "Degree Distributions") 93 | ``` 94 | 95 | 96 | Visualize the network with [ggnet2](https://briatte.github.io/ggnet): 97 | 98 | ```{r networks5, warning=FALSE, message=FALSE, fig.width=12, fig.height=7, eval = TRUE} 99 | library(GGally) 100 | #library(ggnet) 101 | library(network) 102 | library(sna) 103 | library(ggplot2) 104 | library(intergraph) # ggnet2 works also with igraph with this 105 | 106 | phyla <- map_levels(rownames(otu), 107 | from = "Genus", to = "Phylum", 108 | tax_table(pseq)) 109 | 110 | #netw %v% "Phylum" <- phyla 111 | #p <- ggnet2(netw, color = "Phylum", label = TRUE, label.size = 2) 112 | ``` 113 | 114 | 115 | ```{r networks5_plot, warning=FALSE, message=FALSE, fig.width=12, fig.height=7, eval = TRUE} 116 | print(p) 117 | ``` 118 | 119 | 120 | -------------------------------------------------------------------------------- /Ordination.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Ordination analysis" 3 | author: "Leo Lahti, Sudarshan Shetty et al." 4 | bibliography: 5 | - bibliography.bib 6 | output: 7 | BiocStyle::html_document: 8 | number_sections: no 9 | toc: yes 10 | toc_depth: 4 11 | toc_float: true 12 | self_contained: true 13 | thumbnails: true 14 | lightbox: true 15 | gallery: true 16 | use_bookdown: false 17 | highlight: haddock 18 | --- 19 | 25 | 26 | 27 | 28 | 29 | Full examples for standard ordination techniques applied to phyloseq data, based on the [phyloseq ordination tutorial](http://joey711.github.io/phyloseq/plot_ordination-examples.html). For handy wrappers for some common ordination tasks in microbiome analysis, see [landscaping examples](Landscaping.html) 30 | 31 | 32 | Load example data: 33 | 34 | ```{r ordination1, message=FALSE, warning=FALSE, eval=TRUE} 35 | library(microbiome) 36 | library(phyloseq) 37 | library(ggplot2) 38 | data(dietswap) 39 | pseq <- dietswap 40 | 41 | # Convert to compositional data 42 | pseq.rel <- microbiome::transform(pseq, "compositional") 43 | 44 | # Pick core taxa with with the given prevalence and detection limits 45 | pseq.core <- core(pseq.rel, detection = .1/100, prevalence = 90/100) 46 | 47 | # Use relative abundances for the core 48 | pseq.core <- microbiome::transform(pseq.core, "compositional") 49 | ``` 50 | 51 | 52 | ## Sample ordination 53 | 54 | Project the samples with the given method and dissimilarity measure. 55 | 56 | ```{r ordination2, message=FALSE, warning=FALSE, results="hide"} 57 | # Ordinate the data 58 | set.seed(4235421) 59 | # proj <- get_ordination(pseq, "MDS", "bray") 60 | ord <- ordinate(pseq, "MDS", "bray") 61 | ``` 62 | 63 | 64 | ## Multidimensional scaling (MDS / PCoA) 65 | 66 | ```{r ordination-ordinate23, warning=FALSE, message=FALSE, fig.width=8, fig.height=6, fig.show="hold", out.width="400px"} 67 | plot_ordination(pseq, ord, color = "nationality") + 68 | geom_point(size = 5) 69 | ``` 70 | 71 | 72 | ## Canonical correspondence analysis (CCA) 73 | 74 | ```{r ordination-ordinate24a, warning=FALSE, message=FALSE, fig.width=8, fig.height=6, fig.show="hold", out.width="400px"} 75 | # With samples 76 | pseq.cca <- ordinate(pseq, "CCA") 77 | p <- plot_ordination(pseq, pseq.cca, 78 | type = "samples", color = "nationality") 79 | p <- p + geom_point(size = 4) 80 | print(p) 81 | 82 | # With taxa: 83 | p <- plot_ordination(pseq, pseq.cca, 84 | type = "taxa", color = "Phylum") 85 | p <- p + geom_point(size = 4) 86 | print(p) 87 | ``` 88 | 89 | 90 | ## Split plot 91 | 92 | ```{r ordination-ordinate25, warning=FALSE, message=FALSE, fig.width=14, fig.height=5} 93 | plot_ordination(pseq, pseq.cca, 94 | type = "split", shape = "nationality", 95 | color = "Phylum", label = "nationality") 96 | ``` 97 | 98 | 99 | 100 | ## t-SNE 101 | 102 | t-SNE is a popular new ordination technique. 103 | 104 | ```{r tsne, warning=FALSE, message=FALSE, fig.width=14, fig.height=5} 105 | library(vegan) 106 | library(microbiome) 107 | library(Rtsne) # Load package 108 | set.seed(423542) 109 | 110 | method <- "tsne" 111 | trans <- "hellinger" 112 | distance <- "euclidean" 113 | 114 | # Distance matrix for samples 115 | ps <- microbiome::transform(pseq, trans) 116 | 117 | # Calculate sample similarities 118 | dm <- vegdist(otu_table(ps), distance) 119 | 120 | # Run TSNE 121 | tsne_out <- Rtsne(dm, dims = 2) 122 | proj <- tsne_out$Y 123 | rownames(proj) <- rownames(otu_table(ps)) 124 | 125 | library(ggplot2) 126 | p <- plot_landscape(proj, legend = T, size = 1) 127 | print(p) 128 | ``` 129 | 130 | -------------------------------------------------------------------------------- /Output.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Generating output files" 3 | author: "Leo Lahti, Sudarshan Shetty et al." 4 | bibliography: 5 | - bibliography.bib 6 | output: 7 | BiocStyle::html_document: 8 | number_sections: no 9 | toc: yes 10 | toc_depth: 4 11 | toc_float: true 12 | self_contained: true 13 | thumbnails: true 14 | lightbox: true 15 | gallery: true 16 | use_bookdown: false 17 | highlight: haddock 18 | 19 | --- 20 | 26 | 27 | 28 | ## Writing diversity table into file 29 | 30 | ```{r diversity-example4, warning=FALSE, eval = FALSE} 31 | output.dir <- "./" 32 | write.table(div.table, file = "DiversityTable.tab", sep = "\t") 33 | ``` 34 | 35 | ## Save clustering image to a file 36 | 37 | Save in PDF: 38 | 39 | ```{r clustering-saving, message=FALSE, warning=FALSE, eval=FALSE} 40 | pdf("myplot.pdf", width = 7, height = 7 * length(hc$order)/20) 41 | plot(hc, hang=-1, main = "Hierarchical clustering") 42 | dev.off() 43 | ``` 44 | 45 | Save in TIFF: 46 | 47 | ```{r clustering-saving2, message=FALSE, warning=FALSE, eval=FALSE} 48 | tiff("myplot.tif", width = 480, height = 480 * length(hc$order)/20) 49 | plot(hc, hang=-1, main = "Hierarchical clustering") 50 | dev.off() 51 | ``` 52 | 53 | To save in Microsoft EMF format, try the following. If you find a 54 | way to tune figure width for emf files kindly let the admins know. 55 | 56 | ```{r clustering-saving3, message=FALSE, warning=FALSE, eval=FALSE} 57 | plot(hc, hang=-1, main = "Hierarchical clustering") 58 | savePlot("myplot.emf", type = "emf") 59 | dev.off() 60 | ``` 61 | -------------------------------------------------------------------------------- /PERMANOVA.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Group-wise comparisons of microbiome composition" 3 | author: "Leo Lahti, Sudarshan Shetty et al." 4 | bibliography: 5 | - bibliography.bib 6 | output: 7 | BiocStyle::html_document: 8 | number_sections: no 9 | toc: yes 10 | toc_depth: 4 11 | toc_float: true 12 | self_contained: true 13 | thumbnails: true 14 | lightbox: true 15 | gallery: true 16 | use_bookdown: false 17 | highlight: haddock 18 | --- 19 | 25 | 26 | 27 | ## PERMANOVA for community-level multivariate comparisons 28 | 29 | PERMANOVA quantifies multivariate community-level differences between 30 | groups. 31 | 32 | 33 | Load example data: 34 | 35 | ```{r boxplot-example, warning=FALSE, message=FALSE} 36 | # Load libraries 37 | library(microbiome) 38 | library(ggplot2) 39 | library(dplyr) 40 | 41 | # Probiotics intervention example data 42 | data(peerj32) # Source: https://peerj.com/articles/32/ 43 | pseq <- peerj32$phyloseq # Rename the example data 44 | 45 | # Pick relative abundances (compositional) and sample metadata 46 | pseq.rel <- microbiome::transform(pseq, "compositional") 47 | otu <- abundances(pseq.rel) 48 | meta <- meta(pseq.rel) 49 | ``` 50 | 51 | 52 | ## Visualize microbiome variation 53 | 54 | Visualize the population density and highlight sample groups (probiotic treatment LGG vs Placebo): 55 | 56 | ```{r comparisons_permanova_visu, error=FALSE, message=FALSE, warnings=FALSE} 57 | p <- plot_landscape(pseq.rel, method = "NMDS", distance = "bray", col = "group", size = 3) 58 | print(p) 59 | ``` 60 | 61 | 62 | ## PERMANOVA significance test for group-level differences 63 | 64 | Now let us evaluate whether the group (probiotics vs. placebo) has a 65 | significant effect on overall gut microbiota composition. Perform PERMANOVA: 66 | 67 | ```{r comparisons_permanova_analyse, message=FALSE, warnings=FALSE} 68 | # samples x species as input 69 | library(vegan) 70 | permanova <- adonis(t(otu) ~ group, 71 | data = meta, permutations=99, method = "bray") 72 | 73 | # P-value 74 | print(as.data.frame(permanova$aov.tab)["group", "Pr(>F)"]) 75 | ``` 76 | 77 | 78 | ## Checking the homogeneity condition 79 | 80 | Check that variance homogeneity assumptions hold (to ensure the reliability of the results): 81 | 82 | 83 | ```{r comparisons-permanova2, message=FALSE, warnings=FALSE} 84 | # Note the assumption of similar multivariate spread among the groups 85 | # ie. analogous to variance homogeneity 86 | # Here the groups have signif. different spreads and 87 | # permanova result may be potentially explained by that. 88 | dist <- vegdist(t(otu)) 89 | anova(betadisper(dist, meta$group)) 90 | ``` 91 | 92 | ## Investigate the top factors 93 | 94 | Show coefficients for the top taxa separating the groups 95 | 96 | ```{r permanova_top, fig.width=5, fig.height=5, message=FALSE, error=FALSE, warnings=FALSE} 97 | coef <- coefficients(permanova)["group1",] 98 | top.coef <- coef[rev(order(abs(coef)))[1:20]] 99 | par(mar = c(3, 14, 2, 1)) 100 | barplot(sort(top.coef), horiz = T, las = 1, main = "Top taxa") 101 | ``` 102 | 103 | -------------------------------------------------------------------------------- /PlotDiversity.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Plot alpha diversity" 3 | author: "Leo Lahti, Sudarshan Shetty et al." 4 | bibliography: 5 | - bibliography.bib 6 | output: 7 | BiocStyle::html_document: 8 | number_sections: no 9 | toc: yes 10 | toc_depth: 4 11 | toc_float: true 12 | self_contained: true 13 | thumbnails: true 14 | lightbox: true 15 | gallery: true 16 | use_bookdown: false 17 | highlight: haddock 18 | --- 19 | 25 | 26 | ## Diversity plots 27 | 28 | Using the `alpha` function in `microbiome R packge` you can calculate a wide variaty of diversity indices. Comparison and visualising group based differecences or similarities is also important. Here, we show steps from calculating diversity indices using microbiome R package and visualising the differences and/or similarities between groups. A useful R package is [ggpubr](http://www.sthda.com/english/rpkgs/ggpubr/). If you have not installed it, please install it. 29 | 30 | Load libraries and data. 31 | 32 | ```{r Ad-plot, warning=FALSE, message=FALSE} 33 | 34 | library(microbiome) 35 | library(ggpubr) 36 | library(knitr) 37 | library(dplyr) 38 | 39 | data(dietswap) 40 | pseq <- dietswap 41 | ``` 42 | 43 | 44 | ## Alpha diversity 45 | 46 | This returns a table with selected diversity indicators. Check a separate page on [Alpha](Diversity.html) for other functions. 47 | 48 | ```{r Ad-plot1, warning=FALSE, message=FALSE, results="asis"} 49 | 50 | ps1 <- prune_taxa(taxa_sums(pseq) > 0, pseq) 51 | 52 | tab <- microbiome::alpha(ps1, index = "all") 53 | kable(head(tab)) 54 | 55 | ``` 56 | 57 | 58 | ## Prepare data for vizualisation 59 | Now, get the metadata (sample_data) from the `phyloseq` object 60 | 61 | ```{r Ad-plot2, warning=FALSE, message=FALSE, results="asis"} 62 | 63 | ps1.meta <- meta(ps1) 64 | kable(head(ps1.meta)) 65 | 66 | ``` 67 | 68 | Add the diversity table to metadata 69 | 70 | ```{r Ad-plot3, warning=FALSE, message=FALSE} 71 | 72 | ps1.meta$Shannon <- tab$diversity_shannon 73 | ps1.meta$InverseSimpson <- tab$diversity_inverse_simpson 74 | ``` 75 | 76 | Let's say we want to compare differences in Shannon index between bmi group of the study subjects. 77 | 78 | ```{r Ad-plot4, warning=FALSE, message=FALSE} 79 | 80 | # create a list of pairwise comaprisons 81 | bmi <- levels(ps1.meta$bmi_group) # get the variables 82 | 83 | # make a pairwise list that we want to compare. 84 | bmi.pairs <- combn(seq_along(bmi), 2, simplify = FALSE, FUN = function(i)bmi[i]) 85 | 86 | print(bmi.pairs) 87 | ``` 88 | 89 | Sometimes that variables can be stored as characters and sometime as factors. In such a senario `levels()` may return an empty vector. A work around for this can be found [here](https://github.com/microbiome/microbiome/issues/143). 90 | 91 | 92 | ## Violin plot 93 | 94 | Using `ggpubr` a violin plot will be created 95 | 96 | ```{r Ad-plot5, warning=FALSE, message=FALSE, fig.width=8, fig.height=6, eval=FALSE} 97 | #ps1.meta$'' <- alpha(ps1, index = 'shannon') 98 | p1 <- ggviolin(ps1.meta, x = "bmi_group", y = "Shannon", 99 | add = "boxplot", fill = "bmi_group", palette = c("#a6cee3", "#b2df8a", "#fdbf6f")) 100 | print(p1) 101 | ``` 102 | ![violin plot](Rplotvio1.jpeg) 103 | 104 | ## Statistics 105 | 106 | Pairwise comparision using non-parametric test (Wilcoxon test). 107 | 108 | ```{r Ad-plot6, warning=FALSE, message=FALSE, eval=FALSE} 109 | p1 <- p1 + stat_compare_means(comparisons = bmi.pairs) 110 | print(p1) 111 | ``` 112 | ![violin for comparison](Rplotvio2.jpeg) 113 | 114 | 115 | For more information and useful tips and suggestions check the [Statistical tools for high-throughput data analysis](http://www.sthda.com/english/rpkgs/ggpubr/). 116 | 117 | -------------------------------------------------------------------------------- /RDA.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "RDA" 3 | author: "Leo Lahti, Sudarshan Shetty et al." 4 | bibliography: 5 | - bibliography.bib 6 | output: 7 | BiocStyle::html_document: 8 | number_sections: no 9 | toc: yes 10 | toc_depth: 4 11 | toc_float: true 12 | self_contained: true 13 | thumbnails: true 14 | lightbox: true 15 | gallery: true 16 | use_bookdown: false 17 | highlight: haddock 18 | --- 19 | 25 | 26 | 27 | ## RDA analysis and visualization. 28 | 29 | NOTE: These functions have unresolved issues and many 30 | dependencies. They will require thorough revision before inclusion to 31 | the package is possible. 32 | 33 | Load the package and example data: 34 | 35 | ```{r rda, warning=FALSE, message=FALSE, eval=FALSE} 36 | library(microbiome) 37 | data(peerj32) # Data from https://peerj.com/articles/32/ 38 | pseq <- peerj32$phyloseq # phyloseq data 39 | 40 | # Only check the core taxa to speed up examples 41 | pseq <- core(pseq, detection = 10^2, prevalence = 95/100) 42 | 43 | pseq.trans <- transform(pseq, "hell") # Hellinger transform 44 | ``` 45 | 46 | 47 | ## Bagged RDA 48 | 49 | Bagged RDA provides added robustness in the analysis compared to the standard RDA. Fit bagged (bootstrap aggregated) RDA on a phyloseq object (alternatively you could apply it to the abundance matrix and covariates directly): 50 | 51 | ```{r rda5, warning=FALSE, message=FALSE, eval=FALSE} 52 | # In any real study, use bs.iter = 100 or higher 53 | # to achieve meaningful benefits from the bagged version. 54 | # In this example we use bs.iter = 2 just to speed up the 55 | # example code for educational purposes 56 | res <- rda_bagged(pseq.trans, "group", bs.iter=2) 57 | ``` 58 | 59 | Visualizing bagged RDA: 60 | 61 | ```{r rda6, warning=FALSE, message=FALSE, fig.width=8, fig.height=8, eval=FALSE} 62 | plot_rda_bagged(res) 63 | ``` 64 | 65 | 66 | 67 | ## Standard RDA 68 | 69 | Standard RDA for microbiota profiles versus the given (here 'time') 70 | variable from sample metadata (see also the RDA method in 71 | phyloseq::ordinate) 72 | 73 | ```{r rda2, warning=FALSE, message=FALSE, eval=FALSE} 74 | x <- pseq.trans 75 | otu <- abundances(x) 76 | metadata <- meta(x) 77 | 78 | library(vegan) 79 | rda.result <- vegan::rda(t(otu) ~ factor(metadata$time), 80 | na.action = na.fail, scale = TRUE) 81 | ``` 82 | 83 | Proportion explained by the given factor 84 | 85 | ```{r rda2b, warning=FALSE, message=FALSE, eval=FALSE} 86 | summary(rda.result)$constr.chi/summary(rda.result)$tot.chi 87 | ``` 88 | 89 | 90 | ## RDA visualization 91 | 92 | Visualize the standard RDA output. 93 | 94 | ```{r rda4, warning=FALSE, message=FALSE, fig.width=8, fig.height=8, eval=FALSE} 95 | plot(rda.result, choices = c(1,2), type = "points", pch = 15, scaling = 3, cex = 0.7, col = metadata$time) 96 | points(rda.result, choices = c(1,2), pch = 15, scaling = 3, cex = 0.7, col = metadata$time) 97 | pl <- ordihull(rda.result, metadata$time, scaling = 3, label = TRUE) 98 | ``` 99 | 100 | 101 | ## RDA significance test 102 | 103 | ```{r rda2bc, warning=FALSE, message=FALSE, eval=FALSE} 104 | permutest(rda.result) 105 | ``` 106 | 107 | 108 | ## RDA with confounding variables 109 | 110 | For more complex RDA scenarios, use the standard RDA available via the 111 | vegan R package. 112 | 113 | ```{r rda3, warning=FALSE, message=FALSE, fig.width=8, fig.height=8, eval=FALSE} 114 | # Pick microbiota profiling data from the phyloseq object 115 | otu <- abundances(pseq.trans) 116 | 117 | # Sample annotations 118 | metadata <- meta(pseq.trans) 119 | 120 | # RDA with confounders using the vegan function 121 | rda.result2 <- vegan::rda(t(otu) ~ metadata$time + Condition(metadata$subject + metadata$gender)) 122 | ``` 123 | 124 | 125 | -------------------------------------------------------------------------------- /Regression.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Regression" 3 | author: "Leo Lahti, Sudarshan Shetty et al." 4 | bibliography: 5 | - bibliography.bib 6 | output: 7 | BiocStyle::html_document: 8 | number_sections: no 9 | toc: yes 10 | toc_depth: 4 11 | toc_float: true 12 | self_contained: true 13 | thumbnails: true 14 | lightbox: true 15 | gallery: true 16 | use_bookdown: false 17 | highlight: haddock 18 | --- 19 | 25 | 26 | 27 | ## Regression plots 28 | 29 | Regression curve with smoothed error bars based on 30 | the [Visually-Weighted Regression](http://www.fight-entropy.com/2012/07/visually-weighted-regression.html) by Solomon M. Hsiang and Felix Schonbrodt's original code](http://www.nicebread.de/visually-weighted-watercolor-plots-new-variants-please-vote/). 31 | 32 | ```{r variability-regression, message=FALSE, fig.width=10, fig.height=5, warning=FALSE, dev="CairoPNG"} 33 | library(microbiome) 34 | data(atlas1006) 35 | plot_regression(diversity ~ age, meta(atlas1006)) 36 | ``` 37 | 38 | -------------------------------------------------------------------------------- /Rplotvio1.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microbiome/tutorials/4c9c5eb321358c97369b1d2325ff81906a14ac0b/Rplotvio1.jpeg -------------------------------------------------------------------------------- /Rplotvio2.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microbiome/tutorials/4c9c5eb321358c97369b1d2325ff81906a14ac0b/Rplotvio2.jpeg -------------------------------------------------------------------------------- /TODO.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "TODO" 3 | prettydoc::html_pretty: 4 | theme: cayman 5 | highlight: github 6 | --- 7 | 13 | 14 | List of issues to improve the package and tutorials: 15 | 16 | * Merge with [microbiomeSeq]() R package 17 | * Add a list of related resources ([pathoStat]()) 18 | * Lay out overall project roadmap 19 | * Consider package/tutorial collection instead of a single package 20 | 21 | ### Missing examples 22 | 23 | * is_compositional 24 | 25 | ### Related work 26 | 27 | * [metagenomeSeq](https://bioconductor.org/packages/release/bioc/html/metagenomeSeq.html) 28 | * [microbiomeSeq]() 29 | * [pathoStat]() -------------------------------------------------------------------------------- /Themes.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Themes" 3 | author: "Leo Lahti, Sudarshan Shetty et al." 4 | bibliography: 5 | - bibliography.bib 6 | output: 7 | BiocStyle::html_document: 8 | number_sections: no 9 | toc: yes 10 | toc_depth: 4 11 | toc_float: true 12 | self_contained: true 13 | thumbnails: true 14 | lightbox: true 15 | gallery: true 16 | use_bookdown: false 17 | highlight: haddock 18 | --- 19 | 25 | 26 | 27 | ## Cross-plot themes 28 | 29 | 30 | Load example data: 31 | 32 | ```{r core-prevalence, message=FALSE, warning=FALSE} 33 | # Load data 34 | library(microbiome) 35 | data(atlas1006) 36 | 37 | # Rename the data 38 | pseq <- subset_samples(atlas1006, DNA_extraction_method == "r") 39 | ``` 40 | 41 | 42 | ```{r crossplot1, fig.width=6, fig.height=5, warning=FALSE, message=FALSE, fig.show="hold", out.width="300px"} 43 | library(hrbrthemes) 44 | library(gcookbook) 45 | library(tidyverse) 46 | # Themes from https://github.com/hrbrmstr/hrbrthemse 47 | 48 | p <- ggplot(meta(pseq), aes(age, diversity)) + 49 | geom_point() + 50 | labs(x="Age (y)", y="Diversity (Shannon)", 51 | title="HITChip Atlas", 52 | subtitle="Age-diversity relation", 53 | caption="Caption shown here 'g'") 54 | 55 | 56 | print(p + theme_ipsum() + ggtitle("HITChip - theme_ipsum")) 57 | print(p + theme_ipsum_rc() + ggtitle("HITChip - theme_ipsum_rc")) 58 | ``` 59 | 60 | 61 | ## Color fill example 62 | 63 | ```{r crossplot2, fig.width=6, fig.height=5, warning=FALSE, message=FALSE, fig.show="hold", out.width="300px"} 64 | # Theme from https://github.com/hrbrmstr/hrbrthemse 65 | p <- ggplot(uspopage, aes(x=Year, y=Thousands, fill=AgeGroup)) + 66 | geom_area() + 67 | scale_fill_ipsum() + 68 | scale_x_continuous(expand=c(0,0)) + 69 | scale_y_comma() + 70 | labs(title="Age distribution of population in the U.S., 1900-2002", 71 | subtitle="Example data from the R Graphics Cookbook", 72 | caption="Source: R Graphics Cookbook") + 73 | theme_ipsum_rc(grid="XY") + 74 | theme(axis.text.x=element_text(hjust=c(0, 0.5, 0.5, 0.5, 1))) + 75 | theme(legend.position="bottom") 76 | print(p) 77 | ``` 78 | 79 | ## Barplot example 80 | 81 | ```{r crossplot3, fig.width=6, fig.height=5, warning=FALSE, message=FALSE, fig.show="hold", out.width="300px", eval=FALSE} 82 | update_geom_font_defaults(font_rc_light) 83 | df <- count(mpg, class) %>% 84 | mutate(n = n*2000) %>% 85 | arrange(n) %>% 86 | mutate(class=factor(class, levels=class)) 87 | 88 | p <- ggplot(df, aes(class, n)) + 89 | geom_col() + 90 | geom_text(aes(label=scales::comma(n)), hjust=0, nudge_y=2000) + 91 | scale_y_comma(limits=c(0,150000)) + 92 | coord_flip() + 93 | labs(x="Fuel effiiency (mpg)", y="Weight (tons)", 94 | title="Seminal ggplot2 column chart example with commas", 95 | subtitle="A plot that is only useful for demonstration purposes, esp since you'd never\nreally want direct labels and axis labels", 96 | caption="Caption goes here") + 97 | theme_ipsum_rc(grid="X") 98 | 99 | print(p) 100 | ``` 101 | 102 | 103 | ## Viridis scale 104 | 105 | 106 | ```{r core2, fig.width=9, fig.heigth=6, out.width="400px", warning=FALSE} 107 | # Load data 108 | library(microbiome) 109 | data(peerj32) 110 | 111 | # Rename the data 112 | pseq <- peerj32$phyloseq 113 | 114 | # Calculate compositional version of the data 115 | # (relative abundances) 116 | pseq.rel <- microbiome::transform(pseq, "compositional") 117 | 118 | # With compositional (relative) abundances 119 | det <- c(0, 0.1, 0.5, 2, 5, 20)/100 120 | prevalences <- seq(.05, 1, .05) 121 | p <- plot_core(pseq.rel, prevalences = prevalences, detections = det, plot.type = "lineplot") + xlab("Relative Abundance (%)") 122 | 123 | # Same with the viridis color palette 124 | # color-blind friendly and uniform 125 | # options: viridis, magma, plasma, inferno 126 | # https://cran.r-project.org/web/packages/viridis/vignettes/intro-to-viridis.html 127 | # Also discrete=TRUE versions available 128 | library(viridis) 129 | print(p + scale_fill_viridis()) 130 | ``` 131 | -------------------------------------------------------------------------------- /Visualization.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Visualization" 3 | author: "Leo Lahti, Sudarshan Shetty et al." 4 | bibliography: 5 | - bibliography.bib 6 | output: 7 | BiocStyle::html_document: 8 | number_sections: no 9 | toc: yes 10 | toc_depth: 4 11 | toc_float: true 12 | self_contained: true 13 | thumbnails: true 14 | lightbox: true 15 | gallery: true 16 | use_bookdown: false 17 | highlight: haddock 18 | --- 19 | 25 | 26 | 27 | # Experimental functions 28 | 29 | 30 | **Time series for individual subjects** 31 | 32 | ```{r homogeneity-timeseries, message=FALSE, warning=FALSE, fig.height=5, fig.height=5, eval=FALSE} 33 | source(system.file("extdata/plot_longitudinal.R", package = "microbiome")) 34 | p <- plot_longitudinal(pseq, "Dialister", subject = "831", tipping.point = 0.5) 35 | print(p) 36 | ``` 37 | -------------------------------------------------------------------------------- /_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-dinky -------------------------------------------------------------------------------- /ancom/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | *~ 3 | -------------------------------------------------------------------------------- /ancom/README.md: -------------------------------------------------------------------------------- 1 | # ANCOM 2 | 3 | This minimal example on [ANCOM](https://www.ncbi.nlm.nih.gov/pubmed/26028277) (Mandal et al. 205) can be run with main.R (in R). This generates the summary in [ancom.md](ancom.md) 4 | 5 | -------------------------------------------------------------------------------- /ancom/ancom.R: -------------------------------------------------------------------------------- 1 | library(MASS) 2 | library(dplyr) 3 | 4 | # Load functions 5 | source("functions.R") 6 | 7 | # Load example data 8 | library(microbiome) 9 | data(dietswap) 10 | pseq <- dietswap 11 | 12 | # Use CLR abundances 13 | pseq <- transform(pseq, "compositional") 14 | 15 | # Initialize results 16 | groups <- c("sex", "nationality") 17 | pvals.ancom <- matrix(NA, nrow = ntaxa(pseq), 18 | ncol = length(groups)) 19 | rownames(pvals.ancom) <- taxa(pseq) 20 | colnames(pvals.ancom) <- groups 21 | 22 | 23 | for (gr in groups) { 24 | 25 | # Run ANCOM for comparisons within this group 26 | print(gr) 27 | 28 | ps <- pseq 29 | sample_data(ps)$group <- unlist(meta(ps)[, gr]) 30 | ps <- subset_samples(ps, !is.na(group)) 31 | 32 | v <- ancom(ps, "group") 33 | 34 | pvals.ancom[names(v), gr] <- v 35 | 36 | } 37 | 38 | print("ANCOM for loop ok") 39 | 40 | # Note: multiple testing is done per group by ANCOM 41 | # We do not apply further correction 42 | pvals.ancom <- pvals.ancom[which(rowMeans(pvals.ancom < 0.05) > 0),] 43 | 44 | # Add fold change information 45 | library(reshape) 46 | dfm <- melt(pvals.ancom) 47 | names(dfm) <- c("OTU", "Grouping", "pval.ancom") 48 | dfm$relab.cond <- rep(NA, nrow(dfm)) 49 | dfm$relab.control <- rep(NA, nrow(dfm)) 50 | dfm$log10FC <- rep(NA, nrow(dfm)) 51 | 52 | # Provide a table with relative abundances and log10 fold changes 53 | # for each group 54 | # If log10FC is infinite it means that all controls for this tax are 0 55 | check <- c() 56 | 57 | pseq.clr <- transform(pseq, "clr") 58 | for (i in 1:nrow(dfm)) { 59 | 60 | tax <- as.character(dfm[i, 1]) 61 | loc <- as.character(dfm[i, 2]) 62 | x <- get_sample(pseq, tax) 63 | x.clr <- get_sample(pseq.clr, tax) 64 | 65 | gr <- meta(pseq)[, loc] 66 | N <- as.numeric(gr) 67 | N <- N - min(N, na.rm = TRUE) 68 | 69 | cond <- mean(x[which(N == 1)], na.rm = TRUE) 70 | control <- mean(x[which(N == 0)], na.rm = TRUE) 71 | 72 | cond.clr <- mean(x.clr[which(N == 1)], na.rm = TRUE) 73 | control.clr <- mean(x.clr[which(N == 0)], na.rm = TRUE) 74 | 75 | dfm[i,"relab.cond"] <- 100 * cond 76 | dfm[i,"relab.control"] <- 100 * control 77 | 78 | # Check fold-changes with CLR transformed data 79 | dfm[i,"log10FC"] <- log10(exp(cond.clr - control.clr)) 80 | 81 | # Verify that comp and clr abundance FC are consistent 82 | check <- rbind(check, 83 | c( 84 | cond.clr = cond.clr, 85 | control.clr = control.clr, 86 | cond = cond, 87 | control = control, 88 | clr.fc = log10(exp(cond.clr - control.clr)), 89 | comp.fc = log10(cond/control) 90 | 91 | )) 92 | 93 | } 94 | 95 | 96 | res.ancom <- dfm %>% arrange(pval.ancom) %>% filter(log10FC < 1) 97 | 98 | 99 | 100 | print("ANCOM ok") 101 | 102 | -------------------------------------------------------------------------------- /ancom/ancom.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Group comparisons at the level of individual taxonomic groups" 3 | author: "Author" 4 | date: "`r date()`" 5 | output: md_document 6 | --- 7 | 8 | ```{r init, echo=FALSE, warning=FALSE, message=FALSE} 9 | library(knitr) 10 | theme_set(theme_bw(20)) 11 | opts_chunk$set(fig.width=10, fig.height=10, message=FALSE, warning=FALSE) 12 | opts_chunk$set(dev="CairoPNG") 13 | padj.th <- 0.25 14 | ``` 15 | 16 | # ANCOM analysis 17 | 18 | Significant taxa (regarding the grouping). Only the hits with adjusted p<`r padj.th` are shown. Adjustment is done per group based on ANCOM defaults. 19 | 20 | The log10 fold changes have been calculated from CLR-transformed abundances. 21 | 22 | ```{r deseq2, echo=FALSE, warning=FALSE, message=FALSE} 23 | tab <- subset(res.ancom, pval.ancom < 0.05) %>% arrange(desc(abs(log10FC))) 24 | tab$relab.cond <- round(tab$relab.cond, 2) 25 | tab$relab.control <- round(tab$relab.control, 2) 26 | kable(tab) 27 | ``` 28 | 29 | 30 | Illustration of the significant genera. CLR transformed abundances (just first 3 shown). 31 | 32 | ```{r tops, echo=FALSE, warning=FALSE, message=FALSE, fig.show="hold", out.width = "20%", fig.height=6, fig.width=5} 33 | library(ggbeeswarm) 34 | theme_set(theme_bw(20)) 35 | df <- meta(pseq) 36 | ps <- transform(pseq, "clr") 37 | tab <- subset(res.ancom, pval.ancom < 0.05) %>% arrange(desc(abs(log10FC))) 38 | for (i in 1:min(nrow(tab), 3)) { 39 | tax <- as.character(res.ancom[i, "OTU"]) 40 | group <- as.character(res.ancom[i, "Grouping"] ) 41 | df <- meta(ps) 42 | df$Group <- as.factor(df[, gsub("/", "\\.", group)]) 43 | df$Abundance <- abundances(ps)[tax, ] 44 | df <- subset(df, !is.na(Group)) 45 | p <- ggplot(df, aes(x = Group, y = Abundance)) + 46 | #geom_boxplot() + 47 | geom_beeswarm() + 48 | #geom_jitter(width = 0.25, alpha = 0.5) + 49 | # scale_y_log10() + 50 | #coord_flip() + 51 | theme(axis.text.x = element_text(angle = 45, vjust = 0.5)) + 52 | labs(x = "", 53 | y = "Abundance (CLR)", 54 | title = paste(tax), 55 | subtitle = group) 56 | print(p) 57 | } 58 | ``` 59 | 60 | 61 | 62 | -------------------------------------------------------------------------------- /ancom/figure/tops-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microbiome/tutorials/4c9c5eb321358c97369b1d2325ff81906a14ac0b/ancom/figure/tops-1.png -------------------------------------------------------------------------------- /ancom/figure/tops-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microbiome/tutorials/4c9c5eb321358c97369b1d2325ff81906a14ac0b/ancom/figure/tops-2.png -------------------------------------------------------------------------------- /ancom/figure/tops-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microbiome/tutorials/4c9c5eb321358c97369b1d2325ff81906a14ac0b/ancom/figure/tops-3.png -------------------------------------------------------------------------------- /ancom/main.R: -------------------------------------------------------------------------------- 1 | # Run analysis with example data 2 | source("ancom.R") 3 | 4 | # Generate summaries 5 | library(knitr) 6 | knitr::knit("ancom.Rmd") 7 | -------------------------------------------------------------------------------- /archive/Crosshyb.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Crosshyb" 3 | author: "Leo Lahti" 4 | date: "`r Sys.Date()`" 5 | bibliography: 6 | - bibliography.bib 7 | 8 | output: 9 | rmarkdown::html_vignette 10 | --- 11 | 17 | 18 | ## Visualizing cross-hybridization 19 | 20 | To visualize cross-hybridization between selected taxa on HITChip (or 21 | other chips), use the following scripts. By default the groups with no 22 | cross-hyb are filtered out for clarity. Rows and columns are ordered 23 | by hierarchical clustering. The cross-hyb is shown in percentages, 24 | rounded as indicated by the rounding argument. The percentage 25 | indicates which fraction of probes for a row taxon overlaps with 26 | probes of a column taxon. This is not symmetric if the row and col 27 | taxon have a different total number of probes. For details, see 28 | help(plot_crosshyb). 29 | 30 | ```{r chyb, warning=FALSE, fig.width=20, fig.height=20} 31 | library(microbiome, quietly = TRUE) 32 | library(dplyr) 33 | 34 | # Pick the phylogeny which was used to summarize probes to species level 35 | tax.table <- get_hitchip_taxonomy("HITChip", "full") 36 | 37 | # Check cross-hyb between all L2 groups 38 | res <- plot_crosshyb(tax.level = "L2", rounding = 1, show.plot = FALSE, tax.table = tax.table) 39 | 40 | # Pick the crosshyb table and figure 41 | crosshyb.table <- res$data 42 | p <- res$plot 43 | 44 | # Plot the figure 45 | print(p) 46 | 47 | # Organize the Crosshyb table 48 | suppressMessages(library(dplyr)) 49 | s <- filter(res$data, crosshyb > 0) 50 | s <- s[rev(order(s$crosshyb)),] 51 | head(s) 52 | ``` 53 | 54 | 55 | ### Further examples 56 | 57 | Investigate species-species cross-hybridization within the Dialister L2 group 58 | 59 | ```{r chyb2, warning=FALSE} 60 | # Select species belonging to Dialister L2 group 61 | mytaxa <- map_levels("Dialister", from = "L2", to = "species", tax.table)[[1]] 62 | 63 | # Check cross-hyb between Dialister species 64 | res <- plot_crosshyb(tax.level = "species", selected.taxa = mytaxa, rounding = 0, tax.table = tax.table) 65 | 66 | # Check the cross-hyb data as well 67 | library(knitr) 68 | kable(head(res$data)) 69 | ``` 70 | 71 | -------------------------------------------------------------------------------- /archive/DiversityTable.tab: -------------------------------------------------------------------------------- 1 | "evenness" "richness" "diversity" 2 | "Sample.1" 0.839046123101901 918 6.01587586771419 3 | "Sample.2" 0.810739894076034 728 5.75936452810286 4 | "Sample.3" 0.85349658142741 990 6.13728643981587 5 | "Sample.4" 0.80880377585622 698 5.63006638760482 6 | "Sample.5" 0.804770162759795 661 5.59598750340119 7 | "Sample.6" 0.818618485682415 334 5.32788428995849 8 | "Sample.7" 0.828834270972343 849 5.81993383442343 9 | "Sample.8" 0.813447510065929 403 5.21496684971292 10 | "Sample.9" 0.827701107735919 757 5.75403174217837 11 | "Sample.10" 0.85652952898239 868 6.01197293131539 12 | "Sample.11" 0.84061172939295 727 5.92423984049287 13 | "Sample.12" 0.852296304462099 864 5.98017494112037 14 | "Sample.13" 0.776055940729176 307 4.92336081081147 15 | "Sample.14" 0.762121306552714 166 4.68350349000648 16 | "Sample.15" 0.818048201679392 314 5.32965300142074 17 | "Sample.16" 0.711330612392343 249 6.64434948926102 18 | "Sample.17" 0.812135444880361 849 5.92982179142138 19 | "Sample.18" 0.792626929117248 516 5.48986635888394 20 | "Sample.19" 0.804225756552546 414 5.28372396308021 21 | "Sample.20" 0.851672544379265 1112 6.20427213572866 22 | -------------------------------------------------------------------------------- /archive/DynamicsIBD.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microbiome/tutorials/4c9c5eb321358c97369b1d2325ff81906a14ac0b/archive/DynamicsIBD.rda -------------------------------------------------------------------------------- /archive/F1000/MyArticle/F1000header.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microbiome/tutorials/4c9c5eb321358c97369b1d2325ff81906a14ac0b/archive/F1000/MyArticle/F1000header.png -------------------------------------------------------------------------------- /archive/F1000/MyArticle/f1000_styles.sty: -------------------------------------------------------------------------------- 1 | \ProvidesPackage{f1000_styles} 2 | % Packages 3 | \usepackage{authblk} 4 | \usepackage[english]{babel} 5 | \usepackage[utf8]{inputenc} 6 | \usepackage[T1]{fontenc} 7 | \usepackage[bitstream-charter]{mathdesign} 8 | \usepackage{colortbl} 9 | \usepackage[usenames,dvipsnames,table]{xcolor} 10 | \usepackage{amsmath} 11 | \usepackage{graphicx} 12 | \usepackage{fancyhdr} 13 | \usepackage{setspace} 14 | \usepackage[labelsep=period,justification=justified]{caption} 15 | \usepackage{lastpage} 16 | \usepackage{xifthen} 17 | \usepackage{todonotes} 18 | \usepackage[hmargin=1.8cm,vmargin=2.2cm]{geometry} 19 | 20 | % Define the versionDate and flogo commands 21 | \newcommand{\versionDate}{\footnotesize \textit{F1000Research} 2016 - DRAFT ARTICLE (PRE-SUBMISSION)} 22 | \newcommand{\flogo}{\includegraphics[height=12pt]{F1000header.png}} 23 | 24 | % Section heading styles 25 | \usepackage[compact]{titlesec} 26 | 27 | \titleformat*{\section}{\Large\usefont{OT1}{phv}{b}{n}\color{darkgray}} 28 | \titleformat*{\subsection}{\large\usefont{OT1}{phv}{b}{n}\color{MidnightBlue}} 29 | \titleformat*{\subsubsection}{\large\usefont{OT1}{phv}{b}{n}\color{MidnightBlue}} 30 | 31 | % Section heading spacing (reduced space below headings) 32 | \titlespacing\section{0pt}{3.5ex plus 1.2ex minus .2ex}{0ex} 33 | \titlespacing\subsection{0pt}{3.25ex plus 1.2ex minus .2ex}{0ex} 34 | \titlespacing\subsubsection{0pt}{3.25ex plus 1.2ex minus .2ex}{0ex} 35 | 36 | % Caption style 37 | \captionsetup{labelfont={color=red,bf},textfont={color=black,bf}} 38 | 39 | % Table style 40 | \definecolor{DarkKhaki}{rgb}{0.74,0.72,0.42} 41 | \colorlet{tableheadcolor}{DarkKhaki} % Table header colour = 25% gray 42 | \newcommand{\header}{\rowcolor{tableheadcolor}} % 43 | \colorlet{tablerowcolor}{DarkKhaki!50} % Table row separator colour = 10% gray 44 | \newcommand{\row}{\rowcolor{tablerowcolor}} % 45 | \newenvironment{tabledata}[1][1]{% 46 | \renewcommand*{\extrarowheight}{0.1cm}% 47 | \tabular% 48 | }{% 49 | \endtabular 50 | } 51 | 52 | % Define a title note command 53 | \newcommand\titlenote[1]{\renewcommand\@titlenote{#1}} 54 | \newcommand\@titlenote{} 55 | 56 | % Column separation 57 | \setlength{\columnsep}{0.25in} 58 | 59 | % Page margins, headers and footers 60 | % 1 For the main body of the article 61 | \fancypagestyle{main}{ 62 | \newgeometry{hmargin=3.3cm,vmargin=2.2cm} 63 | \fancyheadoffset{1.5cm} % Set header width to match front pages 64 | \fancyfootoffset{1.5cm} % Set footer width to match front pages 65 | \setlength{\parindent}{0pt} 66 | \setlength{\headheight}{16.5pt} 67 | \renewcommand{\headrulewidth}{0pt} 68 | \renewcommand{\footrulewidth}{0pt} 69 | \lhead{} 70 | \chead{} 71 | \rhead{\versionDate} 72 | \lfoot{} 73 | \cfoot{} 74 | \rfoot{\footnotesize Page \thepage\ of \pageref{LastPage}} 75 | } 76 | 77 | % 2 For the front page(s) 78 | \fancypagestyle{front}{ 79 | \newgeometry{hmargin=1.8cm,vmargin=2.2cm} 80 | \setlength{\parindent}{0pt} 81 | \setlength{\headheight}{16.5pt} 82 | \renewcommand{\footrulewidth}{1pt} 83 | \lhead{\textsc{\flogo}} 84 | \chead{} 85 | \rhead{\versionDate} 86 | \lfoot{} 87 | \cfoot{} 88 | \rfoot{\footnotesize Page \thepage\ of \pageref{LastPage}} 89 | } 90 | 91 | % 92 | \renewenvironment{abstract}{% 93 | \usefont{OT1}{phv}{m}{n} 94 | \begin{minipage}{0.65\textwidth} 95 | \rule{\textwidth}{1pt}\\ \textcolor{Orange}{\textbf{\abstractname}}} 96 | {\par\noindent\rule{\textwidth}{1pt}\end{minipage}} 97 | % 98 | \makeatletter 99 | \renewcommand\@maketitle{% 100 | \begin{minipage}{0.95\textwidth} 101 | \vskip 2em 102 | \let\footnote\thanks 103 | {\fontsize{20}{22}\usefont{OT1}{phv}{b}{n} \@title \par {\small{\@titlenote}} } 104 | \vskip 1.5em 105 | {\@author \par} 106 | \end{minipage} 107 | \vskip 1em \par 108 | } 109 | \makeatother 110 | \renewcommand\Authfont{\fontsize{16}{18}\usefont{OT1}{phv}{b}{n}} 111 | \renewcommand\Affilfont{\fontsize{10}{12}\usefont{OT1}{phv}{b}{n}} -------------------------------------------------------------------------------- /archive/F1000/MyArticle/frog.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microbiome/tutorials/4c9c5eb321358c97369b1d2325ff81906a14ac0b/archive/F1000/MyArticle/frog.jpg -------------------------------------------------------------------------------- /archive/F1000/MyArticle/sample.bib: -------------------------------------------------------------------------------- 1 | @BOOK{Smith:2012qr, 2 | title = {{B}ook {T}itle}, 3 | publisher = {Publisher}, 4 | author = {Smith, J.~M. and Jones, A.~B.}, 5 | year = {2012}, 6 | edition = {7th}, 7 | } 8 | 9 | @ARTICLE{Smith:2013jd, 10 | author = {Jones, A.~B. and Smith, J.~M.}, 11 | title = {{A}rticle {T}itle}, 12 | journal = {Journal title}, 13 | year = {2013}, 14 | volume = {13}, 15 | pages = {123-456}, 16 | number = {52}, 17 | month = {March}, 18 | publisher = {Publisher} 19 | } -------------------------------------------------------------------------------- /archive/Maturity.Rmd: -------------------------------------------------------------------------------- 1 | ### Other community indices: Microbiome maturity index (experimental) 2 | 3 | The microbiota maturity index has been adapted from [Subramanian, 4 | S. et 5 | al. (2014)](http://www.nature.com/nature/journal/v510/n7505/abs/nature13421.html) 6 | and [Dogra, S. et 7 | al. (2015)](http://mbio.asm.org/content/6/1/e02419-14.short), where 8 | microbiota maturity index has been shown to differentiate healthy 9 | children. In [Korpela et 10 | al. (2016)](www.nature.com/ncomms/2016/160126/ncomms10410/full/ncomms10410.html) 11 | this was calculated as the first principal coordinate from a PCoA 12 | (MDS) using only significantly age-associated genus-level taxa (all 13 | groups included). In this function NMDS is used instead. The maturity 14 | index is also adjusted for age. This artificial example is based on 15 | readily available adult data set whereas the maturity index is 16 | typically calculated from and used for babies/children: 17 | 18 | ```{r div-example, warning=FALSE, message=FALSE, fig.path = "figure/"} 19 | library(phyloseq) 20 | data("atlas1006") 21 | pseq <- atlas1006 22 | pseq <- subset_samples(pseq, DNA_extraction_method == "r" & time == 0) 23 | pseq <- transform_phyloseq(pseq, "relative.abundance") 24 | maturity.index <- maturity(pseq) 25 | ``` 26 | 27 | -------------------------------------------------------------------------------- /archive/Motionchart.Rmd: -------------------------------------------------------------------------------- 1 | ### Interactive motioncharts with googleVis 2 | 3 | This provides interactive charts where you can change the variables at 4 | X/Y axis, point size and color as you like. With time series data 5 | moving animations are also possible. 6 | 7 | 8 | Prepare the data: 9 | 10 | ```{r motionchart-example, fig.width=8, fig.height=8, message=FALSE} 11 | library(microbiome) 12 | 13 | data(peerj32) 14 | 15 | # Species-level data in phyloseq format 16 | pseq <- peerj32$phyloseq 17 | 18 | # Retrieve L2 (genus-like) data in phyloseq format 19 | pseq.L2 <- aggregate_taxa(pseq, level = "L2") 20 | 21 | # Convert L2 to matrix format 22 | genus.matrix.log10.simulated <- log10(otu_table(pseq.L2)@.Data) 23 | 24 | # Combine phylotype profiling data and sample metadata 25 | df <- cbind(metadata.simulated, t(genus.matrix.log10.simulated)) 26 | ``` 27 | 28 | Plot a Motion Chart using googleVis - package. Note: this requires 29 | flash and internet connection. 30 | 31 | Form a motion chart from example data NOTE: the data set must be given 32 | as data.frame which can contain NUMERIC and CHARACTER fields (NO 33 | FACTORS, NOR LOGICAL variables!). The FIRST FOUR FIELDS must be 34 | provided in the following order: idvar, timevar, two numeric fields, 35 | then any number of numeric and character fields. 36 | 37 | The plot shows only the first time point. Replace the time field with 38 | a constant to plot all in one figure using df$time <- rep(1, nrow(df)) 39 | 40 | ```{r motionchart-example-2, fig.width=8, fig.height=8, message=FALSE} 41 | library(googleVis) 42 | 43 | # See help(gvisMotionChart) for further details 44 | mchart <- gvisMotionChart(df, idvar = "sample", timevar = "time") 45 | 46 | # Plot immediately (opens in browser, requires flash) 47 | plot(mchart) 48 | ``` 49 | 50 | Save as html (needs javascript to open!). NOTE: html file viewing does not work locally - store the html file on server and view through internet: 51 | 52 | ```{r motionchart-example2, fig.width=8, fig.height=8, eval=FALSE} 53 | print(mchart, file="~/MotionChart.html") 54 | ``` 55 | -------------------------------------------------------------------------------- /archive/PhylotypeRatios.R: -------------------------------------------------------------------------------- 1 | #' @title Phylotype ratios 2 | #' @description Calculate phylotype ratios (eg. Bacteroides vs. 3 | #' Prevotella etc.) for a given phylotypes vs. samples matrix 4 | #' @param dat phylotypes vs. samples data matrix in log10 scale 5 | #' @return phylotype pairs x samples matrix indicating the ratio 6 | #' (in log10 domain) between each unique pair 7 | #' @export 8 | #' @examples 9 | #' data(peerj32) 10 | #' ratios <- PhylotypeRatios(peerj32$microbes) 11 | #' @references See citation('microbiome') 12 | #' @author Leo Lahti \email{microbiome-admin@@googlegroups.com} 13 | #' @keywords utilities 14 | PhylotypeRatios <- function(dat) { 15 | 16 | phylogroups <- rownames(dat) 17 | Nratios <- (length(phylogroups)^2 - length(phylogroups))/2 18 | Nsamples <- ncol(dat) 19 | ratios <- list() 20 | for (i in 1:(length(phylogroups) - 1)) { 21 | for (j in (i + 1):length(phylogroups)) { 22 | pt1 <- phylogroups[[i]] 23 | pt2 <- phylogroups[[j]] 24 | ratios[[paste(pt1, pt2, sep = "-")]] <- dat[pt1, ] - dat[pt2, ] 25 | } 26 | } 27 | ratios <- do.call(cbind, ratios) 28 | 29 | t(ratios) 30 | } 31 | 32 | -------------------------------------------------------------------------------- /archive/ProvasIdata.R: -------------------------------------------------------------------------------- 1 | # This script retrieves the data from 2 | # Lahti et al. PeerJ 1:e32, 2013 https://peerj.com/articles/32/ 3 | # and saves it into a more compact matrix format. 4 | 5 | # Download and extract the tarball (data stored in Data/ directory):: 6 | system("wget https://dfzljdn9uc3pi.cloudfront.net/2013/32/1/SupplementalDataS2.tar.gz") 7 | system("tar -zxvf SupplementalDataS2.tar.gz") 8 | 9 | # Read HITChip genus-level intestinal microbiota-profiling data 10 | hit <- read.csv("Data/SupplementalDataHITChipS3.tab", sep = " ", skip = 1, header = FALSE, row.names = 1) 11 | 12 | # The taxon names are unfortunately space-separated, so they need to be 13 | # specified and read in manually from the file: 14 | h <- unlist(strsplit(readLines("Data/SupplementalDataHITChipS3.tab", n = 1), " ")) 15 | inds <- list(1,2,3,4,5:8,9:11,12,13,14:17,18:21,22:25,26,27,28:30,31,32,33:36,37:40,41:44,45:48,49:52,53:56,57:60,61:64,65,66:68,69,70:73,74:77,78,79:82,83,84:87,88,89:92,93:96,97:100,101:104,105:108,109:112,113:116,117:120,121:124,125:128,129:132,133:136,137,138:141,142:145,146,147:149,150,151:154,155:158,159:162,163,164:167,168:171,172:175,176:179,180:183,184:187,188:191,192:195,196:199,200,201,202,203,204,205:208,209:212,213:216,217:220,221:224,225:228,229:232,233,234,235:238,239:242,243,244,245:248,249,250,251,252:255,256:259,260:263,264:267,268:271,272:275,276:279,280:283,284:287,288:291,292:295,296:299,300:303,304,305:307,308,309:312,313:316,317:320,321:324,325:328,329:332,333,334:337,338,339:342,343:346,347:350,351:354,355:358,359:361,362:363,364:365,366:368,369:371,372:373,374:375,376,377:379,380,381:383,384,385:387) 16 | nams <- sapply(inds, function (i) {paste(h[i], collapse = " ")}) 17 | 18 | # Add the retrieved taxon names to HITChip data columns 19 | colnames(hit) <- nams 20 | 21 | # Read Lipidomic profiling data 22 | lip <- read.csv("Data/SupplementalDataLipidsS2.tab", sep = " ", row.names = 1) 23 | lip2 <- apply(lip, 2, function (x) as.numeric(gsub("\\,", "\\.", x))) 24 | dimnames(lip2) <- dimnames(lip) 25 | 26 | # Write to file 27 | write.table(hit, file = "~/Rpackages/microbiome/microbiome/inst/extdata/ProvasI-L2.tab", sep = ";", quote = FALSE) 28 | write.table(lip2, file = "~/Rpackages/microbiome/microbiome/inst/extdata/ProvasI-Lipid.tab", sep = ";", quote = FALSE) 29 | 30 | # ........................ 31 | 32 | data(peerj32) 33 | # lipids microbes meta 34 | # Taxonomy 35 | taxonomy <- GetPhylogeny("HITChip", "filtered") 36 | taxonomy <- unique(as.data.frame(taxonomy[, c("L1", "L2")])) 37 | rownames(taxonomy) <- as.vector(taxonomy[, "L2"]) 38 | 39 | # Merging data matrices into phyloseq format: 40 | pseq <- hitchip2physeq(t(otu), meta, taxonomy) 41 | 42 | round(peerj32$microbes - min(peerj32$microbes)) -------------------------------------------------------------------------------- /archive/RDA/firstlib2.R: -------------------------------------------------------------------------------- 1 | #' @importFrom ade4 s.class 2 | #' @importFrom grDevices chull 3 | 4 | -------------------------------------------------------------------------------- /archive/RDA/plot_rda_bagged.R: -------------------------------------------------------------------------------- 1 | #' @title Plot RDA 2 | #' @description Visualize rda_bagged output. 3 | #' @param x Output from rda_bagged 4 | #' @param which.bac TBA 5 | #' @param ptype Plot type. 'spider' or 'hull' 6 | #' @param comp TBA 7 | #' @param cex.bac Plot size. 8 | #' @param plot.names Plot names 9 | #' @param group.cols Group colors. 10 | #' @param ... Other arguments to be passed 11 | #' @return TBA 12 | #' @examples 13 | #' data(peerj32) 14 | #' x <- t(peerj32$microbes) 15 | #' y <- factor(peerj32$meta$time); names(y) <- rownames(peerj32$meta) 16 | #' # Use more iterations in practice 17 | #' res <- rda_bagged(x, y, bs.iter=5) 18 | #' tmp <- plot_rda_bagged(res) 19 | #' @export 20 | #' @references See citation('microbiome') 21 | #' @author Contact: Jarkko Salojarvi \email{microbiome-admin@@googlegroups.com} 22 | #' @keywords utilities 23 | plot_rda_bagged <- function(x, which.bac=1:nrow(x$loadings), 24 | ptype="spider", comp=1:2, cex.bac=0.5, plot.names=TRUE, 25 | group.cols=as.numeric(unique(Y)), ...) { 26 | 27 | # TODO: can we speed up this function ? 28 | # For instance by switching from for loops to vectorization etc 29 | 30 | y <- cluster <- x.centroid <- y.centroid <- NULL 31 | 32 | bag <- x #[which(!names(x) == 'variable')] #$bagged.rda 33 | Y <- x$variable 34 | 35 | scaled.loadings <- (bag$loadings/max(abs(bag$loadings)))[, comp] 36 | scaled.scores <- (bag$scores/max(abs(bag$scores)))[, comp] 37 | 38 | plot(rbind(scaled.scores, scaled.loadings), type="n", 39 | xlab=paste(names(bag$R2)[1], 40 | " (", format(100 * bag$R2[1], digits=2), "%)", sep=""), 41 | ylab=paste(names(bag$R2)[2], 42 | " (", format(100 * bag$R2[2], digits=2), "%)", sep="")) 43 | 44 | if (ptype == "spider") 45 | s.class(scaled.scores, factor(Y), grid=FALSE, col=group.cols, 46 | cellipse=0.5, cpoint=0, add.plot=TRUE) 47 | 48 | # TODO: Same with ggplot (not ready) 49 | skip <- TRUE 50 | if (!skip) { 51 | # build ggplot dataframe with points (x,y) and 52 | # corresponding groups (cluster) 53 | gg <- as.data.frame(scaled.scores) 54 | names(gg) <- c("x", "y") 55 | gg$cluster <- factor(Y) 56 | # calculate group centroid locations 57 | centroids <- aggregate(cbind(x, y) ~ cluster, data=gg, mean) 58 | # merge centroid locations into ggplot dataframe 59 | gg <- merge(gg, centroids, by="cluster", 60 | suffixes=c("", ".centroid")) 61 | # generate star plot... 62 | ggplot(gg) + 63 | geom_point(aes(x=x, y=y, color=cluster), size=3) + 64 | geom_point(data=centroids, 65 | aes(x=x, y=y, color=cluster), size=4) + 66 | geom_segment(aes(x=x.centroid, 67 | y=y.centroid, xend=x, yend=y, color=cluster)) 68 | } 69 | 70 | if (ptype == "hull") { 71 | 72 | ll <- split(rownames(scaled.scores), Y) 73 | hulls <- lapply(ll, function(ii) ii[chull(scaled.scores[ii, ])]) 74 | for (i in 1:length(hulls)) { 75 | polygon(scaled.scores[hulls[[i]], ], border=group.cols[i]) 76 | } 77 | } 78 | if (plot.names) { 79 | text(scaled.scores, rownames(scaled.scores), cex=0.5, ...) 80 | } else { 81 | points(scaled.scores, ...) 82 | } 83 | text(scaled.loadings[which.bac, ], 84 | rownames(scaled.loadings)[which.bac], cex=cex.bac) 85 | } 86 | 87 | -------------------------------------------------------------------------------- /archive/ROC.Rmd: -------------------------------------------------------------------------------- 1 | ### ROC analysis 2 | 3 | A basic example of ROC/AUC analysis. 4 | 5 | 6 | ### Load example data 7 | 8 | ```{r roc-example, warning=FALSE, fig.path = "figure/"} 9 | library(microbiome) 10 | #pseq <- download_microbiome("dietswap") 11 | data("dietswap") 12 | pseq <- dietswap 13 | 14 | # Pick two groups of samples 15 | # African, DI group, time points 1 and 2 16 | # See the original publication for details: 17 | # references provided in help(dietswap) 18 | pseq1 <- subset_samples(pseq, nationality == "AFR" & 19 | timepoint.within.group == 1 & 20 | group == "DI") 21 | pseq2 <- subset_samples(pseq, nationality == "AFR" & 22 | timepoint.within.group == 2 & 23 | group == "DI") 24 | 25 | # Pick OTU matrix 26 | otu <- otu_table(pseq)@.Data 27 | 28 | # Pick sample metadata 29 | meta <- sample_data(pseq) 30 | 31 | # Define two sample groups for demonstration purpose 32 | g1 <- sample_names(pseq1) 33 | g2 <- sample_names(pseq2) 34 | 35 | # Compare the two groups with Wilcoxon test 36 | pvalues <- c() 37 | for (tax in rownames(otu)) { 38 | pvalues[[tax]] <- wilcox.test(otu[tax, g1], otu[tax, g2])$p.value 39 | } 40 | 41 | # Assume there are some known true positives 42 | # Here for instance Bacteroidetes 43 | bacteroidetes <- levelmap("Bacteroidetes", from = "L1", to = "L2", GetPhylogeny("HITChip", "filtered"))$Bacteroidetes 44 | ``` 45 | 46 | 47 | ### Overall ROC analysis 48 | 49 | Based on the [xrobin/pROC](https://github.com/xrobin/pROC) package 50 | (see that page for more examples with confidence limits etc): 51 | 52 | ```{r roc-example2, warning=FALSE, fig.path = "figure/"} 53 | library(pROC) 54 | res <- roc(names(pvalues) %in% bacteroidetes, pvalues) 55 | ``` 56 | 57 | 58 | ### ROC/AUC value 59 | 60 | ```{r roc-example3, warning=FALSE, fig.path = "figure/"} 61 | res$auc 62 | ``` 63 | 64 | 65 | ### Plot ROC curve 66 | 67 | ```{r roc-example4, warning=FALSE, fig.path = "figure/"} 68 | plot(res) 69 | ``` 70 | -------------------------------------------------------------------------------- /archive/affybatch.R: -------------------------------------------------------------------------------- 1 | data.dir <- system.file("extdata", package = "microbiome") 2 | 3 | # Read probe-level data 4 | f <- paste(data.dir, "/oligoprofile.tab", sep = "") 5 | tab <- read.csv(f, header = TRUE, sep = "\t", row.names = 1, as.is = TRUE) 6 | colnames(tab) <- unlist(strsplit(readLines(f, 1), "\t"))[-1] 7 | probedata <- as.matrix(tab) 8 | 9 | # Read taxonomy table 10 | f <- paste(data.dir, "/taxonomy.tab", sep = "") 11 | tab <- read.csv(f, header = TRUE, sep = "\t", as.is = TRUE) 12 | # Convert into phyloseq taxonomyTable format 13 | taxonomy <- tax_table(as.matrix(tab)) 14 | 15 | require(affy) 16 | 17 | # probesets 18 | map <- unique(taxonomy[, c("species", "oligoID")]@.Data) 19 | colnames(map) <- c("set", "probe") 20 | rownames(map) <- map[, "probe"] 21 | 22 | # Align the probe mappings with the data 23 | # only use the probes that have mapping 24 | coms <- intersect(rownames(probedata), rownames(map)) 25 | map <- map[coms,] 26 | probedata <- probedata[coms,] 27 | 28 | # Custom CDF 29 | nsamples <- ncol(probedata) 30 | nrow <- nrow(probedata) 31 | ncol <- 1 32 | sets <- unique(map[, "set"]) 33 | cdf <- lapply(sets, function(set) { 34 | tmp <- matrix(which(map[, "set"] == set), ncol = 1) 35 | colnames(tmp) <- "pm" 36 | return(tmp) 37 | }) 38 | names(cdf) <- sets 39 | cdfname <- list2env(cdf) 40 | 41 | ab <- new("AffyBatch", exprs = probedata, 42 | cdfName = "cdfname", 43 | #phenoData = phenoData, 44 | nrow = nrow, 45 | ncol = ncol 46 | #annotation = cleancdfname(cdfname, addcdf = FALSE), 47 | #protocolData = protocol, 48 | #description = description, 49 | #notes = notes 50 | ) 51 | 52 | eset <- exprs(rma(ab, background = FALSE, normalize = FALSE)) 53 | 54 | # Not sure if this takes normalization out - check 55 | eset <- exprs(rpa(ab, bg.method = "none", normalization.method = NULL)) 56 | 57 | ## Check given species 58 | #set <- sample(sets, 1) 59 | ## Summary 60 | #x <- eset[set,] 61 | ## Probedata 62 | #this.probes <- as.vector(map[map[, "set"] == set, "probe"]) 63 | #x2 <- probedata[this.probes,] 64 | #cors <- cor(x, t(probedata), method = "spearman"); names(cors) <- rownames(probedata) 65 | # Ranks wrt correct data 66 | #sort(match(this.probes, rev(names(sort(cors))))) 67 | # Ranks wrt random data 68 | #sort(match(this.probes, rev(names(sample(cors))))) 69 | 70 | # ----------------------------------------------- 71 | 72 | 73 | # Toydata example 74 | 75 | nsamples <- 20 76 | nrow <- 3 77 | ncol <- 4 78 | 79 | exprs <- matrix(runif(nrow * ncol * nsamples), nrow = nrow * ncol, ncol = nsamples) 80 | colnames(exprs) <- paste("sample", 1:ncol(exprs), sep = "-") 81 | 82 | # Custom CDF 83 | maxIndx <- nrow * ncol 84 | nsets <- 3 85 | cdf <- lapply(seq(1, nsets), function(x) { 86 | tmp <- matrix(sample(maxIndx, 2), nrow = 2, ncol = 1) 87 | colnames(tmp) <- "pm" 88 | return(tmp) 89 | }) 90 | names(cdf) <- paste("set", seq(1, nsets), sep = "") 91 | cdf[["set1"]][, "pm"] <- c(2, 7) 92 | 93 | v <- runif(nsamples) 94 | exprs[2, ] <- v 95 | exprs[7, ] <- v + runif(nsamples) * 0.01 96 | 97 | cdfname <- list2env(cdf) 98 | 99 | # OK ! 100 | ab <- new("AffyBatch", exprs = exprs, 101 | cdfName = "cdfname", 102 | #phenoData = phenoData, 103 | nrow = nrow, 104 | ncol = ncol 105 | #annotation = cleancdfname(cdfname, addcdf = FALSE), 106 | #protocolData = protocol, 107 | #description = description, 108 | #notes = notes 109 | ) 110 | 111 | indices2xy(30, nc = ncol) 112 | xy2indices(x = 2, y = 3, nc = ncol) 113 | exprs(ab) 114 | 115 | # Summarize 116 | # Jess ! 117 | exprs(rma(ab, background = FALSE, normalize = FALSE)) 118 | 119 | ------------------------------------------ 120 | 121 | -------------------------------------------------------------------------------- /archive/anticorrelation.old.R: -------------------------------------------------------------------------------- 1 | anticorrelation.old <- function(x, type = "interindividual", group_by = "group", method = "spearman") { 2 | 3 | # Pick metadata 4 | meta <- sample_data(x) 5 | 6 | if (!"sample" %in% names(meta)) { 7 | warning("Using the metadata rownames as the sample ID") 8 | meta$sample = rownames(sample_data(x)) 9 | } 10 | 11 | # OTU data Log10 12 | otu <- log10(t(abundances(x)@.Data)) 13 | 14 | # Ensure compatiblity 15 | if (!nrow(otu) == nrow(meta)) { 16 | otu <- t(otu) 17 | } 18 | 19 | if (!all(rownames(otu) == rownames(meta))) { 20 | stop("OTU table and metadata do not match.") 21 | } 22 | 23 | correlation <- NULL 24 | 25 | # Split the data by group 26 | group <- NULL 27 | if (!group_by %in% names(meta)) { 28 | stop(paste("The group_by variable", group_by, "is not included in sample_data(x).")) 29 | } 30 | datasets <- split(as.data.frame(otu), meta[[group_by]], drop = TRUE) 31 | 32 | if (type == "interindividual") { 33 | 34 | tmp <- setdiff(c("sample", group_by), names(meta)) 35 | if (length(tmp) > 0) { 36 | stop(paste("The following variables needed by betadiversity function type=interindividual are 37 | missing from sample metadata:", paste(tmp, collapse = ","))) 38 | } 39 | 40 | # Within-matrix stability NOTE: earlier this was calculated as 41 | # the average of upper triangular correlation matrix This is 42 | # heavily biased since the values are dependent Now replaced 43 | # by calculating correlations against the mean of the whole 44 | # sample set cors <- lower.triangle(cor(dat1)) 45 | dfs <- NULL 46 | for (ds in names(datasets)) { 47 | dat1 <- datasets[[ds]] 48 | cors <- as.vector(cor(t(dat1), matrix(colMeans(dat1)), method = method, use = "pairwise.complete.obs")) 49 | dfs <- rbind(dfs, data.frame(group = rep(ds, length(cors)), 50 | sample = rownames(dat1), 51 | anticorrelation = 1 - cors)) 52 | } 53 | 54 | pval <- anova(lm(anticorrelation ~ group, data = dfs))[["Pr(>F)"]][[1]] 55 | stats <- dfs %>% group_by(group) %>% 56 | summarize(mean = mean(anticorrelation), 57 | sd = sd(anticorrelation)) 58 | homogeneity <- list(data = dfs, statistics = stats, p.value = pval) 59 | 60 | } else if (type == "intraindividual") { 61 | 62 | tmp <- setdiff(c("time", "subject", "sample", group_by), names(meta)) 63 | if (length(tmp) > 0) { 64 | stop(paste("The following variables needed by betadiversity function 65 | type=intraindividual are missing from sample metadata:", paste(tmp, collapse = ","))) 66 | } 67 | 68 | if (!all(sapply(split(meta$time, meta[[group_by]]), function (x) {length(unique(x))}) == 2)) { 69 | stop("Two time points needed for each group for the intraindividual type. 70 | Some groups are having a different number of time points.") 71 | } 72 | 73 | homogeneity <- list() 74 | dfs <- NULL 75 | for (ds in names(datasets)) { 76 | 77 | # Pick the data and metadata for this group 78 | xsub <- datasets[[ds]] 79 | msub <- meta[rownames(xsub),] 80 | 81 | # Use interindividual functionality to assess correlations 82 | # within subjects. Subjects are used as groups 83 | datasets2 <- split(xsub, droplevels(msub[["subject"]])) 84 | cors <- c() 85 | for (subj in names(datasets2)) { 86 | dats <- datasets2[[subj]] 87 | cors[[subj]] <- cor(unlist(dats[1,]), unlist(dats[2,]), 88 | method = method, use = "pairwise.complete.obs") 89 | } 90 | 91 | dfs <- rbind(dfs, data.frame(group = rep(ds, length(cors)), 92 | subject = names(cors), 93 | anticorrelation = 1 - cors)) 94 | 95 | } 96 | 97 | # Between time point correlations within subjects 98 | # and the mean over those correlations 99 | pval <- anova(lm(anticorrelation ~ group, data = dfs))[["Pr(>F)"]][[1]] 100 | stats <- dfs %>% 101 | group_by(group) %>% 102 | summarize(mean = mean(anticorrelation, na.rm = T), 103 | sd = sd(anticorrelation, na.rm = T)) 104 | 105 | anticor <- list(data = dfs, statistics = stats, p.value = pval) 106 | 107 | } 108 | 109 | anticor 110 | 111 | } 112 | 113 | 114 | 115 | 116 | 117 | -------------------------------------------------------------------------------- /archive/betadiversity.R: -------------------------------------------------------------------------------- 1 | #' @title Beta Diversity 2 | #' @description Quantification of beta diversity. 3 | #' @details 4 | #' 5 | #' Beta diversity quantifies community similarity between 6 | #' two samples. At the group level, it can be used to quantify group 7 | #' variability (or spread). This function provides a wrapper that is handy 8 | #' for group-level comparisons of beta diversity. 9 | #' 10 | #' When the sample size n=2, this function calculates the beta diversity 11 | #' between the two samples. 12 | #' 13 | #' When the sample size n>2, it calculates beta 14 | #' diversity of each sample against the group mean. This can be 15 | #' compared between groups in order to compare differences in group 16 | #' homogeneity. 17 | #' 18 | #' Note that this homogeneity measure is affected by sample size. 19 | #' Subsampling or bootstrapping can be applied to equalize sample sizes 20 | #' between comparisons. 21 | #' 22 | #' More advanced beta correlation measures are available but not implemented 23 | #' here yet. 24 | #' 25 | #' The anticorrelation mode is a simple educational indicator that returns 26 | #' average spearman correlation between samples of the input data and 27 | #' the overall group-wise average. 28 | #' 29 | #' @param x phyloseq object 30 | #' @param mode Beta diversity method 31 | #' @return Vector with beta diversities; one for each sample, quantifying the 32 | #' dissimilarity of the sample from the group-level mean 33 | #' @export 34 | #' @examples 35 | #' # Example data 36 | #' library(microbiome) 37 | #' data(peerj32) 38 | #' # Assess beta diversity among the African samples 39 | #' # in a diet swap study 40 | #' b <- group_diversity(subset_samples(dietswap, group == "AFR")) 41 | #' @references 42 | #' The inter- and intra-individual homogeneity in 43 | #' Salonen et al. ISME J. 8:2218-30, 2014 are obtained as 44 | #' 1 - beta where beta is the "anticorrelation" beta diversity. 45 | #' To cite this R package, see citation('microbiome') 46 | #' @seealso the vegdist function from the \pkg{vegan} package provides many 47 | #' standard beta diversity measures 48 | #' @author Contact: Leo Lahti \email{microbiome-admin@@googlegroups.com} 49 | #' @keywords utilities 50 | group_diversity <- function(x, mode = "anticorrelation") { 51 | 52 | # Abundance matrix (taxa x samples) 53 | if (is.phyloseq(x)) { 54 | x <- abundances(x) 55 | } 56 | 57 | if (ncol(x) == 2) { 58 | if (mode == "anticorrelation") { 59 | ret <- 1 - cor(x[,1], x[,2], method = "spearman", use = "pairwise.complete.obs") 60 | } 61 | } else if (ncol(x) > 2 & mode == "anticorrelation") { 62 | ret <- anticorrelation(x, "spearman") 63 | } 64 | 65 | ret 66 | 67 | } 68 | 69 | 70 | anticorrelation <- function(x, method = "spearman") { 71 | 72 | # Correlations calculated against the mean of the sample set 73 | cors <- as.vector(cor( 74 | x, matrix(rowMeans(x)), 75 | method = method, 76 | use = "pairwise.complete.obs")) 77 | 78 | 1 - cors 79 | 80 | } 81 | -------------------------------------------------------------------------------- /archive/bimodality_sarle.R: -------------------------------------------------------------------------------- 1 | #' @title Sarle's Bimodality Coefficient 2 | #' @description Sarle's bimodality coefficient. 3 | #' @param x Data vector for which bimodality will be quantified 4 | #' @param bs.iter Bootstrap iterations 5 | #' @param na.rm Remove NAs 6 | #' @param type Score type ("Sarle.finite.sample" or "Sarle.asymptotic") 7 | #' @return Bimodality score 8 | #' @export 9 | #' @examples 10 | #' b <- bimodality_sarle(rnorm(100), type = "Sarle.finite.sample") 11 | #' @details The coefficient lies in (0, 1). 12 | #' 13 | #' The 'Sarle.asymptotic' version is defined as 14 | #' \deqn{b = (g^2 + 1) / k}. 15 | #' This is coefficient of bimodality from Ellison AM Am. J. Bot. 1987, 16 | #' for microbiome analysis it has been used for instance in 17 | #' Shade et al. 2014. 18 | #' 19 | #' The formula for 'Sarle.finite.sample' (SAS 2012): 20 | #' 21 | #' \deqn{b = \frac{g^2 + 1}{k + (3(n-1)^2)/((n-2)(n-3))}} 22 | #' where n is sample size and 23 | #' 24 | #' In both formulas, \eqn{g} is sample skewness and \eqn{k} is the kth 25 | #' standardized moment (also called the sample kurtosis, or 26 | #' excess kurtosis). 27 | #' 28 | #' @references 29 | #' \itemize{ 30 | #' \item{}{Shade et al. mBio 5(4):e01371-14, 2014.} 31 | #' \item{}{Ellison AM (1987) Am J Botany 74(8):1280-1288.} 32 | #' \item{}{SAS Institute Inc. (2012). SAS/STAT 12.1 user's guide. Cary, NC.} 33 | #' \item{}{To cite the microbiome R package, see citation('microbiome')} 34 | #' } 35 | #' @author Contact: Leo Lahti \email{microbiome-admin@@googlegroups.com} 36 | #' @seealso Check the dip.test from the \pkg{DIP} package for a 37 | #' classical test of multimodality. 38 | #' @keywords utilities 39 | bimodality_sarle <- function(x, bs.iter = 1, na.rm = TRUE, type = "Sarle.finite.sample") { 40 | 41 | g <- skewness(x, na.rm) 42 | k <- kurtosis(x, na.rm) 43 | 44 | if (type == "Sarle.asymptotic") { 45 | 46 | s <- (1 + g^2)/(k + 3) 47 | 48 | } else if (type == "Sarle.finite.sample") { 49 | 50 | n <- length(x) 51 | s <- (g^2 + 1) / (k + (3*(n-1)^2)/((n-2)*(n-3))) 52 | 53 | } 54 | 55 | if (bs.iter > 1) { 56 | s <- c() 57 | for (i in 1:bs.iter) { 58 | xbs <- sample(x, replace = TRUE) 59 | s[[i]] <- bimodality_sarle(xbs, type = type) 60 | } 61 | s <- mean(s) 62 | } 63 | 64 | s 65 | 66 | } 67 | -------------------------------------------------------------------------------- /archive/core_abundance.R: -------------------------------------------------------------------------------- 1 | #' @title Core Abundance 2 | #' @description Calculate total core abundance. 3 | #' @param x \code{\link{phyloseq-class}} object 4 | #' @param detection Detection threshold (non-negative real) 5 | #' @param prevalence Prevalence threshold (in [0, 100]) 6 | #' @return Total core abunance vector. 7 | #' @examples 8 | #' data(dietswap) 9 | #' a <- core_abundance(dietswap) 10 | #' @export 11 | #' @references See citation("microbiome") 12 | #' @author Contact: Leo Lahti \email{microbiome-admin@@googlegroups.com} 13 | #' @keywords utilities 14 | core_abundance <- function(x, detection = 1, prevalence = 95) { 15 | 16 | core.taxa <- core_members(x, detection = detection, 17 | prevalence = prevalence) 18 | 19 | # Core matrix 20 | # FIXME: directly use the core function 21 | xx <- abundances(prune_taxa(core.taxa, x)) 22 | xx <- matrix(xx, nrow = length(core.taxa)) 23 | 24 | # Total sum of core abundances 25 | ab <- colSums(xx, na.rm = TRUE) 26 | 27 | ab 28 | 29 | } 30 | -------------------------------------------------------------------------------- /archive/core_members.R: -------------------------------------------------------------------------------- 1 | #' @title Core Taxa 2 | #' @description Determine members of the core microbiota with given abundance 3 | #'' and prevalences. 4 | #' @param x \code{\link{phyloseq-class}} object 5 | #' @param detection Detection threshold (non-negative real) 6 | #' @param prevalence Prevalence threshold (in [0, 100]) 7 | #' @param sort Logical. Sort the taxa. 8 | #' @return Vector of core members 9 | #' @details For phyloseq object, lists taxa that are more prevalent with the 10 | #' given detection. For matrix, lists columns that satisfy 11 | #' these criteria. 12 | #' @examples 13 | #' data(dietswap) 14 | #' a <- core_members(dietswap, 1, 95) 15 | #' @export 16 | #' @aliases prevalent_taxa 17 | #' @references 18 | #' A Salonen et al. The adult intestinal core microbiota is determined by 19 | #' analysis depth and health status. Clinical Microbiology and Infection 20 | #' 18(S4):16 20, 2012. 21 | #' To cite the microbiome R package, see citation('microbiome') 22 | #' @author Contact: Leo Lahti \email{microbiome-admin@@googlegroups.com} 23 | #' @keywords utilities 24 | core_members <- function(x, detection = 1, prevalence = 95, 25 | sort = TRUE) { 26 | 27 | if (class(x) == "phyloseq") { 28 | x <- abundances(x) 29 | } 30 | taxa <- names(which(prevalence(x, detection) > prevalence)) 31 | 32 | if (sort) { 33 | taxa <- sort(taxa) 34 | } 35 | 36 | taxa 37 | 38 | } 39 | -------------------------------------------------------------------------------- /archive/core_richness.R: -------------------------------------------------------------------------------- 1 | #' @title Core Richness 2 | #' @description Calculate total core richness 3 | #' @param x \code{\link{phyloseq-class}} object 4 | #' @param detection Detection threshold (non-negative real) 5 | #' @param prevalence Prevalence threshold (in [0, 100]) 6 | #' @return Total core abundance vector. 7 | #' @examples 8 | #' data(dietswap) 9 | #' a <- core_richness(dietswap) 10 | #' @export 11 | #' @references See citation("microbiome") 12 | #' @author Contact: Leo Lahti \email{microbiome-admin@@googlegroups.com} 13 | #' @keywords utilities 14 | core_richness <- function(x, detection = 1, prevalence = 95) { 15 | 16 | core.taxa <- core_members(x, detection = detection, 17 | prevalence = prevalence) 18 | 19 | # Core matrix 20 | # FIXME: directly use the core function 21 | xx <- abundances(prune_taxa(core.taxa, x)) 22 | xx <- matrix(xx, nrow = length(core.taxa)) 23 | 24 | # Core taxa richness in each sample 25 | ab <- colSums(xx > detection, na.rm = TRUE) 26 | 27 | ab 28 | 29 | } 30 | -------------------------------------------------------------------------------- /archive/crash-handler.conf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microbiome/tutorials/4c9c5eb321358c97369b1d2325ff81906a14ac0b/archive/crash-handler.conf -------------------------------------------------------------------------------- /archive/data.R: -------------------------------------------------------------------------------- 1 | # Load the package 2 | library(microbiome) 3 | 4 | # Define data path (here we retrieve data from R package itself) 5 | data.directory <- system.file("extdata", package = "microbiome") 6 | 7 | # Read HITChip data matrix (log10 values) 8 | level <- "L2" 9 | method <- "frpa" 10 | hitchip.data <- read.profiling(level = level, 11 | method = method, 12 | data.dir = data.directory, 13 | log10 = TRUE) 14 | 15 | # Oligo level data (absolute values - no log10) 16 | oligo.data <- read.profiling(level = "oligo", 17 | data.dir = data.directory, 18 | log10 = FALSE) 19 | 20 | # Oligo-phylogeny mapping table 21 | phylogeny.info <- read.profiling(level = "phylogeny.full", 22 | data.dir = data.directory) 23 | 24 | # -------------------------------------------------------------- 25 | 26 | 27 | -------------------------------------------------------------------------------- /archive/data_old.R: -------------------------------------------------------------------------------- 1 | #' @title Dynamics of the Human Gut Microbiome in Inflammatory Bowel Disease 2 | #' @description This data set contains OTU level microbiota profiling with 3 | #' Illumina HiSeq 2000 for the V4 region with 515F/806RBC for an 4 | #' Inflammatory bowel disease (IBD) cohort reported in 5 | #' reported in Halfvarson J., et al. Nature Microbiology (2017). 6 | #' \url{http://www.nature.com/articles/nmicrobiol20174}. 7 | #' @name DynamicsIBD 8 | #' @details The data is also available for download from Qiita (ID 1629). 9 | #' \url{https://qiita.ucsd.edu/study/description/1629}. 10 | #' @docType data 11 | #' @return Loads the data set in R. 12 | #' @author Sudarshan A. Shetty \email{sudarshanshetty9@@gmail.com} 13 | #' @references 14 | #' Halfvarson et al. Dynamics of the human gut microbiome in inflammatory 15 | #' bowel disease. Nat Microbiology. 2:17004, 2017. 16 | #' To cite the microbiome R package, see citation('microbiome') 17 | #' @usage data(DynamicsIBD) 18 | #' @format The data set in \code{\link{phyloseq-class}} format. 19 | #' @keywords data 20 | NULL 21 | -------------------------------------------------------------------------------- /archive/debug.R: -------------------------------------------------------------------------------- 1 | library(microbiome) 2 | data(peerj32); 3 | x <- as.matrix(peerj32$lipids) 4 | y <- as.matrix(peerj32$microbes[, 1]) 5 | cc <- cross.correlate(x, y); 6 | 7 | 8 | 9 | p <- correlation.heatmap(cc, "X1", "X2", "Correlation") 10 | -------------------------------------------------------------------------------- /archive/densityplot.R: -------------------------------------------------------------------------------- 1 | #' @title Density Plot 2 | #' @description Density visualization for data points overlaid on cross-plot. 3 | #' @param x Data matrix to plot. The first two columns will be visualized as a cross-plot. 4 | #' @param main title text 5 | #' @param x.ticks Number of ticks on the X axis 6 | #' @param rounding Rounding for X axis tick values 7 | #' @param add.points Plot the data points as well 8 | #' @param col Color of the data points. NAs are marked with darkgray. 9 | #' @param adjust Kernel width adjustment 10 | #' @param size point size 11 | #' @param legend plot legend TRUE/FALSE 12 | #' @return ggplot2 object 13 | #' @examples p <- densityplot(cbind(rnorm(100), rnorm(100))) 14 | #' @export 15 | #' @references See citation('microbiome') 16 | #' @author Contact: Leo Lahti \email{microbiome-admin@@googlegroups.com} 17 | #' @keywords utilities 18 | densityplot <- function(x, 19 | main = NULL, 20 | x.ticks = 10, 21 | rounding = 0, 22 | add.points = TRUE, 23 | col = "black", 24 | adjust = 1, 25 | size = 1, 26 | legend = FALSE) { 27 | 28 | df <- x 29 | if (!is.data.frame(df)) { 30 | df <- as.data.frame(as.matrix(df)) 31 | } 32 | 33 | # Avoid warnings 34 | x <- y <- ..density.. <- color <- NULL 35 | 36 | # If colors are NA: 37 | col <- as.character(col) 38 | col[unname(which(is.na(col)))] <- "darkgray" 39 | 40 | theme_set(theme_bw(20)) 41 | xvar <- colnames(df)[[1]] 42 | yvar <- colnames(df)[[2]] 43 | df[["x"]] <- df[, 1] 44 | df[["y"]] <- df[, 2] 45 | df[["color"]] <- col 46 | df[["size"]] <- size 47 | 48 | # Remove NAs 49 | df <- df[!(is.na(df[["x"]]) | is.na(df[["y"]])), ] 50 | 51 | # Determine bandwidth for density estimation 52 | bw <- adjust * c(bandwidth.nrd(df[["x"]]), bandwidth.nrd(df[["y"]])) 53 | if (any(bw == 0)) { 54 | warning("Zero bandwidths (possibly due to small number of observations). Using minimal bandwidth.") 55 | bw[bw == 0] = bw[bw == 0] + min(bw[!bw == 0]) 56 | } 57 | 58 | # Construct the figure 59 | p <- ggplot(df) 60 | p <- p + stat_density2d(aes(x, y, fill = ..density..), geom = "raster", h = bw, contour = FALSE) 61 | p <- p + scale_fill_gradient(low = "white", high = "black") 62 | 63 | 64 | if (add.points) { 65 | if (length(unique(df$color)) == 1 && length(unique(df$size)) == 1) { 66 | 67 | p <- p + geom_point(aes(x = x, y = y), col = unique(df$color), size = unique(df$size)) 68 | } else if (length(unique(df$color)) == 1 && length(unique(df$size)) > 1) { 69 | p <- p + geom_point(aes(x = x, y = y, size = size), col = unique(df$color)) 70 | } else if (length(unique(df$color)) > 1 && length(unique(df$size)) == 1) { 71 | p <- p + geom_point(aes(x = x, y = y, col = color), size = unique(df$size)) 72 | } else { 73 | p <- p + geom_point(aes(x = x, y = y, col = color, size = size)) 74 | } 75 | } 76 | 77 | p <- p + xlab(xvar) + ylab(yvar) 78 | 79 | if (!legend) { 80 | p <- p + theme(legend.position = "none") 81 | } 82 | 83 | 84 | p <- p + scale_x_continuous(breaks = round(seq(floor(min(df[["x"]])), 85 | ceiling(max(df[["x"]])), 86 | length = x.ticks), rounding)) 87 | 88 | if (!is.null(main)) { 89 | p <- p + ggtitle(main) 90 | } 91 | 92 | p 93 | 94 | } 95 | -------------------------------------------------------------------------------- /archive/deprecated.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | #' read.profiling.old 6 | #' 7 | #' Read run.profiling.script output into R 8 | #' 9 | #' @param method ('frpa' / 'rpa' / 'sum' / 'ave') 10 | #' @param data.dir Profiling script output directory for reading the data. 11 | #' If not given, GUI will ask to specify the file and 12 | #' overruns the possible level / method arguments in the 13 | #' function call. 14 | #' 15 | #' @return data matrix (phylo x samples) 16 | #' 17 | #' @examples 18 | #' data.dir <- system.file("extdata", package = "microbiome") 19 | #' dat <- read.profiling('frpa', data.dir = data.dir) 20 | #' 21 | #' @references See citation('microbiome') 22 | #' @author Contact: Leo Lahti \email{microbiome-admin@@googlegroups.com} 23 | #' @keywords utilities 24 | 25 | read.profiling.old <- function(method = "frpa", data.dir) { 26 | 27 | results <- list() 28 | 29 | message(paste("Reading Chip data from", data.dir)) 30 | 31 | for (level in c("L1", "L2", "species")) { 32 | 33 | if (method == "frpa" && length(grep(method, dir(data.dir))) == 0) { 34 | warning("frpa method not available; using rpa instead") 35 | method <- "rpa" 36 | } 37 | 38 | f <- paste(data.dir, "/", level, "-", method, ".tab", sep = "") 39 | 40 | if (file.exists(f)) { 41 | tab <- read.csv(f, header = TRUE, sep = "\t", row.names = 1, as.is = TRUE) 42 | colnames(tab) <- unlist(strsplit(readLines(f, 1), "\t"))[-1] 43 | 44 | results[[level]] <- tab 45 | } 46 | } 47 | 48 | # oligo data 49 | f <- paste(data.dir, "/oligoprofile.tab", sep = "") 50 | tab <- read.csv(f, header = TRUE, sep = "\t", row.names = 1, as.is = TRUE) 51 | colnames(tab) <- unlist(strsplit(readLines(f, 1), "\t"))[-1] 52 | results[["oligo"]] <- tab 53 | 54 | # Phylogeny 55 | f <- paste(data.dir, "/phylogeny.full.tab", sep = "") 56 | #f <- paste(data.dir, "/phylogeny.filtered.tab", sep = "") 57 | tab <- read.csv(f, header = TRUE, sep = "\t", as.is = TRUE) 58 | tab <- polish.tax.table(tab) 59 | results[["taxonomy"]] <- tab 60 | 61 | # Metadata 62 | # Read simulated example metadata 63 | #library(gdata) 64 | #metadata.file <- paste(data.dir, "/metadata.xls", sep = "") 65 | #metadata <- read.xls(metadata.file, as.is = TRUE) 66 | #rownames(metadata) <- metadata$sampleID 67 | f <- paste(data.dir, "/meta.tab", sep = "") 68 | tab <- read.csv(f, header = TRUE, sep = "\t", as.is = TRUE) 69 | rownames(tab) <- tab$sample 70 | results[["meta"]] <- tab 71 | 72 | # Convert to numeric 73 | for (level in c("oligo", "species", "L1", "L2")) { 74 | 75 | tab <- results[[level]] 76 | rnams <- rownames(tab) 77 | cnams <- colnames(tab) 78 | 79 | tab <- apply(tab, 2, as.numeric) 80 | rownames(tab) <- rnams 81 | colnames(tab) <- cnams 82 | 83 | if (any(is.na(tab))) { 84 | warning(paste("The", level, " matrix has ", sum(is.na(tab)), 85 | " missing values \n 86 | - imputing..")) 87 | 88 | for (i in 1:ncol(tab)) { 89 | inds <- which(is.na(tab[,i])) 90 | if (length(inds) > 0) { 91 | tab[inds, i] <- sample(tab[-inds, i], length(inds)) 92 | } 93 | } 94 | 95 | } 96 | 97 | results[[level]] <- tab 98 | 99 | } 100 | 101 | results 102 | 103 | } 104 | 105 | -------------------------------------------------------------------------------- /archive/devtools.R: -------------------------------------------------------------------------------- 1 | # Package release instructions: http://r-pkgs.had.co.nz/release.html 2 | 3 | # Documentation, Build and Check 4 | library(devtools) 5 | document("../../") 6 | build("../../") 7 | check("../../") 8 | 9 | # Submissions: 10 | # 11 | # release() # Submit to CRAN 12 | # submit_cran() # Submit to CRAN without all release() questions 13 | 14 | # Utilities: 15 | # 16 | # build_win("../../") # Windows check 17 | # revdep_check("../../") 18 | # add_rstudio_project("../../") 19 | # use_build_ignore("../NEWS.md", pkg = "../../") # NEWS.md not supported by CRAN 20 | # use_package("dplyr") # add package to imports 21 | # load_all(".") # Reload the package 22 | # test() # Run tests 23 | # run_examples() 24 | 25 | # Vignettes: 26 | # 27 | # library(knitr) 28 | # knit("../../vignettes/eurostat_tutorial.Rmd", "../../vignettes/eurostat_tutorial.md") 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /archive/get_ordination.R: -------------------------------------------------------------------------------- 1 | #' @title Get Ordination 2 | #' @description Ordinate phyloseq data and merge it with sample metadata 3 | #' @param x \code{\link{phyloseq-class}} object or a data matrix 4 | #' (features x samples; eg. HITChip taxa vs. samples) 5 | #' @param method Ordination method, see phyloseq::plot_ordination 6 | #' @param distance Ordination distance, see phyloseq::plot_ordination 7 | #' @return data.frame with ordination coordinates and metadata 8 | #' @examples 9 | #' data(dietswap) 10 | #' fc <- get_ordination(dietswap) 11 | #' @seealso phyloseq::plot_ordination 12 | #' @export 13 | #' @details This is a wrapper for phyloseq ordination functions, providing 14 | #' smooth access to ordinated data.frame with full info on the projection 15 | #' and metadata necessary for further visualizations. 16 | #' @references See citation('microbiome') 17 | #' @author Contact: Leo Lahti \email{microbiome-admin@@googlegroups.com} 18 | #' @keywords utilities 19 | get_ordination <- function(x, method="NMDS", distance="bray") { 20 | 21 | x.ord <- ordinate(x, method, distance) 22 | 23 | # Pick the projected data (first two columns + metadata) 24 | proj <- phyloseq::plot_ordination(x, x.ord, justDF=TRUE) 25 | 26 | # Rename the projection axes 27 | names(proj)[1:2] <- paste("Comp", 1:2, sep=".") 28 | 29 | proj 30 | 31 | } 32 | 33 | -------------------------------------------------------------------------------- /archive/gini.R: -------------------------------------------------------------------------------- 1 | #' @title Top Abundance Index 2 | #' @description Calculates the community top_abundance index. 3 | #' @param rank Optional. The rank of the dominant taxa to consider. 4 | #' @param aggregate Aggregate (TRUE; default) the top members or not. If aggregate=TRUE, then the sum of relative abundances is returned. Otherwise the relative abundance is returned for the single taxa with the indicated rank. 5 | #' @inheritParams diversity 6 | #' @return A vector of top_abundance indices 7 | #' @export 8 | #' @examples 9 | #' data(dietswap) 10 | #' d <- top_abundance(dietswap) 11 | #' @details The top_abundance index gives the abundance of the most abundant species in [0,1]. This simple diversity index is occasionally used in ecological literature, and sometimes also called dominance. However, note that the microbiome::dominance function uses a different definition. With rank = 2, the sum of abundances for the two most abundant taxa are returned etc. However, if aggregate=FALSE, the abundance for the single n'th most dominant taxa (n = rank) is returnde instead the sum of abundances up to that rank. 12 | #' @author Contact: Leo Lahti \email{microbiome-admin@@googlegroups.com} 13 | #' @seealso dominance, diversity 14 | #' @keywords utilities 15 | top_ab <- function(x, rank = 1, aggregate = TRUE, split = TRUE) { 16 | 17 | # Pick the OTU data 18 | otu <- abundances(x) 19 | 20 | if (!split) { 21 | otu <- as.matrix(rowSums(otu), nrow = nrow(otu)) 22 | } 23 | 24 | if (!aggregate) { 25 | do <- apply(otu, 2, function (x) {rev(sort(x/sum(x, na.rm = TRUE)))[[rank]]}) 26 | } else { 27 | do <- apply(otu, 2, function (x) {sum(rev(sort(x/sum(x, na.rm = TRUE)))[1:rank])}) 28 | } 29 | names(do) <- sample_names(x) 30 | 31 | do 32 | 33 | } 34 | 35 | 36 | -------------------------------------------------------------------------------- /archive/gini_index.R: -------------------------------------------------------------------------------- 1 | 2 | gini_index <- function() { 3 | 4 | 5 | 6 | } 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /archive/internal.R: -------------------------------------------------------------------------------- 1 | 2 | get.file.method <- function( f ) { 3 | 4 | method <- NULL 5 | 6 | if (f == "rpa") { 7 | method <- "rpa" 8 | } 9 | if (f == "frpa") { 10 | method <- "frpa" 11 | } 12 | if (!length(grep("sum", f)) == 0) { 13 | method <- "sum" 14 | } 15 | if (!length(grep("ave", f)) == 0) { 16 | method <- "ave" 17 | } 18 | if (!length(grep("nmf", f)) == 0) { 19 | method <- "nmf" 20 | } 21 | 22 | method 23 | 24 | } 25 | 26 | get.file.level <- function(f) { 27 | 28 | level <- NULL 29 | 30 | if (!length(grep("oligo", f)) == 0) { 31 | level <- "oligo" 32 | } 33 | if (!length(grep("species", f)) == 0) { 34 | level <- "species" 35 | } 36 | if (!length(grep("L0", f)) == 0) { 37 | level <- "L0" 38 | } 39 | if (!length(grep("L1", f)) == 0) { 40 | level <- "L1" 41 | } 42 | if (!length(grep("L2", f)) == 0) { 43 | level <- "L2" 44 | } 45 | if (!length(grep("phylogeny", f)) == 0) { 46 | level <- "phylogeny.info" 47 | } 48 | 49 | level 50 | 51 | } 52 | 53 | -------------------------------------------------------------------------------- /archive/limma.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Limma" 3 | author: "Leo Lahti" 4 | date: "`r Sys.Date()`" 5 | bibliography: 6 | - bibliography.bib 7 | 8 | output: 9 | rmarkdown::html_vignette 10 | --- 11 | 17 | 18 | 19 | ## Linear models with limma 20 | 21 | ### Discrete variables: sex 22 | 23 | Identify most significantly different taxa between males and females. 24 | 25 | For further details, see [limma 26 | homepage](http://bioinf.wehi.edu.au/limma/) and [limma User's 27 | guide](http://www.lcg.unam.mx/~lcollado/R/resources/limma-usersguide.pdf). For 28 | discussion on why limma is preferred over t-test, see [this 29 | article](http://www.plosone.org/article/info:doi/10.1371/journal.pone.0012336). 30 | 31 | ```{r limma-example, warning=FALSE, fig.path = "figure/"} 32 | # Get example data 33 | library(microbiome) 34 | data("peerj32") 35 | pseq <- peerj32$phyloseq 36 | otu <- taxa_abundances(transform_phyloseq(pseq, "log10")) 37 | meta <- sample_data(pseq) 38 | groups <- meta$gender 39 | 40 | # Compare the two groups with limma 41 | library(limma) 42 | 43 | # Prepare the design matrix which states the groups for each sample 44 | # in the otu 45 | design <- cbind(intercept = 1, Grp2vs1 = groups) 46 | rownames(design) <- rownames(meta) 47 | design <- design[colnames(otu), ] 48 | 49 | # NOTE: results and p-values are given for all groupings in the design matrix 50 | # Now focus on the second grouping ie. pairwise comparison 51 | coef.index <- 2 52 | 53 | # Fit the limma model 54 | fit <- lmFit(otu, design) 55 | fit <- eBayes(fit) 56 | pvalues.limma = fit$p.value[, 2] 57 | 58 | # Summarise 59 | kable(topTable(fit, coef = coef.index, p.value=0.05), digits = 2) 60 | ``` 61 | 62 | 63 | ### Q-Q plot 64 | 65 | 66 | ```{r limma-qq, warning=FALSE, fig.path = "figure/"} 67 | qqt(fit$t[, coef.index], df = fit$df.residual + fit$df.prior) 68 | abline(0,1) 69 | ``` 70 | 71 | ### Volcano plot 72 | 73 | ```{r limma-volcano, warning=FALSE, fig.path = "figure/"} 74 | volcanoplot(fit, coef = coef.index, highlight = coef.index) 75 | ``` 76 | 77 | 78 | 79 | ### Comparison between limma and t-test 80 | 81 | Order the taxa with t-test for comparison and validation purposes. The 82 | differences are small in this simulated example, but [can be 83 | considerable in real 84 | data](http://www.plosone.org/article/info:doi/10.1371/journal.pone.0012336). 85 | 86 | ```{r limma-compairson, warning=FALSE, fig.path = "figure/"} 87 | # Compare the two groups with t-test 88 | library(dplyr) 89 | pvalues.ttest <- c() 90 | male.samples <- dplyr::filter(meta, gender == "male")$sample 91 | female.samples <- dplyr::filter(meta, gender == "female")$sample 92 | for (tax in rownames(otu)) { 93 | pvalues.ttest[[tax]] <- t.test(otu[tax, male.samples], otu[tax, female.samples])$p.value 94 | } 95 | # Multiple testing correction 96 | pvalues.ttest <- p.adjust(pvalues.ttest, method = "fdr") 97 | 98 | # Compare p-values between limma and t-test 99 | taxa <- rownames(otu) 100 | plot(pvalues.ttest[taxa], pvalues.limma[taxa]) 101 | abline(0,1,lty = 2) 102 | ``` 103 | 104 | ### Continuous variables 105 | 106 | Quantify continuous associations with lm_phyloseq function. This uses 107 | the limma model to generate a table of P-values and effect sizes. Note 108 | that no confounding variables taken into account in this wrapper. See 109 | the [limma homepage](http://bioinf.wehi.edu.au/limma/) for more 110 | detailed analyses. 111 | 112 | ```{r limma-lm-phyloseq, warning=FALSE, fig.path = "figure/"} 113 | data("atlas1006") 114 | source(system.file("extdata/lm_phyloseq.R", package = "microbiome")) 115 | tab <- lm_phyloseq(atlas1006, "age") 116 | kable(head(tab), digits = 3) 117 | ``` 118 | 119 | 120 | -------------------------------------------------------------------------------- /archive/linearmodel.R: -------------------------------------------------------------------------------- 1 | #' @title Linear model with random subject effects for phyloseq 2 | #' @description Linear model test for paired comparisons for phyloseq objects with random effect subject term. 3 | #' @param x \code{\link{phyloseq-class}} object 4 | #' @param group Metadata field specifying the groups. 5 | #' @param p.adjust.method p-value correction method for p.adjust function 6 | #' (default 'BH'). For other options, see ?p.adjust 7 | #' @return Corrected p-values. 8 | #' @examples 9 | #' data(peerj32) 10 | #' pval <- check_lmer(peerj32$phyloseq, "group") 11 | #' @export 12 | #' @importFrom lme4 lmer 13 | #' @references See citation('microbiome') 14 | #' @author Contact: Leo Lahti \email{microbiome-admin@@googlegroups.com} 15 | #' @keywords utilities 16 | check_lmer <- function (x, group, p.adjust.method = "BH") { 17 | 18 | # We need taxa x samples matrix 19 | mydata <- otu_table(x)@.Data 20 | if (!taxa_are_rows(x)) {mydata <- t(mydata)} 21 | 22 | metadata <- sample_data(x) 23 | metadata$group <- droplevels(factor(metadata[[group]])) 24 | 25 | # Pvalues 26 | pv <- c() 27 | for (tax in rownames(mydata)) { 28 | 29 | dfs <- metadata 30 | dfs$signal <- mydata[tax, rownames(dfs)] 31 | 32 | # Paired comparison 33 | out <- lmer(signal ~ group + (1|subject), data = dfs) 34 | out0 <- lmer(signal ~ (1|subject), data = dfs) 35 | comp <- anova(out0, out) 36 | pv[[tax]] <- comp[["Pr(>Chisq)"]][[2]] 37 | 38 | } 39 | 40 | # Adjust ANOVA group-level p-values 41 | pv <- p.adjust(pv[, "p.anova"], method = p.adjust.method) 42 | 43 | # Order by p 44 | pv <- sort(pv) 45 | 46 | } 47 | 48 | 49 | 50 | -------------------------------------------------------------------------------- /archive/lm_phyloseq.R: -------------------------------------------------------------------------------- 1 | #' @title Limma test for phyloseq 2 | #' @description Limma for a single continuous variable vs. OTUs with a phyloseq object. 3 | #' @param x \code{\link{phyloseq-class}} object 4 | #' @param varname Metadata field specifying the investigated metadata variable. 5 | #' @param transformation Transformation for the original phyloseq values. By default a log10 transformation is done to better approximate the requirements of a linear model. 6 | #' @param p.adj.method p-value correction method for p.adjust function 7 | #' (default 'BH'). For other options, see ?p.adjust 8 | #' @return Limma output table. 9 | #' @examples 10 | #' data("atlas1006") 11 | #' tab <- lm_phyloseq(atlas1006, "age") 12 | #' @export 13 | #' @importFrom phyloseq get_variable 14 | #' @references See citation('microbiome') 15 | #' @author Contact: Leo Lahti \email{microbiome-admin@@googlegroups.com} 16 | #' @keywords utilities 17 | lm_phyloseq <- function (x, varname, transformation = "log10", p.adj.method = "BH") { 18 | 19 | # Transformation useful for linear models 20 | x <- transform_phyloseq(x, transformation) 21 | 22 | # Limma significance analysis 23 | # Prepare the design matrix which states the variable values for each sample 24 | design <- cbind(intercept = 1, var = get_variable(x, varname)) 25 | rownames(design) <- rownames(sample_data(x)) 26 | 27 | # OTU matrix 28 | otu <- otu_table(x)@.Data 29 | if (!taxa_are_rows(x)) {otu <- t(otu)} 30 | 31 | # Remove missing vals 32 | keep = which(rowSums(is.na(design)) == 0) 33 | 34 | if (length(keep) > 0) { 35 | otu = otu[,keep] 36 | design = design[keep,] 37 | } else { 38 | warning("All samples have missing values in the metadata design matrix. Halting the computation.") 39 | return(NULL) 40 | } 41 | 42 | # Fit the limma model 43 | fit <- lmFit(otu, design) 44 | fit2 <- eBayes(fit) 45 | 46 | # NOTE: results and p-values are given for all groupings in the design matrix 47 | # Now focus on the second grouping ie. pairwise comparison 48 | topTable(fit2, coef = 2) 49 | 50 | } -------------------------------------------------------------------------------- /archive/low_abundance.R: -------------------------------------------------------------------------------- 1 | #' @title Low Abundance Index 2 | #' @description Calculates the community low_abundance index. 3 | #' @inheritParams core 4 | #' @param split (Optional). Logical. Pool all samples and estimate index for the entire set. 5 | #' @return A vector of low_abundance indices 6 | #' @export 7 | #' @examples 8 | #' data(dietswap) 9 | #' d <- low_abundance(dietswap, detection = 0.2/100) 10 | #' @details The low_abundance index gives the relative proportion of rare species in [0,1]. The species that are below the indicated detection threshold are considered rare. Note that population prevalence is not considered. 11 | #' @author Contact: Leo Lahti \email{microbiome-admin@@googlegroups.com} 12 | #' @keywords utilities 13 | #' @seealso core_abundance, rarity, diversity 14 | low_abundance <- function(x, detection = 0.2/100, split = TRUE) { 15 | 16 | # Ensure compositional data 17 | xc <- transform(x, "compositional") 18 | 19 | if (!split) { 20 | 21 | otu <- rowSums(abundances(xc)) 22 | x <- otu/sum(otu) 23 | do <- sum(x[x < detection]) 24 | 25 | } else { 26 | 27 | otu <- abundances(xc) 28 | do <- apply(otu, 2, function (x) {sum(x[x < detection])}) 29 | names(do) <- sample_names(x) 30 | 31 | } 32 | 33 | do 34 | 35 | } 36 | 37 | 38 | 39 | -------------------------------------------------------------------------------- /archive/lower.triangle.R: -------------------------------------------------------------------------------- 1 | #' @title Lower triangle of a matrix 2 | #' @description Get lower triangle of a square matrix. 3 | #' as a numeric vector such that 4 | #' row-by-row, picking elements in the order 5 | #' 2,1;3,1;3,2;4,1,... 6 | #' 7 | #' @param mat data matrix 8 | #' 9 | #' @return lower triangle as vector 10 | #' 11 | #' @export 12 | #' @examples 13 | #' mat <- rbind(c(1,2,3), c(4,5,6), c(7,8,9)) 14 | #' vec <- lower.triangle(mat) 15 | #' @references See citation('microbiome') 16 | #' @author Contact: Leo Lahti \email{microbiome-admin@@googlegroups.com} 17 | #' @keywords utilities 18 | lower.triangle <- function(mat) { 19 | 20 | # TODO is this easy replace with standard R functions ? 21 | 22 | elements <- c() 23 | nr <- dim(mat)[[1]] 24 | nc <- dim(mat)[[2]] 25 | 26 | for (i in 2:nr) { 27 | for (j in 1:(i - 1)) { 28 | elements <- c(elements, mat[i, j]) 29 | } 30 | } 31 | elements 32 | } 33 | -------------------------------------------------------------------------------- /archive/make.abundancy.table.R: -------------------------------------------------------------------------------- 1 | #' @title Abundance table 2 | #' @description Calculate abundancies. 3 | #' Discretize Hitchip matrix to form abundancy table 4 | #' of form j, nj where j is number of counts and nj is number 5 | #' of phylotypes with the corresponding counts 6 | #' this format is often required by richness estimation 7 | #' @param dat data matrix 8 | #' @param det.th detection threshold 9 | #' @param discretization.resolution discretization resolution 10 | #' @return abundancy table 11 | #' @examples 12 | #' data(peerj32) 13 | #' abtab <- make.abundancy.table(10^t(peerj32$microbes), det.th = 0) 14 | #' @export 15 | #' @references 16 | #' To cite the microbiome R package, see citation('microbiome') 17 | #' @author Contact: Leo Lahti \email{microbiome-admin@@googlegroups.com} 18 | #' @keywords utilities 19 | make.abundancy.table <- function(dat, det.th, discretization.resolution = 1) { 20 | 21 | di <- 10^dat - 10^det.th 22 | di[di < 0] <- 0 23 | di <- discretization.resolution * (di) 24 | di[di > 0 & di < 1] <- 1 25 | di <- round(di) 26 | 27 | ab <- t(di) 28 | 29 | ab 30 | } 31 | 32 | -------------------------------------------------------------------------------- /archive/maturity.R: -------------------------------------------------------------------------------- 1 | #' @title Maturity index 2 | #' @description Calculate maturity index for a phyloseq object 3 | #' @param x \code{\link{phyloseq-class}} object 4 | #' @return Maturity index 5 | #' @examples 6 | #' #This is an artificial example based on readily available 7 | #' #adult data set whereas 8 | #' #the maturity index is typically calculated from and 9 | #' #used for babies/children: 10 | #' data("atlas1006") 11 | #' pseq <- atlas1006 12 | #' pseq <- subset_samples(pseq, DNA_extraction_method == "r" & time == 0) 13 | #' pseq <- transform_phyloseq(pseq, "relative.abundance") 14 | #' maturity(pseq) 15 | #' @export 16 | #' @references To cite this R package, see citation('microbiome'). 17 | #' The microbiota maturity index has been adapted from the following papers: 18 | #' Subramanian S et al. Nature 510:417-421, 2014. 19 | #' Dogra S et al. mBio 6:e02419-14, 2015. 20 | #' Korpela K et al. Nat. Comm. 7:10410, 2016. 21 | #' @details Microbiota maturity index has been shown to differentiate healthy children (see the references). In Korpela et al. (2016) this was calculated as the first principal coordinate from a PCoA (MDS) using only significantly age-associated genus-level taxa (all groups included). In this function NMDS is used instead. The maturity index is also adjusted for age. 22 | #' @importFrom phyloseq get_variable 23 | #' @author Contact: Leo Lahti \email{microbiome-admin@@googlegroups.com} 24 | #' @keywords utilities 25 | maturity <- function (x) { 26 | 27 | # Pick significant age-associated taxa 28 | age.taxa <- rownames(lm_phyloseq(x, "age")) 29 | 30 | # Taxa x samples matrix for age-associated taxa 31 | #otu <- get_sample(x)[age.taxa,] 32 | pf <- prune_taxa(age.taxa, x) 33 | 34 | # NMDS ordination 35 | ord <- ordinate(pf, "NMDS", "bray") 36 | 37 | # Maturity index 38 | index <- scores(ord)[,1] 39 | 40 | # Adjust for age 41 | ages <- get_variable(x, "age") 42 | res <- lm(index ~ age) 43 | 44 | # Residuals provide the adjusted index 45 | index.adjusted <- residuals(res) 46 | 47 | index.adjusted 48 | 49 | } 50 | 51 | -------------------------------------------------------------------------------- /archive/metadata.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microbiome/tutorials/4c9c5eb321358c97369b1d2325ff81906a14ac0b/archive/metadata.xls -------------------------------------------------------------------------------- /archive/misc.R: -------------------------------------------------------------------------------- 1 | # Just apply variance stabilizing transformation 2 | # varianceStabilizingTransformation(ds2, blind = TRUE, fitType = "parametric") 3 | # ds2 <- estimateSizeFactors(ds2) 4 | # ds2 <- estimateDispersions(ds2) 5 | # abund <- getVarianceStabilizedData(ds2) 6 | 7 | #dfm <- as.data.frame(otu) 8 | #dfm$ID <- rownames(otu) 9 | #dfm <- gather(dfm, ID) 10 | #colnames(dfm) <- c("Taxon", "Sample", "Abundance") 11 | #dfm$Abundance <- as.numeric(as.character(dfm$Abundance)) 12 | #dfm$Sample <- factor(as.character(dfm$Sample), levels = sort.samples) 13 | #dfm$ID <- factor(as.character(dfm$ID), levels = sort.taxa) 14 | -------------------------------------------------------------------------------- /archive/misc.Rmd: -------------------------------------------------------------------------------- 1 | biocLite("devtools") 2 | biocLite("phyloseq") 3 | biocLite("ggplot2") 4 | biocLite("dplyr") 5 | biocLite("reshape2") 6 | biocLite("tidyr") 7 | biocLite("vegan") 8 | biocLite("knitr") 9 | biocLite("knitcitations") 10 | biocLite("compositions") 11 | biocLite("qvalue") 12 | biocLite("rmarkdown") 13 | 14 | -------------------------------------------------------------------------------- /archive/multimodality_phyloseq.R: -------------------------------------------------------------------------------- 1 | #' @title Phyloseq Multimodality Test 2 | #' @description Calculate multimodality score based on bootstrapped 3 | #' potential analysis. 4 | #' @param x Phyloseq object 5 | #' @param detection Mode detection 6 | #' @param bw.adjust Bandwidth adjustment 7 | #' @param bs.iterations Bootstrap iterations 8 | #' @param detection.limit minimum accepted density for a maximum; 9 | #' as a multiple of kernel height 10 | #' @param verbose Verbose 11 | #' @return A list with following elements 12 | #' \item{score}{Fraction of bootstrap samples where multiple modes 13 | #' are observed} 14 | #' \item{nmodes}{The most frequently observed number of modes in bootstrap 15 | #' sampling results} 16 | #' \item{results}{Full results of potential_analysis_bootstrap for each 17 | #' row of the input matrix} 18 | #' @details Repeats potential analysis (Livina et al. 2010) multiple times 19 | #' with bootstrap sampling for each row of the input data 20 | #' (as in Lahti et al. 2014) and returns the specified results. 21 | #' @export 22 | #' @author Leo Lahti \email{leo.lahti@@iki.fi} 23 | #' @examples 24 | #' data(peerj32) 25 | #' s <- multimodality_score( 26 | #' t(peerj32$microbes[, c("Akkermansia", "Dialister")])) 27 | #' @references 28 | #' \itemize{ 29 | #' \item{}{Livina et al. (2010). Potential analysis reveals changing number 30 | #' of climate states during the last 60 kyr. 31 | #' \emph{Climate of the Past}, 6, 77-82.} 32 | #' \item{}{Lahti et al. (2014). Tipping elements of the human intestinal 33 | #' ecosystem. \emph{Nature Communications} 5:4344.} 34 | #' } 35 | #' @keywords utilities 36 | multimodality_phyloseq <- function (x, detection = 1, bw.adjust = 1, 37 | bs.iterations = 100, detection.limit = 1, 38 | verbose = TRUE) { 39 | x <- abundances(transform_phyloseq(x, "log10")) 40 | msc <- multimodality_score(x, detection, 41 | bw.adjust, bs.iterations, detection.limit, verbose) 42 | msc 43 | 44 | } 45 | 46 | -------------------------------------------------------------------------------- /archive/neatsort.R: -------------------------------------------------------------------------------- 1 | #' @title Neatmap Ordering for Matrix Features or Samples 2 | #' @description Order features or samples based on the neatmap approach. 3 | #' @param x A matrix. 4 | #' @param arrange Order "features" or "samples". For matrices, it is assumed that the samples are on the columns and features are on the rows. For phyloseq objects, features are the taxa of the OTU table. 5 | #' @param method Ordination method. Only NMDS implemented for now. 6 | #' @param distance Distance method. See \code{\link{vegdist}} function from the \pkg{vegan} package. 7 | #' @param first Optionally provide the name of the first sample (or feature) to start the ordering (the ordering is cyclic so we can start at any point). The choice of the first sample/feature may somewhat affect the overall ordering. 8 | #' @param ... Arguments to pass. 9 | #' @return Vector of ordered elements 10 | #' @export 11 | #' @examples \dontrun{ 12 | #' data(peerj32) 13 | #' x <- peerj32$microbes 14 | #' x <- neatsort(x, "features", method = "NMDS", distance = "bray") 15 | #' } 16 | #' @references This function is partially based on code derived from the \pkg{phyloseq} package. However for the original 17 | #' neatmap approach for heatmap sorting, see (and cite): 18 | #' Rajaram, S., & Oono, Y. (2010). NeatMap--non-clustering heat map alternatives in R. BMC Bioinformatics, 11, 45. 19 | #' 20 | #' @details This function borrows elements from the heatmap implementation in the \pkg{phyloseq} package. The row/column sorting is there 21 | #' not available as a separate function at present, however, hindering reuse in other tools. This function provides an independent 22 | #' method for easy row/column reordering for matrices. This a quick hack and using the ordination could be expanded further 23 | #' (now only NMDS is available, and the sorting is done independently for features and samples). 24 | #' @keywords utilities 25 | neatsort <- function (x, arrange, method = "NMDS", distance = "bray", first = NULL, ...) { 26 | 27 | if (arrange == "samples") { 28 | x <- t(x) 29 | } 30 | 31 | # Neatmap sorting for matrices with NMDS 32 | d <- vegdist(x, distance) 33 | 34 | # Order 35 | # Capture the output to keep the screen clean 36 | junk <- capture.output( 37 | ord <- metaMDS(d, wascores = FALSE, autotransform = FALSE, noshare = FALSE), file=NULL) 38 | 39 | 40 | # Order items with the NeatMap procedure 41 | # Reorder by the angle in radial coordinates on the 2-axis plane. 42 | DF <- NULL 43 | 44 | # Define new sample ordering based on the ordination 45 | tmp <- try({ 46 | DF <- scores(ord, choices = c(1, 2), display = "sites")}, silent = TRUE) 47 | 48 | if(inherits(tmp, "try-error")){ 49 | warning(paste("Order failed. Using default ordering.", sep = "")) 50 | } 51 | 52 | if(!is.null(DF)){ 53 | # If the score accession worked, replace order 54 | ordering <- rownames(x)[order(radial_theta(DF))] 55 | } 56 | # Determine the starting item (OTU or sample) 57 | if( !is.null(first) ){ 58 | ordering <- chunk_reorder(ordering, first) 59 | } 60 | 61 | ordering 62 | 63 | } 64 | 65 | 66 | 67 | -------------------------------------------------------------------------------- /archive/nets.R: -------------------------------------------------------------------------------- 1 | ```{r networks3, warning=FALSE, message=FALSE} 2 | theme_set(theme_bw(20)) 3 | p <- plot_net(pseq, maxdist = 0.2, 4 | shape = "group", color = "nationality", 5 | distance = "bray", laymeth = "auto") + 6 | scale_colour_brewer(palette = "Accent") 7 | print(p) 8 | ``` 9 | -------------------------------------------------------------------------------- /archive/nets.Rmd: -------------------------------------------------------------------------------- 1 | ### Taxonomic network reconstruction 2 | 3 | ```{r networks1, message=FALSE, warning=FALSE} 4 | pseq <- transform(atlas1006, "compositional") 5 | 6 | # Keep only the prevalent taxa 7 | pseq <- core(pseq, detection = 0.2/100, prevalence = 80/100) 8 | pseq <- subset_samples(pseq, DNA_extraction_method == "r") 9 | ``` 10 | 11 | The compositionality bias can be fixed with SpiecEasi or SparCC; the 12 | implementations are available via the [SpiecEasi 13 | package](https://github.com/zdk123/SpiecEasi). Note that the execution 14 | is slow. 15 | 16 | 17 | ```{r spieceasi, warning=FALSE, message=FALSE, fig.width=10, fig.height=10, eval=TRUE} 18 | library(SpiecEasi) #install_github("zdk123/SpiecEasi") 19 | library(phyloseq) 20 | 21 | # Pick the OTU table 22 | otu <- abundances(pseq) 23 | net <- spiec.easi(t(otu), method='mb', icov.select.params=list(rep.num=100)) 24 | 25 | ## Create graph object 26 | n <- net$refit 27 | colnames(n) <- rownames(n) <- rownames(otu) 28 | 29 | # Network Format 30 | library(network) 31 | netw <- network(as.matrix(n), directed = FALSE) 32 | ``` 33 | 34 | 35 | Visualize the network with [ggnet2](https://briatte.github.io/ggnet): 36 | 37 | ```{r networks5, warning=FALSE, message=FALSE, fig.width=12, fig.height=7} 38 | library(GGally) 39 | library(ggnet) 40 | library(network) 41 | library(sna) 42 | library(ggplot2) 43 | library(intergraph) # ggnet2 works also with igraph with this 44 | 45 | phyla <- map_levels(rownames(otu), 46 | from = "Genus", to = "Phylum", 47 | tax_table(pseq)) 48 | 49 | netw %v% "Phylum" <- phyla 50 | p <- ggnet2(netw, color = "Phylum", label = TRUE, label.size = 2) 51 | print(p) 52 | ``` 53 | 54 | 55 | 56 | See the [phyloseq tutorial](http://joey711.github.io/phyloseq/plot_network-examples) for 57 | additional network visualization tools. 58 | 59 | -------------------------------------------------------------------------------- /archive/peerj32_meta.csv: -------------------------------------------------------------------------------- 1 | time gender subject sample group 2 | sample-1 1 F S1 sample-1 Placebo 3 | sample-2 2 F S1 sample-2 Placebo 4 | sample-3 1 F S2 sample-3 Placebo 5 | sample-4 2 F S2 sample-4 Placebo 6 | sample-5 1 F S3 sample-5 LGG 7 | sample-6 2 F S3 sample-6 LGG 8 | sample-7 1 M S4 sample-7 Placebo 9 | sample-8 2 M S4 sample-8 Placebo 10 | sample-9 1 F S5 sample-9 Placebo 11 | sample-10 2 F S5 sample-10 Placebo 12 | sample-11 1 F S6 sample-11 LGG 13 | sample-12 2 F S6 sample-12 LGG 14 | sample-13 1 F S7 sample-13 Placebo 15 | sample-14 2 F S7 sample-14 Placebo 16 | sample-15 1 F S8 sample-15 LGG 17 | sample-16 2 F S8 sample-16 LGG 18 | sample-17 1 M S9 sample-17 Placebo 19 | sample-18 2 M S9 sample-18 Placebo 20 | sample-19 1 F S10 sample-19 LGG 21 | sample-20 2 F S10 sample-20 LGG 22 | sample-21 1 F S11 sample-21 LGG 23 | sample-22 2 F S11 sample-22 LGG 24 | sample-23 1 F S12 sample-23 Placebo 25 | sample-24 2 F S12 sample-24 Placebo 26 | sample-25 1 M S13 sample-25 LGG 27 | sample-26 2 M S13 sample-26 LGG 28 | sample-27 1 F S14 sample-27 Placebo 29 | sample-28 2 F S14 sample-28 Placebo 30 | sample-29 1 F S15 sample-29 Placebo 31 | sample-30 2 F S15 sample-30 Placebo 32 | sample-31 1 F S16 sample-31 Placebo 33 | sample-32 2 F S16 sample-32 Placebo 34 | sample-33 1 M S17 sample-33 Placebo 35 | sample-34 2 M S17 sample-34 Placebo 36 | sample-35 1 M S18 sample-35 Placebo 37 | sample-36 2 M S18 sample-36 Placebo 38 | sample-37 1 M S19 sample-37 LGG 39 | sample-38 2 M S19 sample-38 LGG 40 | sample-39 1 F S20 sample-39 Placebo 41 | sample-40 2 F S20 sample-40 Placebo 42 | sample-41 1 M S21 sample-41 LGG 43 | sample-42 2 M S21 sample-42 LGG 44 | sample-43 1 F S22 sample-43 Placebo 45 | sample-44 2 F S22 sample-44 Placebo 46 | -------------------------------------------------------------------------------- /archive/pet.R: -------------------------------------------------------------------------------- 1 | library(microbiome) 2 | 3 | # New pipeline 4 | frpa1 <- read.profiling(data.dir = "PET", level = "species", method = "frpa") 5 | frpa2 <- read.profiling(data.dir = "PET", level = "species", method = "sum") 6 | 7 | # New pipeline 2 8 | sum.new2 <- read_hitchip("PET", method = "sum") 9 | frpa.new2 <- read_hitchip("PET", method = "frpa") 10 | rpa.new2 <- read_hitchip("PET", method = "rpa") 11 | 12 | # Old pipeline 13 | frpa.old <- read.profiling("~/tmp/pet14/data", method = "frpa") 14 | sum.old <- read.profiling("~/tmp/pet14/data", method = "sum") 15 | 16 | # Old and New match perfect at species level 17 | level <- "species" 18 | s <- colnames(sum.old[[level]]) 19 | taxa <- intersect(rownames(sum.new[[level]]), rownames(sum.old[[level]])) 20 | plot(log10(unlist(sum.new[[level]][taxa,s])), log10(unlist(sum.old[[level]][taxa,s]))) 21 | plot(log10(unlist(frpa.new[[level]][taxa,s])), log10(unlist(frpa.old[[level]][taxa,s]))) 22 | 23 | level <- "species" 24 | par(mfrow = c(2,2)) 25 | plot(log10(unlist(sum.new[[level]])), log10(unlist(sum.new2$abundance.table))) 26 | plot(log10(unlist(frpa.new[[level]])), log10(unlist(frpa.new2$abundance.table))) 27 | plot(log10(unlist(frpa.new[[level]])), log10(unlist(sum.new2$abundance.table))) 28 | 29 | # FIXME: make this work also with old output 30 | # Read and recalculate 31 | res <- read_hitchip("PET", method = "sum") 32 | pseq <- res$pseq 33 | otu <- res$abundance.table 34 | 35 | # pseq@otu_table 36 | dim(tax_glom(pseq, "L2")@otu_table) 37 | dim(tax_glom(pseq, "L1")@otu_table) 38 | 39 | plot(unlist(log10(tax_glom(pseq, "species")@otu_table)), unlist(log10(pseq@otu_table))) 40 | sum.new <- read.profiling("PET", method = "sum") 41 | 42 | 43 | #--------------------------------------------------------------- 44 | 45 | sum.old <- read.profiling("~/tmp/pet14/data", method = "sum") 46 | sum.new <- read.profiling("PET", method = "sum") 47 | pseq0 <- read_hitchip("PET", method = "sum", detection.threshold = 0)$pseq 48 | pseq <- read_hitchip("PET", method = "sum", detection.threshold = 10^1.8)$pseq 49 | pseq.L20 <- aggregate_taxa(pseq0, level = "L2") 50 | pseq.L2 <- aggregate_taxa(pseq, level = "L2") 51 | 52 | taxa <- intersect(rownames(L2), rownames(sum.new$L2)) 53 | 54 | coms <- colnames(sum.old$L2[taxa,]) 55 | 56 | old <- log10(sum.old$L2[taxa,coms]) 57 | new.pre <- log10(sum.new$L2[taxa,coms]) 58 | new0 <- log10(otu_table(pseq.L20)@.Data[taxa,coms]) 59 | new.th <- log10(otu_table(pseq.L2)@.Data[taxa,coms]) 60 | 61 | par(mfrow = c(2,2)) 62 | plot(unlist(old[, coms]), unlist(new0[, coms])) 63 | plot(unlist(old[, coms]), unlist(new.th[, coms])) 64 | plot(unlist(new.th[, coms]), unlist(new.pre[, coms])) 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | -------------------------------------------------------------------------------- /archive/pet2.R: -------------------------------------------------------------------------------- 1 | library(microbiome) 2 | 3 | # Old PET data, pre-calculated 4 | d1 <- read.profiling(data.dir = "~/antagomir/pet14/data", level = "species", method = "frpa") 5 | 6 | # New PET data, calculated on the fly 7 | d2 <- read_hitchip(data.dir = "~/antagomir/pet14b/", method = "frpa") 8 | d2 <- otu_table(d2$pseq)@.Data 9 | 10 | cr <- intersect(rownames(d1), rownames(d2)) 11 | cc <- intersect(colnames(d1), colnames(d2)) 12 | 13 | plot(unlist(d1[cr, cc]), log10(unlist(d2[cr, cc])), pch = "."); abline(0,1) 14 | 15 | cors <- c() 16 | for (i in 1:nrow(frpa2)) { 17 | cors[[i]] <- cor(as.vector(log10(frpa2[cr, cc][i,])), as.vector(log10(frpa2b[cr, cc][i,]))) 18 | } 19 | hist(cors) 20 | 21 | 22 | -------------------------------------------------------------------------------- /archive/plot_diversity.R: -------------------------------------------------------------------------------- 1 | #' @title Plot Indicators 2 | #' @description Plot global indicators. 3 | #' This function estimates a number of alpha-diversity metrics using the 4 | #' \code{\link{estimate_richness}} function, 5 | #' and returns a \code{ggplot} object. 6 | #' The plot generated by this function will include every sample 7 | #' in \code{physeq}, but they can be further grouped on the horizontal axis 8 | #' through the argument to \code{x}, 9 | #' and shaded according to the argument to \code{color} (see below). 10 | #' You must use untrimmed, non-normalized count data for meaningful results. 11 | #' @param x \code{\link{phyloseq-class}} object 12 | #' @param y A variable to map to the horizontal axis. The vertical 13 | #' axis will be mapped to the alpha diversity index/estimate 14 | #' and have units of total taxa, and/or index value (dimensionless). 15 | #' This parameter (\code{x}) is a character string indicating a 16 | #' in the dataset (nsamples(x)). 17 | #' @param index Default is \code{NULL}. In this case 18 | #' all available alpha-diversity index will be included. 19 | #' Alternatively, you can specify one or more index 20 | #' as a character vector. Values must be among those supported: 21 | #' \code{c("Observed", "Chao1", "ACE", "Shannon", "Simpson", "InvSimpson", "Fisher")}. 22 | #' @param nrow Number of rows for plot faceting. 23 | #' @param scales scales for the plot 24 | #' @param indicate.subjects Indicate subjects by lines. The sample_data(x) must have 'subject' field. 25 | #' @param na.rm Remove samples with missing metadata (NA) 26 | #' @return A \code{\link{ggplot}} plot object summarizing 27 | #' the richness estimates, and their standard error. 28 | #' @details If subject is among the metadata variables, the matched subjects across groups are indicated by lines. 29 | #' @seealso 30 | #' \code{\link{estimate_richness}} 31 | #' \code{\link{global}} 32 | #' \code{\link{plot_richness}} 33 | #' \code{\link[vegan]{estimateR}} 34 | #' \code{\link[vegan]{diversity}} 35 | #' @export 36 | #' @examples 37 | #' # Load gut microbiota data on 1006 western adults 38 | #' # (see help(atlas1006) for references and details) 39 | #' data(atlas1006) 40 | #' # Visualize Shannon diversity across bmi groups; remove cases with no bmi info 41 | #' x <- global(atlas1006)$diversities_shannon 42 | #' y <- meta(atlas1006)$bmi_group 43 | #' p <- plot_global(x, variable = y, na.rm = TRUE) 44 | #' @keywords utilities 45 | plot_global <- function(x, y, nrow = 1, scales = "free_y", indicate.subjects = FALSE, na.rm = FALSE){ 46 | 47 | horiz <- subject <- NULL 48 | 49 | # Coerce to data.frame 50 | DF <- data.frame(x, y) 51 | 52 | mdf <- gather(DF, "key", "value", dplyr::ends_with(".index")) 53 | mdf$key <- gsub("\\.index$", "", mdf$key) 54 | mdf$horiz <- mdf[[variable]] 55 | 56 | if (na.rm) { 57 | mdf <- dplyr::filter(mdf, !is.na(horiz)) 58 | if (nrow(mdf) == 0) { 59 | warning(paste("All values in", variable, "are NA. Returning NULL.")) 60 | NULL 61 | } 62 | } 63 | 64 | # Make the ggplot 65 | theme_set(theme_bw(20)) 66 | 67 | if (is.factor(mdf$horiz)) { 68 | 69 | p <- ggplot(mdf, aes(x = horiz, y = value)) 70 | 71 | p <- p + geom_boxplot(na.rm=TRUE) 72 | 73 | if (indicate.subjects) { 74 | p <- p + geom_point() 75 | p <- p + geom_line(aes(group = subject)) 76 | } 77 | 78 | # Rotate horizontal axis labels, and adjust 79 | p <- p + theme(axis.text.x=element_text(angle=-90, vjust=0.5, hjust=0)) 80 | 81 | # Facet wrap using user-options 82 | p <- p + facet_wrap(~key, nrow = nrow, scales = scales) 83 | 84 | } else if (is.vector(mdf$horiz)) { 85 | 86 | if (length(measures) > 1) { 87 | stop(paste("The horizontal variable", variable, "is numeric. Provide a single diversity measure.")) 88 | } 89 | 90 | p <- plot_regression(value~horiz, mdf) 91 | p <- p + xlab(variable) 92 | 93 | } 94 | 95 | # Add y-label and title 96 | p <- p + ylab('Diversity') 97 | 98 | p 99 | 100 | } 101 | -------------------------------------------------------------------------------- /archive/plot_taxa_prevalence.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Plot Taxa Prevalence" 3 | output: 4 | rmarkdown::html_vignette 5 | --- 6 | 12 | 13 | 14 | ## Load data 15 | 16 | Load [example data](Data.md): 17 | For this example we will use data from [Halfvarson J., et al. Nature Microbiology, 2017](http://www.nature.com/articles/nmicrobiol20174). It was downloaded from [Qitta](https://qiita.ucsd.edu/study/description/1629). 18 | 19 | 20 | ```{r} 21 | library(microbiome) 22 | data(DynamicsIBD) 23 | p0 <- DynamicsIBD 24 | ``` 25 | 26 | ## Plot Taxa Prevalence 27 | 28 | Phylum level: 29 | 30 | ```{r, fig.height=6, fig.width=8} 31 | p0.f <- format_phyloseq(p0) 32 | p <- plot_taxa_prevalence(p0.f, 'Phylum') 33 | print(p) 34 | ``` 35 | 36 | You can also plot the prevalence at Order/Class/Family level. 37 | 38 | -------------------------------------------------------------------------------- /archive/project.data.R: -------------------------------------------------------------------------------- 1 | #' @title project.data 2 | #' @description Project high-dimensional data on two-dimensional plane 3 | #' by various methods 4 | #' 5 | #' @param amat data matrix (samples x features) 6 | #' @param type projection type 7 | #' (options: PCA, MDS.classical, MDS.nonmetric, Sammon) 8 | #' 9 | #' @return projected data matrix 10 | #' 11 | #' @export 12 | #' @importFrom MASS isoMDS 13 | #' @importFrom MASS sammon 14 | #' @importFrom mixOmics spca 15 | #' 16 | #' @examples 17 | #' data(peerj32) 18 | #' xy <- project.data(peerj32$microbes[,1:3]) 19 | #' 20 | #' @references 21 | #' 22 | #' D. Hand and H. Mannila and P. Smyth: 23 | #' Principles of Data Mining. MIT Press. Cambridge, MA, US (2001). 24 | #' 25 | #' To cite microbiome R package, see citation('microbiome') 26 | #' 27 | #' @author Contact: Leo Lahti \email{microbiome-admin@@googlegroups.com} 28 | #' @keywords utilities 29 | 30 | project.data <- function(amat, type = "PCA") { 31 | 32 | if (type == "PCA") { 33 | if (nrow(amat) < ncol(amat)) { 34 | 35 | message("More samples than features, using sparse PCA") 36 | 37 | ## Spca example: we are selecting 50 variables on each of the PCs 38 | result <- spca(amat, ncomp = 2, center = TRUE, scale = TRUE, 39 | keepX = rep(50, 2)) 40 | scores <- result$x 41 | } else { 42 | message("PCA") 43 | pca <- princomp(amat) # Classical PCA 44 | scores <- pca$scores 45 | } 46 | tab <- data.frame(scores[, 1:2]) 47 | rownames(tab) <- rownames(amat) 48 | } else if (type == "Sammon") { 49 | 50 | d <- as.dist(1 - cor(t(amat))) 51 | # This gave the clearest visualization. 52 | # Tuning magic parameter could still 53 | # improve. Try for instance magic = 0.05. 54 | fit <- sammon(d, k = 2) 55 | # Plot solution 56 | tab <- data.frame(list(Comp.1 = fit$points[, 1], 57 | Comp.2 = fit$points[, 2])) 58 | rownames(tab) <- rownames(amat) 59 | } else if (type == "MDS.classical") { 60 | d <- as.dist(1 - cor(t(amat))) 61 | fit <- cmdscale(d, eig = TRUE, k = 2) # classical MDS 62 | tab <- data.frame(list(Comp.1 = fit$points[, 1], 63 | Comp.2 = fit$points[, 2])) 64 | } else if (type == "MDS.nonmetric") { 65 | d <- as.dist(1 - cor(t(amat))) 66 | fit <- isoMDS(d, k = 2) # nonmetric MDS 67 | tab <- data.frame(list(Comp.1 = fit$points[, 1], 68 | Comp.2 = fit$points[, 2])) 69 | } 70 | 71 | # TODO Kernel-PCA kpc <- kpca(~., data=as.data.frame(x.train), 72 | # kernel='rbfdot', features = 2) Print the principal component 73 | # vectors pcv(kpc) Plot the data projection on the components 74 | # par(mfrow=c(2,2)) plot(rotated(kpc), col = 75 | # as.integer(as.factor(ann[rownames(x.train),'time'])), xlab='1st 76 | # Principal Component', ylab='2nd Principal Comp onent') 77 | # plot(rotated(kpc), col = 78 | # as.integer(as.factor(ann[rownames(x.train),'lipids.group'])), 79 | # xlab='1st Principal Component', ylab='2nd Principal Component') 80 | # embed remaining points emb <- predict(kpc, x.test) 81 | # plot(rotated(kpc), col = 82 | # as.integer(as.factor(ann[rownames(x.train),'lipids.group'])), 83 | # xlab='1st Principal Component', ylab='2nd Principal Component') 84 | # points(emb, col = 85 | # as.integer(as.factor(ann[rownames(x.train),'lipids.group']))) 86 | 87 | colnames(tab) <- c("Comp.1", "Comp.2") 88 | 89 | tab 90 | } -------------------------------------------------------------------------------- /archive/qiita1629.biom: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microbiome/tutorials/4c9c5eb321358c97369b1d2325ff81906a14ac0b/archive/qiita1629.biom -------------------------------------------------------------------------------- /archive/rare_abundance.R: -------------------------------------------------------------------------------- 1 | #' @title Rare Abundance 2 | #' @description Calculate total rare abundance 3 | #' @param x \code{\link{phyloseq-class}} object 4 | #' @param detection Detection threshold (non-negative real) 5 | #' @param prevalence Prevalence threshold (in [0, 100]) 6 | #' @return Total rare abunance vector. 7 | #' @examples 8 | #' data(dietswap) 9 | #' a <- rare_abundance(dietswap) 10 | #' @export 11 | #' @references See citation("microbiome") 12 | #' @author Contact: Leo Lahti \email{microbiome-admin@@googlegroups.com} 13 | #' @keywords utilities 14 | rare_abundance <- function(x, detection = 1, prevalence = 95) { 15 | 16 | rare.taxa <- rare_members(x, detection = detection, 17 | prevalence = prevalence) 18 | 19 | # Rare matrix 20 | # FIXME: directly use the rare function 21 | xx <- abundances(prune_taxa(rare.taxa, x)) 22 | xx <- matrix(xx, nrow = length(rare.taxa)) 23 | 24 | # Total sum of rare abundances 25 | ab <- colSums(xx, na.rm = TRUE) 26 | 27 | ab 28 | 29 | } 30 | -------------------------------------------------------------------------------- /archive/rare_members.R: -------------------------------------------------------------------------------- 1 | #' @title Rare Taxa 2 | #' @description Determine members of the rare microbiota with given abundance 3 | #'' and prevalences. 4 | #' @param x \code{\link{phyloseq-class}} object 5 | #' @param detection Detection threshold (non-negative real) 6 | #' @param prevalence Prevalence threshold (in [0, 100]) 7 | #' @param sort Logical. Sort the taxa. 8 | #' @return Vector of rare members 9 | #' @details For phyloseq object, lists taxa that are more prevalent with the 10 | #' given detection. For matrix, lists columns that satisfy 11 | #' these criteria. 12 | #' @examples 13 | #' data(dietswap) 14 | #' a <- rare_members(dietswap, 1, 95) 15 | #' @export 16 | #' @references 17 | #' To cite the microbiome R package, see citation('microbiome') 18 | #' @author Contact: Leo Lahti \email{microbiome-admin@@googlegroups.com} 19 | #' @keywords utilities 20 | rare_members <- function(x, detection = 1, prevalence = 95, 21 | sort = TRUE) { 22 | 23 | # First identify core based on these criteria 24 | cm <- core_members(x, detection, prevalence, sort) 25 | 26 | # Then pick rare taxa as those that do not belong to core 27 | # ie have prevalence below the given threshold at the given 28 | # detection limit 29 | taxa <- setdiff(taxa(x), cm) 30 | 31 | if (sort) { 32 | taxa <- sort(taxa) 33 | } 34 | 35 | taxa 36 | 37 | } 38 | -------------------------------------------------------------------------------- /archive/rare_richness.R: -------------------------------------------------------------------------------- 1 | #' @title Rare Richness 2 | #' @description Calculate total rare richness 3 | #' @param x \code{\link{phyloseq-class}} object 4 | #' @param detection Detection threshold (non-negative real) 5 | #' @param prevalence Prevalence threshold (in [0, 100]) 6 | #' @return Total rare abundance vector. 7 | #' @examples 8 | #' data(dietswap) 9 | #' a <- rare_richness(dietswap) 10 | #' @export 11 | #' @references See citation("microbiome") 12 | #' @author Contact: Leo Lahti \email{microbiome-admin@@googlegroups.com} 13 | #' @keywords utilities 14 | rare_richness <- function(x, detection = 1, prevalence = 95) { 15 | 16 | rare.taxa <- rare_members(x, detection = detection, 17 | prevalence = prevalence) 18 | 19 | # Rare matrix 20 | # FIXME: directly use the rare function 21 | xx <- abundances(prune_taxa(rare.taxa, x)) 22 | xx <- matrix(xx, nrow = length(rare.taxa)) 23 | 24 | # Rare taxa richness in each sample 25 | ab <- colSums(xx > detection, na.rm = TRUE) 26 | 27 | ab 28 | 29 | } 30 | -------------------------------------------------------------------------------- /archive/rda.R: -------------------------------------------------------------------------------- 1 | #' @title RDA for phyloseq objects 2 | #' @description RDA for phyloseq objects based on the \code{\link{rda}} 3 | #' function from the \pkg{vegan} package. 4 | #' @param x \code{\link{phyloseq-class}} object 5 | #' @param y Variable to apply in RDA visualization. 6 | #' @param scale See help(rda) 7 | #' @param na.action See help(rda) 8 | #' @param ... Other arguments to be passed 9 | #' @return rda result. See help(vegan::rda) 10 | #' @export 11 | #' @examples 12 | #' data(peerj32) # Data from https://peerj.com/articles/32/ 13 | #' pseq <- peerj32$phyloseq 14 | #' pseq.trans <- transform(pseq, "hell") # Hellinger transform 15 | #' # rda.result <- rda_pseq(pseq.trans, "time", scale = TRUE) 16 | #' @references See citation("microbiome") 17 | #' @author Contact: Leo Lahti \email{microbiome-admin@@googlegroups.com} 18 | #' @keywords utilities 19 | rda_pseq <- function (x, y, scale = FALSE, na.action = na.fail, ...) { 20 | 21 | # Microbiota profiling data (44 samples x 130 bacteria) 22 | otu <- t(abundances(x)) 23 | 24 | # Pick the indicated annotation field 25 | if (!y %in% sample_variables(x)) { 26 | stop(paste("The variable y ('", y, "') is not available in the phyloseq object i.e. sample_data(x). Only use variables listed in sample_variables(x) ie. o 27 | ne of the following: ", paste(names(sample_data(x)), collapse = " / "), sep = "")) 28 | } 29 | 30 | if (!"sample" %in% sample_variables(x)) { 31 | sample_data(x)$sample <- rownames(sample_data(x)) 32 | } 33 | 34 | annot <- factor(sample_data(x)[[y]]) 35 | names(annot) <- sample_data(x)$sample 36 | 37 | r <- vegan::rda(otu ~ annot, scale = scale, na.action = na.action) 38 | 39 | r 40 | 41 | } 42 | 43 | 44 | 45 | -------------------------------------------------------------------------------- /archive/read.profiling.R: -------------------------------------------------------------------------------- 1 | #' @title Read profiling 2 | #' @description Read run.profiling.script output into R 3 | #' @param level phylogenetic level ('oligo' / 'species' / 'L1' / 'L2' / 'L0') 4 | #' or 'phylogeny.full', 'phylogeny.filtered' 5 | #' @param method ('frpa' / 'rpa' / 'sum' / 'ave') 6 | #' @param data.dir Profiling script output directory for reading the data. 7 | #' If not given, GUI will ask to specify the file and 8 | #' overruns the possible level / method arguments in the 9 | #' function call. 10 | #' @param log10 Logical. Logarithmize the data TRUE/FALSE. 11 | #' By default, the data is in original non-log scale. 12 | #' @param impute impute missing oligo signals 13 | #' @return data matrix (phylo x samples) 14 | #' @export 15 | #' @examples 16 | #' \dontrun{ 17 | #' data.dir <- system.file("extdata", package = "microbiome") 18 | #' dat <- read.profiling('L1', 'frpa', data.dir = data.dir) 19 | #' } 20 | #' @references See citation('microbiome') 21 | #' @author Contact: Leo Lahti \email{microbiome-admin@@googlegroups.com} 22 | #' @keywords utilities 23 | 24 | read.profiling <- function(level = NULL, method = "frpa", data.dir, 25 | log10 = TRUE, impute = TRUE) { 26 | 27 | .Deprecated("read_hitchip") 28 | 29 | # level <- 'oligo'; method = 'sum'; data.dir = 'test/'; log10 = TRUE 30 | if (level %in% c("L0", "L1", "L2", "species")) { 31 | if (method == "frpa" && length(grep(method, dir(data.dir))) == 0) { 32 | warning("frpa method not available; using rpa instead") 33 | method <- "rpa" 34 | } 35 | f <- paste(data.dir, "/", level, "-", method, ".tab", sep = "") 36 | 37 | } else if (level == "oligo") { 38 | f <- paste(data.dir, "/oligoprofile.tab", sep = "") 39 | } else if (level == "phylogeny.full") { 40 | f <- paste(data.dir, "/phylogeny.full.tab", sep = "") 41 | } else if (level %in% c("phylogeny.filtered")) { 42 | f <- paste(data.dir, "/phylogeny.filtered.tab", sep = "") 43 | } else if (level %in% c("phylogeny.info", "phylogeny.full", "phylogeny")) { 44 | f <- paste(data.dir, "/phylogeny.full.tab", sep = "") 45 | } 46 | 47 | 48 | message(paste("Reading", f)) 49 | 50 | if (level %in% c("L0", "L1", "L2", "species")) { 51 | 52 | tab <- read.csv(f, header = TRUE, sep = "\t", row.names = 1, 53 | as.is = TRUE) 54 | colnames(tab) <- unlist(strsplit(readLines(f, 1), "\t"))[-1] 55 | 56 | } else if (level == "oligo") { 57 | 58 | tab <- read.csv(f, header = TRUE, sep = "\t", row.names = 1, 59 | as.is = TRUE) 60 | colnames(tab) <- unlist(strsplit(readLines(f, 1), "\t"))[-1] 61 | 62 | } else if (length(grep("phylogeny", level)) > 0) { 63 | 64 | tab <- read.csv(f, header = TRUE, sep = "\t", as.is = TRUE) 65 | 66 | } 67 | 68 | # Convert to numeric 69 | if (level %in% c("oligo", "species", "L0", "L1", "L2")) { 70 | 71 | rnams <- rownames(tab) 72 | cnams <- colnames(tab) 73 | tab <- apply(tab, 2, as.numeric) 74 | rownames(tab) <- rnams 75 | colnames(tab) <- cnams 76 | 77 | } 78 | 79 | if (impute && any(is.na(tab))) { 80 | warning(paste("The matrix has ", sum(is.na(tab)), 81 | " missing values \n 82 | - imputing.")) 83 | tab <- 10^t(impute(t(log10(tab)))) 84 | } 85 | 86 | if (log10 && (level %in% c("oligo", "species", "L0", "L1", "L2"))) { 87 | message("Logarithmizing the data") 88 | tab <- log10(tab) 89 | } 90 | 91 | tab 92 | 93 | } 94 | 95 | -------------------------------------------------------------------------------- /archive/read_profiling.R: -------------------------------------------------------------------------------- 1 | #' Read HITChip run.profiling.script output into R 2 | #' 3 | #' @param data.dir Profiling script output directory for reading the data. 4 | #' If not given, GUI will ask to specify the file and 5 | #' overruns the possible level / method arguments in the 6 | #' function call. 7 | #' @param method Select the preprocessing method that you like to check 8 | #' @param verbose verbose 9 | #' 10 | #' @return data matrix (phylo x samples) 11 | #' 12 | #' @examples 13 | #' # data.dir <- system.file("extdata", package = "microbiome") 14 | #' # dat <- read_profiling(data.dir) 15 | #' 16 | #' @references See citation('microbiome') 17 | #' @author Contact: Leo Lahti \email{microbiome-admin@@googlegroups.com} 18 | #' @keywords utilities 19 | read_profiling <- function(data.dir, method, verbose = TRUE) { 20 | 21 | if (verbose) {message(paste("Reading Chip data from", data.dir))} 22 | 23 | results <- list() 24 | 25 | # Read probe-level data 26 | f <- paste(data.dir, "/oligoprofile.tab", sep = "") 27 | tab <- read.csv(f, header = TRUE, sep = "\t", row.names = 1, as.is = TRUE) 28 | colnames(tab) <- unlist(strsplit(readLines(f, 1), "\t"))[-1] 29 | results[["probedata"]] <- tab 30 | 31 | # Read abundance tables 32 | #for (s in c("L0", "L1", "L2", "species")) { 33 | # f <- paste(data.dir, "/", s, "-", method, ".tab", sep = "") 34 | # if (file.exists(f)) { 35 | # tab <- read.csv(f, header = TRUE, sep = "\t", row.names = 1, as.is = TRUE) 36 | # colnames(tab) <- unlist(strsplit(readLines(f, 1), "\t"))[-1] 37 | # results[[s]] <- tab 38 | # } 39 | #} 40 | 41 | # Read taxonomy table 42 | f <- paste(data.dir, "/taxonomy.tab", sep = "") 43 | if (!file.exists(f)) { 44 | # Old outputs had this name 45 | f <- paste(data.dir, "/phylogeny.filtered.tab", sep = "") 46 | } 47 | taxonomy <- read.csv(f, header = TRUE, sep = "\t", as.is = TRUE) 48 | results[["taxonomy"]] <- taxonomy 49 | 50 | # Read unfiltered taxonomy table 51 | f <- paste(data.dir, "/taxonomy.full.tab", sep = "") 52 | if (!file.exists(f)) { 53 | # Old outputs had this name 54 | f <- paste(data.dir, "/phylogeny.full.tab", sep = "") 55 | } 56 | taxonomy.full <- read.csv(f, header = TRUE, sep = "\t", as.is = TRUE) 57 | results[["taxonomy.full"]] <- taxonomy.full 58 | 59 | # Read sample metadata 60 | f <- paste(data.dir, "/meta.tab", sep = "") 61 | if (file.exists(f)) { 62 | tab <- read.csv(f, header = TRUE, sep = "\t", as.is = TRUE) 63 | rownames(tab) <- tab$sample 64 | meta <- tab 65 | results[["meta"]] <- meta 66 | } 67 | 68 | results 69 | 70 | } 71 | 72 | 73 | -------------------------------------------------------------------------------- /archive/reading.R: -------------------------------------------------------------------------------- 1 | library(microbiome) 2 | #library(phyloseq) 3 | #fs <- list.files("~/microbiome/R/", full.names = TRUE) 4 | #for (f in fs) { source(f) } 5 | 6 | ressum <- read_hitchip("output", method = "sum") 7 | resrpa <- read_hitchip("output", method = "rpa") 8 | resfrpa <- read_hitchip("output", method = "frpa") 9 | 10 | # Read precalculated 11 | res.frpa <- read_profiling("output", method = "frpa") 12 | res.rpa <- read_profiling("output", method = "rpa") 13 | res.sum <- read_profiling("output", method = "sum") 14 | 15 | # Check compatibility with old function 16 | # frpa <- read.profiling(level = "L2", method = "frpa", data.dir = "output", log10 = TRUE, impute = TRUE) 17 | 18 | # Read and recalculate 19 | res <- read_hitchip("output", method = "sum") 20 | 21 | res1 <- res.sum$species 22 | res2 <- res$abundance.table 23 | res3 <- res.rpa$species 24 | plot(log10(unlist(res1)), log10(unlist(res2))) 25 | plot(log10(unlist(res1)), log10(unlist(res3))) 26 | 27 | 28 | res1 <- res.rpa 29 | res2 <- resrpa 30 | plot(log10(unlist(res1$species)), log10(unlist(res2$abundance.table))) 31 | 32 | -------------------------------------------------------------------------------- /archive/save.phylogeny.R: -------------------------------------------------------------------------------- 1 | message("Fetching Phylogeny from the database") 2 | library(microbiome) 3 | library(HITChipDB) 4 | dbuser = "root"; dbpwd = "fidipro"; dbname = "phyloarray"; host = '127.0.0.1'; port = 3307 5 | 6 | message("Fetching Phylogeny from the database") 7 | phylogeny.full <- get.phylogeny.info("16S", 8 | dbuser = dbuser, 9 | dbpwd = dbpwd, 10 | dbname = dbname, 11 | host = host, 12 | port = port, 13 | chip = "HITChip") 14 | 15 | phylogeny.filtered.orig <- GetPhylogeny("HITChip", "filtered") 16 | 17 | # Fix taxon name 18 | phylogeny.full$L2 <- gsub("^Clostridia$", "Clostridium (sensu stricto)", as.character(phylogeny.full$L2)) 19 | 20 | # This handles also pmTm, complement and mismatch filtering 21 | # This is the phylogeny used in probe summarization into taxonomic levels 22 | rm.phylotypes <- phylotype.rm.list("HITChip") 23 | rm.oligos <- sync.rm.phylotypes(rm.phylotypes, phylogeny.full)$oligos 24 | 25 | phylogeny.filtered <- prune16S(phylogeny.full, pmTm.margin = 2.5, complement = 1, mismatch = 0, rmoligos = rm.oligos, remove.nonspecific.oligos = FALSE) # remove.nonspecific.oligos refers to probes with multiple L2 groups 26 | 27 | # Remove probes that target multiple L1 groups 28 | lmap <- levelmap(NULL, level.from = "oligoID", level.to = "L1", phylogeny.info = phylogeny.filtered) 29 | hits <- sapply(lmap, length) 30 | ambiguous.l1.probes <- unique(names(which(hits > 1))) 31 | 32 | phylogeny.filtered <- subset(phylogeny.filtered, !oligoID %in% ambiguous.l1.probes) 33 | 34 | # Keep only relevant cols 35 | phylogeny.full <- phylogeny.full[, 1:6]; 36 | phylogeny.filtered <- phylogeny.filtered[, 1:6]; 37 | 38 | # Remove duplicate rows 39 | phylogeny.full <- phylogeny.full[!duplicated(phylogeny.full),] 40 | phylogeny.filtered <- phylogeny.filtered[!duplicated(phylogeny.filtered),] 41 | 42 | # Write to file 43 | write.table(phylogeny.filtered, file = "../extdata/phylogeny.filtered.tab", quote = F, row.names = F, sep = "\t") 44 | 45 | -------------------------------------------------------------------------------- /archive/subject_tables.R: -------------------------------------------------------------------------------- 1 | subject_tables <- function (x, meta) { 2 | 3 | stop("TODO: replaced now with timesort_subjects.R") 4 | 5 | # Focus on the signal from specific taxon 6 | meta$signal <- x 7 | 8 | # Pick data for each subject separately 9 | spl <- split(meta, as.character(meta$subject)) 10 | 11 | tabs <- list() 12 | cnt <- 0 13 | for (subj in names(spl)) { 14 | 15 | times <- as.numeric(spl[[subj]]$time) 16 | signal <- as.numeric(spl[[subj]]$signal) 17 | mintime <- which.min(times) 18 | 19 | # Shift in time from first time point 20 | spl[[subj]]$time <- (times - times[[mintime]]) 21 | 22 | # Shift in signal from first time point 23 | spl[[subj]]$shift <- (signal - signal[[mintime]]) 24 | 25 | # Store 26 | tabs[[subj]] <- spl[[subj]] 27 | } 28 | 29 | tabs 30 | 31 | } 32 | 33 | 34 | 35 | 36 | -------------------------------------------------------------------------------- /archive/summarize.rpa.R: -------------------------------------------------------------------------------- 1 | #' Description: Probeset summarization with RPA 2 | #' 3 | #' Arguments: 4 | #' @param taxonomy oligo - phylotype matching data.frame 5 | #' @param level taxonomic level for the summarization. 6 | #' @param probedata preprocessed probes x samples data matrix in absolute domain 7 | #' @param verbose print intermediate messages 8 | #' @param probe.parameters Optional. If probe.parameters are given, 9 | #' the summarization is based on these and model parameters are not 10 | #' estimated. A list. One element for each probeset with the following probe vectors: 11 | #' affinities, variances 12 | #' Returns: 13 | #' @return List with two elements: abundance.table (summarized data matrix in absolute scale) and probe.parameters (RPA probe level parameter estimates) 14 | #' 15 | #' @export 16 | #' @importFrom RPA d.update.fast 17 | #' @importFrom RPA rpa.fit 18 | #' 19 | #' @references See citation("microbiome") 20 | #' @author Contact: Leo Lahti \email{microbiome-admin@@googlegroups.com} 21 | #' @keywords utilities 22 | summarize.rpa <- function (taxonomy, level, probedata, verbose = TRUE, probe.parameters = NULL) { 23 | 24 | # Convert to log10 domain 25 | oligo.data <- log10(probedata) 26 | probeinfo <- list() 27 | 28 | probesets <- retrieve.probesets(taxonomy, level = level) 29 | # probesets <- probesets[setdiff(names(probesets), rm.species)] 30 | nPhylotypesPerOligo <- n.phylotypes.per.oligo(taxonomy, level) 31 | 32 | # initialize 33 | summarized.matrix <- matrix(NA, nrow = length(probesets), 34 | ncol = ncol(oligo.data)) 35 | rownames(summarized.matrix) <- names(probesets) 36 | colnames(summarized.matrix) <- colnames(oligo.data) 37 | 38 | for (set in names(probesets)) { 39 | 40 | # Pick expression for particular probes 41 | probes <- probesets[[set]] 42 | 43 | # Pick probe data for the probeset: probes x samples 44 | # oligo.data assumed to be already in log10 45 | dat <- as.matrix(oligo.data[probes,]) 46 | if (length(probes) == 1) { 47 | dat <- as.matrix(oligo.data[probes,], nrow = length(probes)) 48 | } 49 | rownames(dat) <- probes 50 | colnames(dat) <- colnames(oligo.data) 51 | 52 | if (length(probe.parameters) > 0) { 53 | 54 | # Summarize with pre-calculated variances 55 | vec <- d.update.fast(dat, probe.parameters[[set]]) 56 | 57 | } else { 58 | 59 | # RPA is calculated in log domain 60 | # Downweigh non-specific probes with priors with 10% of virtual data and 61 | # variances set according to number of matching probes 62 | # This will provide slight emphasis to downweigh potentially 63 | # cross-hybridizing probes 64 | alpha <- 1 + 0.1*ncol(dat)/2 65 | beta <- 1 + 0.1*ncol(dat)*nPhylotypesPerOligo[probes]^2 66 | res <- rpa.fit(dat, alpha = alpha, beta = beta) 67 | vec <- res$mu 68 | probeinfo[[set]] <- res$tau2 69 | 70 | } 71 | 72 | summarized.matrix[set, ] <- vec 73 | 74 | } 75 | 76 | if (!is.null(probe.parameters)) { 77 | probeinfo <- probe.parameters 78 | } 79 | 80 | # Return the data in absolute scale 81 | summarized.matrix <- 10^summarized.matrix 82 | 83 | list(abundance.table = summarized.matrix, probeinfo = probeinfo) 84 | 85 | } 86 | 87 | -------------------------------------------------------------------------------- /archive/summarize.sum.R: -------------------------------------------------------------------------------- 1 | #' Probeset summarization with SUM 2 | #' 3 | #' Arguments: 4 | #' @param taxonomy oligo - phylotype matching data.frame 5 | #' @param level taxonomic level for the summarization. 6 | #' @param probedata preprocessed probes x samples data matrix in absolute domain 7 | #' @param verbose print intermediate messages 8 | #' @param downweight.ambiguous.probes Downweight probes with multiple targets 9 | #' 10 | #' Returns: 11 | #' @return List with two elements: abundance.table (summarized data matrix in absolute scale) and probe.parameters used in the calculations 12 | #' 13 | #' @export 14 | #' 15 | #' @references See citation("microbiome") 16 | #' @author Contact: Leo Lahti \email{microbiome-admin@@googlegroups.com} 17 | #' @keywords utilities 18 | summarize.sum <- function (taxonomy, level, probedata, verbose = TRUE, downweight.ambiguous.probes = TRUE) { 19 | 20 | # Convert to log10 domain 21 | oligo.data <- probedata 22 | probe.parameters <- list() 23 | 24 | probesets <- retrieve.probesets(taxonomy, level = level) 25 | 26 | if (downweight.ambiguous.probes) { 27 | nPhylotypesPerOligo <- n.phylotypes.per.oligo(taxonomy, level) 28 | probe.weights <- 1/nPhylotypesPerOligo 29 | } else { 30 | probe.weights <- rep(1, nrow(taxonomy)) 31 | names(probe.weights) <- rownames(taxonomy) 32 | } 33 | 34 | # initialize 35 | summarized.matrix <- matrix(NA, nrow = length(probesets), 36 | ncol = ncol(oligo.data)) 37 | rownames(summarized.matrix) <- names(probesets) 38 | colnames(summarized.matrix) <- colnames(oligo.data) 39 | 40 | for (set in names(probesets)) { 41 | 42 | # print(set) 43 | 44 | # Pick expression for particular probes 45 | probes <- probesets[[set]] 46 | 47 | # Pick probe data for the probeset: probes x samples 48 | # oligo.data assumed to be already in log10 49 | dat <- as.matrix(oligo.data[probes,]) 50 | if (length(probes) == 1) { 51 | dat <- as.matrix(oligo.data[probes,], nrow = length(probes)) 52 | } 53 | rownames(dat) <- probes 54 | colnames(dat) <- colnames(oligo.data) 55 | 56 | # Weight each probe by the inverse of the number of matching phylotypes 57 | # Then calculate sum -> less specific probes are downweighted 58 | # However, set the minimum signal to 0 in log10 scale (1 in original scale)! 59 | if (nrow(dat) > 1) { 60 | dat <- dat * probe.weights[rownames(dat)] 61 | vec <- colSums(dat, na.rm = T) 62 | } else { 63 | vec <- as.vector(unlist(dat)) 64 | } 65 | 66 | summarized.matrix[set, ] <- vec 67 | 68 | } 69 | 70 | list(abundance.table = summarized.matrix, probe.parameters = probe.weights) 71 | 72 | } 73 | 74 | -------------------------------------------------------------------------------- /archive/test.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | library(microbiome) 4 | data.directory <- system.file("extdata", package = "microbiome") 5 | 6 | method <- "sum" 7 | 8 | so <- read.csv(paste("~/Rpackages/microbiome/microbiome/inst/extdata/species-", method, ".tab", sep = ""), sep = "\t", row.names = 1) 9 | 10 | pseq <- read_hitchip(data.directory, method = method)$pseq 11 | sn <- otu_table(pseq)@.Data 12 | 13 | cro <- intersect(rownames(sn), rownames(so)) 14 | cco <- intersect(colnames(sn), colnames(so)) 15 | plot(log10(unlist(so[cro, cco])), log10(unlist(sn[cro, cco]))) 16 | 17 | cors <- c() 18 | for (i in 1:length(cro)) { 19 | tax <- cro[[i]] 20 | cors[[tax]] <- cor(log10(unlist(so[tax, ])), log10(unlist(sn[tax, ]))) 21 | } 22 | hist(cors) 23 | 24 | # -------------------------------- 25 | 26 | # The poorest correlate 27 | tax <- names(sort(cors))[[1]] 28 | 29 | 30 | -------------------------------------------------------------------------------- /archive/top_abundance.R: -------------------------------------------------------------------------------- 1 | #' @title Top Abundance Index 2 | #' @description Calculates the community top_abundance index. 3 | #' @param rank Optional. The rank of the dominant taxa to consider. 4 | #' @param aggregate Aggregate (TRUE; default) the top members or not. If aggregate=TRUE, then the sum of relative abundances is returned. Otherwise the relative abundance is returned for the single taxa with the indicated rank. 5 | #' @inheritParams diversity 6 | #' @return A vector of top_abundance indices 7 | #' @export 8 | #' @examples 9 | #' data(dietswap) 10 | #' d <- top_abundance(dietswap) 11 | #' @details The top_abundance index gives the abundance of the most abundant species in [0,1]. This simple diversity index is occasionally used in ecological literature, and sometimes also called dominance. However, note that the microbiome::dominance function uses a different definition. With rank = 2, the sum of abundances for the two most abundant taxa are returned etc. However, if aggregate=FALSE, the abundance for the single n'th most dominant taxa (n = rank) is returnde instead the sum of abundances up to that rank. 12 | #' @author Contact: Leo Lahti \email{microbiome-admin@@googlegroups.com} 13 | #' @seealso dominance, diversity 14 | #' @keywords utilities 15 | top_abundance <- function(x, rank = 1, aggregate = TRUE, split = TRUE) { 16 | 17 | # Pick the OTU data 18 | otu <- abundances(x) 19 | 20 | if (!split) { 21 | otu <- as.matrix(rowSums(otu), nrow = nrow(otu)) 22 | } 23 | 24 | if (!aggregate) { 25 | do <- apply(otu, 2, function (x) {rev(sort(x/sum(x, na.rm = TRUE)))[[rank]]}) 26 | } else { 27 | do <- apply(otu, 2, function (x) {sum(rev(sort(x/sum(x, na.rm = TRUE)))[1:rank])}) 28 | } 29 | names(do) <- sample_names(x) 30 | 31 | do 32 | 33 | } 34 | 35 | 36 | -------------------------------------------------------------------------------- /archive/validate.R: -------------------------------------------------------------------------------- 1 | #' @title Validate Phyloseq 2 | #' @description Validate phyloseq object. 3 | #' @param x phyloseq object 4 | #' @details Checks that the abundances and sample_data have exactly same samples. 5 | #' @return A validated and polished phyloseq object 6 | #' @export 7 | #' @examples 8 | #' data(dietswap) 9 | #' validate(dietswap) 10 | validate <- function (x) { 11 | 12 | validated <- TRUE 13 | 14 | dat <- t(abundances(x)) 15 | meta <- sample_data(x) 16 | coms <- intersect(rownames(dat), rownames(meta)) 17 | 18 | if (length(coms) < 2) { 19 | validated <- FALSE 20 | warning("Check that the abundances and sample_data have more than 1 samples 21 | in common") 22 | return(validated) 23 | } else { 24 | # Include only the common samples 25 | x@sam_data <- meta[coms,] 26 | otu_table(x) <- otu_table(t(dat[coms,]), taxa_are_rows = TRUE) 27 | } 28 | 29 | # Return validated and polished phyloseq object 30 | if (validated) { 31 | x 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /archive/vm.R: -------------------------------------------------------------------------------- 1 | #fs <- list.files("~/HITChipDB/R/", full.names = TRUE) 2 | #for (f in fs) { source(f) } 3 | #fs <- list.files("~/microbiome/R/", full.names = TRUE) 4 | #for (f in fs) { source(f) } 5 | 6 | # ------------------------------------------- 7 | library(devtools) 8 | install_github("microbiome/microbiome") 9 | install_github("microbiome/HITChipDB") 10 | 11 | res <- run.profiling.script(dbuser, dbpwd, dbname, verbose = FALSE, host = host, port = port, summarization.methods = summarization.methods, which.projects = which.projects) 12 | 13 | projs <- list.mysql.projects(dbuser, dbpwd, dbname, host = host, port = port) 14 | 15 | res <- run.profiling.script(dbuser, dbpwd, dbname, verbose = FALSE, host = host, port = port, summarization.methods = summarization.methods, which.projects = "TURKU PET STUDY") 16 | 17 | # ------------------------------- 18 | 19 | 20 | 21 | -------------------------------------------------------------------------------- /crash-handler-permission: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microbiome/tutorials/4c9c5eb321358c97369b1d2325ff81906a14ac0b/crash-handler-permission -------------------------------------------------------------------------------- /crash-handler.conf: -------------------------------------------------------------------------------- 1 | crash-handling-enabled="1" 2 | -------------------------------------------------------------------------------- /debug.log: -------------------------------------------------------------------------------- 1 | [1019/142843.961:ERROR:directory_reader_win.cc(43)] FindFirstFile: The system cannot find the path specified. (0x3) 2 | -------------------------------------------------------------------------------- /deseq2.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "DESeq2" 3 | author: "Leo Lahti, Sudarshan Shetty et al." 4 | bibliography: 5 | - bibliography.bib 6 | output: 7 | BiocStyle::html_document: 8 | number_sections: no 9 | toc: yes 10 | toc_depth: 4 11 | toc_float: true 12 | self_contained: true 13 | thumbnails: true 14 | lightbox: true 15 | gallery: true 16 | use_bookdown: false 17 | highlight: haddock 18 | --- 19 | 25 | 26 | 27 | ## Normalization and group-wise comparisons with DESeq2 28 | 29 | Examples adapted from [Callahan et al. F1000 (2017)](https://f1000research.com/articles/5-1492/v2). 30 | 31 | Load example data: 32 | 33 | ```{r deseq2-example, warning=FALSE, message=FALSE, fig.width=8, fig.height=5} 34 | # Load libraries 35 | library(microbiome) 36 | library(ggplot2) 37 | library(magrittr) 38 | library(dplyr) 39 | # Probiotics intervention example data 40 | data(dietswap) 41 | 42 | # Only check the core taxa to speed up examples 43 | pseq <- core(dietswap, detection = 50, prevalence = 80/100) 44 | ``` 45 | 46 | 47 | Toy example, to be polished: 48 | 49 | ```{r deseq2-example2, warning=FALSE, message=FALSE, fig.width=8, fig.height=5} 50 | library(phyloseq) 51 | library(reshape2) 52 | library(DESeq2) 53 | library(knitr) 54 | library(magrittr) 55 | # Running the DESeq2 analysis 56 | ds2 <- phyloseq_to_deseq2(pseq, ~ nationality) 57 | dds <- DESeq(ds2) 58 | res <- results(dds) 59 | df <- as.data.frame(res) 60 | df$taxon <- rownames(df) 61 | df <- df %>% arrange(log2FoldChange, padj) 62 | 63 | library(knitr) 64 | print(head(kable((df)))) 65 | ``` 66 | 67 | 68 | ## Validating DESeq2 results 69 | 70 | ```{r deseq2-validate, warning=FALSE, message=FALSE, fig.width=8, fig.height=5, eval=TRUE} 71 | # Identify top taxa based on standard ANOVA 72 | source(system.file("extdata/check_anova.R", package = "microbiome")) 73 | ano <- check_anova(pseq, "nationality"); 74 | ano$log2FC <- log2(ano$ave.AFR) - log2(ano$ave.AAM) 75 | taxa.anova <- as.character(subset(ano, padj < 0.01 & abs(log2FC) > log2(2))$taxa) 76 | 77 | # lowPick the top taxa based on DESEq2 78 | taxa.deseq <- subset(res, padj < 0.01 & abs(log2FoldChange) > log2(2))$taxon 79 | 80 | # Check overlap 81 | # Most DESEq2 taxa are confirmed with ANOVA 82 | library(gplots) 83 | 84 | # Also the est p-values are well correlated (higher not so) 85 | mf <- data.frame(df$padj, ano$padj) 86 | p <- ggplot(mf, aes(x = log10(df$padj), y = log10(ano$padj))) + 87 | labs(x = 'DESeq2 adjusted p-value', y = 'ANOVA adjusted p-value') + 88 | geom_point() 89 | print(p) 90 | ``` 91 | 92 | 93 | ```{r plot_for_ven, echo=TRUE, message=F, error=F, eval=TRUE} 94 | library(venn) # Check UpSet plot instead 95 | venn( list(ANOVA = taxa.anova,DESeq2 = taxa.deseq)) 96 | ``` 97 | 98 | 99 | -------------------------------------------------------------------------------- /figure/pooled3-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microbiome/tutorials/4c9c5eb321358c97369b1d2325ff81906a14ac0b/figure/pooled3-1.png -------------------------------------------------------------------------------- /figure/pooled_overdispersion-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microbiome/tutorials/4c9c5eb321358c97369b1d2325ff81906a14ac0b/figure/pooled_overdispersion-1.png -------------------------------------------------------------------------------- /figure/pooled_pcomp-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microbiome/tutorials/4c9c5eb321358c97369b1d2325ff81906a14ac0b/figure/pooled_pcomp-1.png -------------------------------------------------------------------------------- /figure/tail-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microbiome/tutorials/4c9c5eb321358c97369b1d2325ff81906a14ac0b/figure/tail-1.png -------------------------------------------------------------------------------- /figure/univariate5-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microbiome/tutorials/4c9c5eb321358c97369b1d2325ff81906a14ac0b/figure/univariate5-1.png -------------------------------------------------------------------------------- /figure/univariate6-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microbiome/tutorials/4c9c5eb321358c97369b1d2325ff81906a14ac0b/figure/univariate6-1.png -------------------------------------------------------------------------------- /figure/univariate_boxplot-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microbiome/tutorials/4c9c5eb321358c97369b1d2325ff81906a14ac0b/figure/univariate_boxplot-1.png -------------------------------------------------------------------------------- /figure/univariate_densityplot-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microbiome/tutorials/4c9c5eb321358c97369b1d2325ff81906a14ac0b/figure/univariate_densityplot-1.png -------------------------------------------------------------------------------- /init.R: -------------------------------------------------------------------------------- 1 | if (!requireNamespace("BiocManager", quietly = TRUE)) 2 | install.packages("BiocManager") 3 | library(BiocManager) 4 | 5 | 6 | .cran_packages <- c("ggplot2", "gridExtra",'ape', 'dplyr','ggpubr', 'knitr','Cairo','devtools','data.table', 7 | 'splitstackshape', 'rmarkdown', 'tidyverse', 'readxl', 'vegan','knitcitations', 'reshape', 'reshape2', 8 | 'magrittr', 'vegan', 'glue','stringr','devtools','captioner', 'rstan','rstanarm', 9 | 'hrbrthemes', 'gcookbook','GGally', 'rvg','ggiraph','network', 'gplots', 10 | 'intergraph', 'rmdformats', 'FD', 'hrbrthemes', 'GGally', 'lattice', 'venn', 'PMCMRplus') 11 | 12 | .bioc_packages <- c("dada2", "phyloseq",'SummarizedExperiment','Biobase', 13 | "DECIPHER",'IRanges','BiocGenerics', "phangorn", 14 | 'BiocStyle', "microbiome", "DESeq2", 'DirichletMultinomial') 15 | 16 | # package.version("microbiome") 17 | #[1] "1.5.31" 18 | .inst <- .cran_packages %in% installed.packages() 19 | if(any(!.inst)) { 20 | install.packages(.cran_packages[!.inst]) 21 | } 22 | sapply(.cran_packages, require, character.only = TRUE) 23 | 24 | 25 | library('devtools') 26 | install_github('zdk123/SpiecEasi') 27 | install_github('briatte/ggnet') 28 | 29 | .inst <- .bioc_packages %in% installed.packages() 30 | if(any(!.inst)) { 31 | #source("http://bioconductor.org/biocLite.R") 32 | 33 | BiocManager::install(.bioc_packages[!.inst], ask = F) 34 | } 35 | install_github('antagomir/netresponse') 36 | install_github('microsud/microbiomeutilities') 37 | ### 38 | if (!requireNamespace("BiocManager", quietly = TRUE)) 39 | install.packages("BiocManager") 40 | 41 | BiocManager::install(c(.bioc_packages,"SummarizedExperiment", 'DelayedArray')) 42 | # Load packages into session, and print package version 43 | sapply(c(.cran_packages, .bioc_packages), require, character.only = TRUE) 44 | 45 | -------------------------------------------------------------------------------- /limma.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Limma - linear models for microarrays" 3 | author: "Leo Lahti, Sudarshan Shetty et al." 4 | bibliography: 5 | - bibliography.bib 6 | output: 7 | BiocStyle::html_document: 8 | number_sections: no 9 | toc: yes 10 | toc_depth: 4 11 | toc_float: true 12 | self_contained: true 13 | thumbnails: true 14 | lightbox: true 15 | gallery: true 16 | use_bookdown: false 17 | highlight: haddock 18 | --- 19 | 25 | 26 | 27 | ## Two-group comparison at community level with limma 28 | 29 | Load example data: 30 | 31 | ```{r boxplot-example, warning=FALSE, message=FALSE, fig.width=8, fig.height=5} 32 | # Load libraries 33 | library(microbiome) 34 | library(ggplot2) 35 | library(dplyr) 36 | 37 | # Probiotics intervention example data 38 | data(peerj32) # Source: https://peerj.com/articles/32/ 39 | pseq <- peerj32$phyloseq # Rename the example data 40 | 41 | # Get OTU abundances and sample metadata 42 | otu <- abundances(microbiome::transform(pseq, "log10")) 43 | meta <- meta(pseq) 44 | ``` 45 | 46 | 47 | ## Linear models with limma 48 | 49 | Identify most significantly different taxa between males and females using the limma method. See [limma homepage](http://bioinf.wehi.edu.au/limma/) and [limma User's guide](https://www.bioconductor.org/packages/release/bioc/vignettes/limma/inst/doc/usersguide.pdf) for details. For discussion on why limma is preferred over t-test, see [this 50 | article](http://www.plosone.org/article/info:doi/10.1371/journal.pone.0012336). 51 | 52 | ```{r limma-example, warning=FALSE} 53 | # Compare the two groups with limma 54 | library(limma) 55 | 56 | # Prepare the design matrix which states the groups for each sample 57 | # in the otu 58 | design <- cbind(intercept = 1, Grp2vs1 = meta[["sex"]]) 59 | rownames(design) <- rownames(meta) 60 | design <- design[colnames(otu), ] 61 | 62 | # NOTE: results and p-values are given for all groupings in the design matrix 63 | # Now focus on the second grouping ie. pairwise comparison 64 | coef.index <- 2 65 | 66 | # Fit the limma model 67 | fit <- lmFit(otu, design) 68 | fit <- eBayes(fit) 69 | 70 | # Limma P-values 71 | pvalues.limma = fit$p.value[, 2] 72 | 73 | # Limma effect sizes 74 | efs.limma <- fit$coefficients[, "Grp2vs1"] 75 | 76 | # Summarise 77 | library(knitr) 78 | kable(topTable(fit, coef = coef.index, p.value=0.1), digits = 2) 79 | ``` 80 | 81 | Quantile-Quantile plot and volcano plot for limma 82 | 83 | ```{r limma-qq, warning=FALSE} 84 | # QQ 85 | qqt(fit$t[, coef.index], df = fit$df.residual + fit$df.prior); abline(0,1) 86 | 87 | # Volcano 88 | volcanoplot(fit, coef = coef.index, highlight = coef.index) 89 | ``` 90 | 91 | 92 | ## Comparison between limma and t-test 93 | 94 | Order the taxa with t-test for comparison and validation purposes. The 95 | differences are small in this simulated example, but can be 96 | considerable in real 97 | data. For discussion on why limma is preferred over t-test, see [this 98 | article](http://www.plosone.org/article/info:doi/10.1371/journal.pone.0012336). 99 | 100 | 101 | ```{r limma-compairson, warning=FALSE} 102 | # Compare the two groups with t-test 103 | library(dplyr) 104 | pvalues.ttest <- c() 105 | male.samples <- dplyr::filter(meta, sex == "male")$sample 106 | female.samples <- dplyr::filter(meta, sex == "female")$sample 107 | for (tax in rownames(otu)) { 108 | pvalues.ttest[[tax]] <- t.test(otu[tax, male.samples], otu[tax, female.samples])$p.value 109 | } 110 | # Multiple testing correction 111 | pvalues.ttest <- p.adjust(pvalues.ttest, method = "fdr") 112 | 113 | # Compare p-values between limma and t-test 114 | taxa <- rownames(otu) 115 | plot(pvalues.ttest[taxa], pvalues.limma[taxa]) 116 | abline(0,1,lty = 2) 117 | ``` 118 | 119 | 120 | ## Continuous variables 121 | 122 | Rapid quantification of continuous associations can be done with the 123 | lm_phyloseq wrapper function. 124 | 125 | This uses the limma model to generate a table of P-values and effect 126 | sizes. Note that no confounding variables taken into account in this 127 | wrapper. See the [limma homepage](http://bioinf.wehi.edu.au/limma/) 128 | for more detailed analyses. 129 | 130 | ```{r limma-lm-phyloseq2, warning=FALSE, eval=FALSE} 131 | data(atlas1006) 132 | source(system.file("extdata/lm_phyloseq.R", package = "microbiome")) 133 | tab <- lm_phyloseq(atlas1006, "age") 134 | kable(head(tab), digits = 3) 135 | ``` 136 | 137 | 138 | 139 | -------------------------------------------------------------------------------- /main.R: -------------------------------------------------------------------------------- 1 | library(rmarkdown) 2 | fs <- list.files(pattern = ".Rmd$") 3 | fs <- sample(fs) 4 | fs <- setdiff(fs, c("info.Rmd", "misc.Rmd", "all.Rmd")) 5 | #fs <- setdiff(fs, c("Betadiversity.Rmd")) 6 | #fs <- setdiff(fs, c("CompositionAmplicondata.Rmd")) 7 | #fs <- setdiff(fs, c("Composition.Rmd")) 8 | #fs <- setdiff(fs, c("Diversity.Rmd")) 9 | fs0 <- fs 10 | rem <- fs 11 | for (myrmdfile in rem) { 12 | print(myrmdfile) 13 | render(myrmdfile, output_format = "html_document") 14 | print(paste(myrmdfile, "has been rendered")) 15 | rem <- setdiff(rem, myrmdfile) # Remaining files to handle 16 | } 17 | -------------------------------------------------------------------------------- /misc.Rmd: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | As it can be seen, we see only OTu IDs and this may not be useful to interpret the data. We need to reprccess this figure to include taxonomic information. We can do this as follows: 6 | 7 | ```{r,core-example4, fig.width=14, fig.heigth=18, fig.show='hold', out.width = '200px', warning=FALSE, eval=FALSE} 8 | library(RColorBrewer) 9 | library(knitr) 10 | # Core with absolute counts and vertical view: 11 | # and minimum population prevalence (given as percentage) 12 | detections <- 10^seq(log10(1), log10(max(abundances(pseq.2))/10), length = 10) 13 | 14 | healthycore <- plot_core(pseq.2, plot.type = "heatmap", 15 | prevalences = prevalences, 16 | detections = detections, 17 | colours = rev(brewer.pal(5, "Spectral")), 18 | min.prevalence = .9) 19 | # get the data used for plotting 20 | df <- healthycore$data 21 | 22 | # get the list of OTUs 23 | list <- df$Taxa 24 | 25 | # check the OTU ids 26 | # print(list) 27 | 28 | # get the taxonomy data 29 | tax <- tax_table(pseq.2) 30 | tax <- as.data.frame(tax) 31 | 32 | # add the OTus to last column 33 | tax$OTU <- rownames(tax) 34 | 35 | # select taxonomy of only 36 | # those OTUs that are used in the plot 37 | tax2 <- dplyr::filter(tax, rownames(tax) %in% list) 38 | 39 | # head(tax2) 40 | 41 | # We will merege all the column into one except the Doamin as all is bacteria in this case 42 | tax.unit <- tidyr::unite(tax2, Taxa_level,c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species", "OTU"), sep = "_;", remove = TRUE) 43 | 44 | tax.unit$Taxa_level <- gsub(pattern="[a-z]__",replacement="", tax.unit$Taxa_level) 45 | 46 | # add this new information into the plot data df 47 | 48 | df$Taxa <- tax.unit$Taxa_level 49 | 50 | # you can see now we have the taxonomic information 51 | knitr::kable(head(df)) 52 | 53 | # replace the data in the plot object 54 | healthycore$data <- df 55 | 56 | plot(healthycore + theme(axis.text.y = element_text(face="italic"))) 57 | ``` 58 | 59 | 60 | 61 | -------------------------------------------------------------------------------- /plot_sample_trajectory.R: -------------------------------------------------------------------------------- 1 | # In this we demonstrate how to plot trajactory of a community over time 2 | # We use the human microbiome time series data from Caporaso et al., 2011 Genome Biol. 3 | 4 | library(microbiome) 5 | library(jeevanuDB) # external database pkg for microbiome pkg with test data 6 | library(dplyr) 7 | library(ggplot2) 8 | library(viridis) 9 | # Example data 10 | data("moving_pictures") 11 | # Rename 12 | ps <- moving_pictures 13 | 14 | ps.gut <- subset_samples(ps, sample_type == "stool") 15 | taxa_names(ps.gut) <- paste0("ASV-", seq(ntaxa(ps.gut))) 16 | # remove asvs which are zero in all of these samples 17 | ps.gut <- prune_taxa(taxa_sums(ps.gut) > 0, ps.gut) 18 | 19 | # remove samples with less than 500 reads 20 | ps.gut <- prune_samples(sample_sums(ps.gut) > 500, ps.gut) 21 | 22 | # Covnert to relative abundances 23 | ps.gut.rel <- microbiome::transform(ps.gut, "compositional") 24 | 25 | ps.ord <- ordinate(ps.gut.rel, "PCoA") 26 | 27 | ordip <- plot_ordination(ps.gut.rel, ps.ord, justDF = T) 28 | 29 | # Get axis 1 and 2 variation 30 | evals1 <- round(ps.ord$values$Eigenvalues[1] / sum(ps.ord$values$Eigenvalues) * 100, 2) 31 | evals2 <- round(ps.ord$values$Eigenvalues[2] / sum(ps.ord$values$Eigenvalues) * 100, 2) 32 | 33 | # Visualize 34 | # theme_set(theme_bw(14)) 35 | # set colors 36 | subject_cols <- c(F4 = "#457b9d", M3 = "#e63946") 37 | 38 | # Add trajectory for given subject 39 | dfs <- subset(ordip, host_subject_id == "F4") 40 | # arrange according to sampling time 41 | dfs <- dfs %>% 42 | arrange(days_since_experiment_start) 43 | 44 | p <- ggplot(ordip, aes(x = Axis.1, y = Axis.2)) 45 | p2 <- p + 46 | geom_path( 47 | data = dfs, alpha = 0.5, 48 | arrow = arrow( 49 | angle = 15, length = unit(0.1, "inches"), 50 | ends = "last", type = "closed" 51 | ) 52 | ) + 53 | geom_point(aes(color = host_subject_id), alpha = 0.6, size = 3) + 54 | scale_color_manual("Subject", values = subject_cols) + 55 | xlab(paste("PCoA 1 (", evals1, "%)", sep = "")) + 56 | ylab(paste("PCoA 2 (", evals2, "%)", sep = "")) + 57 | theme( 58 | panel.grid.major = element_blank(), 59 | panel.grid.minor = element_blank() 60 | ) 61 | # coord_fixed(sqrt(evals[2] / evals[1])) 62 | 63 | # Print figure 64 | print(p2) 65 | 66 | # Alternatively we can just focus on one subject 67 | 68 | ps.gut.rel.m3 <- subset_samples(ps.gut.rel, host_subject_id == "M3") 69 | ps.ord.m3 <- ordinate(ps.gut.rel.m3, "PCoA") 70 | 71 | ordip.m3 <- plot_ordination(ps.gut.rel.m3, ps.ord.m3, justDF = T) 72 | 73 | # Get axis 1 and 2 variation 74 | evals1 <- round(ordip.m3$values$Eigenvalues[1] / sum(ordip.m3$values$Eigenvalues) * 100, 2) 75 | evals2 <- round(ordip.m3$values$Eigenvalues[2] / sum(ordip.m3$values$Eigenvalues) * 100, 2) 76 | 77 | 78 | # arrange according to sampling time 79 | ordip.m3 <- ordip.m3 %>% 80 | arrange(days_since_experiment_start) # important to arrange the time 81 | 82 | # Visualize 83 | # blank plot initiate 84 | p1 <- ggplot(ordip.m3, aes(x = Axis.1, y = Axis.2)) 85 | # add layers 86 | p3 <- p1 + 87 | # add arrows with geom_path 88 | geom_path(alpha = 0.5, arrow = arrow( 89 | angle = 30, length = unit(0.1, "inches"), 90 | ends = "last", type = "closed" 91 | )) + 92 | # add points 93 | geom_point(aes(color = days_since_experiment_start), size = 3) + 94 | # add gradient colors 95 | scale_color_viridis("Days from first sampling") + 96 | # add x and y labels 97 | xlab(paste("PCoA 1 (", evals1, "%)", sep = "")) + 98 | ylab(paste("PCoA 2 (", evals2, "%)", sep = "")) + 99 | # remove grids in the plot 100 | theme( 101 | panel.grid.major = element_blank(), 102 | panel.grid.minor = element_blank() 103 | ) 104 | 105 | print(p3) 106 | -------------------------------------------------------------------------------- /plot_time_trajectory.R: -------------------------------------------------------------------------------- 1 | library(microbiome) 2 | 3 | # Example data 4 | data(dietswap) 5 | # Rename 6 | pseq <- dietswap 7 | 8 | # CLR transformation to remove compositionality biases 9 | # Add small constant on the data to avoid problems from zero counts 10 | pseq <- transform(pseq, "clr") 11 | 12 | # Samples x taxa abundance matrix 13 | otu <- t(abundances(pseq)) 14 | 15 | # PCA 16 | proj <- princomp(otu)$scores[, 1:2] 17 | 18 | # Pick the metadata 19 | df <- meta(pseq) 20 | 21 | # Add projection 22 | df <- cbind(df, proj[rownames(df),]) 23 | 24 | # Visualize 25 | library(ggplot2) 26 | theme_set(theme_bw(20)) 27 | p <- ggplot(df, aes(x = Comp.1, y = Comp.2)) + geom_point(aes(color = group)) 28 | 29 | # Add trajectory for given subject 30 | dfs <- subset(df, subject == "zaq") 31 | p2 <- p + geom_path(data = dfs) 32 | 33 | # Print figure 34 | print(p2) 35 | 36 | 37 | --------------------------------------------------------------------------------