├── LICENSE ├── README.md ├── R_tips_12_save_vs_saveRDS.Rmd ├── compbio_tutorials.Rproj ├── data ├── .DS_Store ├── dirty_data.xlsx ├── gene_counts.xlsx ├── lm_fit.rda └── lm_fit.rds ├── images ├── Bioc.png ├── Cheng_question.png ├── GATK_best_practise_somatic.png ├── HiPerGator.png ├── Howie_question.png ├── Mikhael_question.png ├── R_and_python.png ├── Sajad_question.png ├── UCSC_format.png ├── age_pvalue.jpeg ├── aging.png ├── barchart.png ├── bashrc.png ├── bedtools.png ├── books.jpeg ├── boxplot_pvalue.png ├── cellline2commandline.png ├── cheatsheet1.png ├── cheatsheet2.png ├── cloud.png ├── cluster_species.png ├── cluster_tissue.png ├── clustered_dotplot.png ├── clustertissue.png ├── colormap.png ├── confusion.png ├── cron.jpeg ├── cross_validation_wrong.jpeg ├── data-life-cycle.png ├── data-science-explore.png ├── datatoviz.png ├── dates.jpeg ├── deeplearning.jpeg ├── dendrogram.png ├── dimredu.png ├── dist.png ├── distance_measures.jpeg ├── doc_version.png ├── experiment_design.png ├── fair_data.jpeg ├── filenames.png ├── filenames2.png ├── filenames3.png ├── folder.png ├── gct.png ├── geek.png ├── genomic_coordinate.png ├── ggplot2_cheatsheet.png ├── git_cheatsheet.jpeg ├── github-flow.png ├── google_arg.png ├── google_more.png ├── google_rotate.png ├── google_tar.png ├── gorilla.png ├── gsea.png ├── gsea_out1.png ├── gsea_out2.png ├── hclust.png ├── hpcscheduler.png ├── logistic_deep.png ├── logistic_rnn.png ├── mac_terminal.png ├── machine_learning_map.jpeg ├── matrix_factorization.png ├── merfish.jpeg ├── meta_excel.png ├── multinomial.png ├── notebook1.png ├── notebook2.png ├── out2.png ├── out3.png ├── phenotype_label.png ├── protein_mRNA_cor.jpeg ├── protein_vs_rna.png ├── protein_vs_rna_ccle.png ├── pval_cry.jpeg ├── qcfail.png ├── regular_1.png ├── regular_2.png ├── rnaseq_workflow.png ├── scatter_pvalue.png ├── scientific_method.jpeg ├── shell_terminal.png ├── stackover_rotate.png ├── statquest.png ├── superman.jpeg ├── survival1.png ├── survival2.png ├── survival3.png ├── tcga.png ├── terminal.jpeg ├── terminal.png ├── tidy_data.png ├── tidyverse.png ├── twin.png ├── ucsc_example.jpeg ├── umap_vs_tsne.png ├── unix_vs_linux.png └── workflow.png └── scripts ├── 01_how_to_make_a_heatmap.Rmd ├── 02_differential_expression_heatmap.Rmd ├── 03_volcano_plot_with_ggrepel.Rmd ├── 04_create_seurat_object_from_GEO.Rmd ├── 05_find_tissue_specific_genes_human_protein_atlas.Rmd ├── 06_scRNseq_two_lines_from_fastq_to_count_matrix.Rmd ├── 07_gene_set_enrichment_RNAseq.Rmd ├── 08_intro_to_singleCellExperiment.Rmd ├── 09_intro_to_seurat_V5.Rmd ├── 10_analyze_my_tweets.Rmd ├── 11_change_rownames_ENSEMBL_to_symbol_RNAseq.Rmd ├── 15_how_to_get_metadata_GEO.Rmd ├── 16_get_mouse_gene_exon_lengths.Rmd ├── R_tips_01_add_percentage_to_y_axis.Rmd ├── R_tips_02_add_side_to_scatterplot.Rmd ├── R_tips_03_extract_tables_from_PDF.Rmd ├── R_tips_04_list_column_dataframe_in_dataframe.Rmd ├── R_tips_05_read_all_files_in_a_folder.Rmd ├── R_tips_06_avoid_overplotting_ggblend.Rmd ├── R_tips_06_hierarchical_clustering.Rmd ├── R_tips_06_hierarchical_clustering.html ├── R_tips_07_rownames.Rmd ├── R_tips_08_convert_gene_ids.Rmd ├── R_tips_09_biomart_mouse_ortholog.Rmd ├── R_tips_10_ggplot2_percentage.Rmd ├── R_tips_11_read_all_tabs_spreadsheet.Rmd ├── R_tips_12_save_vs_saveRDS.Rmd ├── R_tips_13_copy_paste_vector_datapasta.Rmd ├── R_tips_14_janitor_clean_column_names.Rmd ├── R_tips_15_calculate_cpm.Rmd ├── R_tips_16_liftover_bedpe.Rmd ├── R_tips_17_upset_plot_for_gene_sets.Rmd ├── R_tips_18_tile_a_bed_file.Rmd ├── R_tips_19_kmeans_clustering.Rmd ├── R_tips_20_scatterplot_with_cor_p_value.Rmd └── R_tips_janitor_clean_column_names.Rmd /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Ming Tang 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # compbio_tutorials 2 | 3 | Rmarkdown files for my youtube videos on chatomics channel https://www.youtube.com/@chatomics 4 | 5 | Make sure you subscribe to the channel and join my FREE newsletter https://divingintogeneticsandgenomics.ck.page/newsletter 6 | -------------------------------------------------------------------------------- /R_tips_12_save_vs_saveRDS.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "save vs saveRDS" 3 | output: html_document 4 | date: "2024-08-29" 5 | editor_options: 6 | chunk_output_type: console 7 | --- 8 | 9 | ### save() vs saveRDS() 10 | 11 | ```{r} 12 | # Load necessary libraries 13 | library(tidymodels) 14 | library(broom) 15 | library(ggplot2) 16 | library(dplyr) 17 | 18 | head(mtcars) 19 | 20 | ggplot(mtcars, aes(x= mpg, y = wt)) + 21 | geom_point() 22 | 23 | 24 | # Define the linear regression model specification 25 | lm_spec <- linear_reg() %>% 26 | set_engine("lm") 27 | 28 | # Fit the model 29 | lm_fit <- lm_spec %>% 30 | fit(mpg ~ wt, data = mtcars) 31 | 32 | # Extract the coefficient and p-value 33 | model_summary <- tidy(lm_fit) 34 | 35 | 36 | saveRDS(lm_fit, file = "data/lm_fit.rds") 37 | 38 | save(lm_fit, model_summary, file = "data/lm_fit.rda") 39 | ``` 40 | 41 | 42 | ### read the data back 43 | 44 | ```{r} 45 | my_saved_fit<- readRDS("data/lm_fit.rds") 46 | 47 | load("data/lm_fit.rda") 48 | ``` -------------------------------------------------------------------------------- /compbio_tutorials.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | -------------------------------------------------------------------------------- /data/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/data/.DS_Store -------------------------------------------------------------------------------- /data/dirty_data.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/data/dirty_data.xlsx -------------------------------------------------------------------------------- /data/gene_counts.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/data/gene_counts.xlsx -------------------------------------------------------------------------------- /data/lm_fit.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/data/lm_fit.rda -------------------------------------------------------------------------------- /data/lm_fit.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/data/lm_fit.rds -------------------------------------------------------------------------------- /images/Bioc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/Bioc.png -------------------------------------------------------------------------------- /images/Cheng_question.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/Cheng_question.png -------------------------------------------------------------------------------- /images/GATK_best_practise_somatic.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/GATK_best_practise_somatic.png -------------------------------------------------------------------------------- /images/HiPerGator.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/HiPerGator.png -------------------------------------------------------------------------------- /images/Howie_question.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/Howie_question.png -------------------------------------------------------------------------------- /images/Mikhael_question.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/Mikhael_question.png -------------------------------------------------------------------------------- /images/R_and_python.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/R_and_python.png -------------------------------------------------------------------------------- /images/Sajad_question.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/Sajad_question.png -------------------------------------------------------------------------------- /images/UCSC_format.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/UCSC_format.png -------------------------------------------------------------------------------- /images/age_pvalue.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/age_pvalue.jpeg -------------------------------------------------------------------------------- /images/aging.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/aging.png -------------------------------------------------------------------------------- /images/barchart.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/barchart.png -------------------------------------------------------------------------------- /images/bashrc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/bashrc.png -------------------------------------------------------------------------------- /images/bedtools.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/bedtools.png -------------------------------------------------------------------------------- /images/books.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/books.jpeg -------------------------------------------------------------------------------- /images/boxplot_pvalue.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/boxplot_pvalue.png -------------------------------------------------------------------------------- /images/cellline2commandline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/cellline2commandline.png -------------------------------------------------------------------------------- /images/cheatsheet1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/cheatsheet1.png -------------------------------------------------------------------------------- /images/cheatsheet2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/cheatsheet2.png -------------------------------------------------------------------------------- /images/cloud.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/cloud.png -------------------------------------------------------------------------------- /images/cluster_species.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/cluster_species.png -------------------------------------------------------------------------------- /images/cluster_tissue.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/cluster_tissue.png -------------------------------------------------------------------------------- /images/clustered_dotplot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/clustered_dotplot.png -------------------------------------------------------------------------------- /images/clustertissue.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/clustertissue.png -------------------------------------------------------------------------------- /images/colormap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/colormap.png -------------------------------------------------------------------------------- /images/confusion.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/confusion.png -------------------------------------------------------------------------------- /images/cron.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/cron.jpeg -------------------------------------------------------------------------------- /images/cross_validation_wrong.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/cross_validation_wrong.jpeg -------------------------------------------------------------------------------- /images/data-life-cycle.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/data-life-cycle.png -------------------------------------------------------------------------------- /images/data-science-explore.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/data-science-explore.png -------------------------------------------------------------------------------- /images/datatoviz.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/datatoviz.png -------------------------------------------------------------------------------- /images/dates.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/dates.jpeg -------------------------------------------------------------------------------- /images/deeplearning.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/deeplearning.jpeg -------------------------------------------------------------------------------- /images/dendrogram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/dendrogram.png -------------------------------------------------------------------------------- /images/dimredu.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/dimredu.png -------------------------------------------------------------------------------- /images/dist.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/dist.png -------------------------------------------------------------------------------- /images/distance_measures.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/distance_measures.jpeg -------------------------------------------------------------------------------- /images/doc_version.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/doc_version.png -------------------------------------------------------------------------------- /images/experiment_design.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/experiment_design.png -------------------------------------------------------------------------------- /images/fair_data.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/fair_data.jpeg -------------------------------------------------------------------------------- /images/filenames.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/filenames.png -------------------------------------------------------------------------------- /images/filenames2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/filenames2.png -------------------------------------------------------------------------------- /images/filenames3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/filenames3.png -------------------------------------------------------------------------------- /images/folder.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/folder.png -------------------------------------------------------------------------------- /images/gct.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/gct.png -------------------------------------------------------------------------------- /images/geek.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/geek.png -------------------------------------------------------------------------------- /images/genomic_coordinate.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/genomic_coordinate.png -------------------------------------------------------------------------------- /images/ggplot2_cheatsheet.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/ggplot2_cheatsheet.png -------------------------------------------------------------------------------- /images/git_cheatsheet.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/git_cheatsheet.jpeg -------------------------------------------------------------------------------- /images/github-flow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/github-flow.png -------------------------------------------------------------------------------- /images/google_arg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/google_arg.png -------------------------------------------------------------------------------- /images/google_more.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/google_more.png -------------------------------------------------------------------------------- /images/google_rotate.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/google_rotate.png -------------------------------------------------------------------------------- /images/google_tar.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/google_tar.png -------------------------------------------------------------------------------- /images/gorilla.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/gorilla.png -------------------------------------------------------------------------------- /images/gsea.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/gsea.png -------------------------------------------------------------------------------- /images/gsea_out1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/gsea_out1.png -------------------------------------------------------------------------------- /images/gsea_out2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/gsea_out2.png -------------------------------------------------------------------------------- /images/hclust.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/hclust.png -------------------------------------------------------------------------------- /images/hpcscheduler.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/hpcscheduler.png -------------------------------------------------------------------------------- /images/logistic_deep.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/logistic_deep.png -------------------------------------------------------------------------------- /images/logistic_rnn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/logistic_rnn.png -------------------------------------------------------------------------------- /images/mac_terminal.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/mac_terminal.png -------------------------------------------------------------------------------- /images/machine_learning_map.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/machine_learning_map.jpeg -------------------------------------------------------------------------------- /images/matrix_factorization.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/matrix_factorization.png -------------------------------------------------------------------------------- /images/merfish.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/merfish.jpeg -------------------------------------------------------------------------------- /images/meta_excel.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/meta_excel.png -------------------------------------------------------------------------------- /images/multinomial.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/multinomial.png -------------------------------------------------------------------------------- /images/notebook1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/notebook1.png -------------------------------------------------------------------------------- /images/notebook2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/notebook2.png -------------------------------------------------------------------------------- /images/out2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/out2.png -------------------------------------------------------------------------------- /images/out3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/out3.png -------------------------------------------------------------------------------- /images/phenotype_label.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/phenotype_label.png -------------------------------------------------------------------------------- /images/protein_mRNA_cor.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/protein_mRNA_cor.jpeg -------------------------------------------------------------------------------- /images/protein_vs_rna.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/protein_vs_rna.png -------------------------------------------------------------------------------- /images/protein_vs_rna_ccle.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/protein_vs_rna_ccle.png -------------------------------------------------------------------------------- /images/pval_cry.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/pval_cry.jpeg -------------------------------------------------------------------------------- /images/qcfail.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/qcfail.png -------------------------------------------------------------------------------- /images/regular_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/regular_1.png -------------------------------------------------------------------------------- /images/regular_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/regular_2.png -------------------------------------------------------------------------------- /images/rnaseq_workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/rnaseq_workflow.png -------------------------------------------------------------------------------- /images/scatter_pvalue.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/scatter_pvalue.png -------------------------------------------------------------------------------- /images/scientific_method.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/scientific_method.jpeg -------------------------------------------------------------------------------- /images/shell_terminal.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/shell_terminal.png -------------------------------------------------------------------------------- /images/stackover_rotate.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/stackover_rotate.png -------------------------------------------------------------------------------- /images/statquest.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/statquest.png -------------------------------------------------------------------------------- /images/superman.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/superman.jpeg -------------------------------------------------------------------------------- /images/survival1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/survival1.png -------------------------------------------------------------------------------- /images/survival2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/survival2.png -------------------------------------------------------------------------------- /images/survival3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/survival3.png -------------------------------------------------------------------------------- /images/tcga.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/tcga.png -------------------------------------------------------------------------------- /images/terminal.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/terminal.jpeg -------------------------------------------------------------------------------- /images/terminal.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/terminal.png -------------------------------------------------------------------------------- /images/tidy_data.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/tidy_data.png -------------------------------------------------------------------------------- /images/tidyverse.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/tidyverse.png -------------------------------------------------------------------------------- /images/twin.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/twin.png -------------------------------------------------------------------------------- /images/ucsc_example.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/ucsc_example.jpeg -------------------------------------------------------------------------------- /images/umap_vs_tsne.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/umap_vs_tsne.png -------------------------------------------------------------------------------- /images/unix_vs_linux.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/unix_vs_linux.png -------------------------------------------------------------------------------- /images/workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/workflow.png -------------------------------------------------------------------------------- /scripts/01_how_to_make_a_heatmap.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "1_how_to_make_a_heatmap" 3 | author: "Ming Tang" 4 | date: "5/2/2023" 5 | output: html_document 6 | editor_options: 7 | chunk_output_type: console 8 | --- 9 | 10 | 11 | Making heatmap is an essential skill for any computational biologist. 12 | 13 | 14 | ### load the libraries 15 | 16 | ```{r} 17 | library(ComplexHeatmap) 18 | ``` 19 | 20 | ### make dummy data 21 | 22 | ```{r} 23 | 24 | h1 <- c(10,20,10,20,10,20,10,20) 25 | h2 <- c(20,10,20,10,20,10,20,10) 26 | 27 | l1 <- c(1,3,1,3,1,3,1,3) 28 | l2 <- c(3,1,3,1,3,1,3,1) 29 | 30 | mat <- rbind(h1,h2,l1,l2) 31 | colnames(mat)<- paste0("timepoint_", 1:8) 32 | mat 33 | ``` 34 | 35 | 36 | visualize the data 37 | 38 | ```{r} 39 | par(mfrow =c(1,1), mar=c(4,4,1,1)) 40 | plot(1:8,rep(0,8), ylim=c(0,35), pch="", xlab="Time", ylab="Gene Expression") 41 | 42 | for (i in 1:nrow(mat)) { 43 | lines(1:8,mat[i,], lwd=3, col=i) 44 | } 45 | 46 | legend(1,35,rownames(mat), 1:4, cex=0.7) 47 | ``` 48 | 49 | ### Making a heatmap is easy! 50 | 51 | ```{r} 52 | Heatmap(mat, cluster_columns = FALSE) 53 | 54 | quantile(mat, c(0, 0.1,0.5, 0.9)) 55 | ``` 56 | 57 | 58 | change color mapping 59 | 60 | ```{r} 61 | col_fun<- circlize::colorRamp2(c(0, 3, 20), c("blue", "white", "red")) 62 | 63 | Heatmap(mat, cluster_columns = FALSE, col = col_fun) 64 | ``` 65 | 66 | 67 | outlier 68 | 69 | ```{r} 70 | mat2<- mat 71 | mat2[1,1]<- 1000 72 | 73 | Heatmap(mat2, cluster_columns = FALSE) 74 | 75 | Heatmap(mat2, cluster_columns = FALSE, col = col_fun) 76 | 77 | ``` 78 | 79 | Let's scale the data/gene expression level across columns (time points) first 80 | 81 | ```{r} 82 | scaled_mat<- t(scale(t(mat))) 83 | 84 | 85 | ?Heatmap 86 | Heatmap(scaled_mat, cluster_columns = FALSE) 87 | ``` 88 | 89 | Note, after scaling, h1 and l1 are close to each other! 90 | 91 | ### understand clustering? 92 | 93 | define distances 94 | 95 | ```{r} 96 | ?dist 97 | 98 | d<- dist(mat) 99 | 100 | d 101 | ``` 102 | 103 | 104 | ```{r} 105 | ?hclust 106 | 107 | hclust(d) 108 | 109 | 110 | plot(hclust(d)) 111 | ``` 112 | 113 | 114 | After scaling 115 | ```{r} 116 | 117 | d2<- dist(scaled_mat) 118 | 119 | plot(hclust(d2)) 120 | ``` 121 | 122 | Key takeaways: 123 | 1. color mapping is critical 124 | 2. scaling is critical 125 | 3. making heatmap is easy, but better to understand the details. 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | -------------------------------------------------------------------------------- /scripts/02_differential_expression_heatmap.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "differential_expression_heatmap" 3 | author: "Ming Tang" 4 | date: "5/9/2023" 5 | output: html_document 6 | editor_options: 7 | chunk_output_type: console 8 | --- 9 | 10 | Let's use a real example https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE197576 11 | 12 | How to download the files from ftp https://www.ncbi.nlm.nih.gov/geo/info/download.html 13 | 14 | https://ftp.ncbi.nlm.nih.gov/geo/series/GSE197nnn/GSE197576/suppl/ 15 | 16 | Alternative use GEOquery https://bioconductor.org/packages/release/bioc/html/GEOquery.html 17 | 18 | ```{bash eval=FALSE} 19 | wget https://ftp.ncbi.nlm.nih.gov/geo/series/GSE197nnn/GSE197576/suppl/GSE197576_raw_gene_counts_matrix.tsv.gz 20 | 21 | csvtk pretty -t GSE197576_raw_gene_counts_matrix.tsv.gz | less -S 22 | 23 | csvtk headers -t GSE197576_raw_gene_counts_matrix.tsv.gz 24 | 25 | csvtk cut -t -f1,2,3,12,13 GSE197576_raw_gene_counts_matrix.tsv.gz| head 26 | 27 | csvtk cut -t -f1,2,3,12,13 GSE197576_raw_gene_counts_matrix.tsv.gz > raw_counts.tsv 28 | ``` 29 | 30 | Get csvtk at https://github.com/shenwei356/csvtk 31 | 32 | ### read the data into R and make a DESeq2 object 33 | 34 | follow the tutorial http://bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html 35 | 36 | ```{r} 37 | library(dplyr) 38 | library(readr) 39 | library(here) 40 | library(DESeq2) 41 | 42 | raw_counts<- read_tsv(here("data/raw_counts.tsv")) 43 | 44 | raw_counts_mat<- raw_counts[, -1] %>% as.matrix 45 | 46 | head(raw_counts_mat) 47 | 48 | rownames(raw_counts_mat)<- raw_counts$gene 49 | head(raw_counts_mat) 50 | 51 | ``` 52 | 53 | Make a sample sheet 54 | 55 | ```{r} 56 | coldata<- data.frame(condition = c("normoxia", "normoxia", "hypoxia", "hypoxia")) 57 | 58 | rownames(coldata)<- colnames(raw_counts_mat) 59 | 60 | coldata 61 | ``` 62 | 63 | Make a DEseq2 object 64 | 65 | ```{r} 66 | all(rownames(coldata) == colnames(raw_counts_mat)) 67 | 68 | dds <- DESeqDataSetFromMatrix(countData = raw_counts_mat, 69 | colData = coldata, 70 | design = ~ condition) 71 | dds <- DESeq(dds) 72 | res <- results(dds, contrast = c("condition", "hypoxia", "normoxia")) 73 | 74 | res %>% 75 | as.data.frame() %>% 76 | arrange((padj), desc(log2FoldChange)) %>% 77 | head(n=30) 78 | 79 | 80 | significant_genes<- res %>% 81 | as.data.frame() %>% 82 | filter(padj <=0.01, abs(log2FoldChange) >= 2) %>% 83 | rownames() 84 | 85 | 86 | significant_genes 87 | ``` 88 | 89 | 90 | ### PCA analysis 91 | 92 | ```{r} 93 | vsd <- vst(dds, blind=FALSE) 94 | 95 | plotPCA(vsd, intgroup=c("condition")) 96 | ``` 97 | 98 | 99 | Plot PCA by ourselves. 100 | 101 | ```{r} 102 | vsd <- vst(dds, blind=FALSE) 103 | head(assay(vsd), 3) 104 | 105 | normalized_counts<- assay(vsd) %>% 106 | as.matrix() 107 | 108 | pca_prcomp<- prcomp(t(normalized_counts), center = TRUE, scale. = FALSE) 109 | 110 | names(pca_prcomp) 111 | pca_prcomp$x 112 | 113 | PC1_and_PC2<- data.frame(PC1=pca_prcomp$x[,1], PC2= pca_prcomp$x[,2], 114 | type = rownames(pca_prcomp$x)) 115 | 116 | ## plot PCA plot 117 | library(ggplot2) 118 | 119 | ggplot(PC1_and_PC2, aes(x=PC1, y=PC2, col=type)) + 120 | geom_point() + 121 | geom_text(aes(label = type), hjust=0, vjust=0) + 122 | coord_fixed() 123 | ``` 124 | 125 | It is not exactly the same, what's going on? 126 | 127 | 128 | ```{r} 129 | ?plotPCA #using the top 500 most variable genes 130 | 131 | ``` 132 | 133 | https://github.com/mikelove/DESeq2/blob/48b80aaac5efd4b9e0d054fc1e4a6e1fa78e782a/R/plots.R#LL245C71-L245C71 134 | 135 | 136 | ### Make a perfect heatmap 137 | 138 | ```{r} 139 | library(ComplexHeatmap) 140 | 141 | significant_mat<- normalized_counts[significant_genes, ] 142 | 143 | Heatmap(t(scale(t(significant_mat)))) 144 | 145 | ``` 146 | 147 | 148 | Yeah, you get this perfect looking heatmap because we select the genes that are different. So, no surprise at all! 149 | 150 | 151 | ```{r} 152 | coldata 153 | 154 | col_anno <- HeatmapAnnotation(df = coldata, 155 | col = list( condition = c("hypoxia" = "red", "normoxia" = "blue"))) 156 | 157 | 158 | Heatmap(t(scale(t(significant_mat))), 159 | top_annotation = col_anno, 160 | show_row_names = FALSE, 161 | name = "scaled normalized\nexpression") 162 | ``` 163 | 164 | why scaling is important? 165 | 166 | 167 | ```{r} 168 | Heatmap(significant_mat, 169 | top_annotation = col_anno, 170 | show_row_names = FALSE, 171 | name = "scaled normalized\nexpression") 172 | ``` 173 | 174 | -------------------------------------------------------------------------------- /scripts/03_volcano_plot_with_ggrepel.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "volcano plot with ggrepel" 3 | author: "Ming Tang" 4 | date: "5/16/2023" 5 | output: html_document 6 | editor_options: 7 | chunk_output_type: console 8 | --- 9 | 10 | 11 | Continue with https://github.com/crazyhottommy/compbio_tutorials/blob/main/scripts/02_differential_expression_heatmap.Rmd 12 | 13 | ### read the data into R and make a DESeq2 object 14 | 15 | follow the tutorial http://bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html 16 | 17 | ```{r} 18 | library(dplyr) 19 | library(readr) 20 | library(here) 21 | library(DESeq2) 22 | library(ggplot2) 23 | 24 | raw_counts<- read_tsv(here("data/raw_counts.tsv")) 25 | 26 | raw_counts_mat<- raw_counts[, -1] %>% as.matrix 27 | 28 | head(raw_counts_mat) 29 | 30 | rownames(raw_counts_mat)<- raw_counts$gene 31 | head(raw_counts_mat) 32 | 33 | ``` 34 | 35 | Make a sample sheet 36 | 37 | ```{r} 38 | coldata<- data.frame(condition = c("normoxia", "normoxia", "hypoxia", "hypoxia")) 39 | 40 | rownames(coldata)<- colnames(raw_counts_mat) 41 | 42 | coldata 43 | ``` 44 | 45 | Make a DEseq2 object 46 | 47 | ```{r} 48 | all(rownames(coldata) == colnames(raw_counts_mat)) 49 | 50 | dds <- DESeqDataSetFromMatrix(countData = raw_counts_mat, 51 | colData = coldata, 52 | design = ~ condition) 53 | dds <- DESeq(dds) 54 | res <- results(dds, contrast = c("condition", "hypoxia", "normoxia")) 55 | 56 | head(res) 57 | ``` 58 | 59 | ### Make a volcano plot 60 | 61 | what is a volcano plot? 62 | 63 | It is a scatter plot: 64 | x-axis is the log2Fold change 65 | 66 | y-axis is -log10(p-value) 67 | 68 | 69 | ```{r} 70 | res %>% 71 | as.data.frame() %>% 72 | ggplot(aes(x = log2FoldChange, y = -log10(pvalue))) + 73 | geom_point() 74 | 75 | ``` 76 | 77 | hmm, what are the points on the top? 78 | 79 | ```{r} 80 | res %>% 81 | as.data.frame() %>% 82 | arrange((padj), desc(log2FoldChange)) %>% 83 | head(n = 30) 84 | ``` 85 | 86 | A basic volcano plot 87 | 88 | ```{r} 89 | res %>% 90 | as.data.frame() %>% 91 | ggplot(aes(x = log2FoldChange, y = -log10(pvalue))) + 92 | geom_point() + 93 | theme_bw(base_size = 14) 94 | ``` 95 | 96 | How to label the genes? 97 | 98 | ```{r} 99 | 100 | res %>% 101 | as.data.frame() %>% 102 | tibble::rownames_to_column(var = "gene") %>% 103 | filter(!stringr::str_detect(gene, "LOC")) %>% 104 | filter(abs(log2FoldChange)>=4, padj <= 0.001) %>% 105 | dim() 106 | 107 | 108 | res_sig<- res %>% 109 | as.data.frame() %>% 110 | tibble::rownames_to_column(var = "gene") %>% 111 | filter(!stringr::str_detect(gene, "LOC")) %>% 112 | filter(abs(log2FoldChange)>=4, padj <= 0.001) 113 | 114 | 115 | res %>% 116 | as.data.frame() %>% 117 | ggplot(aes(x = log2FoldChange, y = -log10(pvalue))) + 118 | geom_point() + 119 | geom_label(data = res_sig, aes(label = gene))+ 120 | theme_bw(base_size = 14) 121 | ``` 122 | 123 | 124 | the labels are overlapping, let's improve by ggrepel 125 | ```{r} 126 | #install.packages("ggrepel") 127 | library(ggrepel ) 128 | 129 | 130 | res %>% 131 | as.data.frame() %>% 132 | ggplot(aes(x = log2FoldChange, y = -log10(pvalue))) + 133 | geom_point() + 134 | ggrepel::geom_label_repel(data = res_sig, aes(label = gene))+ 135 | theme_bw(base_size = 14) 136 | ``` 137 | 138 | 139 | Let's color the points 140 | 141 | ```{r} 142 | res2<- res %>% 143 | as.data.frame() %>% 144 | tibble::rownames_to_column(var = "gene") %>% 145 | mutate(sig = case_when( 146 | !stringr::str_detect(gene, "LOC") & abs(log2FoldChange)>=4 & 147 | padj <= 0.001 ~ "sig", 148 | TRUE ~ "not sig" 149 | )) 150 | 151 | head(res2) 152 | 153 | ggplot(res2, aes(x = log2FoldChange, y = -log10(pvalue))) + 154 | geom_point(aes(color = sig)) + 155 | ggrepel::geom_label_repel(data = res_sig, aes(label = gene))+ 156 | theme_bw(base_size = 14) 157 | ``` 158 | 159 | 160 | fix the color of the points 161 | 162 | ```{r} 163 | ggplot(res2, aes(x = log2FoldChange, y = -log10(pvalue))) + 164 | geom_point(aes(color = sig)) + 165 | scale_color_manual(values = c("blue", "red")) + 166 | ggrepel::geom_label_repel(data = res_sig, aes(label = gene))+ 167 | theme_bw(base_size = 14) 168 | ``` 169 | 170 | add horizontal and vertical lines 171 | ```{r} 172 | ggplot(res2, aes(x = log2FoldChange, y = -log10(pvalue))) + 173 | geom_point(aes(color = sig)) + 174 | scale_color_manual(values = c("blue", "red")) + 175 | ggrepel::geom_label_repel(data = res_sig, aes(label = gene))+ 176 | geom_hline(yintercept = 100, linetype = 2, color = "red") + 177 | geom_vline(xintercept = c(-4, 4), linetype = 2, color = "red")+ 178 | theme_bw(base_size = 14) 179 | ``` 180 | 181 | Enhanced volcanoplot: https://bioconductor.org/packages/devel/bioc/vignettes/EnhancedVolcano/inst/doc/EnhancedVolcano.html 182 | 183 | -------------------------------------------------------------------------------- /scripts/04_create_seurat_object_from_GEO.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "How to a seurat object from GEO dataset" 3 | author: "Ming Tang" 4 | date: "07/12/2023" 5 | output: html_document 6 | editor_options: 7 | chunk_output_type: console 8 | --- 9 | 10 | 11 | ### Download the data 12 | 13 | https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE116256 14 | 15 | ```{bash} 16 | 17 | cd /Users/tommytang/github_repos/compbio_tutorials/data/GSE116256 18 | wget https://ftp.ncbi.nlm.nih.gov/geo/series/GSE116nnn/GSE116256/suppl/GSE116256_RAW.tar 19 | tar xvf GSE116256_RAW.tar 20 | rm GSE116256_RAW.tar 21 | ``` 22 | 23 | ```{r} 24 | library(here) 25 | library(stringr) 26 | library(dplyr) 27 | library(ggplot2) 28 | library(Seurat) 29 | library(purrr) 30 | library(readr) 31 | library(harmony) 32 | library(scCustomize) 33 | library(SeuratDisk) 34 | ``` 35 | 36 | read in the count matrix 37 | 38 | ```{r} 39 | read_counts<- function(file){ 40 | x<- read_tsv(file) 41 | x<- as.data.frame(x) 42 | genes<- x$Gene 43 | x<- x[, -1] 44 | rownames(x)<- genes 45 | return(as.matrix(x)) 46 | } 47 | 48 | 49 | counts_files<- list.files(here("data/GSE116256"), full.names = TRUE, pattern = "*dem.txt.gz") 50 | 51 | samples<- map_chr(counts_files, basename) 52 | 53 | samples<- str_replace(samples, "(GSM[0-9]+_.+).dem.txt.gz", "\\1") 54 | 55 | names(counts_files)<- samples 56 | 57 | counts<- purrr::map(counts_files[1:4], read_counts) 58 | 59 | ``` 60 | 61 | 62 | ```{r} 63 | read_meta<- function(file){ 64 | y<- read_tsv(file) 65 | y<- as.data.frame(y) 66 | cells<- y$Cell 67 | y<- y[,-1] 68 | rownames(y)<- cells 69 | return(y) 70 | } 71 | 72 | 73 | meta_files<- list.files(here("data/GSE116256"), full.names = TRUE, pattern = "*anno.txt.gz") 74 | meta_names<- map_chr(meta_files, basename) 75 | meta_names<- str_replace(meta_names, "(GSM[0-9]+_.+).anno.txt.gz", "\\1") 76 | names(meta_files)<- meta_names 77 | 78 | meta<- purrr::map(meta_files[1:4], read_meta) 79 | ``` 80 | 81 | ### create a seurat object 82 | 83 | ```{r} 84 | library(Matrix) #for sparse matrix 85 | objs<- purrr::map2(counts, meta, 86 | ~CreateSeuratObject(counts = as(.x, "sparseMatrix"), 87 | meta.data = .y)) 88 | 89 | 90 | # merge to a single object 91 | merged_seurat<- purrr::reduce(objs, function(x,y) {merge(x,y)}) 92 | 93 | ## free memory 94 | rm(counts) 95 | rm(objs) 96 | rm(meta) 97 | gc() 98 | ``` 99 | 100 | ### preprocess the data 101 | 102 | ```{r} 103 | merged_seurat<- merged_seurat %>% 104 | NormalizeData(normalization.method = "LogNormalize", scale.factor = 10000) %>% 105 | FindVariableFeatures( selection.method = "vst", nfeatures = 2000) %>% 106 | ScaleData() %>% 107 | RunPCA() %>% 108 | RunHarmony(group.by.vars = "orig.ident", dims.use = 1:30) %>% 109 | RunUMAP(reduction = "harmony", dims = 1:30) %>% 110 | FindNeighbors(reduction = "harmony", dims = 1:30) %>% 111 | FindClusters(resolution = 0.6) 112 | ``` 113 | 114 | 115 | ### visualization 116 | 117 | ```{r} 118 | DimPlot_scCustom(seurat_object = merged_seurat) 119 | 120 | ``` 121 | -------------------------------------------------------------------------------- /scripts/05_find_tissue_specific_genes_human_protein_atlas.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "05_find_tissue_specific_genes_human_protein_atlas" 3 | author: "Ming Tang" 4 | date: "6/15/2023" 5 | output: html_document 6 | editor_options: 7 | chunk_output_type: console 8 | --- 9 | 10 | http://www.bioconductor.org/packages/devel/bioc/vignettes/HPAanalyze/inst/doc/b_HPAanalyze_indepth.html 11 | 12 | 13 | ```{r} 14 | #BiocManager::install("HPAanalyze") 15 | library(tidyverse) 16 | library(HPAanalyze) 17 | ``` 18 | 19 | Most of the time, you will only need the “histology” datasets, which contain normal_tissue, pathology (basically cancers) and subcellular_location. 20 | 21 | ```{r} 22 | downloadedData <- hpaDownload(downloadList='histology') 23 | ``` 24 | 25 | The `normal_tissue` dataset contains information about protein expression profiles in human tissues based on IHC staining. The datasets contain six columns: ensembl (Ensembl gene identifier); gene (HGNC symbol), tissue (tissue name); cell_type (annotated cell type); level (expression value); reliability (the gene reliability of the expression value) 26 | 27 | ```{r} 28 | 29 | names(downloadedData) 30 | 31 | downloadedData$normal_tissue %>% 32 | head() 33 | 34 | table(downloadedData$normal_tissue$level) 35 | 36 | downloadedData$normal_tissue %>% 37 | filter(cell_type == "smooth muscle cells") %>% 38 | janitor::tabyl(tissue, cell_type) 39 | 40 | downloadedData$normal_tissue %>% 41 | pivot_wider(names_from = c("tissue", "cell_type"), values_from = "level") %>% 42 | View() 43 | 44 | data<- downloadedData$normal_tissue %>% 45 | pivot_wider(names_from = c("tissue", "cell_type"), values_from = "level") %>% 46 | filter(`smooth muscle_smooth muscle cells` %in% c("High", "Medium")) 47 | 48 | View(data) 49 | ``` 50 | 51 | what are the membrane genes 52 | 53 | ```{r} 54 | head(downloadedData$subcellular_location) 55 | 56 | table(downloadedData$subcellular_location$main_location) %>% 57 | sort() 58 | 59 | memberane_genes<- downloadedData$subcellular_location %>% 60 | filter(str_detect(main_location, "Plasma membrane")) 61 | 62 | inner_join(data, memberane_genes, by = c("ensembl" = "ensembl")) 63 | ``` 64 | 65 | 66 | Make a heatmap 67 | 68 | ```{r} 69 | library(ComplexHeatmap) 70 | 71 | data2<- data %>% 72 | filter(gene %in% memberane_genes$gene) 73 | 74 | mat<- as.matrix(data2[, -c(1,2,3)]) 75 | rownames(mat)<- data2$gene 76 | 77 | ComplexHeatmap::Heatmap(mat, 78 | show_row_names = FALSE, 79 | show_column_names = FALSE) 80 | ``` 81 | 82 | 83 | 84 | -------------------------------------------------------------------------------- /scripts/06_scRNseq_two_lines_from_fastq_to_count_matrix.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "06_two_lines_command_scNAseq_from_fastq_to_count_matrix" 3 | author: "Ming Tang" 4 | date: "6/19/2023" 5 | output: html_document 6 | editor_options: 7 | chunk_output_type: console 8 | --- 9 | 10 | ### Let's use 10x genomics data 11 | 12 | https://www.10xgenomics.com/resources/datasets/human-pbmc-from-a-healthy-donor-1-k-cells-v-2-2-standard-4-0-0 13 | 14 | 15 | install tools 16 | 17 | gget https://github.com/pachterlab/gget 18 | 19 | https://github.com/mamba-org/mamba 20 | 21 | https://www.kallistobus.tools/ 22 | 23 | ```{bash} 24 | mamba create -n kb-python python=3.7 25 | conda activate kb-python 26 | pip install kb-python gget ffq 27 | 28 | ``` 29 | 30 | 31 | ```{bash} 32 | time kb ref \ 33 | -i index.idx \ 34 | -g t2g.txt \ 35 | -f1 transcriptome.fa \ 36 | $(gget ref --ftp -w dna,gtf homo_sapiens) 37 | 38 | 39 | Mon Jun 19 23:26:27 2023 INFO Fetching reference information for homo_sapiens from Ensembl release: 109. 40 | [2023-06-19 23:26:30,564] INFO [ref] Preparing http://ftp.ensembl.org/pub/release-109/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz, http://ftp.ensembl.org/pub/release-109/gtf/homo_sapiens/Homo_sapiens.GRCh38.109.gtf.gz 41 | [2023-06-19 23:28:19,733] INFO [ref] Splitting genome http://ftp.ensembl.org/pub/release-109/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz into cDNA at /Users/tommytang/playground/tmp/tmp6352jm21 42 | [2023-06-19 23:52:12,318] INFO [ref] Concatenating 1 cDNAs to transcriptome.fa 43 | [2023-06-19 23:52:14,048] INFO [ref] Creating transcript-to-gene mapping at t2g.txt 44 | [2023-06-19 23:52:17,463] INFO [ref] Indexing transcriptome.fa to index.idx 45 | kb ref -i index.idx -g t2g.txt -f1 transcriptome.fa 716.34s user 32.85s system 36% cpu 34:25.42 total 46 | ``` 47 | 48 | A little over 10 mins 49 | 50 | prepare fastq input 51 | 52 | ```{bash} 53 | cd sc5p_v2_hs_PBMC_1k_5gex_fastqs 54 | 55 | ls -1d sc5p_v2_hs_PBMC_1k_5gex_fastqs/* | grep -E "R1|R2" 56 | 57 | ls -1d sc5p_v2_hs_PBMC_1k_5gex_fastqs/* | grep -E "R1|R2" | tr "\n" " " 58 | 59 | sc5p_v2_hs_PBMC_1k_5gex_S1_L001_R1_001.fastq.gz sc5p_v2_hs_PBMC_1k_5gex_S1_L001_R2_001.fastq.gz sc5p_v2_hs_PBMC_1k_5gex_S1_L002_R1_001.fastq.gz sc5p_v2_hs_PBMC_1k_5gex_S1_L002_R2_001.fastq.gz 60 | ``` 61 | 62 | ```{bash} 63 | time kb count \ 64 | -i index.idx \ 65 | -g t2g.txt \ 66 | -x 10xv2 \ 67 | -t 8 \ 68 | -m 16G \ 69 | -o out \ 70 | sc5p_v2_hs_PBMC_1k_5gex_fastqs/sc5p_v2_hs_PBMC_1k_5gex_S1_L001_R1_001.fastq.gz sc5p_v2_hs_PBMC_1k_5gex_fastqs/sc5p_v2_hs_PBMC_1k_5gex_S1_L001_R2_001.fastq.gz sc5p_v2_hs_PBMC_1k_5gex_fastqs/sc5p_v2_hs_PBMC_1k_5gex_S1_L002_R1_001.fastq.gz sc5p_v2_hs_PBMC_1k_5gex_fastqs/sc5p_v2_hs_PBMC_1k_5gex_S1_L002_R2_001.fastq.gz 71 | 72 | 73 | 74 | [2023-06-21 22:04:33,465] INFO [count] Using index index.idx to generate BUS file to out from 75 | [2023-06-21 22:04:33,466] INFO [count] sc5p_v2_hs_PBMC_1k_5gex_fastqs/sc5p_v2_hs_PBMC_1k_5gex_S1_L001_R1_001.fastq.gz 76 | [2023-06-21 22:04:33,466] INFO [count] sc5p_v2_hs_PBMC_1k_5gex_fastqs/sc5p_v2_hs_PBMC_1k_5gex_S1_L001_R2_001.fastq.gz 77 | [2023-06-21 22:04:33,466] INFO [count] sc5p_v2_hs_PBMC_1k_5gex_fastqs/sc5p_v2_hs_PBMC_1k_5gex_S1_L002_R1_001.fastq.gz 78 | [2023-06-21 22:04:33,466] INFO [count] sc5p_v2_hs_PBMC_1k_5gex_fastqs/sc5p_v2_hs_PBMC_1k_5gex_S1_L002_R2_001.fastq.gz 79 | 80 | [2023-06-21 22:06:50,383] INFO [count] Sorting BUS file out/output.bus to out/tmp/output.s.bus 81 | [2023-06-21 22:07:05,398] INFO [count] Whitelist not provided 82 | [2023-06-21 22:07:05,399] INFO [count] Copying pre-packaged 10XV2 whitelist to out 83 | [2023-06-21 22:07:05,513] INFO [count] Inspecting BUS file out/tmp/output.s.bus 84 | [2023-06-21 22:07:08,064] INFO [count] Correcting BUS records in out/tmp/output.s.bus to out/tmp/output.s.c.bus with whitelist out/10x_version2_whitelist.txt 85 | [2023-06-21 22:07:12,351] INFO [count] Sorting BUS file out/tmp/output.s.c.bus to out/output.unfiltered.bus 86 | [2023-06-21 22:07:26,625] INFO [count] Generating count matrix out/counts_unfiltered/cells_x_genes from BUS file out/output.unfiltered.bus 87 | kb count -i index.idx -g t2g.txt -x 10xv2 -t 8 -m 16G -o out 851.81s user 26.97s system 469% cpu 3:07.26 total 88 | 89 | ``` 90 | 91 | 30 seconds! 92 | 93 | read in the unfiltered count matrix and filter out the empty droplets 94 | ```{r} 95 | library(Matrix, quietly=T) # load libraries 96 | library(DropletUtils, quietly=T) 97 | library(dplyr) 98 | library(ggplot2) 99 | ``` 100 | 101 | read in the matrix, genes and barcodes 102 | 103 | ```{r} 104 | raw_mtx <- readMM('~/playground/out/counts_unfiltered/cells_x_genes.mtx') 105 | genes <- read.csv('~/playground/out/counts_unfiltered/cells_x_genes.genes.txt', sep = '\t', header = F) 106 | barcodes<- read.csv('~/playground/out/counts_unfiltered/cells_x_genes.barcodes.txt', sep = '\t', header = F) 107 | 108 | 109 | raw_mtx<- t(raw_mtx) 110 | rownames(raw_mtx) <- genes[,1] # attach gene_ids 111 | colnames(raw_mtx) <- barcodes[,1] 112 | ``` 113 | 114 | draw knee plot 115 | 116 | ```{r} 117 | tot_counts <- colSums(raw_mtx) 118 | 119 | df <- tibble(total = tot_counts, 120 | rank = row_number(desc(total))) %>% 121 | distinct() %>% 122 | arrange(rank) 123 | 124 | ggplot(df, aes(total, rank)) + 125 | geom_path() + 126 | scale_x_log10() + 127 | scale_y_log10() + 128 | annotation_logticks() + 129 | labs(y = "Barcode rank", x = "Total UMI count") 130 | ``` 131 | 132 | 133 | 134 | ```{r} 135 | out <- emptyDrops(raw_mtx) # get probability that each barcode is a cell 136 | keep <- out$FDR <= 0.05 # define threshold probability for calling a cell 137 | keep[is.na(keep)] <- FALSE 138 | filt_mtx <- raw_mtx[,keep] # subset raw mtx to remove empty drops 139 | 140 | dim(filt_mtx) 141 | ``` 142 | 143 | ### SRR files from SRA 144 | 145 | https://github.com/pachterlab/ffq 146 | 147 | 148 | ```{bash} 149 | ffq SRR9990627 150 | ``` 151 | 152 | 153 | 154 | ```{bash} 155 | time kb count \ 156 | -i index.idx \ 157 | -g t2g.txt \ 158 | -x 10xv2 \ 159 | -t 8 \ 160 | -m 16G \ 161 | -o out \ 162 | $(ffq --ftp SRR10668798 | jq -r '.[] | .url' | tr '\n' ' ') 163 | ``` 164 | 165 | 166 | -------------------------------------------------------------------------------- /scripts/07_gene_set_enrichment_RNAseq.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "gene set enrichment analysis from RNAseq data" 3 | author: "Ming Tang" 4 | date: '2023-06-26' 5 | output: html_document 6 | editor_options: 7 | chunk_output_type: console 8 | --- 9 | 10 | 11 | ## Gene set enrichment analysis 12 | 13 | Let's use a real example https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE197576 14 | 15 | How to download the files from ftp https://www.ncbi.nlm.nih.gov/geo/info/download.html 16 | 17 | https://ftp.ncbi.nlm.nih.gov/geo/series/GSE197nnn/GSE197576/suppl/ 18 | 19 | Alternative use GEOquery https://bioconductor.org/packages/release/bioc/html/GEOquery.html 20 | 21 | ```{bash eval=FALSE} 22 | wget https://ftp.ncbi.nlm.nih.gov/geo/series/GSE197nnn/GSE197576/suppl/GSE197576_raw_gene_counts_matrix.tsv.gz 23 | 24 | csvtk pretty -t GSE197576_raw_gene_counts_matrix.tsv.gz | less -S 25 | 26 | csvtk headers -t GSE197576_raw_gene_counts_matrix.tsv.gz 27 | 28 | csvtk cut -t -f1,2,3,12,13 GSE197576_raw_gene_counts_matrix.tsv.gz| head 29 | 30 | csvtk cut -t -f1,2,3,12,13 GSE197576_raw_gene_counts_matrix.tsv.gz > raw_counts.tsv 31 | ``` 32 | 33 | Get csvtk at https://github.com/shenwei356/csvtk 34 | 35 | ### read the data into R and make a DESeq2 object 36 | 37 | follow the tutorial http://bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html 38 | 39 | ```{r} 40 | library(dplyr) 41 | library(readr) 42 | library(here) 43 | library(DESeq2) 44 | 45 | raw_counts<- read_tsv(here("data/raw_counts.tsv")) 46 | 47 | raw_counts_mat<- raw_counts[, -1] %>% as.matrix 48 | 49 | head(raw_counts_mat) 50 | 51 | rownames(raw_counts_mat)<- raw_counts$gene 52 | head(raw_counts_mat) 53 | 54 | ``` 55 | 56 | Make a sample sheet 57 | 58 | ```{r} 59 | coldata<- data.frame(condition = c("normoxia", "normoxia", "hypoxia", "hypoxia")) 60 | 61 | rownames(coldata)<- colnames(raw_counts_mat) 62 | 63 | coldata 64 | ``` 65 | 66 | Make a DEseq2 object 67 | 68 | ```{r} 69 | all(rownames(coldata) == colnames(raw_counts_mat)) 70 | 71 | dds <- DESeqDataSetFromMatrix(countData = raw_counts_mat, 72 | colData = coldata, 73 | design = ~ condition) 74 | dds <- DESeq(dds) 75 | res <- results(dds, contrast = c("condition", "hypoxia", "normoxia")) 76 | 77 | res %>% 78 | as.data.frame() %>% 79 | arrange((padj), desc(log2FoldChange)) %>% 80 | head(n=30) 81 | 82 | 83 | significant_genes<- res %>% 84 | as.data.frame() %>% 85 | filter(padj <=0.01, abs(log2FoldChange) >= 2) %>% 86 | rownames() 87 | 88 | 89 | significant_genes 90 | ``` 91 | 92 | ## pathway analysis 93 | 94 | https://yulab-smu.top/biomedical-knowledge-mining-book/enrichment-overview.html 95 | 96 | ### over-representation test 97 | 98 | ```{r} 99 | library(clusterProfiler) 100 | 101 | #convert gene symbol to Entrez ID for 102 | 103 | significant_genes_map<- clusterProfiler::bitr(geneID = significant_genes, 104 | fromType="SYMBOL", toType="ENTREZID", 105 | OrgDb="org.Hs.eg.db") 106 | 107 | head(significant_genes_map) 108 | 109 | ## background genes are genes that are detected in the RNAseq experiment 110 | background_genes<- res %>% 111 | as.data.frame() %>% 112 | filter(baseMean != 0) %>% 113 | tibble::rownames_to_column(var = "gene") %>% 114 | pull(gene) 115 | 116 | 117 | res_df<- res %>% 118 | as.data.frame() %>% 119 | filter(baseMean != 0) %>% 120 | tibble::rownames_to_column(var = "gene") 121 | 122 | background_genes_map<- bitr(geneID = background_genes, 123 | fromType="SYMBOL", 124 | toType="ENTREZID", 125 | OrgDb="org.Hs.eg.db") 126 | ``` 127 | 128 | GO term enrichment 129 | 130 | Gene Ontology(GO) defines concepts/classes used to describe gene function, and relationships between these concepts. It classifies functions along three aspects: 131 | 132 | MF: Molecular Function 133 | molecular activities of gene products 134 | 135 | CC: Cellular Component 136 | where gene products are active 137 | 138 | BP: Biological Process 139 | pathways and larger processes made up of the activities of multiple gene products 140 | 141 | GO terms are organized in a directed acyclic graph, where edges between terms represent parent-child relationship. 142 | 143 | ```{r} 144 | ego <- enrichGO(gene = significant_genes_map$ENTREZID, 145 | universe = background_genes_map$ENTREZID, 146 | OrgDb = org.Hs.eg.db, 147 | ont = "BP", 148 | pAdjustMethod = "BH", 149 | pvalueCutoff = 0.01, 150 | qvalueCutoff = 0.05, 151 | readable = TRUE) 152 | head(ego) 153 | 154 | library(enrichplot) 155 | barplot(ego, showCategory=20) 156 | dotplot(ego) 157 | ``` 158 | 159 | 160 | H: hallmark gene sets 161 | C1: positional gene sets 162 | C2: curated gene sets 163 | C3: motif gene sets 164 | C4: computational gene sets 165 | C5: GO gene sets 166 | C6: oncogenic signatures 167 | C7: immunologic signatures 168 | 169 | 170 | ```{r} 171 | # install.packages("msigdbr") 172 | library(msigdbr) 173 | 174 | m_df <- msigdbr(species = "Homo sapiens") 175 | head(m_df) 176 | 177 | m_t2g <- msigdbr(species = "Homo sapiens", category = "H") %>% 178 | dplyr::select(gs_name, entrez_gene) 179 | 180 | 181 | table(m_t2g$gs_name) 182 | head(m_t2g) 183 | 184 | em <- enricher(significant_genes_map$ENTREZID, TERM2GENE=m_t2g, 185 | universe = background_genes_map$ENTREZID ) 186 | head(em) 187 | ``` 188 | 189 | ### Gene set enrichment analysis 190 | 191 | ```{r} 192 | ## you need all the genes and pre-rank them by p-value 193 | ## rank all the genes by signed fold change * -log10pvalue. 194 | 195 | res_df<- res_df %>% 196 | mutate(signed_rank_stats = sign(log2FoldChange) * -log10(pvalue)) %>% 197 | left_join(background_genes_map, by= c("gene" = "SYMBOL")) %>% 198 | arrange(desc(signed_rank_stats)) 199 | 200 | gene_list<- res_df$signed_rank_stats 201 | names(gene_list)<- res_df$ENTREZID 202 | 203 | em2 <- GSEA(gene_list, TERM2GENE=m_t2g) 204 | 205 | ## change the inf to big numbers 206 | res_df<- res_df %>% 207 | mutate(negative_log10pvalue = -log10(pvalue)) %>% 208 | mutate(negative_log10pvalue = ifelse(is.infinite(negative_log10pvalue), 1000, negative_log10pvalue)) %>% 209 | mutate(signed_rank_stats = sign(log2FoldChange) * negative_log10pvalue) 210 | 211 | gene_list<- res_df$signed_rank_stats 212 | names(gene_list)<- res_df$ENTREZID 213 | 214 | 215 | em2 <- GSEA(gene_list, TERM2GENE=m_t2g) 216 | head(em2) 217 | 218 | em2@result %>% View() 219 | ``` 220 | 221 | ### visualization 222 | 223 | 224 | ```{r} 225 | p1<- gseaplot(em2, geneSetID = "HALLMARK_G2M_CHECKPOINT", 226 | by = "runningScore", title = "HALLMARK_G2M_CHECKPOINT") 227 | 228 | p2 <- gseaplot(em2, geneSetID = "HALLMARK_HYPOXIA", 229 | by = "runningScore", title = "HALLMARK_HYPOXIA") 230 | 231 | p1/p2 232 | ``` 233 | 234 | 235 | important thread on background gene selection https://twitter.com/mdziemann/status/1626407797939384320 by Mark Ziemann 236 | 237 | Further reading https://twitter.com/tangming2005/status/1671873310257295360 238 | 239 | -------------------------------------------------------------------------------- /scripts/08_intro_to_singleCellExperiment.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "singleCellExperiment" 3 | author: "Ming Tang" 4 | date: '2024-06-01' 5 | output: html_document 6 | editor_options: 7 | chunk_output_type: console 8 | --- 9 | 10 | ### introduction to singleCellExperiment 11 | 12 | references: 13 | 14 | https://www.bioconductor.org/packages/release/bioc/vignettes/SingleCellExperiment/inst/doc/intro.html 15 | 16 | 17 | ```{r} 18 | if (!requireNamespace("BiocManager", quietly = TRUE)) 19 | install.packages("BiocManager") 20 | 21 | BiocManager::install("scater") 22 | 23 | BiocManager::install("SingleCellExperiment") 24 | 25 | library(SingleCellExperiment) 26 | library(scater) 27 | ``` 28 | 29 | https://ftp.ncbi.nlm.nih.gov/geo/series/GSE126nnn/GSE126030/suppl/ 30 | 31 | ```{bash eval=FALSE} 32 | wget https://ftp.ncbi.nlm.nih.gov/geo/series/GSE126nnn/GSE126030/suppl/GSE126030_RAW.tar 33 | 34 | tar -xvf GSE126030_RAW.tar 35 | ``` 36 | 37 | 38 | ```{r} 39 | library(tidyverse) 40 | library(here) 41 | counts_df<- read_tsv(here("data/GSM3589407_PP002swap.filtered.matrix.txt.gz")) 42 | 43 | colnames(counts_df) %>% tail() 44 | 45 | counts_df$...2466 %>% head() 46 | 47 | table(counts_df$...2466, useNA="ifany") 48 | ``` 49 | 50 | remove the last column 51 | ```{r} 52 | length(colnames(counts_df)) 53 | counts_df<- counts_df[, -2466] 54 | 55 | counts_df[1:5, 1:5] 56 | 57 | counts_mat<- counts_df[, -c(1,2)] %>% 58 | as.matrix() 59 | 60 | rownames(counts_mat)<- counts_df$Gene 61 | 62 | counts_mat[1:5, 1:5] 63 | ``` 64 | 65 | ### Create a SingleCellExperiment Object 66 | 67 | ```{r} 68 | sce <- SingleCellExperiment(assays = list(counts = counts_mat)) 69 | 70 | sce 71 | ``` 72 | To access the count data we just supplied, we can do any one of the following: 73 | 74 | ```{r} 75 | assay(sce, "counts") 76 | 77 | assay(sce, "counts")[1:5, 1:5] 78 | 79 | counts(sce)[1:5, 1:5] 80 | ``` 81 | 82 | One can access the slots with the `@` operator. This is considered bad practice as the class developers are free to alter the internal structure of the class, at which point any code using `@` may no longer work. Rather, it is best to use the provided **getter** functions like `assay()` and `counts()` 83 | 84 | ```{r} 85 | sce@assays 86 | 87 | sce@assays@data 88 | 89 | sce@assays@data$counts[1:5, 1:5] 90 | ``` 91 | 92 | add more assays 93 | 94 | ```{r} 95 | sce <- scuttle::logNormCounts(sce) 96 | sce 97 | ``` 98 | 99 | ```{r} 100 | sce@assays@data$logcounts[1:5, 1:5] 101 | 102 | assay(sce, "logcounts")[1:5, 1:5] 103 | 104 | dim(logcounts(sce)) 105 | ``` 106 | 107 | You may add the assay manually 108 | 109 | ```{r} 110 | counts_1 <- counts(sce) + 1 111 | 112 | assay(sce, "counts_1") <- counts_1 # assign a new entry to assays slot 113 | 114 | assays(sce) 115 | 116 | # not recommended way to add new assay data 117 | sce@assays@data$counts_2<- counts(sce) + 2 118 | assays(sce) 119 | ``` 120 | 121 | ```{r} 122 | assayNames(sce) 123 | ``` 124 | 125 | ### add metadata 126 | 127 | ```{r} 128 | sce <- scuttle::addPerCellQC(sce) 129 | 130 | 131 | colData(sce) 132 | 133 | coldata<- colData(sce) 134 | 135 | identical(rownames(coldata), colnames(counts_mat)) 136 | 137 | sce <- scuttle::addPerFeatureQC(sce) 138 | rowData(sce) 139 | ``` 140 | 141 | ```{r} 142 | rowRanges(sce) 143 | ``` 144 | 145 | ### other metadata 146 | 147 | the metadata slot, a named list of arbitrary objects. For example, say we have some favorite genes (e.g., highly variable genes) that we want to store inside of sce for use in our analysis at a later point. 148 | 149 | ```{r} 150 | my_genes <- c("gene_1", "gene_5") 151 | metadata(sce) <- list(favorite_genes = my_genes) 152 | metadata(sce) 153 | ``` 154 | 155 | ### dimension reduction 156 | 157 | ```{r} 158 | sce <- scater::logNormCounts(sce) 159 | 160 | sce <- scater::runPCA(sce) 161 | 162 | sce 163 | 164 | 165 | reducedDim(sce, "PCA")[1:5, 1:5] 166 | 167 | dim(reducedDim(sce, "PCA")) 168 | 169 | sce <- scater::runTSNE(sce, perplexity = 0.1) 170 | 171 | reducedDim(sce, "TSNE")[1:5, 1:2] 172 | 173 | reducedDims(sce) 174 | 175 | 176 | reducedDimNames(sce) 177 | ``` 178 | 179 | ### add 180 | 181 | ```{r} 182 | u <- uwot::umap(t(logcounts(sce)), n_neighbors = 2) 183 | reducedDim(sce, "UMAP_uwot") <- u 184 | 185 | reducedDims(sce) # Now stored in the object. 186 | ``` 187 | 188 | 189 | ```{r} 190 | colLabels(sce) <- scran::clusterCells(sce, use.dimred="PCA") 191 | table(colLabels(sce)) 192 | ``` 193 | 194 | ```{r} 195 | scater::plotReducedDim(sce, dimred="TSNE") 196 | ``` 197 | 198 | -------------------------------------------------------------------------------- /scripts/09_intro_to_seurat_V5.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "09_intro_to_seurat_v5" 3 | author: "Ming Tang" 4 | date: '2024-06-06' 5 | output: html_document 6 | editor_options: 7 | chunk_output_type: console 8 | --- 9 | 10 | ### Overview of Layers in Seurat V5 11 | 12 | In Seurat V5, a Layer is a new abstraction that can encapsulate different modalities or features of the single-cell data. This allows users to seamlessly integrate and analyze data from various sources while retaining the ability to use familiar Seurat functions. 13 | 14 | Key Concepts of Layers 15 | Layered Data: Each layer can represent a modality (like RNA, protein, etc.) or a different version of data (like log-normalized versus raw counts). 16 | 17 | ```{r} 18 | options(Seurat.object.assay.version = "v5") 19 | library(Seurat) 20 | library(dplyr) 21 | 22 | # packageVersion("Seurat") 23 | 24 | ``` 25 | 26 | Load the example data 27 | ```{r} 28 | # devtools::install_github('satijalab/seurat-data') 29 | library(SeuratData) 30 | InstallData("pbmc3k") 31 | data("pbmc3k") 32 | pbmc3k 33 | 34 | pbmc3k<- UpdateSeuratObject(pbmc3k) 35 | 36 | pbmc3k 37 | ``` 38 | 39 | 40 | ```{r} 41 | pbmc3k<- pbmc3k %>% 42 | NormalizeData(normalization.method = "LogNormalize", scale.factor = 10000) %>% 43 | FindVariableFeatures(selection.method = "vst", nfeatures = 2000) %>% 44 | ScaleData() %>% 45 | RunPCA(verbose = FALSE) %>% 46 | RunUMAP(dims = 1:10, verbose = FALSE) 47 | ``` 48 | 49 | ### Seurat V5 50 | 51 | Seurat v5 assays store data in layers. These layers can store raw, un-normalized counts (layer='counts'), normalized data (layer='data'), or z-scored/variance-stabilized data (layer='scale.data'). 52 | 53 | ```{r} 54 | pbmc3k@meta.data %>% head() 55 | pbmc3k@assays 56 | pbmc3k@assays$RNA 57 | pbmc3k[["RNA"]] 58 | 59 | 60 | pbmc3k@assays$RNA$counts[1:5, 1:5] 61 | pbmc3k[["RNA"]]@counts[1:5, 1:5] 62 | 63 | Layers(pbmc3k) 64 | LayerData(pbmc3k, "counts")[1:5, 1:5] 65 | 66 | pbmc3k[["RNA"]]$data 67 | LayerData(pbmc3k, "data")[1:5, 1:5] 68 | 69 | 70 | ##use getter function 71 | GetAssayData(object = pbmc3k, slot = 'data')[1:5, 1:5] 72 | 73 | ``` 74 | 75 | dimension reduction 76 | 77 | ```{r} 78 | pbmc3k@reductions 79 | 80 | pbmc3k@reductions$pca 81 | 82 | pbmc3k@reductions$pca@cell.embeddings %>% head() 83 | 84 | pbmc3k[["pca"]] 85 | pbmc3k[["pca"]]@cell.embeddings %>% head() 86 | 87 | ## getter function 88 | 89 | Embeddings(pbmc3k, "pca") %>% head() 90 | Embeddings(pbmc3k, "umap") %>% head() 91 | 92 | ### cell loadings 93 | Loadings(pbmc3k, "pca") %>% head() 94 | 95 | ``` 96 | 97 | ### convert between V4 and V5 98 | 99 | ```{r} 100 | obj<- pbmc3k 101 | 102 | #convert v5 assay to v4 assay within same object 103 | obj[["RNA"]]<- as(obj[["RNA"]], Class= "Assay") 104 | 105 | class(obj[["RNA"]]) 106 | # convert it back 107 | obj[["RNA"]]<- as(obj[["RNA"]], Class= "Assay5") 108 | 109 | class(obj[["RNA"]]) 110 | 111 | pbmc3k_v3 <- Convert(pbmc3k, version = "3.0") 112 | 113 | ``` 114 | 115 | -------------------------------------------------------------------------------- /scripts/10_analyze_my_tweets.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Untitled" 3 | output: html_document 4 | date: "2024-10-03" 5 | editor_options: 6 | chunk_output_type: console 7 | --- 8 | 9 | ```{r} 10 | # Load necessary libraries 11 | library(jsonlite) 12 | library(dplyr) 13 | library(tidyr) 14 | library(purrr) 15 | 16 | # Step 1: Read the raw file 17 | raw_data <- readLines("~/Downloads/twitter-2024-10-01-52b49f0122d5bd108bcb56a0d40e5f809f4ade7fdbf0c0e457d924d980f12230/data/tweets.js") 18 | 19 | # Step 2: Remove the JavaScript variable declaration ("window.YTD.tweets.part0 =") 20 | json_data <- gsub("window.YTD.tweets.part0 = ", "", raw_data) 21 | 22 | # Step 3: Collapse the data into a single string (in case it's split across multiple lines) 23 | json_data <- paste(json_data, collapse = "") 24 | 25 | # Step 4: Remove any trailing semi-colon at the end (if present) 26 | json_data <- gsub(";$", "", json_data) 27 | 28 | # Step 5: Now parse the cleaned JSON 29 | tweets_data <- fromJSON(json_data) 30 | 31 | head(tweets_data) 32 | colnames(tweets_data) 33 | class(tweets_data) 34 | 35 | ## get the dataframe 36 | tweets_df<- tweets_data$tweet 37 | 38 | 39 | colnames(tweets_df) 40 | # Step 7: View the structure of the resulting dataframe 41 | glimpse(tweets_df) 42 | 43 | # Optionally, you can now extract specific fields like "full_text" and "created_at" 44 | tweets_cleaned <- tweets_df %>% 45 | select(created_at, full_text, favorite_count, retweet_count) %>% 46 | mutate(favorite_count = as.numeric(favorite_count), 47 | retweet_count = as.numeric(retweet_count)) 48 | 49 | # View cleaned dataframe 50 | head(tweets_cleaned) 51 | 52 | tweets_cleaned %>% 53 | arrange(desc(favorite_count)) %>% 54 | View() 55 | 56 | ``` 57 | 58 | 59 | 60 | ```{r} 61 | # Step 6: Convert to a tibble (dataframe) using tidyverse 62 | tweets_df <- as_tibble(tweets_data) 63 | 64 | # Step 7: Extract key tweet information, including URLs 65 | # Unnest the 'tweet' column, extract relevant fields, and handle nested 'urls' 66 | tweets_cleaned <- tweets_df %>% 67 | unnest_wider(tweet) %>% 68 | unnest_wider(entities) %>% 69 | unnest_longer(urls, keep_empty = TRUE) %>% 70 | mutate(url = map(urls, ~ if (is.data.frame(.x) && "url" %in% colnames(.x)) .x$url else NA_character_)) %>% 71 | select(created_at, full_text, favorite_count, retweet_count, url) 72 | 73 | 74 | temp %>% 75 | mutate(url = map_chr(urls, ~ if (is.data.frame(.x) && "url" %in% colnames(.x)) .x$url else NA_character_)) 76 | 77 | temp$url[[1]] 78 | # Step 8: View cleaned dataframe with URLs 79 | head(tweets_cleaned) 80 | ``` 81 | 82 | -------------------------------------------------------------------------------- /scripts/11_change_rownames_ENSEMBL_to_symbol_RNAseq.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "change gene names" 3 | output: html_document 4 | date: "2024-10-10" 5 | editor_options: 6 | chunk_output_type: console 7 | --- 8 | 9 | ### The problem 10 | You have a count matrix with rownames are ENSEMBL ids, and you want 11 | to change them to gene symbols. 12 | 13 | 14 | ```{r} 15 | #BiocManager::install("recount3") 16 | library(recount3) 17 | library(purrr) 18 | library(dplyr) 19 | library(ggplot2) 20 | human_projects <- available_projects() 21 | 22 | tcga_info = subset( 23 | human_projects, 24 | file_source == "tcga" & project_type == "data_sources" 25 | ) 26 | 27 | head(tcga_info) 28 | ``` 29 | 30 | 31 | ```{r} 32 | tcga_info[1, ] 33 | 34 | ## create the RangedSummarizedExperiment. the create_rse function works on 35 | ## one row a time 36 | 37 | rse_acc<- create_rse(tcga_info[1,]) 38 | 39 | rse_acc 40 | ``` 41 | 42 | 43 | ```{r} 44 | rse_acc@assays@data$raw_counts[1:5, 1:5] 45 | 46 | rse_acc@rowRanges 47 | ``` 48 | 49 | ### mapping gene symbol to ENSEMBL ID 50 | 51 | ```{r} 52 | if (!requireNamespace("BiocManager", quietly = TRUE)) 53 | install.packages("BiocManager") 54 | BiocManager::install(c("AnnotationDbi", "org.Hs.eg.db")) 55 | library(AnnotationDbi) 56 | library(org.Hs.eg.db) 57 | ``` 58 | 59 | 60 | ```{r} 61 | 62 | mat<- rse_acc@assays@data$raw_counts 63 | mat[1:5, 1:5] 64 | ensembl_ids<- rownames(mat) 65 | 66 | 67 | # Map ENSEMBL IDs to HGNC symbols using org.Hs.eg.db 68 | # you get errors! 69 | gene_symbols <- AnnotationDbi::select(org.Hs.eg.db, 70 | keys = ensembl_ids, 71 | column = "SYMBOL", # The output you want (gene symbol) 72 | keytype = "ENSEMBL", # The input key type (ENSEMBL ID) 73 | multiVals = "first") # How to handle multiple mappings 74 | 75 | head(ensembl_ids) 76 | 77 | # remove the version number 78 | ensembl_ids<- ensembl_ids %>% 79 | stringr::str_replace("\\.[0-9]+$", "") 80 | 81 | rownames(mat)<- ensembl_ids 82 | mat[1:5, 1:5] 83 | 84 | gene_symbols <- AnnotationDbi::select(org.Hs.eg.db, 85 | keys = ensembl_ids, 86 | column = "SYMBOL", # The output you want (gene symbol) 87 | keytype = "ENSEMBL", # The input key type (ENSEMBL ID) 88 | multiVals = "first") 89 | 90 | head(gene_symbols) 91 | ``` 92 | 93 | 94 | ```{r} 95 | gene_symbols %>% 96 | janitor::get_dupes(SYMBOL) %>% 97 | head() 98 | 99 | gene_symbols %>% 100 | janitor::get_dupes(SYMBOL) %>% 101 | filter(!is.na(SYMBOL)) 102 | ``` 103 | 104 | ### remove the NA and remove the duplicates 105 | 106 | ```{r} 107 | gene_symbols_uniq<- gene_symbols %>% 108 | filter(!is.na(SYMBOL)) %>% 109 | distinct(SYMBOL, .keep_all = TRUE) %>% 110 | distinct(ENSEMBL, .keep_all = TRUE) 111 | 112 | 113 | dim(gene_symbols_uniq) 114 | 115 | gene_symbols_uniq %>% 116 | janitor::get_dupes(ENSEMBL) 117 | 118 | gene_symbols_uniq %>% 119 | janitor::get_dupes(SYMBOL) 120 | ``` 121 | 122 | 123 | ### subset the orignal matrix 124 | 125 | 126 | ```{r} 127 | mat_subset<- mat[gene_symbols_uniq$ENSEMBL, ] 128 | 129 | all.equal(rownames(mat_subset), gene_symbols_uniq$ENSEMBL) 130 | 131 | rownames(mat_subset)<- gene_symbols_uniq$SYMBOL 132 | 133 | mat_subset[1:5, 1:5] 134 | ``` -------------------------------------------------------------------------------- /scripts/15_how_to_get_metadata_GEO.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "15_how_to_get_metadata_from_GEO" 3 | author: "Ming Tang" 4 | date: '2023-12-07' 5 | output: html_document 6 | editor_options: 7 | chunk_output_type: console 8 | --- 9 | 10 | ```{r} 11 | # BiocManager::install("GEOquery") 12 | library(GEOquery) 13 | library(tidyverse) 14 | 15 | meta<- getGEO(GEO="GSE185507",GSEMatrix=FALSE) 16 | ``` 17 | 18 | ```{r} 19 | meta@gsms$GSM5616943@header$characteristics_ch1 20 | ``` 21 | 22 | ```{r} 23 | purrr::map(meta@gsms, ~.x@header$characteristics_ch1) %>% 24 | stack() %>% 25 | tidyr::separate(values, into = c("feature", "value"), sep= ": ")%>% 26 | pivot_wider(names_from= feature, values_from = value) %>% 27 | janitor::clean_names() %>% 28 | write_csv("~/Downloads/GSE185507_meta.csv") 29 | ``` 30 | 31 | ## 5 tools to fetch GEO and other databases' metadata and data 32 | 33 | 1. GEOfetch https://geofetch.databio.org/en/latest/ 34 | 35 | 2. bioconductor package GEOquery https://bioconductor.org/packages/release/bioc/html/GEOquery.html 36 | 37 | 3. [ffq](https://github.com/pachterlab/ffq) Fetch metadata information from databases. https://github.com/pachterlab/ffq 38 | 39 | 4. [pysradb](https://github.com/saketkc/pysradb): a python package to query next-generation sequencing metadata and data from NCBI sequence read archive. 40 | 41 | 5. [GEOparse](https://github.com/guma44/GEOparse) 42 | -------------------------------------------------------------------------------- /scripts/16_get_mouse_gene_exon_lengths.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "get gene length" 3 | output: html_document 4 | date: "2025-01-16" 5 | editor_options: 6 | chunk_output_type: console 7 | --- 8 | 9 | ```{r} 10 | #install.packages("BiocManager") 11 | #BiocManager::install("TxDb.Mmusculus.UCSC.mm9.knownGene") 12 | #BiocManager::install("org.Mm.eg.db") 13 | 14 | library(TxDb.Mmusculus.UCSC.mm9.knownGene) 15 | library(org.Mm.eg.db) 16 | ``` 17 | 18 | Now, we can retrieve the gene data: 19 | ```{r} 20 | txdb <- TxDb.Mmusculus.UCSC.mm9.knownGene 21 | mm9_genes <- genes(txdb) 22 | mm9_genes 23 | ``` 24 | 25 | To map the Entrez ID to gene symbols, we use: 26 | 27 | ```{r} 28 | gene_symbol <- AnnotationDbi::select(org.Mm.eg.db, keys = mm9_genes$gene_id, 29 | columns = "SYMBOL", keytype = "ENTREZID") 30 | ``` 31 | 32 | Make sure the gene IDs match: 33 | 34 | ```{r} 35 | all.equal(mm9_genes$gene_id, gene_symbol$ENTREZID) 36 | 37 | # Add gene symbols to the data: 38 | mm9_genes$symbol <- gene_symbol$SYMBOL 39 | ``` 40 | 41 | ```{r} 42 | width(mm9_genes) 43 | 44 | df <- data.frame(EntrezID = mm9_genes$gene_id, 45 | Symbol = mm9_genes$symbol, 46 | Gene_length = width(mm9_genes)) 47 | 48 | head(df) 49 | ``` 50 | 51 | Why is this useful? When normalizing H3K36me3 signals (found in gene bodies), you can use gene lengths for proper normalization. Also, for RNA-seq, exon lengths are essential for calculating metrics like RPKM (though TPM is preferred now). 52 | 53 | ### exon lengths 54 | 55 | ```{r} 56 | exons<- exonsBy(txdb, by = "gene") 57 | exons 58 | ``` 59 | 60 | This returns a GRangesList object and each element of the list is a GRanges containing all the exons for that gene. 61 | 62 | Let’s calculate the total exon lengths for each gene by the width function 63 | 64 | ```{r} 65 | width(exons) 66 | 67 | head(sum(width(exons))) 68 | ``` 69 | 70 | ```{r} 71 | exon_len<- sum(width(exons)) %>% 72 | tibble::enframe(name = "ENTREZID", value = "exon_length") 73 | 74 | head(exon_len) 75 | ``` 76 | 77 | -------------------------------------------------------------------------------- /scripts/R_tips_01_add_percentage_to_y_axis.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "add percentage to y axis" 3 | author: "Ming Tang" 4 | date: "5/28/2023" 5 | output: html_document 6 | editor_options: 7 | chunk_output_type: console 8 | --- 9 | 10 | ### How to add percentages to the y-axis of a bar plot and how to order the bar plot by the value of each bar. 11 | 12 | The transcripts and genes for a reference sequence UCSC annotation https://genome.ucsc.edu/ 13 | 14 | The TxDb family of packages and data objects manages information on transcripts and gene models. We consider those derived from annotation tables prepared for the UCSC genome browser. 15 | 16 | ```{r} 17 | library(TxDb.Hsapiens.UCSC.hg19.knownGene) 18 | library(dplyr) 19 | library(ggplot2) 20 | 21 | txdb<- TxDb.Hsapiens.UCSC.hg19.knownGene # abbreviate 22 | txdb 23 | 24 | ``` 25 | 26 | ```{r} 27 | genes(txdb) 28 | 29 | #exons(txdb) 30 | ``` 31 | 32 | always check the first several rows first 33 | 34 | ```{r} 35 | genes(txdb) %>% 36 | as.data.frame() %>% 37 | head() 38 | ``` 39 | 40 | Let's count how many genes in each chromosomes 41 | 42 | ```{r} 43 | genes_df<- genes(txdb) %>% 44 | as.data.frame() 45 | 46 | genes_df %>% 47 | dplyr::count(seqnames) 48 | ``` 49 | 50 | Let's remove the unconventional chromosomes first 51 | 52 | ```{r} 53 | conventional_chrs<- paste0("chr", c(1:22, "X", "Y")) 54 | 55 | total_gene<- genes_df %>% 56 | filter(seqnames %in% conventional_chrs) %>% 57 | nrow() 58 | 59 | 60 | genes_per_chr<- genes_df %>% 61 | filter(seqnames %in% conventional_chrs) %>% 62 | dplyr::count(seqnames) 63 | 64 | genes_per_chr %>% 65 | mutate(genes_percent = n/total_gene) 66 | ``` 67 | 68 | make the figure 69 | ```{r} 70 | genes_per_chr %>% 71 | mutate(genes_percent = n/total_gene) %>% 72 | ggplot(aes(x= seqnames, y = genes_percent)) + 73 | geom_bar(stat = "identity") 74 | ``` 75 | 76 | 77 | rotate the x-axis 78 | 79 | ```{r} 80 | genes_per_chr %>% 81 | mutate(genes_percent = n/total_gene) %>% 82 | ggplot(aes(x= seqnames, y = genes_percent)) + 83 | geom_bar( stat = "identity") + 84 | xlab("") + 85 | theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) 86 | ``` 87 | 88 | use them_bw 89 | ```{r} 90 | genes_per_chr %>% 91 | mutate(genes_percent = n/total_gene) %>% 92 | ggplot(aes(x= seqnames, y = genes_percent)) + 93 | geom_bar( stat = "identity") + 94 | xlab("") + 95 | theme_bw(base_size = 14) + 96 | theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) 97 | ``` 98 | 99 | change y-axis to percentage 100 | 101 | ```{r} 102 | genes_per_chr %>% 103 | mutate(genes_percent = n/total_gene) %>% 104 | ggplot(aes(x= seqnames, y = genes_percent)) + 105 | geom_bar(stat = "identity") + 106 | xlab("") + 107 | scale_y_continuous(labels = scales::percent) + 108 | theme_bw(base_size = 14) + 109 | theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) 110 | ``` 111 | 112 | Reorder the bar from small to big: 113 | 114 | https://forcats.tidyverse.org/reference/fct_reorder.html 115 | 116 | ```{r} 117 | genes_per_chr %>% 118 | mutate(genes_percent = n/total_gene) %>% 119 | ggplot(aes(x= seqnames %>% 120 | forcats::fct_reorder(genes_percent) 121 | , y = genes_percent)) + 122 | geom_bar(stat = "identity") + 123 | xlab("") + 124 | scale_y_continuous(labels = scales::percent) + 125 | theme_bw(base_size = 14) + 126 | theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) 127 | ``` 128 | 129 | reverse the order 130 | 131 | ```{r} 132 | genes_per_chr %>% 133 | mutate(genes_percent = n/total_gene) %>% 134 | ggplot(aes(x= seqnames %>% 135 | forcats::fct_reorder(genes_percent, .desc = TRUE) 136 | , y = genes_percent)) + 137 | geom_bar(stat = "identity") + 138 | xlab("") + 139 | scale_y_continuous(labels = scales::percent) + 140 | theme_bw(base_size = 14) + 141 | theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) 142 | ``` 143 | 144 | -------------------------------------------------------------------------------- /scripts/R_tips_02_add_side_to_scatterplot.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "R_tips_02_add_side_to_scatterplot" 3 | author: "Ming Tang" 4 | date: "6/2/2023" 5 | output: html_document 6 | editor_options: 7 | chunk_output_type: console 8 | --- 9 | 10 | ### introduce ggside using single cell data 11 | 12 | The ggside R package provides a new way to visualize data by combining the flexibility of ggplot2 with the power of side-by-side plots. 13 | 14 | We will use a single cell dataset to demonstrate its usage. 15 | 16 | ggside allows users to create side-by-side plots of multiple variables, such as gene expression, cell type, and experimental conditions. This can be helpful for identifying patterns and trends in scRNA-seq data that would be difficult to see in individual plots. Additionally, ggside provides a number of features that make it easy to customize the appearance of side-by-side plots, such as changing the color scheme, adding labels, and adjusting the layout. 17 | 18 | https://cran.r-project.org/web/packages/ggside/vignettes/ggside_basic_usage.html 19 | 20 | ### Load libraries 21 | ```{r} 22 | # install.packages("ggside") 23 | library(ggside) 24 | library(Seurat) 25 | library(dplyr) 26 | library(SeuratData) 27 | ``` 28 | 29 | load data 30 | ```{r} 31 | data("pbmc3k") 32 | 33 | pbmc3k 34 | ``` 35 | 36 | 37 | ### routine processing 38 | 39 | ```{r} 40 | pbmc3k<- pbmc3k %>% 41 | NormalizeData(normalization.method = "LogNormalize", scale.factor = 10000) %>% 42 | FindVariableFeatures(selection.method = "vst", nfeatures = 2000) %>% 43 | ScaleData() %>% 44 | RunPCA(verbose = FALSE) %>% 45 | FindNeighbors(dims = 1:10, verbose = FALSE) %>% 46 | FindClusters(resolution = 0.5, verbose = FALSE) %>% 47 | RunUMAP(dims = 1:10, verbose = FALSE) 48 | 49 | Idents(pbmc3k)<- pbmc3k$seurat_annotations 50 | 51 | DimPlot(pbmc3k, label = TRUE, repel=TRUE) + NoLegend() 52 | ``` 53 | 54 | 55 | some helper functions to extract the gene expression values from the seurat object 56 | 57 | ```{r} 58 | matrix_to_expression_df<- function(x, obj){ 59 | df<- x %>% 60 | as.matrix() %>% 61 | as.data.frame() %>% 62 | tibble::rownames_to_column(var= "gene") %>% 63 | tidyr::pivot_longer(cols = -1, names_to = "cell", values_to = "expression") %>% 64 | tidyr::pivot_wider(names_from = "gene", values_from = expression) %>% 65 | left_join(obj@meta.data %>% 66 | tibble::rownames_to_column(var = "cell")) 67 | return(df) 68 | } 69 | 70 | 71 | get_expression_data<- function(obj, assay = "RNA", slot = "data", 72 | genes = NULL, cells = NULL){ 73 | if (is.null(genes) & !is.null(cells)){ 74 | df<- GetAssayData(obj, assay = assay, slot = slot)[, cells, drop = FALSE] %>% 75 | matrix_to_expression_df(obj = obj) 76 | } else if (!is.null(genes) & is.null(cells)){ 77 | df <- GetAssayData(obj, assay = assay, slot = slot)[genes, , drop = FALSE] %>% 78 | matrix_to_expression_df(obj = obj) 79 | } else if (is.null(genes & is.null(cells))){ 80 | df <- GetAssayData(obj, assay = assay, slot = slot)[, , drop = FALSE] %>% 81 | matrix_to_expression_df(obj = obj) 82 | } else { 83 | df<- GetAssayData(obj, assay = assay, slot = slot)[genes, cells, drop = FALSE] %>% 84 | matrix_to_expression_df(obj = obj) 85 | } 86 | return(df) 87 | } 88 | ``` 89 | 90 | 91 | test the function 92 | 93 | ```{r} 94 | df<- get_expression_data(obj = pbmc3k, genes = c("CD14", "FCGR3A")) 95 | 96 | head(df) 97 | ``` 98 | 99 | 100 | a plain scatter plot 101 | ```{r} 102 | df %>% 103 | filter(seurat_annotations %in% c("CD14+ Mono", "FCGR3A+ Mono")) %>% 104 | ggplot(aes(x= CD14, y = FCGR3A)) + 105 | geom_point(aes(color = seurat_annotations)) 106 | ``` 107 | 108 | 109 | a scatter plot adding two boxplots 110 | 111 | ```{r} 112 | df %>% 113 | filter(seurat_annotations %in% c("CD14+ Mono", "FCGR3A+ Mono")) %>% 114 | ggplot(aes(x= CD14, y = FCGR3A)) + 115 | geom_point(aes(color = seurat_annotations)) + 116 | geom_xsideboxplot(aes(y = seurat_annotations, color = seurat_annotations), 117 | orientation = "y") + 118 | geom_ysideboxplot(aes(x = seurat_annotations, color = seurat_annotations), 119 | orientation = "x")+ 120 | scale_xsidey_discrete() + 121 | scale_ysidex_discrete()+ 122 | theme(ggside.panel.scale.x = 0.2, 123 | ggside.panel.scale.y = 0.3) 124 | ``` 125 | 126 | a scatterplot adding one boxplot and one density plot 127 | 128 | ```{r} 129 | df %>% 130 | filter(seurat_annotations %in% c("CD14+ Mono", "FCGR3A+ Mono")) %>% 131 | ggplot(aes(x= CD14, y = FCGR3A)) + 132 | geom_point(aes(color = seurat_annotations)) + 133 | geom_xsideboxplot(aes(y = seurat_annotations, color = seurat_annotations), 134 | orientation = "y") + 135 | geom_ysidedensity(aes(x = after_stat(density), color = seurat_annotations, fill = seurat_annotations), 136 | position = "stack", alpha = 0.4) + 137 | scale_xsidey_discrete() + 138 | scale_ysidex_continuous(guide = guide_axis(angle = 90), minor_breaks = NULL) + 139 | theme(ggside.panel.scale.x = 0.2, 140 | ggside.panel.scale.y = 0.4) 141 | ``` 142 | 143 | ### alternative way: use patchwork 144 | 145 | https://patchwork.data-imaginist.com/ 146 | 147 | ```{r} 148 | library(patchwork) 149 | 150 | p1<- df %>% 151 | filter(seurat_annotations %in% c("CD14+ Mono", "FCGR3A+ Mono")) %>% 152 | ggplot(aes(x= seurat_annotations, y = CD14)) + 153 | geom_boxplot(aes(color = seurat_annotations)) + 154 | xlab("") + 155 | theme( 156 | axis.title.x = element_blank(), 157 | axis.text.x = element_blank(), 158 | axis.ticks.x = element_blank(), 159 | #legend.position = "none", legend.text = element_blank() 160 | )+ 161 | coord_flip() 162 | 163 | p2<- df %>% 164 | filter(seurat_annotations %in% c("CD14+ Mono", "FCGR3A+ Mono")) %>% 165 | ggplot(aes(x= CD14, y = FCGR3A)) + 166 | geom_point(aes(color = seurat_annotations)) + 167 | theme(legend.position = "none", legend.text = element_blank()) 168 | 169 | p3<- df %>% 170 | filter(seurat_annotations %in% c("CD14+ Mono", "FCGR3A+ Mono")) %>% 171 | ggplot(aes(x= seurat_annotations, y = FCGR3A)) + 172 | geom_boxplot(aes(color = seurat_annotations)) + 173 | theme(legend.position = "none") + 174 | ylab("") + 175 | xlab("") + 176 | theme( 177 | axis.title.y = element_blank(), 178 | axis.text.y = element_blank(), 179 | axis.ticks.y = element_blank() 180 | ) 181 | 182 | p1 + plot_spacer() + p2 + p3 + 183 | plot_layout(widths = c(4, 2), heights = c(1, 5), 184 | guides = 'collect') 185 | 186 | ``` 187 | 188 | 189 | 190 | -------------------------------------------------------------------------------- /scripts/R_tips_03_extract_tables_from_PDF.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "extract tables" 3 | author: "Ming Tang" 4 | date: "6/9/2023" 5 | output: html_document 6 | --- 7 | 8 | ### how to extract tables from PDF file using Tabulizer https://github.com/ropensci/tabulizer 9 | 10 | ```{r} 11 | #install.packages("tabulizer") not working, have to install it from github 12 | 13 | if (!require("remotes")) { 14 | install.packages("remotes") 15 | } 16 | 17 | remotes::install_github(c("ropensci/tabulizerjars", "ropensci/tabulizer")) 18 | 19 | library(tabulizer) 20 | 21 | out <- extract_tables("/cloud/project/sciadv.abm1831.pdf", pages = 3, guess = TRUE, 22 | output = "data.frame") 23 | ``` 24 | 25 | 26 | ```{r} 27 | out[[1]] 28 | View(out[[1]]) 29 | 30 | ``` 31 | -------------------------------------------------------------------------------- /scripts/R_tips_04_list_column_dataframe_in_dataframe.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "R_tips_04_list_column_dataframe_in_dataframe" 3 | author: "Ming Tang" 4 | date: "6/13/2023" 5 | output: html_document 6 | editor_options: 7 | chunk_output_type: console 8 | --- 9 | 10 | ```{r} 11 | library(tibble) 12 | library(dplyr) 13 | library(tidyr) 14 | library(purrr) 15 | library(ggplot2) 16 | ``` 17 | 18 | 19 | make a list column by using tidyr::nest() 20 | ```{r} 21 | mtcars_list<- mtcars %>% 22 | group_by(cyl) %>% 23 | tidyr::nest() 24 | 25 | mtcars_list$data[[1]] 26 | mtcars_list$data[[2]] 27 | 28 | ``` 29 | 30 | list column can even contain ggplot2 object 31 | 32 | ```{r} 33 | mtcars_list<- mtcars_list %>% 34 | mutate(plots = purrr::map(data, ~ggplot(.x, aes(x= hp, y = mpg)) + geom_point())) 35 | 36 | mtcars_list 37 | ``` 38 | 39 | save the plots 40 | 41 | ```{r} 42 | walk2(mtcars_list$cyl, mtcars_list$plots, 43 | ~ ggsave(filename = paste0("~/Downloads/cyl", .x, "_plot.pdf"), 44 | plot = .y, width = 4, height = 4)) 45 | 46 | # side effect 47 | map2(mtcars_list$cyl, mtcars_list$plots, 48 | ~ ggsave(filename = paste0("~/Downloads/cyl", .x, "_plot.pdf"), 49 | plot = .y, width = 4, height = 4)) 50 | ``` 51 | 52 | 53 | nest by two variables 54 | 55 | ```{r} 56 | mtcars_list2<- mtcars %>% 57 | group_by(cyl, gear) %>% 58 | nest() 59 | 60 | mtcars_list2 %>% 61 | mutate(filename = paste0("cyl", cyl, "_gear", gear)) 62 | ``` 63 | 64 | -------------------------------------------------------------------------------- /scripts/R_tips_05_read_all_files_in_a_folder.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "read in all files" 3 | author: "Ming Tang" 4 | date: "6/14/2023" 5 | output: html_document 6 | editor_options: 7 | chunk_output_type: console 8 | --- 9 | 10 | ### combine all the counts into a single dataframe 11 | 12 | https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE197320 13 | 14 | ```{r} 15 | library(tidyverse) 16 | library(purrr) 17 | 18 | file1<- read_tsv("~/Downloads/GSE197320_RAW/GSM5914555_SKOV3_ALDH_Veh_Rep1_Genes_ReadCount.txt.gz", 19 | col_names = FALSE) 20 | 21 | files<- list.files("~/Downloads/GSE197320_RAW", full.names = TRUE) 22 | 23 | 24 | samples<- basename(files) %>% 25 | str_replace(".+_ALDH_(.+)_Genes_ReadCount.txt.gz", "\\1") 26 | 27 | 28 | read_count<- function(x, sample){ 29 | df<- read_tsv(x, col_names = FALSE) 30 | names(df)<- c("gene", sample) 31 | return(df) 32 | } 33 | 34 | counts<- map2(files,samples,read_count) 35 | 36 | counts_table<- purrr::reduce(counts, inner_join) 37 | 38 | head(counts_table) 39 | ``` 40 | 41 | 42 | 43 | ```{r} 44 | bind_cols(counts) %>% 45 | head() 46 | 47 | ``` 48 | 49 | ### dummy files 50 | 51 | 52 | ```{r} 53 | library(tidyr) 54 | library(dplyr) 55 | library(readr) 56 | 57 | list.files("~/playground", pattern = "sample[0-9].tsv") 58 | 59 | files<- list.files("~/playground", pattern = "sample[0-9].tsv", full.name=TRUE) 60 | 61 | files 62 | 63 | names(files)<- stringr::str_split(files, pattern = "/", simplify = TRUE)[, 5] %>% 64 | stringr::str_replace(".tsv", "") 65 | 66 | files 67 | 68 | dat1<- read_tsv(files[1]) 69 | dat2<- read_tsv(files[2]) 70 | dat3<- read_tsv(files[3]) 71 | dat4<-read_tsv(files[4]) 72 | 73 | ``` 74 | 75 | use a for loop 76 | 77 | ```{r} 78 | 79 | results<- data.frame() 80 | 81 | for (file in files) { 82 | x<- read_tsv(file) 83 | sample_name<- stringr::str_replace(file, "/Users/tommytang/playground/", "") %>% 84 | stringr::str_replace(".tsv", "") 85 | x$sample<- sample_name 86 | results<- rbind(results, x) 87 | } 88 | 89 | 90 | 91 | results 92 | 93 | results %>% 94 | tidyr::pivot_wider(names_from = sample, values_from = count) 95 | ``` 96 | 97 | 98 | use lapply 99 | 100 | ```{r} 101 | counts<- lapply(files, read_tsv) 102 | 103 | do.call(rbind, counts) 104 | 105 | purrr::reduce(counts, rbind) 106 | 107 | ## add a sample name column 108 | read_count<- function(file){ 109 | x<- read_tsv(file) 110 | sample_name<- stringr::str_replace(file, "/Users/tommytang/playground/", "") %>% 111 | stringr::str_replace(".tsv", "") 112 | x$sample<- sample_name 113 | return(x) 114 | } 115 | 116 | # lappy from base R 117 | counts<- lapply(files, read_count) 118 | 119 | #base R 120 | do.call(rbind, counts) 121 | 122 | # purrr 123 | purrr::reduce(counts, rbind) %>% 124 | tidyr::pivot_wider(names_from = sample, values_from = count) 125 | 126 | ``` 127 | 128 | use purrr::map and bind_rows 129 | ```{r} 130 | # map function from purrr 131 | counts<- purrr::map(files, read_tsv) 132 | 133 | dplyr::bind_rows(counts, .id = "sample") %>% 134 | tidyr::pivot_wider(names_from = sample, values_from = count) 135 | ``` 136 | 137 | ```{r} 138 | counts<- purrr::map_df(files, readr::read_tsv, .id = "sample") 139 | counts %>% 140 | tidyr::pivot_wider(names_from = sample, values_from = count) 141 | ``` 142 | 143 | -------------------------------------------------------------------------------- /scripts/R_tips_06_avoid_overplotting_ggblend.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "avoid overplotting ggblend" 3 | author: "Ming Tang" 4 | date: "6/21/2023" 5 | output: html_document 6 | editor_options: 7 | chunk_output_type: console 8 | --- 9 | 10 | ## How to avoid overplotting with ggblend 11 | 12 | https://mjskay.github.io/ggblend/ 13 | 14 | We’ll construct a simple dataset with two semi-overlapping point clouds. We’ll have two versions of the dataset: one with all the "a" points listed first, and one with all the "b" points listed first. 15 | 16 | ```{r} 17 | library(ggplot2) 18 | library(ggblend) 19 | 20 | theme_set(ggdist::theme_ggdist() + theme( 21 | plot.title = element_text(size = rel(1), lineheight = 1.1, face = "bold"), 22 | plot.subtitle = element_text(face = "italic"), 23 | panel.border = element_rect(color = "gray75", fill = NA) 24 | )) 25 | 26 | ``` 27 | 28 | 29 | 30 | ```{r} 31 | set.seed(1234) 32 | df_a = data.frame(x = rnorm(500, 0), y = rnorm(500, 1), set = "a") 33 | df_b = data.frame(x = rnorm(500, 1), y = rnorm(500, 2), set = "b") 34 | 35 | df_ab = rbind(df_a, df_b) |> 36 | transform(order = "draw a then b") 37 | 38 | df_ba = rbind(df_b, df_a) |> 39 | transform(order = "draw b then a") 40 | 41 | df = rbind(df_ab, df_ba) 42 | ``` 43 | 44 | 45 | ```{r} 46 | df |> 47 | ggplot(aes(x, y, color = set)) + 48 | geom_point(size = 3, alpha = 0.5) + 49 | scale_color_brewer(palette = "Set1") + 50 | facet_grid(~ order) + 51 | labs(title = "geom_point() without blending", subtitle = "Draw order matters.") 52 | ``` 53 | 54 | 55 | 56 | ```{r} 57 | df |> 58 | ggplot(aes(x, y, color = set)) + 59 | geom_point(size = 3, alpha = 0.5) |> blend("multiply") + 60 | scale_color_brewer(palette = "Set1") + 61 | facet_grid(~ order) + 62 | labs( 63 | title = "geom_point(alpha = 0.5) |> blend('multiply')", 64 | subtitle = "Draw order does not matter, but color is too dark." 65 | ) 66 | ``` 67 | 68 | 69 | Rstudio graphic device does not support blend 70 | ```{r} 71 | df |> 72 | ggplot(aes(x, y, color = set, partition = set)) + 73 | geom_point(size = 3, alpha = 0.5) * (blend("lighten") + blend("multiply", alpha = 0.5)) + 74 | scale_color_brewer(palette = "Set1") + 75 | facet_grid(~ order) + 76 | labs( 77 | title = "geom_point(aes(partition = set)) * (blend('lighten') + blend('multiply', alpha = 0.5))", 78 | subtitle = "Two order-independent blends on one layer using the distributive law." 79 | ) + 80 | theme(plot.subtitle = element_text(lineheight = 1.2)) 81 | ``` 82 | 83 | 84 | ```{r} 85 | cairo_pdf("~/github_repos/compbio_tutorials/results/blend.pdf", width = 6, height = 4) 86 | 87 | df |> 88 | ggplot(aes(x, y, color = set, partition = set)) + 89 | geom_point(size = 3, alpha = 0.5) * (blend("lighten") + blend("multiply", alpha = 0.5)) + 90 | scale_color_brewer(palette = "Set1") + 91 | facet_grid(~ order) + 92 | labs( 93 | title = "geom_point(aes(partition = set)) * (blend('lighten') + blend('multiply', alpha = 0.5))", 94 | subtitle = "Two order-independent blends on one layer using the distributive law." 95 | ) + 96 | theme(plot.subtitle = element_text(lineheight = 1.2)) 97 | 98 | dev.off() 99 | ``` 100 | 101 | -------------------------------------------------------------------------------- /scripts/R_tips_06_hierarchical_clustering.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "clustering in R" 3 | author: "Ming Tang" 4 | date: '2023-06-30' 5 | output: html_document 6 | editor_options: 7 | chunk_output_type: console 8 | --- 9 | 10 | In hierarchical clustering, a dendrogram is a diagram that shows the hierarchical relationship between the clusters that are formed. It is a tree-like diagram that is commonly used to visualize the results of hierarchical clustering. A dendrogram can help you to understand how the different clusters are related to each other, and it can also be used to determine the optimal number of clusters for your data. To create a dendrogram, the distances between each pair of clusters are calculated and then represented graphically using a tree-like structure, with the clusters being represented by the branches and the distance between the clusters being represented by the length of the branches. By looking at a dendrogram, you can see which clusters are more closely related to each other and which are more distantly related. 11 | 12 | ```{r dendrogram, echo=FALSE, fig.cap="what is a dendrogram", out.width = '60%'} 13 | 14 | knitr::include_graphics(c("../images/dendrogram.png")) 15 | ``` 16 | 17 | ## distance measures 18 | 19 | Before doing clustering, one has to define the distances between data points first. There are different 20 | distance measures as shown in Figure \@ref(fig:distmeasure). Credit https://towardsdatascience.com/9-distance-measures-in-data-science-918109d069fa. 21 | 22 | ```{r distmeasure, echo=FALSE, fig.cap="distance measures", out.width = '80%'} 23 | 24 | knitr::include_graphics(c("../images/distance_measures.jpeg")) 25 | ``` 26 | 27 | ```{r} 28 | ?dist 29 | ``` 30 | 31 | 32 | ## linkages 33 | 34 | After you define the distance measure, one has to define how the clusters are merged. 35 | That's what linkage does. 36 | 37 | Linkage: Measure of dissimilarity between two sets of objects that determine how two set of objects are merged. 38 | 39 | * Single linkage: Minimum dissimilarity between points in two sets used to determine 40 | which two sets should be merged. 41 | 42 | * Complete linkage: Maximum dissimilarity between points in two sets used to determine 43 | which two sets should be merged. 44 | 45 | * Average Linkage: Average dissimilarity between points in two sets used to determine 46 | which two sets should be merged. 47 | 48 | * Ward’s Linkage. The idea has much in common with analysis of variance (ANOVA). The linkage function specifying the distance between two clusters is computed as the increase in the "error sum of squares" (ESS) after fusing two clusters into a single cluster. In other words, it minimizes the increase in the total within-cluster variance when merging two clusters. It is commonly used when the goal is to create balanced, compact clusters. 49 | 50 | 51 | ```{r} 52 | ?hclust 53 | ``` 54 | 55 | 56 | Read more on clustering: https://www.nature.com/articles/nmeth.4299 57 | 58 | 59 | Let's use the NCI data microarray data 60 | 61 | ```{r} 62 | library(ISLR) 63 | ncidat<- t(NCI60$data) 64 | colnames(ncidat)<- NCI60$labs 65 | 66 | dim(ncidat) 67 | 68 | ncidat[1:10, 1:5] 69 | 70 | unique(colnames(ncidat)) 71 | 72 | X<- t(scale(t(ncidat),center=TRUE,scale=FALSE)) 73 | ``` 74 | 75 | Let's use complete linkage: 76 | 77 | * Often gives comparable cluster sizes. 78 | * Less sensitive to outliers. 79 | * Works better with spherical distributions. 80 | 81 | ```{r} 82 | #default euclidean distance 83 | Dmat<- dist(t(X)) 84 | com.hclust<- hclust(Dmat,method="complete") 85 | plot(com.hclust,cex=.7,main="Complete Linkage") 86 | ``` 87 | 88 | Let's make the dendrogram look a little better by coloring the labels. 89 | 90 | ```{r} 91 | library(dplyr) 92 | # https://cran.r-project.org/web/packages/dendextend/vignettes/dendextend.html 93 | # better dendogram 94 | library(dendextend) 95 | 96 | #https://cran.r-project.org/web/packages/Polychrome/index.html 97 | # better color 98 | library(Polychrome) 99 | 100 | set.seed(12042022) 101 | mypal <- kelly.colors(15) 102 | swatch(mypal) 103 | 104 | # remove the white color 105 | mypal<- mypal[-1] 106 | 107 | dend<- com.hclust %>% 108 | as.dendrogram() 109 | 110 | dend_labels<- dend %>% labels() 111 | 112 | dend %>% 113 | color_labels(col = mypal[as.numeric(as.factor(dend_labels))]) %>% 114 | set("labels_cex", 0.7) %>% 115 | plot() 116 | ``` 117 | 118 | Make a little function to plot dendrogram with branch label colored 119 | 120 | ```{r} 121 | plot_dend<- function(dend,...){ 122 | dend_labels<- dend %>% labels() 123 | 124 | dend %>% 125 | color_labels(col = mypal[as.numeric(as.factor(dend_labels))]) %>% 126 | set("labels_cex", 0.7) %>% 127 | plot(...) 128 | } 129 | 130 | ``` 131 | 132 | **Tip**: what is that `...` in the function? 133 | 134 | >Adding `...` to a function is a powerful technique because it allows you to accept any number of additional arguments. Unfortunately it comes with a big downside: any misspelled or extraneous arguments will be silently ignored. This package [ellipsis](https://ellipsis.r-lib.org/) provides tools for making ... safer 135 | 136 | 137 | **single linkage**: 138 | 139 | * Can handle diverse shapes. 140 | * Very sensitive to outliers or noise. 141 | * Often results in unbalanced clusters. 142 | * Extended, trailing clusters in which observations fused one at a time -chaining. 143 | 144 | ```{r} 145 | sing.hclust<- hclust(Dmat,method="single") 146 | plot_dend(as.dendrogram(sing.hclust), main = "single linkage") 147 | 148 | ``` 149 | 150 | 151 | **Average linkage**: 152 | 153 | * A compromise between single and complete linkage. 154 | * Less sensitive to outliers. 155 | * Works better with spherical distributions. 156 | 157 | ```{r} 158 | ave.hclust<- hclust(Dmat,method="average") 159 | plot_dend(as.dendrogram(ave.hclust), main = "average linkage") 160 | ``` 161 | 162 | 163 | **Ward’s linkage** 164 | 165 | similar to Average linkage. Join objects that minimize Euclidean distance / average Euclidean distance. 166 | 167 | ```{r} 168 | ward.hclust<- hclust(Dmat,method="ward.D") 169 | plot_dend(as.dendrogram(ward.hclust), main = " Ward linkage") 170 | 171 | # cut the tree with a height of 120 172 | abline(h=120) 173 | rect.hclust(ward.hclust,h=120) 174 | 175 | cl<- cutree(ward.hclust, h= 120) 176 | table(type=colnames(X), clusters=cl) 177 | ``` 178 | 179 | **Complete linkage with different distances** 180 | 181 | ```{r} 182 | Dmat<- dist(t(ncidat),method="manhattan") #L1 distance 183 | com.hclust<- hclust(Dmat,method="complete") 184 | plot_dend(as.dendrogram(com.hclust), main = " manhattan distance with Complete linage") 185 | ``` 186 | 187 | We can try all different combinations of distance matrix and different linkages. One can also use `1- cor(X)` as a distance measure! It is commonly used in the clustering of gene expression. Also, use either average linkage or Ward’s linkage. 188 | 189 | **Ward’s linkage for 1- cor(X) distance** 190 | 191 | `cor` calculate correlation between columns of a matrix. Do not need to transpose the matrix for calculating the distances between samples(columns). 192 | 193 | ```{r} 194 | Dmat<- as.dist(1-cor(ncidat)) 195 | ward.hclust<- hclust(Dmat,method="ward.D") 196 | plot_dend(as.dendrogram(ward.hclust), main = "ward linkage-1-cor(X) distance") 197 | ``` 198 | 199 | 200 | ## sort dendrogram 201 | 202 | The two branches from the same node can be flipped in the dendrogram and it does not 203 | affect the meaning of the cluster structure in the data. However, one can sort the dendrogram 204 | so it can be more visually. 205 | 206 | Take a look at [dendsort](https://github.com/evanbiederstedt/dendsort)! 207 | 208 | >The subtrees in the resulting dendrogram are sorted based on the average distance of subtrees at every merging point. The tighter cluster, in other words the cluster with smaller average distance, is placed on the left side of branch. When a leaf merge with a cluster, the leaf is placed on the right side. 209 | 210 | 211 | 212 | ```{r} 213 | library(dendsort) 214 | 215 | plot_dend(as.dendrogram(ward.hclust), main = "ward linkage-1-cor(X) distance") 216 | 217 | plot_dend(as.dendrogram(ward.hclust) %>% 218 | dendsort(), 219 | main = "ward linkage-1-cor(X) distance sorted dendogram") 220 | ``` 221 | 222 | -------------------------------------------------------------------------------- /scripts/R_tips_07_rownames.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "rownames for dataframe" 3 | author: "Ming Tang" 4 | date: '2024-01-04' 5 | output: html_document 6 | editor_options: 7 | chunk_output_type: console 8 | --- 9 | 10 | ```{r} 11 | expression<- data.frame(sample1 = c(1,2,3,4,5), 12 | sample2 = c(2,3,5,6,7), 13 | sample3 = c(10,12,8,9,14)) 14 | 15 | expression 16 | ``` 17 | 18 | add rownames 19 | 20 | ```{r} 21 | rownames(expression)<- paste0("gene", 1:5) 22 | 23 | expression 24 | ``` 25 | 26 | use rownames to subset 27 | 28 | ```{r} 29 | rownames(expression) 30 | 31 | expression[c("gene1", "gene2"), ] 32 | ``` 33 | 34 | duplicated genes 35 | 36 | ```{r} 37 | genes<- c("gene1", "gene2", "gene2", "gene3", "gene3") 38 | 39 | genes 40 | 41 | rownames(expression)<- genes 42 | 43 | make.names(genes, unique = TRUE) 44 | 45 | rownames(expression)<- make.names(genes, unique = TRUE) 46 | 47 | expression 48 | ``` 49 | -------------------------------------------------------------------------------- /scripts/R_tips_08_convert_gene_ids.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "convert gene ids" 3 | author: "Ming Tang" 4 | date: '2023-07-13' 5 | output: html_document 6 | editor_options: 7 | chunk_output_type: console 8 | --- 9 | 10 | ### GUI tools 11 | 12 | DAVID pathway https://david.ncifcrf.gov/conversion.jsp 13 | 14 | BioDBnet https://biodbnet.abcc.ncifcrf.gov/db/db2db.php 15 | 16 | 17 | ### Convert gene id using Bioconductor package 18 | ```{r} 19 | BiocManager::install("org.Hs.eg.db") 20 | library(org.Hs.eg.db) 21 | library(AnnotationDbi) 22 | 23 | genes<- c("VEGFA", "CTCF", "HIF1A") 24 | 25 | AnnotationDbi::select(org.Hs.eg.db, keys = genes, 26 | columns = c('ENTREZID'), keytype = 'SYMBOL') 27 | 28 | columns(org.Hs.eg.db) 29 | clusterProfiler::bitr(genes, fromType = "SYMBOL", toType = "ENTREZID", OrgDb = org.Hs.eg.db) 30 | 31 | ``` 32 | 33 | 34 | ### Mygene 35 | 36 | https://mygene.info/ 37 | convert more IDs 38 | 39 | ```{r} 40 | BiocManager::install("mygene") 41 | library(mygene) 42 | 43 | queryMany(genes, scopes="symbol", fields=c("uniprot", "ensembl.gene", "reporter"), species="human") 44 | ``` 45 | 46 | -------------------------------------------------------------------------------- /scripts/R_tips_09_biomart_mouse_ortholog.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "mouse ortholog" 3 | author: "Ming Tang" 4 | date: '2024-01-11' 5 | output: html_document 6 | editor_options: 7 | chunk_output_type: console 8 | --- 9 | 10 | Get mouse orthologs for human 11 | 12 | https://bioconductor.org/packages/release/bioc/html/biomaRt.html 13 | 14 | ```{r} 15 | if (!require("BiocManager", quietly = TRUE)) 16 | install.packages("BiocManager") 17 | 18 | BiocManager::install("biomaRt") 19 | 20 | 21 | library(dplyr) 22 | library(biomaRt) 23 | human<- useMart("ensembl", dataset = "hsapiens_gene_ensembl") 24 | 25 | attributes<- c("ensembl_gene_id", "external_gene_name", 26 | "mmusculus_homolog_ensembl_gene", 27 | "mmusculus_homolog_associated_gene_name", 28 | "mmusculus_homolog_orthology_type", 29 | "mmusculus_homolog_perc_id_r1") 30 | 31 | listAttributes(human) %>% 32 | head() 33 | 34 | 35 | listAttributes(human) %>% 36 | filter(stringr::str_detect(name, "mmusculus_homolog_")) 37 | 38 | listAttributes(human) %>% head() 39 | 40 | orth.mouse<- getBM(attributes, filters="with_mmusculus_homolog", 41 | values=TRUE, mart = human, uniqueRows=TRUE) 42 | 43 | listFilters(human)%>% head() 44 | listFilters(human)%>% 45 | filter(stringr::str_detect(name, "mmusculus")) 46 | 47 | head(orth.mouse) 48 | ``` 49 | 50 | 51 | ```{r} 52 | orth.mouse %>% 53 | dplyr::filter(external_gene_name == "VEGFA") 54 | 55 | orth.mouse %>% 56 | dplyr::filter(external_gene_name == "CTCF") 57 | 58 | orth.mouse %>% 59 | dplyr::filter(external_gene_name == "LILRB1") 60 | ``` 61 | -------------------------------------------------------------------------------- /scripts/R_tips_10_ggplot2_percentage.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "R_tips_percentage_ggplot" 3 | author: "Ming Tang" 4 | date: '2024-01-15' 5 | output: html_document 6 | editor_options: 7 | chunk_output_type: console 8 | --- 9 | 10 | How to label the y-axis with % for percentages? 11 | 12 | ```{r} 13 | library(Seurat) 14 | library(SeuratData) 15 | library(dplyr) 16 | library(ggplot2) 17 | 18 | data("pbmc3k") 19 | 20 | pbmc3k 21 | pbmc3k<- UpdateSeuratObject(pbmc3k) 22 | 23 | pbmc3k 24 | ``` 25 | 26 | 27 | ```{r} 28 | table(pbmc3k$seurat_annotations) 29 | ``` 30 | 31 | 32 | ```{r} 33 | pbmc3k@meta.data %>% 34 | head() 35 | ``` 36 | 37 | Let's calculate the percentage of each cell types 38 | 39 | ```{r} 40 | cell_number<- pbmc3k@meta.data %>% 41 | count(seurat_annotations) 42 | 43 | cell_number 44 | ``` 45 | 46 | calculate the percentage 47 | ```{r} 48 | cell_number<- cell_number %>% 49 | mutate(percent = n/ sum(n) * 100) %>% 50 | rename(cell_type = seurat_annotations) 51 | 52 | cell_number 53 | ``` 54 | 55 | plotting 56 | 57 | ```{r} 58 | ggplot(cell_number, aes(x= cell_type, y = percent)) + 59 | geom_bar(stat = "identity") + 60 | xlab("") 61 | ``` 62 | 63 | add percentage 64 | 65 | ```{r} 66 | ggplot(cell_number, aes(x= cell_type, y = percent)) + 67 | geom_bar(stat = "identity") + 68 | scale_y_continuous(labels = scales::percent) + 69 | xlab("") 70 | ``` 71 | 72 | fix it 73 | 74 | ```{r} 75 | ggplot(cell_number, aes(x= cell_type, y = percent)) + 76 | geom_bar(stat = "identity") + 77 | scale_y_continuous(labels = scales::percent_format(scale = 1)) + 78 | xlab("") 79 | 80 | 81 | ``` 82 | 83 | show more digits 84 | 85 | ```{r} 86 | ggplot(cell_number, aes(x= cell_type, y = percent)) + 87 | geom_bar(stat = "identity") + 88 | scale_y_continuous(labels = scales::percent_format(scale = 1, accuracy = 0.01)) + 89 | xlab("") 90 | ``` 91 | 92 | rotate x-axis label 93 | 94 | ```{r} 95 | ggplot(cell_number, aes(x= cell_type, y = percent)) + 96 | geom_bar(stat = "identity") + 97 | scale_y_continuous(labels = scales::percent_format(scale = 1, accuracy = 0.1)) + 98 | xlab("") + 99 | theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) 100 | ``` 101 | 102 | re-order the bar by percentages 103 | 104 | ```{r} 105 | ggplot(cell_number, 106 | aes(x= cell_type %>% forcats::fct_reorder(percent), y = percent)) + 107 | geom_bar(stat = "identity") + 108 | scale_y_continuous(labels = scales::percent_format(scale = 1, accuracy = 0.1)) + 109 | xlab("") + 110 | theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) 111 | ``` 112 | 113 | descending 114 | 115 | ```{r} 116 | ggplot(cell_number, 117 | aes(x= cell_type %>% forcats::fct_reorder(percent,.desc=TRUE), y = percent)) + 118 | geom_bar(stat = "identity") + 119 | scale_y_continuous(labels = scales::percent_format(scale = 1, accuracy = 0.1)) + 120 | xlab("") + 121 | theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) 122 | ``` 123 | 124 | -------------------------------------------------------------------------------- /scripts/R_tips_11_read_all_tabs_spreadsheet.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "read all tabs from spreadsheet" 3 | author: "Ming Tang" 4 | date: '2024-04-25' 5 | output: html_document 6 | editor_options: 7 | chunk_output_type: console 8 | --- 9 | 10 | ```{r} 11 | library(readxl) 12 | library(here) 13 | library(tidyverse) 14 | 15 | # read in one sheet 16 | read_excel(here("data/gene_counts.xlsx"), sheet = "sample1") 17 | 18 | ``` 19 | 20 | 21 | ```{r} 22 | 23 | read_excel(here("data/gene_counts.xlsx"), sheet = "sample1") 24 | read_excel(here("data/gene_counts.xlsx"), sheet = "sample2") 25 | read_excel(here("data/gene_counts.xlsx"), sheet = "sample3") 26 | read_excel(here("data/gene_counts.xlsx"), sheet = "sample4") 27 | 28 | path<- here("data/gene_counts.xlsx") 29 | 30 | path %>% 31 | excel_sheets() %>% 32 | set_names() %>% 33 | map(read_excel, path = path) 34 | 35 | 36 | path %>% 37 | excel_sheets() %>% 38 | map(read_excel, path = path) 39 | ``` 40 | 41 | 42 | merge into a single dataframe 43 | ```{r} 44 | path %>% 45 | excel_sheets() %>% 46 | set_names() %>% 47 | map(read_excel, path = path) %>% 48 | bind_rows(.id="sample") 49 | ``` 50 | 51 | 52 | make it to a wide format count table 53 | 54 | ```{r} 55 | path %>% 56 | excel_sheets() %>% 57 | set_names() %>% 58 | map(read_excel, path = path) %>% 59 | bind_rows(.id="sample") %>% 60 | tidyr::pivot_wider(names_from = "sample", values_from = "count") 61 | ``` 62 | 63 | -------------------------------------------------------------------------------- /scripts/R_tips_12_save_vs_saveRDS.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "save vs saveRDS" 3 | output: html_document 4 | date: "2024-08-29" 5 | editor_options: 6 | chunk_output_type: console 7 | --- 8 | 9 | ### save() vs saveRDS() 10 | 11 | ```{r} 12 | # Load necessary libraries 13 | library(tidymodels) 14 | library(broom) 15 | library(ggplot2) 16 | library(dplyr) 17 | 18 | head(mtcars) 19 | 20 | ggplot(mtcars, aes(x= mpg, y = wt)) + 21 | geom_point() 22 | 23 | 24 | # Define the linear regression model specification 25 | lm_spec <- linear_reg() %>% 26 | set_engine("lm") 27 | 28 | # Fit the model 29 | lm_fit <- lm_spec %>% 30 | fit(mpg ~ wt, data = mtcars) 31 | 32 | # Extract the coefficient and p-value 33 | model_summary <- tidy(lm_fit) 34 | 35 | 36 | saveRDS(lm_fit, file = "data/lm_fit.rds") 37 | 38 | save(lm_fit, model_summary, file = "data/lm_fit.rda") 39 | ``` 40 | 41 | 42 | ### read the data back 43 | 44 | ```{r} 45 | my_saved_fit<- readRDS("data/lm_fit.rds") 46 | 47 | load("data/lm_fit.rda") 48 | ``` -------------------------------------------------------------------------------- /scripts/R_tips_13_copy_paste_vector_datapasta.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "datapasta demo" 3 | output: html_document 4 | date: "2024-09-04" 5 | editor_options: 6 | chunk_output_type: console 7 | --- 8 | 9 | introduction to datapasta https://github.com/MilesMcBain/datapasta 10 | 11 | ```{r} 12 | install.packages( 13 | "datapasta", 14 | repos = c(mm = "https://milesmcbain.r-universe.dev", getOption("repos"))) 15 | ``` 16 | 17 | ### copy and paste vector 18 | 19 | ```{r} 20 | my_genes<- c("VEGFA", "CTCF", "TP53", "FOXP3", "CD3D", "CD8A", "CD4") 21 | 22 | c("VEGFA", 23 | "CTCF", 24 | "TP53", 25 | "FOXP3", 26 | "CD3D", 27 | "CD8A", 28 | "CD4") 29 | 30 | 31 | ``` 32 | 33 | ### copy and paste tibble 34 | 35 | ```{r} 36 | my_dataframe<- tibble::tribble( 37 | ~gene, ~expression, 38 | "VEGFA", 3L, 39 | "CTCF", 4L, 40 | "TP53", 10L, 41 | "FOXP3", 1L, 42 | "CD3D", 14L, 43 | "CD8A", 15L, 44 | "CD4", 0L 45 | ) 46 | 47 | 48 | ``` 49 | -------------------------------------------------------------------------------- /scripts/R_tips_14_janitor_clean_column_names.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "janitor clean column name" 3 | output: html_document 4 | date: "2024-09-04" 5 | editor_options: 6 | chunk_output_type: console 7 | --- 8 | 9 | ### Janitor R package 10 | 11 | showcase how to use https://github.com/sfirke/janitor 12 | 13 | ```{r} 14 | install.packages("janitor") 15 | 16 | library(readxl) 17 | library(janitor) 18 | library(dplyr) 19 | library(here) 20 | ``` 21 | 22 | ```{r} 23 | 24 | roster_raw <- read_excel(here("data/dirty_data.xlsx")) # I included the copy in the repo 25 | 26 | head(roster_raw) 27 | ``` 28 | 29 | ```{r} 30 | roster_raw <- roster_raw %>% 31 | row_to_names(row_number = 1) %>% 32 | clean_names() 33 | 34 | # or 35 | read_excel(here("data/dirty_data.xlsx"), skip=1) 36 | 37 | head(roster_raw) 38 | 39 | View(roster_raw) 40 | ``` 41 | 42 | clean it further 43 | ```{r} 44 | roster <- roster_raw %>% 45 | remove_empty(c("rows", "cols")) %>% 46 | remove_constant(na.rm = TRUE, quiet = FALSE) %>% # remove the column of all "Yes" values 47 | mutate( 48 | hire_date = convert_to_date( 49 | hire_date, # handle the mixed-format dates 50 | character_fun = lubridate::mdy 51 | ), 52 | cert = dplyr::coalesce(certification, certification_2), 53 | cert2 = dplyr::coalesce(certification_2, certification) 54 | ) %>% View() 55 | 56 | select(-certification, -certification_2) 57 | ``` 58 | 59 | ### better table using tabyl() 60 | 61 | ```{r} 62 | table(roster$subject) 63 | 64 | roster %>% 65 | tabyl(subject) 66 | 67 | roster %>% 68 | tabyl(employee_status, full_time) 69 | 70 | roster %>% 71 | tabyl(full_time, subject, employee_status, show_missing_levels = FALSE) 72 | 73 | roster %>% 74 | tabyl(subject, employee_status, full_time, show_missing_levels = FALSE) 75 | ``` -------------------------------------------------------------------------------- /scripts/R_tips_15_calculate_cpm.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "calculate cpm" 3 | output: html_document 4 | date: "2024-10-03" 5 | editor_options: 6 | chunk_output_type: console 7 | --- 8 | # how to calculate counts per million (CPM) from scratch for RNAseq or ChIP-seq count table 9 | 10 | 11 | ### use sweep 12 | 13 | ```{r} 14 | mat <- matrix(1:9, nrow=3, byrow=TRUE) 15 | 16 | matrix(1:9, nrow=3) 17 | # Divide each column by its sum 18 | normalized_mat1 <- sweep(mat, 2, colSums(mat)/10^6, FUN="/") 19 | 20 | 21 | normalized_mat1 22 | 23 | ``` 24 | 25 | ### use apply 26 | 27 | ```{r} 28 | normalized_mat2<- apply(mat, 2, function(x) x/sum(x) * 10^6) 29 | normalized_mat2 30 | 31 | all.equal(normalized_mat1, normalized_mat2) 32 | ``` 33 | 34 | ### use vectorization 35 | 36 | ```{r} 37 | mat/colSums(mat) 38 | 39 | normalized_mat3<- t(t(mat)/colSums(mat) * 10^6) 40 | normalized_mat3 41 | ``` 42 | 43 | ### use a package 44 | ```{r} 45 | library(edgeR) 46 | cpm(mat) 47 | ``` 48 | 49 | 50 | -------------------------------------------------------------------------------- /scripts/R_tips_16_liftover_bedpe.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "lift over bedpe" 3 | output: html_document 4 | date: "2024-11-22" 5 | editor_options: 6 | chunk_output_type: console 7 | --- 8 | 9 | ### liftover hg19 bedpe to hg38 10 | 11 | download the bedpe file here https://gitlab.com/tangming2005/Enhancer_promoter_interaction_data/-/blob/master/bedpe/ENCODE_EP.bedpe 12 | 13 | 14 | ```{r} 15 | library(rtracklayer) 16 | library(AnnotationHub) 17 | ahub <- AnnotationHub() 18 | 19 | ahub.chain <- subset(ahub, rdataclass == "ChainFile" & species == "Homo sapiens") 20 | query(ahub.chain, c("hg19", "hg38")) 21 | 22 | chain <- ahub.chain[ahub.chain$title == "hg19ToHg38.over.chain.gz"] 23 | chain <- chain[[1]] 24 | 25 | ?liftOver 26 | 27 | bedpe<- import(here("data/ENCODE_EP_clean.bedpe")) 28 | 29 | bedpe@first 30 | 31 | bedpe@second 32 | 33 | first_liftover<- liftOver(bedpe@first, chain = chain) 34 | second_liftover<- liftOver(bedpe@second, chain = chain) 35 | ``` 36 | 37 | There are regions in hg19 mapped to multiple regions of hg38 38 | and there are hg19 regions do not map to hg38. 39 | 40 | ```{r} 41 | indx<- elementNROWS(first_liftover) >1 42 | 43 | first_liftover[indx] 44 | 45 | table(elementNROWS(first_liftover) ==0) 46 | table(elementNROWS(second_liftover) ==0) 47 | 48 | 49 | length(first_liftover) 50 | length(second_liftover) 51 | 52 | invalid_idx_first<- which(elementNROWS(first_liftover) ==0) 53 | invalid_idx_second<- which(elementNROWS(second_liftover) ==0) 54 | 55 | valid_idx<- setdiff(1:length(first_liftover), c(invalid_idx_first, invalid_idx_second)) 56 | 57 | #remove the invalid pairs 58 | first_liftover<- first_liftover[valid_idx] 59 | second_liftover<- second_liftover[valid_idx] 60 | 61 | 62 | ## looping over the full GRangesList object is very slow 63 | ## only pick the problematic ones 64 | 65 | indx_1<- which(elementNROWS(first_liftover) >1) 66 | indx_2<- which(elementNROWS(second_liftover) >1) 67 | 68 | first_liftover[indx_1] 69 | 70 | select_valid_regions<- function(gr){ 71 | if (length(gr) > 1){ 72 | return(gr[which.max(width(gr))]) 73 | } 74 | } 75 | 76 | lapply(first_liftover[indx_1], select_valid_regions) 77 | lapply(second_liftover[indx_2], select_valid_regions) 78 | 79 | # check endoapply and mendoapply 80 | # unlist GRangeList object to GRanges object, but lapply will not keep the same GRangesList object 81 | # it will just return a list of GRanges. use endoapply 82 | first_liftover_valid<- c(unlist(endoapply(first_liftover[indx_1], 83 | select_valid_regions)), 84 | unlist(first_liftover[-indx_1])) 85 | 86 | 87 | 88 | second_liftover_valid<- c(unlist(endoapply(second_liftover[indx_2], 89 | select_valid_regions)), 90 | unlist(second_liftover[-indx_2])) 91 | 92 | first_liftover_valid 93 | 94 | ``` 95 | 96 | write it to file 97 | ```{r} 98 | hg38_bedpe<- Pairs(first_liftover_valid, second_liftover_valid) 99 | rtracklayer::export(hg38_bedpe, here("data/ENCODE_EP_hg38.bedpe"), format="bedpe") 100 | ``` 101 | 102 | -------------------------------------------------------------------------------- /scripts/R_tips_17_upset_plot_for_gene_sets.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "upset plot" 3 | output: html_document 4 | date: "2024-12-12" 5 | editor_options: 6 | chunk_output_type: console 7 | --- 8 | 9 | ```{r} 10 | # install.packages("msigdbr") 11 | library(msigdbr) 12 | library(dplyr) 13 | 14 | h_gene_sets<- msigdbr(species = "human", category = "H") 15 | 16 | h_gene_sets 17 | 18 | head(h_gene_sets) 19 | 20 | table(h_gene_sets$gs_name) 21 | ``` 22 | 23 | 24 | ```{r} 25 | gs_nest<- h_gene_sets %>% 26 | group_by(gs_name) %>% 27 | tidyr::nest() 28 | ``` 29 | 30 | 31 | ```{r} 32 | gs_nest$data[[1]] 33 | 34 | gene_list<- purrr::map(gs_nest$data, ~pull(.x, gene_symbol)) 35 | 36 | names(gene_list)<- gs_nest$gs_name %>% stringr::str_replace("HALLMARK_", "") 37 | 38 | names(gene_list) 39 | 40 | gene_sub<- gene_list[c(13,14,15,16,18)] 41 | 42 | names(gene_sub) 43 | ``` 44 | 45 | 46 | 47 | ```{r} 48 | library(ComplexHeatmap) 49 | m<- make_comb_mat(gene_sub) 50 | m 51 | 52 | UpSet(m) 53 | 54 | UpSet(m, comb_order = order(-comb_size(m))) 55 | ``` 56 | 57 | 58 | ```{r} 59 | UpSet(m, left_annotation = upset_left_annotation(m), 60 | comb_order = order(comb_size(m))) 61 | ``` 62 | 63 | add numbers on the bar 64 | ```{r} 65 | UpSet(m, left_annotation = upset_left_annotation(m), 66 | top_annotation = upset_top_annotation(m, add_numbers = TRUE), 67 | comb_order = order(comb_size(m))) 68 | ``` 69 | 70 | change set order 71 | ```{r} 72 | UpSet(m, 73 | left_annotation = upset_left_annotation(m), 74 | top_annotation = upset_top_annotation(m, add_numbers = TRUE), 75 | set_order = c("E2F_TARGETS","G2M_CHECKPOINT","ESTROGEN_RESPONSE_EARLY", 76 | "ESTROGEN_RESPONSE_LATE", "EPITHELIAL_MESENCHYMAL_TRANSITION"), 77 | comb_order = order(comb_size(m))) 78 | ``` 79 | 80 | ```{r} 81 | UpSet(m, 82 | left_annotation = upset_left_annotation(m), 83 | top_annotation = upset_top_annotation(m, add_numbers = TRUE), 84 | set_order = c("E2F_TARGETS","G2M_CHECKPOINT","ESTROGEN_RESPONSE_EARLY", 85 | "ESTROGEN_RESPONSE_LATE", "EPITHELIAL_MESENCHYMAL_TRANSITION"), 86 | comb_order = order(-comb_size(m))) 87 | ``` 88 | 89 | Change the rowname size 90 | 91 | ```{r} 92 | UpSet(m, left_annotation = upset_left_annotation(m), 93 | top_annotation = upset_top_annotation(m, add_numbers = TRUE), 94 | comb_order = order(comb_size(m)), 95 | row_names_gp = gpar(fontsize = 8)) 96 | ``` 97 | -------------------------------------------------------------------------------- /scripts/R_tips_18_tile_a_bed_file.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "tile a bed file" 3 | output: html_document 4 | date: "2024-12-31" 5 | editor_options: 6 | chunk_output_type: console 7 | --- 8 | 9 | ```{r} 10 | library(GenomicRanges) 11 | library(rtracklayer) 12 | library(dplyr) 13 | library(readr) 14 | library(here) 15 | 16 | ``` 17 | 18 | ```{r} 19 | gr2 <- GRanges( 20 | seqnames=Rle(c("chr1", "chr2", "chr1", "chr3"), c(1, 3, 2, 4)), 21 | ranges=IRanges(1:10, end=11), 22 | strand=Rle(strand(c("-", "+", "*", "+", "-")), c(1, 2, 2, 3, 2)), 23 | seqlengths=c(chr1=11, chr2=12, chr3=13)) 24 | 25 | gr2 26 | # split every range to 2 base pair bins 27 | tiles <- tile(gr2, width = 2L) 28 | tiles 29 | ``` 30 | 31 | 32 | We want to tile it to 5 base pair bins 33 | ```{r} 34 | gr<- rtracklayer::import(here("data/test.bed")) 35 | gr 36 | 37 | bin_size<- 5 38 | 39 | gr_width<- width(gr) 40 | 41 | gr_width 42 | 43 | bin_num<- ceiling(gr_width/bin_size) 44 | 45 | ## after extending, the peaks are overlapping 46 | gr_center<- resize(gr, fix = "center", width = bin_num * bin_size) 47 | 48 | ## this works fine 49 | unlist(tile(gr_center, width = bin_size)) 50 | 51 | ## merge before tile 52 | gr_center_merge<- reduce(gr_center) 53 | 54 | gr_center_merge 55 | 56 | out_bed<- unlist(tile(gr_center_merge, width = bin_size)) 57 | 58 | out_bed 59 | ``` 60 | 61 | check `tileGenome`. It is very helpful when you need to bin the genome into 62 | bins and calculate the number of reads in each bin. For example, for scATACseq, 63 | one can calculate the number of reads per bin in the whole genome and use that matrix 64 | to cluster the cells. 65 | 66 | ```{r} 67 | 68 | library(TxDb.Hsapiens.UCSC.hg38.knownGene) 69 | txdb<- TxDb.Hsapiens.UCSC.hg38.knownGene 70 | 71 | bins <- tileGenome(seqinfo(txdb), tilewidth=10000, 72 | cut.last.tile.in.chrom=TRUE) 73 | ``` 74 | 75 | -------------------------------------------------------------------------------- /scripts/R_tips_19_kmeans_clustering.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "k-means" 3 | output: html_document 4 | date: "2024-12-31" 5 | editor_options: 6 | chunk_output_type: console 7 | --- 8 | 9 | Let's use k-means clustering 10 | 11 | ```{r} 12 | # install the package if you do not have it. 13 | # install.packages("ISLR") 14 | library(ISLR) 15 | 16 | ncidat<- t(NCI60$data) 17 | colnames(ncidat)<- NCI60$labs 18 | 19 | dim(ncidat) 20 | 21 | ncidat[1:5, 1:50] 22 | ``` 23 | 24 | 25 | ```{r} 26 | unique(colnames(ncidat)) 27 | ``` 28 | 29 | ### PCA analysis 30 | 31 | ```{r} 32 | 33 | X<- t(scale(t(ncidat),center=TRUE,scale= TRUE)) 34 | 35 | # we transpose X again for svd, or use prcomp 36 | sv = svd(t(X)) 37 | U = sv$u 38 | V = sv$v 39 | D = sv$d 40 | ``` 41 | 42 | Further reading https://divingintogeneticsandgenomics.com/post/pca-in-action/ 43 | and https://divingintogeneticsandgenomics.com/post/pca-projection/ 44 | 45 | ```{r} 46 | Z = t(X)%*%V 47 | 48 | pc_dat<- data.frame(type = rownames(Z), PC1 = Z[,1], PC2= Z[,2]) 49 | 50 | library(ggplot2) 51 | 52 | ggplot(pc_dat,aes(x=PC1, y=PC2, col=type)) + 53 | geom_point() + 54 | theme_classic(base_size = 14) 55 | ``` 56 | 57 | ### K-means on the raw data 58 | 59 | kmeans is by rows by default. 60 | 61 | ```{r} 62 | library(ComplexHeatmap) 63 | 64 | 65 | K<- 9 66 | 67 | km<- kmeans(t(X), centers = K) 68 | 69 | table(km$cluster) 70 | ``` 71 | 72 | ### set.seed() to make it reproducible 73 | 74 | ```{r} 75 | set.seed(123) 76 | 77 | km<- kmeans(t(X), centers = K) 78 | table(km$cluster) 79 | ``` 80 | 81 | In your original matrix: 82 | 83 | Rows (genes): 6830 — these are the features or "attributes" of your samples. 84 | Columns (samples): 64 — these are what you want to group (cluster). 85 | 86 | When you run K-means clustering on the samples `t(X)`, the robot helper is grouping the columns based on their similarity across the 6830 genes. 87 | 88 | ### Dimensions of Outputs When Clustering Samples 89 | 90 | Cluster Assignments: 91 | For each of the 64 samples, K-means will assign it to one of the 9 groups (clusters). This is a vector of length 64. 92 | 93 | Dimension: 64 (one number per sample). 94 | 95 | ```{r} 96 | km$cluster 97 | 98 | length(km$cluster) 99 | ``` 100 | 101 | 102 | visualize it with a heatmap 103 | ```{r} 104 | km$cluster %>% 105 | tibble::enframe() %>% 106 | janitor::tabyl(name, value) %>% 107 | tibble::column_to_rownames(var="name") %>% 108 | as.matrix() %>% 109 | Heatmap(cluster_columns = FALSE) 110 | ``` 111 | 112 | Centers Matrix: 113 | 114 | The centers matrix now represents the "average sample" for each cluster. Each center is calculated based on the genes (rows). Since you have 9 clusters and each cluster center is described by the 6840 genes, the centers matrix will have: 115 | 116 | 9 rows (clusters) × 6840 columns (genes). 117 | 118 | ```{r} 119 | cens<- km$centers 120 | 121 | dim(cens) 122 | ``` 123 | 124 | 125 | ### how do we visualize K-means results? 126 | 127 | overlay K-means result on the PCA plot. 128 | 129 | ```{r} 130 | par(mfrow=c(1,1)) 131 | 132 | plot(Z[,1],Z[,2],col=km$cluster,type="n") 133 | 134 | text(Z[,1],Z[,2],colnames(ncidat),cex=.75,col=km$cluster) 135 | 136 | 137 | points(cens%*%V[,1],cens%*%V[,2],col=1:K,pch=16,cex=3) 138 | ``` 139 | 140 | ### K-means on the PCA space 141 | 142 | ```{r} 143 | Z 144 | 145 | km2<- kmeans(Z, centers = K) 146 | 147 | km2$cluster %>% 148 | tibble::enframe() %>% 149 | janitor::tabyl(name, value) %>% 150 | tibble::column_to_rownames(var="name") %>% 151 | as.matrix() %>% 152 | Heatmap(cluster_columns = FALSE) 153 | ``` 154 | 155 | 156 | K-means on the PCA-transformed Z matrix is generally preferred for clustering when working with high-dimensional data like gene expression. It focuses on the meaningful variation while avoiding noise and redundancy. 157 | 158 | 159 | Reduces noise and redundancy: PCA captures the most important variation in the data, filtering out noise or low-variance genes. 160 | - Better distance metrics: By focusing on a few top principal components, clustering is based on meaningful differences rather than noise. 161 | 162 | - Efficient computation: Working in a smaller-dimensional space speeds up K-means, especially with large datasets. 163 | 164 | 165 | K-means on the original matrix can be useful if you suspect that low-variance genes or subtle patterns might be biologically relevant and want to preserve them. 166 | 167 | 168 | | **Aspect** | **Original Matrix** | **PCA (Z Matrix)** | 169 | |----------------------------|------------------------------------|------------------------------------------| 170 | | **Dimensionality** | High (6840 genes) | Low (e.g., 10-50 PCs, depending on variance) | 171 | | **Noise Sensitivity** | High | Low | 172 | | **Focus** | Includes all variance | Focuses on major variance | 173 | | **Computational Cost** | Higher | Lower | 174 | | **Risk of Overfitting** | Higher (due to noise) | Lower | 175 | -------------------------------------------------------------------------------- /scripts/R_tips_20_scatterplot_with_cor_p_value.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Untitled" 3 | output: html_document 4 | date: "2025-01-06" 5 | editor_options: 6 | chunk_output_type: console 7 | --- 8 | 9 | ```{r} 10 | # BiocManager::install(c("airway", "ggpubr","DESEq2")) 11 | 12 | library(airway) 13 | library(ggpubr) 14 | library(DESeq2) # for normalization 15 | 16 | data("airway") 17 | dds <- DESeqDataSet(airway, design = ~ cell + dex) 18 | 19 | ``` 20 | 21 | The airway dataset contains RNA-seq data from airway smooth muscle cells. 22 | 23 | ```{r} 24 | dds <- DESeq(dds) 25 | norm_counts <- counts(dds, normalized = TRUE) 26 | ``` 27 | 28 | 29 | ```{r} 30 | gene1 <- "ENSG00000075624" # ACTB (Beta-actin) 31 | gene2 <- "ENSG00000111640" # GAPDH 32 | 33 | ``` 34 | Both ACTB and GAPDH are well-known housekeeping genes commonly used as controls in gene expression studies. 35 | 36 | ```{r} 37 | plot_data <- data.frame( 38 | Gene1 = norm_counts[gene1, ], 39 | Gene2 = norm_counts[gene2, ] 40 | ) 41 | 42 | data<- iris 43 | ``` 44 | 45 | ```{r} 46 | p <- ggscatter(data, 47 | x = "Sepal.Length", 48 | y = "Petal.Length", 49 | add = "reg.line", 50 | conf.int = TRUE, 51 | cor.coef = TRUE, 52 | cor.method = "pearson", 53 | cor.coeff.args = list(label.sep = "\n"), 54 | xlab = "Sepal Length", 55 | ylab = "Petal Length") 56 | print(p) 57 | 58 | ``` -------------------------------------------------------------------------------- /scripts/R_tips_janitor_clean_column_names.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "janitor clean column name" 3 | output: html_document 4 | date: "2024-09-04" 5 | editor_options: 6 | chunk_output_type: console 7 | --- 8 | 9 | ### Janitor R package 10 | 11 | showcase how to use https://github.com/sfirke/janitor 12 | 13 | ```{r} 14 | install.packages("janitor") 15 | 16 | library(readxl) 17 | library(janitor) 18 | library(dplyr) 19 | library(here) 20 | ``` 21 | 22 | ```{r} 23 | 24 | roster_raw <- read_excel(here("data/dirty_data.xlsx")) # I included the copy in the repo 25 | 26 | head(roster_raw) 27 | ``` 28 | 29 | ```{r} 30 | roster_raw <- roster_raw %>% 31 | row_to_names(row_number = 1) %>% 32 | clean_names() 33 | 34 | # or 35 | read_excel(here("data/dirty_data.xlsx"), skip=1) 36 | 37 | head(roster_raw) 38 | 39 | View(roster_raw) 40 | ``` 41 | 42 | clean it further 43 | ```{r} 44 | roster <- roster_raw %>% 45 | remove_empty(c("rows", "cols")) %>% 46 | remove_constant(na.rm = TRUE, quiet = FALSE) %>% # remove the column of all "Yes" values 47 | mutate( 48 | hire_date = convert_to_date( 49 | hire_date, # handle the mixed-format dates 50 | character_fun = lubridate::mdy 51 | ), 52 | cert = dplyr::coalesce(certification, certification_2), 53 | cert2 = dplyr::coalesce(certification_2, certification) 54 | ) %>% 55 | select(-certification, -certification_2) 56 | ``` 57 | 58 | ### better table using tabyl() 59 | 60 | ```{r} 61 | table(roster$subject) 62 | 63 | roster %>% 64 | tabyl(subject) 65 | 66 | roster %>% 67 | tabyl(employee_status, full_time) 68 | 69 | roster %>% 70 | tabyl(full_time, subject, employee_status, show_missing_levels = FALSE) 71 | 72 | roster %>% 73 | tabyl(subject, employee_status, full_time, show_missing_levels = FALSE) 74 | ``` --------------------------------------------------------------------------------