├── LICENSE
├── README.md
├── R_tips_12_save_vs_saveRDS.Rmd
├── compbio_tutorials.Rproj
├── data
    ├── .DS_Store
    ├── dirty_data.xlsx
    ├── gene_counts.xlsx
    ├── lm_fit.rda
    └── lm_fit.rds
├── images
    ├── Bioc.png
    ├── Cheng_question.png
    ├── GATK_best_practise_somatic.png
    ├── HiPerGator.png
    ├── Howie_question.png
    ├── Mikhael_question.png
    ├── R_and_python.png
    ├── Sajad_question.png
    ├── UCSC_format.png
    ├── age_pvalue.jpeg
    ├── aging.png
    ├── barchart.png
    ├── bashrc.png
    ├── bedtools.png
    ├── books.jpeg
    ├── boxplot_pvalue.png
    ├── cellline2commandline.png
    ├── cheatsheet1.png
    ├── cheatsheet2.png
    ├── cloud.png
    ├── cluster_species.png
    ├── cluster_tissue.png
    ├── clustered_dotplot.png
    ├── clustertissue.png
    ├── colormap.png
    ├── confusion.png
    ├── cron.jpeg
    ├── cross_validation_wrong.jpeg
    ├── data-life-cycle.png
    ├── data-science-explore.png
    ├── datatoviz.png
    ├── dates.jpeg
    ├── deeplearning.jpeg
    ├── dendrogram.png
    ├── dimredu.png
    ├── dist.png
    ├── distance_measures.jpeg
    ├── doc_version.png
    ├── experiment_design.png
    ├── fair_data.jpeg
    ├── filenames.png
    ├── filenames2.png
    ├── filenames3.png
    ├── folder.png
    ├── gct.png
    ├── geek.png
    ├── genomic_coordinate.png
    ├── ggplot2_cheatsheet.png
    ├── git_cheatsheet.jpeg
    ├── github-flow.png
    ├── google_arg.png
    ├── google_more.png
    ├── google_rotate.png
    ├── google_tar.png
    ├── gorilla.png
    ├── gsea.png
    ├── gsea_out1.png
    ├── gsea_out2.png
    ├── hclust.png
    ├── hpcscheduler.png
    ├── logistic_deep.png
    ├── logistic_rnn.png
    ├── mac_terminal.png
    ├── machine_learning_map.jpeg
    ├── matrix_factorization.png
    ├── merfish.jpeg
    ├── meta_excel.png
    ├── multinomial.png
    ├── notebook1.png
    ├── notebook2.png
    ├── out2.png
    ├── out3.png
    ├── phenotype_label.png
    ├── protein_mRNA_cor.jpeg
    ├── protein_vs_rna.png
    ├── protein_vs_rna_ccle.png
    ├── pval_cry.jpeg
    ├── qcfail.png
    ├── regular_1.png
    ├── regular_2.png
    ├── rnaseq_workflow.png
    ├── scatter_pvalue.png
    ├── scientific_method.jpeg
    ├── shell_terminal.png
    ├── stackover_rotate.png
    ├── statquest.png
    ├── superman.jpeg
    ├── survival1.png
    ├── survival2.png
    ├── survival3.png
    ├── tcga.png
    ├── terminal.jpeg
    ├── terminal.png
    ├── tidy_data.png
    ├── tidyverse.png
    ├── twin.png
    ├── ucsc_example.jpeg
    ├── umap_vs_tsne.png
    ├── unix_vs_linux.png
    └── workflow.png
└── scripts
    ├── 01_how_to_make_a_heatmap.Rmd
    ├── 02_differential_expression_heatmap.Rmd
    ├── 03_volcano_plot_with_ggrepel.Rmd
    ├── 04_create_seurat_object_from_GEO.Rmd
    ├── 05_find_tissue_specific_genes_human_protein_atlas.Rmd
    ├── 06_scRNseq_two_lines_from_fastq_to_count_matrix.Rmd
    ├── 07_gene_set_enrichment_RNAseq.Rmd
    ├── 08_intro_to_singleCellExperiment.Rmd
    ├── 09_intro_to_seurat_V5.Rmd
    ├── 10_analyze_my_tweets.Rmd
    ├── 11_change_rownames_ENSEMBL_to_symbol_RNAseq.Rmd
    ├── 15_how_to_get_metadata_GEO.Rmd
    ├── 16_get_mouse_gene_exon_lengths.Rmd
    ├── R_tips_01_add_percentage_to_y_axis.Rmd
    ├── R_tips_02_add_side_to_scatterplot.Rmd
    ├── R_tips_03_extract_tables_from_PDF.Rmd
    ├── R_tips_04_list_column_dataframe_in_dataframe.Rmd
    ├── R_tips_05_read_all_files_in_a_folder.Rmd
    ├── R_tips_06_avoid_overplotting_ggblend.Rmd
    ├── R_tips_06_hierarchical_clustering.Rmd
    ├── R_tips_06_hierarchical_clustering.html
    ├── R_tips_07_rownames.Rmd
    ├── R_tips_08_convert_gene_ids.Rmd
    ├── R_tips_09_biomart_mouse_ortholog.Rmd
    ├── R_tips_10_ggplot2_percentage.Rmd
    ├── R_tips_11_read_all_tabs_spreadsheet.Rmd
    ├── R_tips_12_save_vs_saveRDS.Rmd
    ├── R_tips_13_copy_paste_vector_datapasta.Rmd
    ├── R_tips_14_janitor_clean_column_names.Rmd
    ├── R_tips_15_calculate_cpm.Rmd
    ├── R_tips_16_liftover_bedpe.Rmd
    ├── R_tips_17_upset_plot_for_gene_sets.Rmd
    ├── R_tips_18_tile_a_bed_file.Rmd
    ├── R_tips_19_kmeans_clustering.Rmd
    ├── R_tips_20_scatterplot_with_cor_p_value.Rmd
    └── R_tips_janitor_clean_column_names.Rmd


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Ming Tang
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # compbio_tutorials
2 | 
3 | Rmarkdown files for my youtube videos on chatomics channel https://www.youtube.com/@chatomics
4 | 
5 | Make sure you subscribe to the channel and join my FREE newsletter https://divingintogeneticsandgenomics.ck.page/newsletter
6 | 


--------------------------------------------------------------------------------
/R_tips_12_save_vs_saveRDS.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "save vs saveRDS"
 3 | output: html_document
 4 | date: "2024-08-29"
 5 | editor_options: 
 6 |   chunk_output_type: console
 7 | ---
 8 | 
 9 | ### save() vs saveRDS()
10 | 
11 | ```{r}
12 | # Load necessary libraries
13 | library(tidymodels)
14 | library(broom)
15 | library(ggplot2)
16 | library(dplyr)
17 | 
18 | head(mtcars)
19 | 
20 | ggplot(mtcars, aes(x= mpg, y = wt)) +
21 |   geom_point()
22 | 
23 | 
24 | # Define the linear regression model specification
25 | lm_spec <- linear_reg() %>%
26 |   set_engine("lm")
27 | 
28 | # Fit the model
29 | lm_fit <- lm_spec %>%
30 |   fit(mpg ~ wt, data = mtcars)
31 | 
32 | # Extract the coefficient and p-value
33 | model_summary <- tidy(lm_fit)
34 | 
35 | 
36 | saveRDS(lm_fit, file = "data/lm_fit.rds")
37 | 
38 | save(lm_fit, model_summary, file = "data/lm_fit.rda")
39 | ```
40 | 
41 | 
42 | ### read the data back
43 | 
44 | ```{r}
45 | my_saved_fit<- readRDS("data/lm_fit.rds")
46 | 
47 | load("data/lm_fit.rda")
48 | ```


--------------------------------------------------------------------------------
/compbio_tutorials.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 


--------------------------------------------------------------------------------
/data/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/data/.DS_Store


--------------------------------------------------------------------------------
/data/dirty_data.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/data/dirty_data.xlsx


--------------------------------------------------------------------------------
/data/gene_counts.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/data/gene_counts.xlsx


--------------------------------------------------------------------------------
/data/lm_fit.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/data/lm_fit.rda


--------------------------------------------------------------------------------
/data/lm_fit.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/data/lm_fit.rds


--------------------------------------------------------------------------------
/images/Bioc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/Bioc.png


--------------------------------------------------------------------------------
/images/Cheng_question.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/Cheng_question.png


--------------------------------------------------------------------------------
/images/GATK_best_practise_somatic.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/GATK_best_practise_somatic.png


--------------------------------------------------------------------------------
/images/HiPerGator.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/HiPerGator.png


--------------------------------------------------------------------------------
/images/Howie_question.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/Howie_question.png


--------------------------------------------------------------------------------
/images/Mikhael_question.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/Mikhael_question.png


--------------------------------------------------------------------------------
/images/R_and_python.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/R_and_python.png


--------------------------------------------------------------------------------
/images/Sajad_question.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/Sajad_question.png


--------------------------------------------------------------------------------
/images/UCSC_format.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/UCSC_format.png


--------------------------------------------------------------------------------
/images/age_pvalue.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/age_pvalue.jpeg


--------------------------------------------------------------------------------
/images/aging.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/aging.png


--------------------------------------------------------------------------------
/images/barchart.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/barchart.png


--------------------------------------------------------------------------------
/images/bashrc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/bashrc.png


--------------------------------------------------------------------------------
/images/bedtools.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/bedtools.png


--------------------------------------------------------------------------------
/images/books.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/books.jpeg


--------------------------------------------------------------------------------
/images/boxplot_pvalue.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/boxplot_pvalue.png


--------------------------------------------------------------------------------
/images/cellline2commandline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/cellline2commandline.png


--------------------------------------------------------------------------------
/images/cheatsheet1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/cheatsheet1.png


--------------------------------------------------------------------------------
/images/cheatsheet2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/cheatsheet2.png


--------------------------------------------------------------------------------
/images/cloud.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/cloud.png


--------------------------------------------------------------------------------
/images/cluster_species.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/cluster_species.png


--------------------------------------------------------------------------------
/images/cluster_tissue.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/cluster_tissue.png


--------------------------------------------------------------------------------
/images/clustered_dotplot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/clustered_dotplot.png


--------------------------------------------------------------------------------
/images/clustertissue.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/clustertissue.png


--------------------------------------------------------------------------------
/images/colormap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/colormap.png


--------------------------------------------------------------------------------
/images/confusion.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/confusion.png


--------------------------------------------------------------------------------
/images/cron.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/cron.jpeg


--------------------------------------------------------------------------------
/images/cross_validation_wrong.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/cross_validation_wrong.jpeg


--------------------------------------------------------------------------------
/images/data-life-cycle.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/data-life-cycle.png


--------------------------------------------------------------------------------
/images/data-science-explore.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/data-science-explore.png


--------------------------------------------------------------------------------
/images/datatoviz.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/datatoviz.png


--------------------------------------------------------------------------------
/images/dates.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/dates.jpeg


--------------------------------------------------------------------------------
/images/deeplearning.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/deeplearning.jpeg


--------------------------------------------------------------------------------
/images/dendrogram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/dendrogram.png


--------------------------------------------------------------------------------
/images/dimredu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/dimredu.png


--------------------------------------------------------------------------------
/images/dist.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/dist.png


--------------------------------------------------------------------------------
/images/distance_measures.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/distance_measures.jpeg


--------------------------------------------------------------------------------
/images/doc_version.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/doc_version.png


--------------------------------------------------------------------------------
/images/experiment_design.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/experiment_design.png


--------------------------------------------------------------------------------
/images/fair_data.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/fair_data.jpeg


--------------------------------------------------------------------------------
/images/filenames.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/filenames.png


--------------------------------------------------------------------------------
/images/filenames2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/filenames2.png


--------------------------------------------------------------------------------
/images/filenames3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/filenames3.png


--------------------------------------------------------------------------------
/images/folder.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/folder.png


--------------------------------------------------------------------------------
/images/gct.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/gct.png


--------------------------------------------------------------------------------
/images/geek.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/geek.png


--------------------------------------------------------------------------------
/images/genomic_coordinate.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/genomic_coordinate.png


--------------------------------------------------------------------------------
/images/ggplot2_cheatsheet.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/ggplot2_cheatsheet.png


--------------------------------------------------------------------------------
/images/git_cheatsheet.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/git_cheatsheet.jpeg


--------------------------------------------------------------------------------
/images/github-flow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/github-flow.png


--------------------------------------------------------------------------------
/images/google_arg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/google_arg.png


--------------------------------------------------------------------------------
/images/google_more.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/google_more.png


--------------------------------------------------------------------------------
/images/google_rotate.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/google_rotate.png


--------------------------------------------------------------------------------
/images/google_tar.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/google_tar.png


--------------------------------------------------------------------------------
/images/gorilla.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/gorilla.png


--------------------------------------------------------------------------------
/images/gsea.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/gsea.png


--------------------------------------------------------------------------------
/images/gsea_out1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/gsea_out1.png


--------------------------------------------------------------------------------
/images/gsea_out2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/gsea_out2.png


--------------------------------------------------------------------------------
/images/hclust.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/hclust.png


--------------------------------------------------------------------------------
/images/hpcscheduler.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/hpcscheduler.png


--------------------------------------------------------------------------------
/images/logistic_deep.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/logistic_deep.png


--------------------------------------------------------------------------------
/images/logistic_rnn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/logistic_rnn.png


--------------------------------------------------------------------------------
/images/mac_terminal.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/mac_terminal.png


--------------------------------------------------------------------------------
/images/machine_learning_map.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/machine_learning_map.jpeg


--------------------------------------------------------------------------------
/images/matrix_factorization.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/matrix_factorization.png


--------------------------------------------------------------------------------
/images/merfish.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/merfish.jpeg


--------------------------------------------------------------------------------
/images/meta_excel.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/meta_excel.png


--------------------------------------------------------------------------------
/images/multinomial.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/multinomial.png


--------------------------------------------------------------------------------
/images/notebook1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/notebook1.png


--------------------------------------------------------------------------------
/images/notebook2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/notebook2.png


--------------------------------------------------------------------------------
/images/out2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/out2.png


--------------------------------------------------------------------------------
/images/out3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/out3.png


--------------------------------------------------------------------------------
/images/phenotype_label.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/phenotype_label.png


--------------------------------------------------------------------------------
/images/protein_mRNA_cor.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/protein_mRNA_cor.jpeg


--------------------------------------------------------------------------------
/images/protein_vs_rna.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/protein_vs_rna.png


--------------------------------------------------------------------------------
/images/protein_vs_rna_ccle.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/protein_vs_rna_ccle.png


--------------------------------------------------------------------------------
/images/pval_cry.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/pval_cry.jpeg


--------------------------------------------------------------------------------
/images/qcfail.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/qcfail.png


--------------------------------------------------------------------------------
/images/regular_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/regular_1.png


--------------------------------------------------------------------------------
/images/regular_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/regular_2.png


--------------------------------------------------------------------------------
/images/rnaseq_workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/rnaseq_workflow.png


--------------------------------------------------------------------------------
/images/scatter_pvalue.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/scatter_pvalue.png


--------------------------------------------------------------------------------
/images/scientific_method.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/scientific_method.jpeg


--------------------------------------------------------------------------------
/images/shell_terminal.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/shell_terminal.png


--------------------------------------------------------------------------------
/images/stackover_rotate.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/stackover_rotate.png


--------------------------------------------------------------------------------
/images/statquest.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/statquest.png


--------------------------------------------------------------------------------
/images/superman.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/superman.jpeg


--------------------------------------------------------------------------------
/images/survival1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/survival1.png


--------------------------------------------------------------------------------
/images/survival2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/survival2.png


--------------------------------------------------------------------------------
/images/survival3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/survival3.png


--------------------------------------------------------------------------------
/images/tcga.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/tcga.png


--------------------------------------------------------------------------------
/images/terminal.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/terminal.jpeg


--------------------------------------------------------------------------------
/images/terminal.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/terminal.png


--------------------------------------------------------------------------------
/images/tidy_data.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/tidy_data.png


--------------------------------------------------------------------------------
/images/tidyverse.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/tidyverse.png


--------------------------------------------------------------------------------
/images/twin.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/twin.png


--------------------------------------------------------------------------------
/images/ucsc_example.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/ucsc_example.jpeg


--------------------------------------------------------------------------------
/images/umap_vs_tsne.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/umap_vs_tsne.png


--------------------------------------------------------------------------------
/images/unix_vs_linux.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/unix_vs_linux.png


--------------------------------------------------------------------------------
/images/workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/compbio_tutorials/b5c2afae14aefd718c32e6188d03af52b10b2525/images/workflow.png


--------------------------------------------------------------------------------
/scripts/01_how_to_make_a_heatmap.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "1_how_to_make_a_heatmap"
  3 | author: "Ming Tang"
  4 | date: "5/2/2023"
  5 | output: html_document
  6 | editor_options: 
  7 |   chunk_output_type: console
  8 | ---
  9 | 
 10 | 
 11 | Making heatmap is an essential skill for any computational biologist.
 12 | 
 13 | 
 14 | ### load the libraries 
 15 | 
 16 | ```{r}
 17 | library(ComplexHeatmap)
 18 | ```
 19 | 
 20 | ### make dummy data 
 21 | 
 22 | ```{r}
 23 | 
 24 | h1 <- c(10,20,10,20,10,20,10,20)
 25 | h2 <- c(20,10,20,10,20,10,20,10)
 26 | 
 27 | l1 <- c(1,3,1,3,1,3,1,3)
 28 | l2 <- c(3,1,3,1,3,1,3,1)
 29 | 
 30 | mat <- rbind(h1,h2,l1,l2)
 31 | colnames(mat)<- paste0("timepoint_", 1:8)
 32 | mat
 33 | ```
 34 | 
 35 | 
 36 | visualize the data 
 37 | 
 38 | ```{r}
 39 | par(mfrow =c(1,1), mar=c(4,4,1,1))
 40 | plot(1:8,rep(0,8), ylim=c(0,35), pch="", xlab="Time", ylab="Gene Expression")
 41 | 
 42 | for (i in 1:nrow(mat)) {
 43 | lines(1:8,mat[i,], lwd=3, col=i)
 44 | }
 45 | 
 46 | legend(1,35,rownames(mat), 1:4, cex=0.7)
 47 | ```
 48 | 
 49 | ### Making a heatmap is easy!
 50 | 
 51 | ```{r}
 52 | Heatmap(mat, cluster_columns = FALSE)
 53 | 
 54 | quantile(mat, c(0, 0.1,0.5, 0.9))
 55 | ```
 56 | 
 57 | 
 58 | change color mapping 
 59 | 
 60 | ```{r}
 61 | col_fun<- circlize::colorRamp2(c(0, 3, 20), c("blue", "white", "red"))
 62 | 
 63 | Heatmap(mat, cluster_columns = FALSE, col = col_fun)
 64 | ```
 65 | 
 66 | 
 67 | outlier 
 68 | 
 69 | ```{r}
 70 | mat2<- mat
 71 | mat2[1,1]<- 1000
 72 | 
 73 | Heatmap(mat2, cluster_columns = FALSE)
 74 | 
 75 | Heatmap(mat2, cluster_columns = FALSE, col = col_fun)
 76 | 
 77 | ```
 78 | 
 79 | Let's scale the data/gene expression level across columns (time points) first
 80 | 
 81 | ```{r}
 82 | scaled_mat<- t(scale(t(mat)))
 83 | 
 84 | 
 85 | ?Heatmap
 86 | Heatmap(scaled_mat, cluster_columns = FALSE)
 87 | ```
 88 | 
 89 | Note, after scaling, h1 and l1 are close to each other!
 90 | 
 91 | ### understand clustering?
 92 | 
 93 | define distances 
 94 | 
 95 | ```{r}
 96 | ?dist
 97 | 
 98 | d<- dist(mat)
 99 | 
100 | d
101 | ```
102 | 
103 | 
104 | ```{r}
105 | ?hclust
106 | 
107 | hclust(d)
108 | 
109 | 
110 | plot(hclust(d))
111 | ```
112 | 
113 | 
114 | After scaling 
115 | ```{r}
116 | 
117 | d2<- dist(scaled_mat)
118 | 
119 | plot(hclust(d2))
120 | ```
121 | 
122 | Key takeaways:
123 | 1. color mapping is critical
124 | 2. scaling is critical 
125 | 3. making heatmap is easy, but better to understand the details.
126 | 
127 | 
128 | 
129 | 
130 | 
131 | 
132 | 
133 | 


--------------------------------------------------------------------------------
/scripts/02_differential_expression_heatmap.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "differential_expression_heatmap"
  3 | author: "Ming Tang"
  4 | date: "5/9/2023"
  5 | output: html_document
  6 | editor_options: 
  7 |   chunk_output_type: console
  8 | ---
  9 | 
 10 | Let's use a real example https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE197576
 11 | 
 12 | How to download the files from ftp https://www.ncbi.nlm.nih.gov/geo/info/download.html
 13 | 
 14 | https://ftp.ncbi.nlm.nih.gov/geo/series/GSE197nnn/GSE197576/suppl/
 15 | 
 16 | Alternative use GEOquery https://bioconductor.org/packages/release/bioc/html/GEOquery.html
 17 | 
 18 | ```{bash eval=FALSE}
 19 | wget https://ftp.ncbi.nlm.nih.gov/geo/series/GSE197nnn/GSE197576/suppl/GSE197576_raw_gene_counts_matrix.tsv.gz
 20 | 
 21 | csvtk pretty -t  GSE197576_raw_gene_counts_matrix.tsv.gz | less -S
 22 | 
 23 | csvtk headers -t   GSE197576_raw_gene_counts_matrix.tsv.gz
 24 | 
 25 | csvtk cut -t -f1,2,3,12,13 GSE197576_raw_gene_counts_matrix.tsv.gz| head
 26 | 
 27 | csvtk cut -t -f1,2,3,12,13 GSE197576_raw_gene_counts_matrix.tsv.gz > raw_counts.tsv
 28 | ```
 29 | 
 30 | Get csvtk at https://github.com/shenwei356/csvtk
 31 | 
 32 | ### read the data into R and make a DESeq2 object 
 33 | 
 34 | follow the tutorial http://bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html
 35 | 
 36 | ```{r}
 37 | library(dplyr)
 38 | library(readr)
 39 | library(here)
 40 | library(DESeq2)
 41 | 
 42 | raw_counts<- read_tsv(here("data/raw_counts.tsv"))
 43 | 
 44 | raw_counts_mat<- raw_counts[, -1] %>% as.matrix
 45 | 
 46 | head(raw_counts_mat)
 47 | 
 48 | rownames(raw_counts_mat)<- raw_counts$gene
 49 | head(raw_counts_mat)
 50 | 
 51 | ```
 52 | 
 53 | Make a sample sheet 
 54 | 
 55 | ```{r}
 56 | coldata<- data.frame(condition = c("normoxia", "normoxia", "hypoxia", "hypoxia"))
 57 | 
 58 | rownames(coldata)<- colnames(raw_counts_mat)
 59 | 
 60 | coldata
 61 | ```
 62 | 
 63 | Make a DEseq2 object
 64 | 
 65 | ```{r}
 66 | all(rownames(coldata) == colnames(raw_counts_mat))
 67 | 
 68 | dds <- DESeqDataSetFromMatrix(countData = raw_counts_mat,
 69 |                               colData = coldata,
 70 |                               design = ~ condition)
 71 | dds <- DESeq(dds)
 72 | res <- results(dds, contrast = c("condition", "hypoxia", "normoxia"))
 73 | 
 74 | res %>%
 75 |   as.data.frame() %>%
 76 |   arrange((padj), desc(log2FoldChange)) %>%
 77 |   head(n=30)
 78 | 
 79 | 
 80 | significant_genes<- res %>%
 81 |   as.data.frame() %>%
 82 |   filter(padj <=0.01, abs(log2FoldChange) >= 2) %>% 
 83 |   rownames()
 84 | 
 85 | 
 86 | significant_genes
 87 | ```
 88 | 
 89 | 
 90 | ### PCA analysis
 91 | 
 92 | ```{r}
 93 | vsd <- vst(dds, blind=FALSE)
 94 | 
 95 | plotPCA(vsd, intgroup=c("condition"))
 96 | ```
 97 | 
 98 | 
 99 | Plot PCA by ourselves.
100 | 
101 | ```{r}
102 | vsd <- vst(dds, blind=FALSE)
103 | head(assay(vsd), 3)
104 | 
105 | normalized_counts<- assay(vsd) %>% 
106 |   as.matrix()
107 | 
108 | pca_prcomp<- prcomp(t(normalized_counts), center = TRUE, scale. = FALSE)
109 | 
110 | names(pca_prcomp)
111 | pca_prcomp$x
112 | 
113 | PC1_and_PC2<- data.frame(PC1=pca_prcomp$x[,1], PC2= pca_prcomp$x[,2], 
114 |                          type = rownames(pca_prcomp$x))
115 | 
116 | ## plot PCA plot
117 | library(ggplot2)
118 | 
119 | ggplot(PC1_and_PC2, aes(x=PC1, y=PC2, col=type)) + 
120 |   geom_point() + 
121 |   geom_text(aes(label = type), hjust=0, vjust=0) +
122 |   coord_fixed()
123 | ```
124 | 
125 | It is not exactly the same, what's going on? 
126 | 
127 | 
128 | ```{r}
129 | ?plotPCA #using the top 500 most variable genes
130 | 
131 | ```
132 | 
133 | https://github.com/mikelove/DESeq2/blob/48b80aaac5efd4b9e0d054fc1e4a6e1fa78e782a/R/plots.R#LL245C71-L245C71
134 | 
135 | 
136 | ### Make a perfect heatmap
137 | 
138 | ```{r}
139 | library(ComplexHeatmap)
140 | 
141 | significant_mat<- normalized_counts[significant_genes, ] 
142 | 
143 | Heatmap(t(scale(t(significant_mat))))
144 | 
145 | ```
146 | 
147 | 
148 | Yeah, you get this perfect looking heatmap because we select the genes that are different. So, no surprise at all!
149 | 
150 | 
151 | ```{r}
152 | coldata
153 | 
154 | col_anno <- HeatmapAnnotation(df = coldata, 
155 |                              col = list( condition = c("hypoxia" = "red", "normoxia" = "blue")))
156 | 
157 | 
158 | Heatmap(t(scale(t(significant_mat))), 
159 |         top_annotation = col_anno,
160 |         show_row_names = FALSE,
161 |         name = "scaled normalized\nexpression")
162 | ```
163 | 
164 | why scaling is important?
165 | 
166 | 
167 | ```{r}
168 | Heatmap(significant_mat, 
169 |         top_annotation = col_anno,
170 |         show_row_names = FALSE,
171 |         name = "scaled normalized\nexpression")
172 | ```
173 | 
174 | 


--------------------------------------------------------------------------------
/scripts/03_volcano_plot_with_ggrepel.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "volcano plot with ggrepel"
  3 | author: "Ming Tang"
  4 | date: "5/16/2023"
  5 | output: html_document
  6 | editor_options: 
  7 |   chunk_output_type: console
  8 | ---
  9 | 
 10 | 
 11 | Continue with https://github.com/crazyhottommy/compbio_tutorials/blob/main/scripts/02_differential_expression_heatmap.Rmd
 12 | 
 13 | ### read the data into R and make a DESeq2 object 
 14 | 
 15 | follow the tutorial http://bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html
 16 | 
 17 | ```{r}
 18 | library(dplyr)
 19 | library(readr)
 20 | library(here)
 21 | library(DESeq2)
 22 | library(ggplot2)
 23 | 
 24 | raw_counts<- read_tsv(here("data/raw_counts.tsv"))
 25 | 
 26 | raw_counts_mat<- raw_counts[, -1] %>% as.matrix
 27 | 
 28 | head(raw_counts_mat)
 29 | 
 30 | rownames(raw_counts_mat)<- raw_counts$gene
 31 | head(raw_counts_mat)
 32 | 
 33 | ```
 34 | 
 35 | Make a sample sheet 
 36 | 
 37 | ```{r}
 38 | coldata<- data.frame(condition = c("normoxia", "normoxia", "hypoxia", "hypoxia"))
 39 | 
 40 | rownames(coldata)<- colnames(raw_counts_mat)
 41 | 
 42 | coldata
 43 | ```
 44 | 
 45 | Make a DEseq2 object
 46 | 
 47 | ```{r}
 48 | all(rownames(coldata) == colnames(raw_counts_mat))
 49 | 
 50 | dds <- DESeqDataSetFromMatrix(countData = raw_counts_mat,
 51 |                               colData = coldata,
 52 |                               design = ~ condition)
 53 | dds <- DESeq(dds)
 54 | res <- results(dds, contrast = c("condition", "hypoxia", "normoxia"))
 55 | 
 56 | head(res)
 57 | ```
 58 | 
 59 | ### Make a volcano plot
 60 | 
 61 | what is a volcano plot?
 62 | 
 63 | It is a scatter plot:
 64 | x-axis is the log2Fold change
 65 | 
 66 | y-axis is -log10(p-value)
 67 | 
 68 | 
 69 | ```{r}
 70 | res %>% 
 71 |   as.data.frame() %>%
 72 |   ggplot(aes(x = log2FoldChange, y = -log10(pvalue))) +
 73 |   geom_point()
 74 |   
 75 | ```
 76 | 
 77 | hmm, what are the points on the top?
 78 | 
 79 | ```{r}
 80 | res %>%
 81 |   as.data.frame() %>%
 82 |   arrange((padj), desc(log2FoldChange)) %>%
 83 |   head(n = 30)
 84 | ```
 85 | 
 86 | A basic volcano plot
 87 | 
 88 | ```{r}
 89 | res %>% 
 90 |   as.data.frame() %>%
 91 |   ggplot(aes(x = log2FoldChange, y = -log10(pvalue))) +
 92 |   geom_point() +
 93 |   theme_bw(base_size = 14)
 94 | ```
 95 | 
 96 | How to label the genes?
 97 | 
 98 | ```{r}
 99 | 
100 | res %>% 
101 |   as.data.frame() %>%
102 |   tibble::rownames_to_column(var = "gene") %>%
103 |   filter(!stringr::str_detect(gene, "LOC")) %>% 
104 |   filter(abs(log2FoldChange)>=4, padj <= 0.001) %>%
105 |   dim()
106 | 
107 | 
108 | res_sig<- res %>% 
109 |   as.data.frame() %>%
110 |   tibble::rownames_to_column(var = "gene") %>%
111 |   filter(!stringr::str_detect(gene, "LOC")) %>% 
112 |   filter(abs(log2FoldChange)>=4, padj <= 0.001) 
113 |   
114 | 
115 | res %>% 
116 |   as.data.frame() %>%
117 |   ggplot(aes(x = log2FoldChange, y = -log10(pvalue))) +
118 |   geom_point() +
119 |   geom_label(data = res_sig, aes(label = gene))+
120 |   theme_bw(base_size = 14)
121 | ```
122 | 
123 | 
124 | the labels are overlapping, let's improve by ggrepel
125 | ```{r}
126 | #install.packages("ggrepel")
127 | library(ggrepel )
128 | 
129 | 
130 | res %>% 
131 |   as.data.frame() %>%
132 |   ggplot(aes(x = log2FoldChange, y = -log10(pvalue))) +
133 |   geom_point() +
134 |   ggrepel::geom_label_repel(data = res_sig, aes(label = gene))+
135 |   theme_bw(base_size = 14)
136 | ```
137 | 
138 | 
139 | Let's color the points 
140 | 
141 | ```{r}
142 | res2<- res %>% 
143 |   as.data.frame() %>%
144 |   tibble::rownames_to_column(var = "gene") %>%
145 |   mutate(sig = case_when(
146 |     !stringr::str_detect(gene, "LOC") & abs(log2FoldChange)>=4 & 
147 |       padj <= 0.001 ~ "sig",
148 |     TRUE ~ "not sig"
149 |   ))
150 |   
151 | head(res2)
152 | 
153 | ggplot(res2, aes(x = log2FoldChange, y = -log10(pvalue))) +
154 |   geom_point(aes(color = sig)) +
155 |   ggrepel::geom_label_repel(data = res_sig, aes(label = gene))+
156 |   theme_bw(base_size = 14)
157 | ```
158 | 
159 | 
160 | fix the color of the points 
161 | 
162 | ```{r}
163 | ggplot(res2, aes(x = log2FoldChange, y = -log10(pvalue))) +
164 |   geom_point(aes(color = sig)) +
165 |   scale_color_manual(values = c("blue", "red")) +
166 |   ggrepel::geom_label_repel(data = res_sig, aes(label = gene))+
167 |   theme_bw(base_size = 14)
168 | ```
169 | 
170 | add horizontal and vertical lines
171 | ```{r}
172 | ggplot(res2, aes(x = log2FoldChange, y = -log10(pvalue))) +
173 |   geom_point(aes(color = sig)) +
174 |   scale_color_manual(values = c("blue", "red")) +
175 |   ggrepel::geom_label_repel(data = res_sig, aes(label = gene))+
176 |   geom_hline(yintercept = 100, linetype = 2, color = "red") +
177 |   geom_vline(xintercept = c(-4, 4), linetype = 2, color = "red")+
178 |   theme_bw(base_size = 14)
179 | ```
180 | 
181 | Enhanced volcanoplot: https://bioconductor.org/packages/devel/bioc/vignettes/EnhancedVolcano/inst/doc/EnhancedVolcano.html
182 | 
183 | 


--------------------------------------------------------------------------------
/scripts/04_create_seurat_object_from_GEO.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "How to a seurat object from GEO dataset"
  3 | author: "Ming Tang"
  4 | date: "07/12/2023"
  5 | output: html_document
  6 | editor_options: 
  7 |   chunk_output_type: console
  8 | ---
  9 | 
 10 | 
 11 | ### Download the data 
 12 | 
 13 | https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE116256
 14 | 
 15 | ```{bash}
 16 | 
 17 | cd /Users/tommytang/github_repos/compbio_tutorials/data/GSE116256
 18 | wget https://ftp.ncbi.nlm.nih.gov/geo/series/GSE116nnn/GSE116256/suppl/GSE116256_RAW.tar
 19 | tar xvf GSE116256_RAW.tar
 20 | rm  GSE116256_RAW.tar
 21 | ```
 22 | 
 23 | ```{r}
 24 | library(here)
 25 | library(stringr)
 26 | library(dplyr)
 27 | library(ggplot2)
 28 | library(Seurat)
 29 | library(purrr)
 30 | library(readr)
 31 | library(harmony)
 32 | library(scCustomize)
 33 | library(SeuratDisk)
 34 | ```
 35 | 
 36 | read in the count matrix 
 37 | 
 38 | ```{r}
 39 | read_counts<- function(file){
 40 |   x<- read_tsv(file)
 41 |   x<- as.data.frame(x)
 42 |   genes<- x$Gene
 43 |   x<- x[, -1]
 44 |   rownames(x)<- genes
 45 |   return(as.matrix(x))
 46 | }
 47 | 
 48 | 
 49 | counts_files<- list.files(here("data/GSE116256"), full.names = TRUE, pattern = "*dem.txt.gz")
 50 | 
 51 | samples<- map_chr(counts_files, basename) 
 52 | 
 53 | samples<- str_replace(samples, "(GSM[0-9]+_.+).dem.txt.gz", "\\1")
 54 | 
 55 | names(counts_files)<- samples
 56 | 
 57 | counts<- purrr::map(counts_files[1:4], read_counts)
 58 | 
 59 | ```
 60 | 
 61 | 
 62 | ```{r}
 63 | read_meta<- function(file){
 64 |   y<- read_tsv(file)
 65 |   y<- as.data.frame(y)
 66 |   cells<- y$Cell
 67 |   y<- y[,-1]
 68 |   rownames(y)<- cells
 69 |   return(y)
 70 | }
 71 | 
 72 | 
 73 | meta_files<- list.files(here("data/GSE116256"), full.names = TRUE, pattern = "*anno.txt.gz")
 74 | meta_names<- map_chr(meta_files, basename)
 75 | meta_names<- str_replace(meta_names, "(GSM[0-9]+_.+).anno.txt.gz", "\\1")
 76 | names(meta_files)<- meta_names
 77 | 
 78 | meta<- purrr::map(meta_files[1:4], read_meta)
 79 | ```
 80 | 
 81 | ### create a seurat object
 82 | 
 83 | ```{r}
 84 | library(Matrix) #for sparse matrix
 85 | objs<- purrr::map2(counts, meta,  
 86 |                    ~CreateSeuratObject(counts = as(.x, "sparseMatrix"), 
 87 |                                        meta.data = .y))
 88 | 
 89 | 
 90 | # merge to a single object 
 91 | merged_seurat<- purrr::reduce(objs, function(x,y) {merge(x,y)})
 92 | 
 93 | ## free memory
 94 | rm(counts)
 95 | rm(objs)
 96 | rm(meta)
 97 | gc()
 98 | ```
 99 | 
100 | ### preprocess the data 
101 | 
102 | ```{r}
103 | merged_seurat<- merged_seurat %>%
104 |   NormalizeData(normalization.method = "LogNormalize", scale.factor = 10000) %>%
105 |   FindVariableFeatures( selection.method = "vst", nfeatures = 2000) %>%
106 |   ScaleData() %>%
107 |   RunPCA() %>%
108 |   RunHarmony(group.by.vars = "orig.ident", dims.use = 1:30) %>%
109 |   RunUMAP(reduction = "harmony", dims = 1:30) %>%
110 |   FindNeighbors(reduction = "harmony", dims = 1:30) %>% 
111 |   FindClusters(resolution = 0.6)
112 | ```
113 | 
114 | 
115 | ### visualization
116 | 
117 | ```{r}
118 | DimPlot_scCustom(seurat_object = merged_seurat)
119 | 
120 | ```
121 | 


--------------------------------------------------------------------------------
/scripts/05_find_tissue_specific_genes_human_protein_atlas.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "05_find_tissue_specific_genes_human_protein_atlas"
 3 | author: "Ming Tang"
 4 | date: "6/15/2023"
 5 | output: html_document
 6 | editor_options: 
 7 |   chunk_output_type: console
 8 | ---
 9 | 
10 | http://www.bioconductor.org/packages/devel/bioc/vignettes/HPAanalyze/inst/doc/b_HPAanalyze_indepth.html
11 | 
12 | 
13 | ```{r}
14 | #BiocManager::install("HPAanalyze")
15 | library(tidyverse)
16 | library(HPAanalyze)
17 | ```
18 | 
19 | Most of the time, you will only need the “histology” datasets, which contain normal_tissue, pathology (basically cancers) and subcellular_location.
20 | 
21 | ```{r}
22 | downloadedData <- hpaDownload(downloadList='histology')
23 | ```
24 | 
25 | The `normal_tissue` dataset contains information about protein expression profiles in human tissues based on IHC staining. The datasets contain six columns: ensembl (Ensembl gene identifier); gene (HGNC symbol), tissue (tissue name); cell_type (annotated cell type); level (expression value); reliability (the gene reliability of the expression value)
26 | 
27 | ```{r}
28 | 
29 | names(downloadedData)
30 | 
31 | downloadedData$normal_tissue %>%
32 |   head()
33 | 
34 | table(downloadedData$normal_tissue$level)
35 | 
36 | downloadedData$normal_tissue %>%
37 |   filter(cell_type == "smooth muscle cells") %>%
38 |   janitor::tabyl(tissue, cell_type)
39 | 
40 | downloadedData$normal_tissue %>%
41 |   pivot_wider(names_from = c("tissue", "cell_type"), values_from = "level") %>%
42 |   View()
43 | 
44 | data<- downloadedData$normal_tissue %>%
45 |   pivot_wider(names_from = c("tissue", "cell_type"), values_from = "level") %>%
46 |   filter(`smooth muscle_smooth muscle cells` %in% c("High", "Medium"))
47 | 
48 | View(data)
49 | ```
50 | 
51 | what are the membrane genes
52 | 
53 | ```{r}
54 | head(downloadedData$subcellular_location)
55 | 
56 | table(downloadedData$subcellular_location$main_location) %>% 
57 |   sort()
58 | 
59 | memberane_genes<- downloadedData$subcellular_location %>% 
60 |   filter(str_detect(main_location, "Plasma membrane")) 
61 | 
62 | inner_join(data, memberane_genes, by = c("ensembl" = "ensembl")) 
63 | ```
64 | 
65 | 
66 | Make a heatmap 
67 | 
68 | ```{r}
69 | library(ComplexHeatmap)
70 | 
71 | data2<- data %>% 
72 |   filter(gene %in% memberane_genes$gene)
73 |   
74 | mat<- as.matrix(data2[, -c(1,2,3)])
75 | rownames(mat)<- data2$gene
76 | 
77 | ComplexHeatmap::Heatmap(mat,
78 |                         show_row_names = FALSE,
79 |                         show_column_names = FALSE)
80 | ```
81 | 
82 | 
83 | 
84 | 


--------------------------------------------------------------------------------
/scripts/06_scRNseq_two_lines_from_fastq_to_count_matrix.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "06_two_lines_command_scNAseq_from_fastq_to_count_matrix"
  3 | author: "Ming Tang"
  4 | date: "6/19/2023"
  5 | output: html_document
  6 | editor_options: 
  7 |   chunk_output_type: console
  8 | ---
  9 | 
 10 | ### Let's use 10x genomics data 
 11 | 
 12 | https://www.10xgenomics.com/resources/datasets/human-pbmc-from-a-healthy-donor-1-k-cells-v-2-2-standard-4-0-0
 13 | 
 14 | 
 15 | install tools 
 16 | 
 17 | gget https://github.com/pachterlab/gget
 18 | 
 19 | https://github.com/mamba-org/mamba
 20 | 
 21 | https://www.kallistobus.tools/
 22 | 
 23 | ```{bash}
 24 | mamba create -n kb-python python=3.7
 25 | conda activate kb-python
 26 | pip install kb-python gget ffq
 27 | 
 28 | ```
 29 | 
 30 | 
 31 | ```{bash}
 32 | time kb ref \
 33 |   -i index.idx \
 34 |   -g t2g.txt \
 35 |   -f1 transcriptome.fa \
 36 |   $(gget ref --ftp -w dna,gtf homo_sapiens)
 37 |   
 38 |   
 39 | Mon Jun 19 23:26:27 2023 INFO Fetching reference information for homo_sapiens from Ensembl release: 109.
 40 | [2023-06-19 23:26:30,564]    INFO [ref] Preparing http://ftp.ensembl.org/pub/release-109/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz, http://ftp.ensembl.org/pub/release-109/gtf/homo_sapiens/Homo_sapiens.GRCh38.109.gtf.gz
 41 | [2023-06-19 23:28:19,733]    INFO [ref] Splitting genome http://ftp.ensembl.org/pub/release-109/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz into cDNA at /Users/tommytang/playground/tmp/tmp6352jm21
 42 | [2023-06-19 23:52:12,318]    INFO [ref] Concatenating 1 cDNAs to transcriptome.fa
 43 | [2023-06-19 23:52:14,048]    INFO [ref] Creating transcript-to-gene mapping at t2g.txt
 44 | [2023-06-19 23:52:17,463]    INFO [ref] Indexing transcriptome.fa to index.idx
 45 | kb ref -i index.idx -g t2g.txt -f1 transcriptome.fa   716.34s user 32.85s system 36% cpu 34:25.42 total
 46 | ```
 47 | 
 48 | A little over 10 mins
 49 | 
 50 | prepare fastq input
 51 | 
 52 | ```{bash}
 53 | cd sc5p_v2_hs_PBMC_1k_5gex_fastqs
 54 | 
 55 | ls -1d  sc5p_v2_hs_PBMC_1k_5gex_fastqs/* | grep -E "R1|R2"
 56 | 
 57 | ls -1d  sc5p_v2_hs_PBMC_1k_5gex_fastqs/* | grep -E "R1|R2" | tr "\n" " " 
 58 | 
 59 | sc5p_v2_hs_PBMC_1k_5gex_S1_L001_R1_001.fastq.gz	sc5p_v2_hs_PBMC_1k_5gex_S1_L001_R2_001.fastq.gz	sc5p_v2_hs_PBMC_1k_5gex_S1_L002_R1_001.fastq.gz	sc5p_v2_hs_PBMC_1k_5gex_S1_L002_R2_001.fastq.gz
 60 | ```
 61 | 
 62 | ```{bash}
 63 | time kb count \
 64 |   -i index.idx \
 65 |   -g t2g.txt \
 66 |   -x 10xv2 \
 67 |   -t 8 \
 68 |   -m 16G \
 69 |   -o out \
 70 |   sc5p_v2_hs_PBMC_1k_5gex_fastqs/sc5p_v2_hs_PBMC_1k_5gex_S1_L001_R1_001.fastq.gz sc5p_v2_hs_PBMC_1k_5gex_fastqs/sc5p_v2_hs_PBMC_1k_5gex_S1_L001_R2_001.fastq.gz sc5p_v2_hs_PBMC_1k_5gex_fastqs/sc5p_v2_hs_PBMC_1k_5gex_S1_L002_R1_001.fastq.gz sc5p_v2_hs_PBMC_1k_5gex_fastqs/sc5p_v2_hs_PBMC_1k_5gex_S1_L002_R2_001.fastq.gz
 71 |   
 72 |   
 73 | 
 74 | [2023-06-21 22:04:33,465]    INFO [count] Using index index.idx to generate BUS file to out from
 75 | [2023-06-21 22:04:33,466]    INFO [count]         sc5p_v2_hs_PBMC_1k_5gex_fastqs/sc5p_v2_hs_PBMC_1k_5gex_S1_L001_R1_001.fastq.gz
 76 | [2023-06-21 22:04:33,466]    INFO [count]         sc5p_v2_hs_PBMC_1k_5gex_fastqs/sc5p_v2_hs_PBMC_1k_5gex_S1_L001_R2_001.fastq.gz
 77 | [2023-06-21 22:04:33,466]    INFO [count]         sc5p_v2_hs_PBMC_1k_5gex_fastqs/sc5p_v2_hs_PBMC_1k_5gex_S1_L002_R1_001.fastq.gz
 78 | [2023-06-21 22:04:33,466]    INFO [count]         sc5p_v2_hs_PBMC_1k_5gex_fastqs/sc5p_v2_hs_PBMC_1k_5gex_S1_L002_R2_001.fastq.gz
 79 | 
 80 | [2023-06-21 22:06:50,383]    INFO [count] Sorting BUS file out/output.bus to out/tmp/output.s.bus
 81 | [2023-06-21 22:07:05,398]    INFO [count] Whitelist not provided
 82 | [2023-06-21 22:07:05,399]    INFO [count] Copying pre-packaged 10XV2 whitelist to out
 83 | [2023-06-21 22:07:05,513]    INFO [count] Inspecting BUS file out/tmp/output.s.bus
 84 | [2023-06-21 22:07:08,064]    INFO [count] Correcting BUS records in out/tmp/output.s.bus to out/tmp/output.s.c.bus with whitelist out/10x_version2_whitelist.txt
 85 | [2023-06-21 22:07:12,351]    INFO [count] Sorting BUS file out/tmp/output.s.c.bus to out/output.unfiltered.bus
 86 | [2023-06-21 22:07:26,625]    INFO [count] Generating count matrix out/counts_unfiltered/cells_x_genes from BUS file out/output.unfiltered.bus
 87 | kb count -i index.idx -g t2g.txt -x 10xv2 -t 8 -m 16G -o out      851.81s user 26.97s system 469% cpu 3:07.26 total
 88 | 
 89 | ```
 90 | 
 91 | 30 seconds!
 92 | 
 93 | read in the unfiltered count matrix and filter out the empty droplets
 94 | ```{r}
 95 | library(Matrix, quietly=T) # load libraries
 96 | library(DropletUtils, quietly=T)
 97 | library(dplyr)
 98 | library(ggplot2)
 99 | ```
100 | 
101 | read in the matrix, genes and barcodes
102 | 
103 | ```{r}
104 | raw_mtx <- readMM('~/playground/out/counts_unfiltered/cells_x_genes.mtx')
105 | genes <- read.csv('~/playground/out/counts_unfiltered/cells_x_genes.genes.txt', sep = '\t', header = F) 
106 | barcodes<- read.csv('~/playground/out/counts_unfiltered/cells_x_genes.barcodes.txt', sep = '\t', header = F) 
107 | 
108 | 
109 | raw_mtx<- t(raw_mtx)
110 | rownames(raw_mtx) <- genes[,1] # attach gene_ids
111 | colnames(raw_mtx) <- barcodes[,1]
112 | ```
113 | 
114 | draw knee plot
115 | 
116 | ```{r}
117 | tot_counts <- colSums(raw_mtx)
118 | 
119 | df <- tibble(total = tot_counts,
120 |              rank = row_number(desc(total))) %>%
121 |       distinct() %>%
122 |       arrange(rank)
123 | 
124 | ggplot(df, aes(total, rank)) +
125 |   geom_path() +
126 |   scale_x_log10() + 
127 |   scale_y_log10() + 
128 |   annotation_logticks() +
129 |   labs(y = "Barcode rank", x = "Total UMI count")
130 | ```
131 | 
132 | 
133 | 
134 | ```{r}
135 | out <- emptyDrops(raw_mtx) # get probability that each barcode is a cell
136 | keep <- out$FDR <= 0.05 # define threshold probability for calling a cell
137 | keep[is.na(keep)] <- FALSE
138 | filt_mtx <- raw_mtx[,keep] # subset raw mtx to remove empty drops
139 | 
140 | dim(filt_mtx)
141 | ```
142 | 
143 | ### SRR files from SRA 
144 | 
145 | https://github.com/pachterlab/ffq
146 | 
147 | 
148 | ```{bash}
149 | ffq SRR9990627
150 | ```
151 | 
152 | 
153 | 
154 | ```{bash}
155 | time kb count \
156 |   -i index.idx \
157 |   -g t2g.txt \
158 |   -x 10xv2 \
159 |   -t 8 \
160 |   -m 16G \
161 |   -o out \
162 |   $(ffq --ftp SRR10668798 | jq -r '.[] | .url' | tr '\n' ' ')
163 | ```
164 | 
165 | 
166 | 


--------------------------------------------------------------------------------
/scripts/07_gene_set_enrichment_RNAseq.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "gene set enrichment analysis from RNAseq data"
  3 | author: "Ming Tang"
  4 | date: '2023-06-26'
  5 | output: html_document
  6 | editor_options: 
  7 |   chunk_output_type: console
  8 | ---
  9 | 
 10 | 
 11 | ## Gene set enrichment analysis 
 12 | 
 13 | Let's use a real example https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE197576
 14 | 
 15 | How to download the files from ftp https://www.ncbi.nlm.nih.gov/geo/info/download.html
 16 | 
 17 | https://ftp.ncbi.nlm.nih.gov/geo/series/GSE197nnn/GSE197576/suppl/
 18 | 
 19 | Alternative use GEOquery https://bioconductor.org/packages/release/bioc/html/GEOquery.html
 20 | 
 21 | ```{bash eval=FALSE}
 22 | wget https://ftp.ncbi.nlm.nih.gov/geo/series/GSE197nnn/GSE197576/suppl/GSE197576_raw_gene_counts_matrix.tsv.gz
 23 | 
 24 | csvtk pretty -t  GSE197576_raw_gene_counts_matrix.tsv.gz | less -S
 25 | 
 26 | csvtk headers -t   GSE197576_raw_gene_counts_matrix.tsv.gz
 27 | 
 28 | csvtk cut -t -f1,2,3,12,13 GSE197576_raw_gene_counts_matrix.tsv.gz| head
 29 | 
 30 | csvtk cut -t -f1,2,3,12,13 GSE197576_raw_gene_counts_matrix.tsv.gz > raw_counts.tsv
 31 | ```
 32 | 
 33 | Get csvtk at https://github.com/shenwei356/csvtk
 34 | 
 35 | ### read the data into R and make a DESeq2 object 
 36 | 
 37 | follow the tutorial http://bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html
 38 | 
 39 | ```{r}
 40 | library(dplyr)
 41 | library(readr)
 42 | library(here)
 43 | library(DESeq2)
 44 | 
 45 | raw_counts<- read_tsv(here("data/raw_counts.tsv"))
 46 | 
 47 | raw_counts_mat<- raw_counts[, -1] %>% as.matrix
 48 | 
 49 | head(raw_counts_mat)
 50 | 
 51 | rownames(raw_counts_mat)<- raw_counts$gene
 52 | head(raw_counts_mat)
 53 | 
 54 | ```
 55 | 
 56 | Make a sample sheet 
 57 | 
 58 | ```{r}
 59 | coldata<- data.frame(condition = c("normoxia", "normoxia", "hypoxia", "hypoxia"))
 60 | 
 61 | rownames(coldata)<- colnames(raw_counts_mat)
 62 | 
 63 | coldata
 64 | ```
 65 | 
 66 | Make a DEseq2 object
 67 | 
 68 | ```{r}
 69 | all(rownames(coldata) == colnames(raw_counts_mat))
 70 | 
 71 | dds <- DESeqDataSetFromMatrix(countData = raw_counts_mat,
 72 |                               colData = coldata,
 73 |                               design = ~ condition)
 74 | dds <- DESeq(dds)
 75 | res <- results(dds, contrast = c("condition", "hypoxia", "normoxia"))
 76 | 
 77 | res %>%
 78 |   as.data.frame() %>%
 79 |   arrange((padj), desc(log2FoldChange)) %>%
 80 |   head(n=30)
 81 | 
 82 | 
 83 | significant_genes<- res %>%
 84 |   as.data.frame() %>%
 85 |   filter(padj <=0.01, abs(log2FoldChange) >= 2) %>% 
 86 |   rownames()
 87 | 
 88 | 
 89 | significant_genes
 90 | ```
 91 | 
 92 | ## pathway analysis
 93 | 
 94 | https://yulab-smu.top/biomedical-knowledge-mining-book/enrichment-overview.html
 95 | 
 96 | ### over-representation test
 97 | 
 98 | ```{r}
 99 | library(clusterProfiler)
100 | 
101 | #convert gene symbol to Entrez ID for 
102 | 
103 | significant_genes_map<- clusterProfiler::bitr(geneID = significant_genes,
104 |                       fromType="SYMBOL", toType="ENTREZID",
105 |                       OrgDb="org.Hs.eg.db")
106 | 
107 | head(significant_genes_map)
108 | 
109 | ## background genes are genes that are detected in the RNAseq experiment 
110 | background_genes<- res %>% 
111 |   as.data.frame() %>% 
112 |   filter(baseMean != 0) %>%
113 |   tibble::rownames_to_column(var = "gene") %>%
114 |   pull(gene)
115 | 
116 | 
117 | res_df<- res %>% 
118 |   as.data.frame() %>% 
119 |   filter(baseMean != 0) %>%
120 |   tibble::rownames_to_column(var = "gene")
121 | 
122 | background_genes_map<- bitr(geneID = background_genes, 
123 |                             fromType="SYMBOL", 
124 |                             toType="ENTREZID",
125 |                       OrgDb="org.Hs.eg.db")
126 | ```
127 | 
128 | GO term enrichment 
129 | 
130 | Gene Ontology(GO) defines concepts/classes used to describe gene function, and relationships between these concepts. It classifies functions along three aspects:
131 | 
132 | MF: Molecular Function
133 | molecular activities of gene products
134 | 
135 | CC: Cellular Component
136 | where gene products are active
137 | 
138 | BP: Biological Process
139 | pathways and larger processes made up of the activities of multiple gene products
140 | 
141 | GO terms are organized in a directed acyclic graph, where edges between terms represent parent-child relationship.
142 | 
143 | ```{r}
144 | ego <- enrichGO(gene          = significant_genes_map$ENTREZID,
145 |                 universe      = background_genes_map$ENTREZID,
146 |                 OrgDb         = org.Hs.eg.db,
147 |                 ont           = "BP",
148 |                 pAdjustMethod = "BH",
149 |                 pvalueCutoff  = 0.01,
150 |                 qvalueCutoff  = 0.05,
151 |                 readable      = TRUE)
152 | head(ego)
153 | 
154 | library(enrichplot)
155 | barplot(ego, showCategory=20) 
156 | dotplot(ego)
157 | ```
158 | 
159 | 
160 | H: hallmark gene sets
161 | C1: positional gene sets
162 | C2: curated gene sets
163 | C3: motif gene sets
164 | C4: computational gene sets
165 | C5: GO gene sets
166 | C6: oncogenic signatures
167 | C7: immunologic signatures
168 | 
169 | 
170 | ```{r}
171 | # install.packages("msigdbr")
172 | library(msigdbr)
173 | 
174 | m_df <- msigdbr(species = "Homo sapiens")
175 | head(m_df)
176 | 
177 | m_t2g <- msigdbr(species = "Homo sapiens", category = "H") %>% 
178 |   dplyr::select(gs_name, entrez_gene)
179 | 
180 | 
181 | table(m_t2g$gs_name)
182 | head(m_t2g)
183 | 
184 | em <- enricher(significant_genes_map$ENTREZID, TERM2GENE=m_t2g, 
185 |                universe = background_genes_map$ENTREZID )
186 | head(em)
187 | ```
188 | 
189 | ### Gene set enrichment analysis
190 | 
191 | ```{r}
192 | ## you need all the genes and pre-rank them by p-value
193 | ## rank all the genes by signed fold change * -log10pvalue.
194 | 
195 | res_df<- res_df %>% 
196 |   mutate(signed_rank_stats = sign(log2FoldChange) * -log10(pvalue)) %>%
197 |   left_join(background_genes_map, by= c("gene" = "SYMBOL")) %>%
198 |   arrange(desc(signed_rank_stats))
199 | 
200 | gene_list<- res_df$signed_rank_stats
201 | names(gene_list)<- res_df$ENTREZID
202 | 
203 | em2 <- GSEA(gene_list, TERM2GENE=m_t2g)
204 | 
205 | ## change the inf to big numbers
206 | res_df<- res_df %>%
207 |   mutate(negative_log10pvalue = -log10(pvalue)) %>%
208 |   mutate(negative_log10pvalue = ifelse(is.infinite(negative_log10pvalue), 1000, negative_log10pvalue)) %>%
209 |   mutate(signed_rank_stats = sign(log2FoldChange) * negative_log10pvalue)
210 | 
211 | gene_list<- res_df$signed_rank_stats
212 | names(gene_list)<- res_df$ENTREZID
213 | 
214 | 
215 | em2 <- GSEA(gene_list, TERM2GENE=m_t2g)
216 | head(em2)
217 | 
218 | em2@result %>% View()
219 | ```
220 | 
221 | ### visualization 
222 | 
223 | 
224 | ```{r}
225 | p1<- gseaplot(em2, geneSetID = "HALLMARK_G2M_CHECKPOINT", 
226 |               by = "runningScore", title = "HALLMARK_G2M_CHECKPOINT")
227 | 
228 | p2 <- gseaplot(em2, geneSetID = "HALLMARK_HYPOXIA", 
229 |                by = "runningScore", title = "HALLMARK_HYPOXIA")
230 | 
231 | p1/p2
232 | ```
233 | 
234 | 
235 | important thread on background gene selection https://twitter.com/mdziemann/status/1626407797939384320 by Mark Ziemann
236 | 
237 | Further reading https://twitter.com/tangming2005/status/1671873310257295360
238 | 
239 | 


--------------------------------------------------------------------------------
/scripts/08_intro_to_singleCellExperiment.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "singleCellExperiment"
  3 | author: "Ming Tang"
  4 | date: '2024-06-01'
  5 | output: html_document
  6 | editor_options: 
  7 |   chunk_output_type: console
  8 | ---
  9 | 
 10 | ### introduction to singleCellExperiment 
 11 | 
 12 | references:
 13 | 
 14 | https://www.bioconductor.org/packages/release/bioc/vignettes/SingleCellExperiment/inst/doc/intro.html
 15 | 
 16 | 
 17 | ```{r}
 18 | if (!requireNamespace("BiocManager", quietly = TRUE))
 19 |     install.packages("BiocManager")
 20 | 
 21 | BiocManager::install("scater")
 22 | 
 23 | BiocManager::install("SingleCellExperiment")
 24 | 
 25 | library(SingleCellExperiment)
 26 | library(scater)
 27 | ```
 28 | 
 29 | https://ftp.ncbi.nlm.nih.gov/geo/series/GSE126nnn/GSE126030/suppl/
 30 | 
 31 | ```{bash eval=FALSE}
 32 | wget https://ftp.ncbi.nlm.nih.gov/geo/series/GSE126nnn/GSE126030/suppl/GSE126030_RAW.tar
 33 | 
 34 | tar -xvf GSE126030_RAW.tar
 35 | ```
 36 | 
 37 | 
 38 | ```{r}
 39 | library(tidyverse)
 40 | library(here)
 41 | counts_df<- read_tsv(here("data/GSM3589407_PP002swap.filtered.matrix.txt.gz"))
 42 | 
 43 | colnames(counts_df) %>% tail()
 44 | 
 45 | counts_df$...2466 %>% head()
 46 | 
 47 | table(counts_df$...2466, useNA="ifany")
 48 | ```
 49 | 
 50 | remove the last column
 51 | ```{r}
 52 | length(colnames(counts_df))
 53 | counts_df<- counts_df[, -2466]
 54 | 
 55 | counts_df[1:5, 1:5]
 56 | 
 57 | counts_mat<- counts_df[, -c(1,2)] %>%
 58 |   as.matrix()
 59 | 
 60 | rownames(counts_mat)<- counts_df$Gene
 61 | 
 62 | counts_mat[1:5, 1:5]
 63 | ```
 64 | 
 65 | ### Create a SingleCellExperiment Object 
 66 | 
 67 | ```{r}
 68 | sce <- SingleCellExperiment(assays = list(counts = counts_mat))
 69 | 
 70 | sce
 71 | ```
 72 | To access the count data we just supplied, we can do any one of the following:
 73 | 
 74 | ```{r}
 75 | assay(sce, "counts")
 76 | 
 77 | assay(sce, "counts")[1:5, 1:5]
 78 | 
 79 | counts(sce)[1:5, 1:5]
 80 | ```
 81 | 
 82 | One can access the slots with the `@` operator. This is considered bad practice as the class developers are free to alter the internal structure of the class, at which point any code using `@` may no longer work. Rather, it is best to use the provided **getter** functions like `assay()` and `counts()`
 83 | 
 84 | ```{r}
 85 | sce@assays
 86 | 
 87 | sce@assays@data
 88 | 
 89 | sce@assays@data$counts[1:5, 1:5]
 90 | ```
 91 | 
 92 | add more assays
 93 | 
 94 | ```{r}
 95 | sce <- scuttle::logNormCounts(sce)
 96 | sce
 97 | ```
 98 | 
 99 | ```{r}
100 | sce@assays@data$logcounts[1:5, 1:5]
101 | 
102 | assay(sce, "logcounts")[1:5, 1:5]
103 | 
104 | dim(logcounts(sce))
105 | ```
106 | 
107 | You may add the assay manually
108 | 
109 | ```{r}
110 | counts_1 <- counts(sce) + 1
111 | 
112 | assay(sce, "counts_1") <- counts_1 # assign a new entry to assays slot
113 | 
114 | assays(sce)
115 | 
116 | # not recommended way to add new assay data
117 | sce@assays@data$counts_2<- counts(sce) + 2
118 | assays(sce)
119 | ```
120 | 
121 | ```{r}
122 | assayNames(sce)
123 | ```
124 | 
125 | ### add metadata
126 | 
127 | ```{r}
128 | sce <- scuttle::addPerCellQC(sce)
129 | 
130 | 
131 | colData(sce)
132 | 
133 | coldata<- colData(sce)
134 | 
135 | identical(rownames(coldata), colnames(counts_mat))
136 | 
137 | sce <- scuttle::addPerFeatureQC(sce)
138 | rowData(sce)
139 | ```
140 | 
141 | ```{r}
142 | rowRanges(sce)
143 | ```
144 | 
145 | ### other metadata
146 | 
147 | the metadata slot, a named list of arbitrary objects. For example, say we have some favorite genes (e.g., highly variable genes) that we want to store inside of sce for use in our analysis at a later point. 
148 | 
149 | ```{r}
150 | my_genes <- c("gene_1", "gene_5")
151 | metadata(sce) <- list(favorite_genes = my_genes)
152 | metadata(sce)
153 | ```
154 | 
155 | ### dimension reduction
156 | 
157 | ```{r}
158 | sce <- scater::logNormCounts(sce)
159 | 
160 | sce <- scater::runPCA(sce)
161 | 
162 | sce
163 | 
164 | 
165 | reducedDim(sce, "PCA")[1:5, 1:5]
166 | 
167 | dim(reducedDim(sce, "PCA"))
168 | 
169 | sce <- scater::runTSNE(sce, perplexity = 0.1)
170 | 
171 | reducedDim(sce, "TSNE")[1:5, 1:2]
172 | 
173 | reducedDims(sce)
174 | 
175 | 
176 | reducedDimNames(sce)
177 | ```
178 | 
179 | ### add 
180 | 
181 | ```{r}
182 | u <- uwot::umap(t(logcounts(sce)), n_neighbors = 2)
183 | reducedDim(sce, "UMAP_uwot") <- u
184 | 
185 | reducedDims(sce) # Now stored in the object.
186 | ```
187 | 
188 | 
189 | ```{r}
190 | colLabels(sce) <- scran::clusterCells(sce, use.dimred="PCA")
191 | table(colLabels(sce))
192 | ```
193 | 
194 | ```{r}
195 | scater::plotReducedDim(sce, dimred="TSNE")
196 | ```
197 | 
198 | 


--------------------------------------------------------------------------------
/scripts/09_intro_to_seurat_V5.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "09_intro_to_seurat_v5"
  3 | author: "Ming Tang"
  4 | date: '2024-06-06'
  5 | output: html_document
  6 | editor_options: 
  7 |   chunk_output_type: console
  8 | ---
  9 | 
 10 | ### Overview of Layers in Seurat V5
 11 | 
 12 | In Seurat V5, a Layer is a new abstraction that can encapsulate different modalities or features of the single-cell data. This allows users to seamlessly integrate and analyze data from various sources while retaining the ability to use familiar Seurat functions.
 13 | 
 14 | Key Concepts of Layers
 15 | Layered Data: Each layer can represent a modality (like RNA, protein, etc.) or a different version of data (like log-normalized versus raw counts).
 16 | 
 17 | ```{r}
 18 | options(Seurat.object.assay.version = "v5")
 19 | library(Seurat)
 20 | library(dplyr)
 21 | 
 22 | # packageVersion("Seurat")
 23 | 
 24 | ```
 25 | 
 26 | Load the example data
 27 | ```{r}
 28 | # devtools::install_github('satijalab/seurat-data')
 29 | library(SeuratData)
 30 | InstallData("pbmc3k")
 31 | data("pbmc3k")
 32 | pbmc3k
 33 | 
 34 | pbmc3k<- UpdateSeuratObject(pbmc3k)
 35 | 
 36 | pbmc3k
 37 | ```
 38 | 
 39 | 
 40 | ```{r}
 41 | pbmc3k<- pbmc3k %>% 
 42 |   NormalizeData(normalization.method = "LogNormalize", scale.factor = 10000) %>%
 43 |   FindVariableFeatures(selection.method = "vst", nfeatures = 2000) %>%
 44 |   ScaleData() %>%
 45 |   RunPCA(verbose = FALSE) %>%
 46 |   RunUMAP(dims = 1:10, verbose = FALSE)
 47 | ```
 48 | 
 49 | ### Seurat V5
 50 | 
 51 | Seurat v5 assays store data in layers. These layers can store raw, un-normalized counts (layer='counts'), normalized data (layer='data'), or z-scored/variance-stabilized data (layer='scale.data').
 52 | 
 53 | ```{r}
 54 | pbmc3k@meta.data %>% head()
 55 | pbmc3k@assays
 56 | pbmc3k@assays$RNA
 57 | pbmc3k[["RNA"]]
 58 | 
 59 | 
 60 | pbmc3k@assays$RNA$counts[1:5, 1:5]
 61 | pbmc3k[["RNA"]]@counts[1:5, 1:5]
 62 | 
 63 | Layers(pbmc3k)
 64 | LayerData(pbmc3k, "counts")[1:5, 1:5]
 65 | 
 66 | pbmc3k[["RNA"]]$data
 67 | LayerData(pbmc3k, "data")[1:5, 1:5]
 68 | 
 69 | 
 70 | ##use getter function
 71 | GetAssayData(object = pbmc3k, slot = 'data')[1:5, 1:5]
 72 | 
 73 | ```
 74 | 
 75 | dimension reduction
 76 | 
 77 | ```{r}
 78 | pbmc3k@reductions
 79 | 
 80 | pbmc3k@reductions$pca
 81 | 
 82 | pbmc3k@reductions$pca@cell.embeddings %>% head()
 83 | 
 84 | pbmc3k[["pca"]]
 85 | pbmc3k[["pca"]]@cell.embeddings %>% head()
 86 | 
 87 | ## getter function
 88 | 
 89 | Embeddings(pbmc3k, "pca") %>% head()
 90 | Embeddings(pbmc3k, "umap") %>% head()
 91 | 
 92 | ### cell loadings
 93 | Loadings(pbmc3k, "pca") %>% head()
 94 | 
 95 | ```
 96 | 
 97 | ### convert between V4 and V5
 98 | 
 99 | ```{r}
100 | obj<- pbmc3k
101 | 
102 | #convert v5 assay to v4 assay within same object
103 | obj[["RNA"]]<- as(obj[["RNA"]], Class= "Assay")
104 | 
105 | class(obj[["RNA"]])
106 | # convert it back
107 | obj[["RNA"]]<- as(obj[["RNA"]], Class= "Assay5")
108 | 
109 | class(obj[["RNA"]])
110 | 
111 | pbmc3k_v3 <- Convert(pbmc3k, version = "3.0")
112 | 
113 | ```
114 | 
115 | 


--------------------------------------------------------------------------------
/scripts/10_analyze_my_tweets.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Untitled"
 3 | output: html_document
 4 | date: "2024-10-03"
 5 | editor_options: 
 6 |   chunk_output_type: console
 7 | ---
 8 | 
 9 | ```{r}
10 | # Load necessary libraries
11 | library(jsonlite)
12 | library(dplyr)
13 | library(tidyr)
14 | library(purrr)
15 | 
16 | # Step 1: Read the raw file
17 | raw_data <- readLines("~/Downloads/twitter-2024-10-01-52b49f0122d5bd108bcb56a0d40e5f809f4ade7fdbf0c0e457d924d980f12230/data/tweets.js")
18 | 
19 | # Step 2: Remove the JavaScript variable declaration ("window.YTD.tweets.part0 =")
20 | json_data <- gsub("window.YTD.tweets.part0 = ", "", raw_data)
21 | 
22 | # Step 3: Collapse the data into a single string (in case it's split across multiple lines)
23 | json_data <- paste(json_data, collapse = "")
24 | 
25 | # Step 4: Remove any trailing semi-colon at the end (if present)
26 | json_data <- gsub(";$", "", json_data)
27 | 
28 | # Step 5: Now parse the cleaned JSON
29 | tweets_data <- fromJSON(json_data)
30 | 
31 | head(tweets_data)
32 | colnames(tweets_data)
33 | class(tweets_data)
34 | 
35 | ## get the dataframe
36 | tweets_df<- tweets_data$tweet
37 | 
38 | 
39 | colnames(tweets_df)
40 | # Step 7: View the structure of the resulting dataframe
41 | glimpse(tweets_df)
42 | 
43 | # Optionally, you can now extract specific fields like "full_text" and "created_at"
44 | tweets_cleaned <- tweets_df %>%
45 |   select(created_at, full_text, favorite_count, retweet_count) %>%
46 |   mutate(favorite_count = as.numeric(favorite_count),
47 |          retweet_count = as.numeric(retweet_count))
48 |   
49 | # View cleaned dataframe
50 | head(tweets_cleaned)
51 | 
52 | tweets_cleaned %>%
53 |   arrange(desc(favorite_count)) %>%
54 |   View()
55 | 
56 | ```
57 | 
58 | 
59 | 
60 | ```{r}
61 | # Step 6: Convert to a tibble (dataframe) using tidyverse
62 | tweets_df <- as_tibble(tweets_data)
63 | 
64 | # Step 7: Extract key tweet information, including URLs
65 | # Unnest the 'tweet' column, extract relevant fields, and handle nested 'urls'
66 | tweets_cleaned <- tweets_df %>%
67 |   unnest_wider(tweet) %>%
68 |   unnest_wider(entities) %>%
69 |   unnest_longer(urls, keep_empty = TRUE) %>%
70 |   mutate(url = map(urls, ~ if (is.data.frame(.x) && "url" %in% colnames(.x)) .x$url else NA_character_)) %>%
71 |   select(created_at, full_text, favorite_count, retweet_count, url)
72 | 
73 |   
74 |  temp %>%
75 |    mutate(url = map_chr(urls, ~ if (is.data.frame(.x) && "url" %in% colnames(.x)) .x$url else NA_character_))
76 |  
77 |  temp$url[[1]]
78 | # Step 8: View cleaned dataframe with URLs
79 | head(tweets_cleaned)
80 | ```
81 | 
82 | 


--------------------------------------------------------------------------------
/scripts/11_change_rownames_ENSEMBL_to_symbol_RNAseq.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "change gene names"
  3 | output: html_document
  4 | date: "2024-10-10"
  5 | editor_options: 
  6 |   chunk_output_type: console
  7 | ---
  8 | 
  9 | ### The problem
 10 | You have a count matrix with rownames are ENSEMBL ids, and you want 
 11 | to change them to gene symbols.
 12 | 
 13 | 
 14 | ```{r}
 15 | #BiocManager::install("recount3")
 16 | library(recount3)
 17 | library(purrr)
 18 | library(dplyr)
 19 | library(ggplot2)
 20 | human_projects <- available_projects()
 21 | 
 22 | tcga_info = subset(
 23 |     human_projects,
 24 |     file_source == "tcga" & project_type == "data_sources"
 25 | )
 26 | 
 27 | head(tcga_info)
 28 | ```
 29 | 
 30 | 
 31 | ```{r}
 32 | tcga_info[1, ]
 33 | 
 34 | ## create the RangedSummarizedExperiment. the create_rse function works on 
 35 | ## one row a time 
 36 | 
 37 | rse_acc<- create_rse(tcga_info[1,])
 38 | 
 39 | rse_acc
 40 | ```
 41 | 
 42 | 
 43 | ```{r}
 44 | rse_acc@assays@data$raw_counts[1:5, 1:5]
 45 | 
 46 | rse_acc@rowRanges
 47 | ```
 48 | 
 49 | ### mapping gene symbol to ENSEMBL ID
 50 | 
 51 | ```{r}
 52 | if (!requireNamespace("BiocManager", quietly = TRUE))
 53 |     install.packages("BiocManager")
 54 | BiocManager::install(c("AnnotationDbi", "org.Hs.eg.db"))
 55 | library(AnnotationDbi)
 56 | library(org.Hs.eg.db)
 57 | ```
 58 | 
 59 | 
 60 | ```{r}
 61 | 
 62 | mat<- rse_acc@assays@data$raw_counts
 63 | mat[1:5, 1:5]
 64 | ensembl_ids<- rownames(mat)
 65 | 
 66 | 
 67 | # Map ENSEMBL IDs to HGNC symbols using org.Hs.eg.db
 68 | # you get errors!
 69 | gene_symbols <- AnnotationDbi::select(org.Hs.eg.db,
 70 |                        keys = ensembl_ids,
 71 |                        column = "SYMBOL",        # The output you want (gene symbol)
 72 |                        keytype = "ENSEMBL",      # The input key type (ENSEMBL ID)
 73 |                        multiVals = "first")      # How to handle multiple mappings
 74 | 
 75 | head(ensembl_ids)
 76 | 
 77 | # remove the version number
 78 | ensembl_ids<- ensembl_ids %>%
 79 |   stringr::str_replace("\\.[0-9]+$", "")
 80 | 
 81 | rownames(mat)<- ensembl_ids
 82 | mat[1:5, 1:5]
 83 | 
 84 | gene_symbols <- AnnotationDbi::select(org.Hs.eg.db,
 85 |                        keys = ensembl_ids,
 86 |                        column = "SYMBOL",        # The output you want (gene symbol)
 87 |                        keytype = "ENSEMBL",      # The input key type (ENSEMBL ID)
 88 |                        multiVals = "first")  
 89 | 
 90 | head(gene_symbols)
 91 | ```
 92 | 
 93 | 
 94 | ```{r}
 95 | gene_symbols %>%
 96 |   janitor::get_dupes(SYMBOL) %>%
 97 |   head()
 98 | 
 99 | gene_symbols %>%
100 |   janitor::get_dupes(SYMBOL) %>%
101 |   filter(!is.na(SYMBOL))
102 | ```
103 | 
104 | ### remove the NA and remove the duplicates
105 | 
106 | ```{r}
107 | gene_symbols_uniq<- gene_symbols %>%
108 |   filter(!is.na(SYMBOL)) %>%
109 |   distinct(SYMBOL, .keep_all = TRUE) %>%
110 |   distinct(ENSEMBL, .keep_all = TRUE)
111 | 
112 | 
113 | dim(gene_symbols_uniq)
114 | 
115 | gene_symbols_uniq %>%
116 |   janitor::get_dupes(ENSEMBL)
117 | 
118 | gene_symbols_uniq %>%
119 |   janitor::get_dupes(SYMBOL)
120 | ```
121 | 
122 | 
123 | ### subset the orignal matrix
124 | 
125 | 
126 | ```{r}
127 | mat_subset<- mat[gene_symbols_uniq$ENSEMBL, ]
128 | 
129 | all.equal(rownames(mat_subset), gene_symbols_uniq$ENSEMBL)
130 | 
131 | rownames(mat_subset)<- gene_symbols_uniq$SYMBOL
132 | 
133 | mat_subset[1:5, 1:5]
134 | ```


--------------------------------------------------------------------------------
/scripts/15_how_to_get_metadata_GEO.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "15_how_to_get_metadata_from_GEO"
 3 | author: "Ming Tang"
 4 | date: '2023-12-07'
 5 | output: html_document
 6 | editor_options: 
 7 |   chunk_output_type: console
 8 | ---
 9 | 
10 | ```{r}
11 | # BiocManager::install("GEOquery")
12 | library(GEOquery)
13 | library(tidyverse)
14 | 
15 | meta<- getGEO(GEO="GSE185507",GSEMatrix=FALSE)
16 | ```
17 | 
18 | ```{r}
19 | meta@gsms$GSM5616943@header$characteristics_ch1
20 | ```
21 | 
22 | ```{r}
23 | purrr::map(meta@gsms, ~.x@header$characteristics_ch1) %>%
24 |   stack() %>%
25 |   tidyr::separate(values, into = c("feature", "value"), sep= ": ")%>%
26 |         pivot_wider(names_from= feature, values_from = value) %>%
27 |         janitor::clean_names() %>%
28 |   write_csv("~/Downloads/GSE185507_meta.csv")
29 | ```
30 | 
31 | ## 5 tools to fetch GEO and other databases' metadata and data
32 | 
33 | 1. GEOfetch https://geofetch.databio.org/en/latest/
34 | 
35 | 2. bioconductor package GEOquery  https://bioconductor.org/packages/release/bioc/html/GEOquery.html
36 | 
37 | 3. [ffq](https://github.com/pachterlab/ffq) Fetch metadata information from databases. https://github.com/pachterlab/ffq
38 | 
39 | 4. [pysradb](https://github.com/saketkc/pysradb): a python package to query next-generation sequencing metadata and data from NCBI sequence read archive.
40 | 
41 | 5. [GEOparse](https://github.com/guma44/GEOparse)
42 | 


--------------------------------------------------------------------------------
/scripts/16_get_mouse_gene_exon_lengths.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "get gene length"
 3 | output: html_document
 4 | date: "2025-01-16"
 5 | editor_options: 
 6 |   chunk_output_type: console
 7 | ---
 8 | 
 9 | ```{r}
10 | #install.packages("BiocManager")
11 | #BiocManager::install("TxDb.Mmusculus.UCSC.mm9.knownGene")
12 | #BiocManager::install("org.Mm.eg.db")
13 | 
14 | library(TxDb.Mmusculus.UCSC.mm9.knownGene)
15 | library(org.Mm.eg.db)
16 | ```
17 | 
18 | Now, we can retrieve the gene data:
19 | ```{r}
20 | txdb <- TxDb.Mmusculus.UCSC.mm9.knownGene
21 | mm9_genes <- genes(txdb)
22 | mm9_genes
23 | ```
24 | 
25 | To map the Entrez ID to gene symbols, we use: 
26 | 
27 | ```{r}
28 | gene_symbol <- AnnotationDbi::select(org.Mm.eg.db, keys = mm9_genes$gene_id, 
29 |                                      columns = "SYMBOL", keytype = "ENTREZID")
30 | ```
31 | 
32 | Make sure the gene IDs match: 
33 | 
34 | ```{r}
35 | all.equal(mm9_genes$gene_id, gene_symbol$ENTREZID)
36 | 
37 | # Add gene symbols to the data: 
38 | mm9_genes$symbol <- gene_symbol$SYMBOL
39 | ```
40 | 
41 | ```{r}
42 | width(mm9_genes)
43 | 
44 | df <- data.frame(EntrezID = mm9_genes$gene_id, 
45 |                  Symbol = mm9_genes$symbol, 
46 |                  Gene_length = width(mm9_genes))
47 | 
48 | head(df)
49 | ```
50 | 
51 | Why is this useful? When normalizing H3K36me3 signals (found in gene bodies), you can use gene lengths for proper normalization. Also, for RNA-seq, exon lengths are essential for calculating metrics like RPKM (though TPM is preferred now).
52 | 
53 | ### exon lengths 
54 | 
55 | ```{r}
56 | exons<- exonsBy(txdb, by = "gene")
57 | exons
58 | ```
59 | 
60 | This returns a GRangesList object and each element of the list is a GRanges containing all the exons for that gene.
61 | 
62 | Let’s calculate the total exon lengths for each gene by the width function
63 | 
64 | ```{r}
65 | width(exons)
66 | 
67 | head(sum(width(exons)))
68 | ```
69 | 
70 | ```{r}
71 | exon_len<- sum(width(exons)) %>%
72 |       tibble::enframe(name = "ENTREZID", value = "exon_length")
73 | 
74 | head(exon_len)
75 | ```
76 | 
77 | 


--------------------------------------------------------------------------------
/scripts/R_tips_01_add_percentage_to_y_axis.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "add percentage to y axis"
  3 | author: "Ming Tang"
  4 | date: "5/28/2023"
  5 | output: html_document
  6 | editor_options: 
  7 |   chunk_output_type: console
  8 | ---
  9 | 
 10 | ### How to add percentages to the y-axis of a bar plot and how to order the bar plot by the value of each bar.
 11 | 
 12 | The transcripts and genes for a reference sequence UCSC annotation https://genome.ucsc.edu/
 13 | 
 14 | The TxDb family of packages and data objects manages information on transcripts and gene models. We consider those derived from annotation tables prepared for the UCSC genome browser.
 15 | 
 16 | ```{r}
 17 | library(TxDb.Hsapiens.UCSC.hg19.knownGene)
 18 | library(dplyr)
 19 | library(ggplot2)
 20 | 
 21 | txdb<- TxDb.Hsapiens.UCSC.hg19.knownGene # abbreviate
 22 | txdb
 23 | 
 24 | ```
 25 | 
 26 | ```{r}
 27 | genes(txdb)
 28 | 
 29 | #exons(txdb)
 30 | ```
 31 | 
 32 | always check the first several rows first 
 33 | 
 34 | ```{r}
 35 | genes(txdb) %>%
 36 |   as.data.frame() %>%
 37 |   head()
 38 | ```
 39 | 
 40 | Let's count how many genes in each chromosomes 
 41 | 
 42 | ```{r}
 43 | genes_df<- genes(txdb) %>%
 44 |   as.data.frame()
 45 | 
 46 | genes_df %>%
 47 |   dplyr::count(seqnames)
 48 | ```
 49 | 
 50 | Let's remove the unconventional chromosomes first
 51 | 
 52 | ```{r}
 53 | conventional_chrs<- paste0("chr", c(1:22, "X", "Y"))
 54 | 
 55 | total_gene<- genes_df %>%
 56 |   filter(seqnames %in% conventional_chrs) %>%
 57 |   nrow()
 58 | 
 59 | 
 60 | genes_per_chr<- genes_df %>%
 61 |   filter(seqnames %in% conventional_chrs) %>%
 62 |   dplyr::count(seqnames)
 63 | 
 64 | genes_per_chr %>%
 65 |   mutate(genes_percent = n/total_gene)
 66 | ```
 67 | 
 68 | make the figure 
 69 | ```{r}
 70 | genes_per_chr %>%
 71 |   mutate(genes_percent = n/total_gene) %>%
 72 |   ggplot(aes(x= seqnames, y = genes_percent)) +
 73 |   geom_bar(stat = "identity")
 74 | ```
 75 | 
 76 | 
 77 | rotate the x-axis
 78 | 
 79 | ```{r}
 80 | genes_per_chr %>%
 81 |   mutate(genes_percent = n/total_gene) %>%
 82 |   ggplot(aes(x= seqnames, y = genes_percent)) +
 83 |   geom_bar( stat = "identity") +
 84 |   xlab("") + 
 85 |   theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))
 86 | ```
 87 | 
 88 | use them_bw
 89 | ```{r}
 90 | genes_per_chr %>%
 91 |   mutate(genes_percent = n/total_gene) %>%
 92 |   ggplot(aes(x= seqnames, y = genes_percent)) +
 93 |   geom_bar( stat = "identity") +
 94 |   xlab("") + 
 95 |   theme_bw(base_size = 14) +
 96 |   theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) 
 97 | ```
 98 | 
 99 | change y-axis to percentage
100 | 
101 | ```{r}
102 | genes_per_chr %>%
103 |   mutate(genes_percent = n/total_gene) %>%
104 |   ggplot(aes(x= seqnames, y = genes_percent)) +
105 |   geom_bar(stat = "identity") +
106 |   xlab("") + 
107 |   scale_y_continuous(labels = scales::percent) + 
108 |   theme_bw(base_size = 14) +
109 |   theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) 
110 | ```
111 | 
112 | Reorder the bar from small to big:
113 | 
114 | https://forcats.tidyverse.org/reference/fct_reorder.html
115 | 
116 | ```{r}
117 | genes_per_chr %>%
118 |   mutate(genes_percent = n/total_gene) %>%
119 |   ggplot(aes(x= seqnames %>% 
120 |                forcats::fct_reorder(genes_percent)
121 |                , y = genes_percent)) +
122 |   geom_bar(stat = "identity") +
123 |   xlab("") + 
124 |   scale_y_continuous(labels = scales::percent) + 
125 |   theme_bw(base_size = 14) +
126 |   theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) 
127 | ```
128 | 
129 | reverse the order 
130 | 
131 | ```{r}
132 | genes_per_chr %>%
133 |   mutate(genes_percent = n/total_gene) %>%
134 |   ggplot(aes(x= seqnames %>% 
135 |                forcats::fct_reorder(genes_percent, .desc = TRUE)
136 |                , y = genes_percent)) +
137 |   geom_bar(stat = "identity") +
138 |   xlab("") + 
139 |   scale_y_continuous(labels = scales::percent) + 
140 |   theme_bw(base_size = 14) +
141 |   theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) 
142 | ```
143 | 
144 | 


--------------------------------------------------------------------------------
/scripts/R_tips_02_add_side_to_scatterplot.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "R_tips_02_add_side_to_scatterplot"
  3 | author: "Ming Tang"
  4 | date: "6/2/2023"
  5 | output: html_document
  6 | editor_options: 
  7 |   chunk_output_type: console
  8 | ---
  9 | 
 10 | ### introduce ggside using single cell data 
 11 | 
 12 | The ggside R package provides a new way to visualize data by combining the flexibility of ggplot2 with the power of side-by-side plots.
 13 | 
 14 | We will use a single cell dataset to demonstrate its usage.
 15 | 
 16 | ggside allows users to create side-by-side plots of multiple variables, such as gene expression, cell type, and experimental conditions. This can be helpful for identifying patterns and trends in scRNA-seq data that would be difficult to see in individual plots. Additionally, ggside provides a number of features that make it easy to customize the appearance of side-by-side plots, such as changing the color scheme, adding labels, and adjusting the layout.
 17 | 
 18 | https://cran.r-project.org/web/packages/ggside/vignettes/ggside_basic_usage.html
 19 | 
 20 | ### Load libraries 
 21 | ```{r}
 22 | # install.packages("ggside")
 23 | library(ggside)
 24 | library(Seurat)
 25 | library(dplyr)
 26 | library(SeuratData)
 27 | ```
 28 | 
 29 | load data 
 30 | ```{r}
 31 | data("pbmc3k")
 32 | 
 33 | pbmc3k
 34 | ```
 35 | 
 36 | 
 37 | ### routine processing
 38 | 
 39 | ```{r}
 40 | pbmc3k<- pbmc3k %>% 
 41 |   NormalizeData(normalization.method = "LogNormalize", scale.factor = 10000) %>%
 42 |   FindVariableFeatures(selection.method = "vst", nfeatures = 2000) %>%
 43 |   ScaleData() %>%
 44 |   RunPCA(verbose = FALSE) %>%
 45 |   FindNeighbors(dims = 1:10, verbose = FALSE) %>%
 46 |   FindClusters(resolution = 0.5, verbose = FALSE) %>%
 47 |   RunUMAP(dims = 1:10, verbose = FALSE)
 48 | 
 49 | Idents(pbmc3k)<- pbmc3k$seurat_annotations
 50 | 
 51 | DimPlot(pbmc3k, label = TRUE, repel=TRUE) + NoLegend()
 52 | ```
 53 | 
 54 | 
 55 | some helper functions to extract the gene expression values from the seurat object 
 56 | 
 57 | ```{r}
 58 | matrix_to_expression_df<- function(x, obj){
 59 |         df<- x %>%
 60 |                 as.matrix() %>% 
 61 |                 as.data.frame() %>%
 62 |                 tibble::rownames_to_column(var= "gene") %>%
 63 |                 tidyr::pivot_longer(cols = -1, names_to = "cell", values_to = "expression") %>%
 64 |                 tidyr::pivot_wider(names_from = "gene", values_from = expression) %>%
 65 |                 left_join(obj@meta.data %>% 
 66 |                                   tibble::rownames_to_column(var = "cell"))
 67 |         return(df)
 68 | }
 69 | 
 70 | 
 71 | get_expression_data<- function(obj, assay = "RNA", slot = "data", 
 72 |                                genes = NULL, cells = NULL){
 73 |         if (is.null(genes) & !is.null(cells)){
 74 |                 df<- GetAssayData(obj, assay = assay, slot = slot)[, cells, drop = FALSE] %>%
 75 |                         matrix_to_expression_df(obj = obj)
 76 |         } else if (!is.null(genes) & is.null(cells)){
 77 |                 df <- GetAssayData(obj, assay = assay, slot = slot)[genes, , drop = FALSE] %>%
 78 |                         matrix_to_expression_df(obj = obj)
 79 |         } else if (is.null(genes & is.null(cells))){
 80 |                 df <- GetAssayData(obj, assay = assay, slot = slot)[, , drop = FALSE] %>%
 81 |                         matrix_to_expression_df(obj = obj)
 82 |         } else {
 83 |                 df<- GetAssayData(obj, assay = assay, slot = slot)[genes, cells, drop = FALSE] %>%
 84 |                         matrix_to_expression_df(obj = obj)
 85 |         }
 86 |         return(df)
 87 | }
 88 | ```
 89 | 
 90 | 
 91 | test the function 
 92 | 
 93 | ```{r}
 94 | df<- get_expression_data(obj = pbmc3k, genes = c("CD14", "FCGR3A"))
 95 | 
 96 | head(df)
 97 | ```
 98 | 
 99 | 
100 | a plain scatter plot
101 | ```{r}
102 | df %>%
103 |   filter(seurat_annotations %in% c("CD14+ Mono", "FCGR3A+ Mono")) %>%
104 |   ggplot(aes(x= CD14, y = FCGR3A)) +
105 |   geom_point(aes(color = seurat_annotations))
106 | ```
107 | 
108 | 
109 | a scatter plot adding two boxplots 
110 | 
111 | ```{r}
112 | df %>%
113 |   filter(seurat_annotations %in% c("CD14+ Mono", "FCGR3A+ Mono")) %>%
114 |   ggplot(aes(x= CD14, y = FCGR3A)) +
115 |   geom_point(aes(color = seurat_annotations)) +
116 |   geom_xsideboxplot(aes(y = seurat_annotations, color = seurat_annotations), 
117 |                     orientation = "y") +
118 |   geom_ysideboxplot(aes(x = seurat_annotations, color = seurat_annotations), 
119 |                     orientation = "x")+
120 |   scale_xsidey_discrete() +
121 |   scale_ysidex_discrete()+
122 |   theme(ggside.panel.scale.x = 0.2,
123 |        ggside.panel.scale.y = 0.3)
124 | ```
125 | 
126 | a scatterplot adding one boxplot and one density plot
127 | 
128 | ```{r}
129 | df %>%
130 |   filter(seurat_annotations %in% c("CD14+ Mono", "FCGR3A+ Mono")) %>%
131 |   ggplot(aes(x= CD14, y = FCGR3A)) +
132 |   geom_point(aes(color = seurat_annotations)) +
133 |   geom_xsideboxplot(aes(y = seurat_annotations, color = seurat_annotations), 
134 |                     orientation = "y") +
135 |   geom_ysidedensity(aes(x = after_stat(density), color = seurat_annotations, fill = seurat_annotations), 
136 |                     position = "stack", alpha = 0.4) +
137 |   scale_xsidey_discrete() +
138 |   scale_ysidex_continuous(guide = guide_axis(angle = 90), minor_breaks = NULL) +
139 |   theme(ggside.panel.scale.x = 0.2,
140 |        ggside.panel.scale.y = 0.4)
141 | ```
142 | 
143 | ### alternative way: use patchwork 
144 | 
145 | https://patchwork.data-imaginist.com/
146 | 
147 | ```{r}
148 | library(patchwork)
149 | 
150 | p1<- df %>%
151 |   filter(seurat_annotations %in% c("CD14+ Mono", "FCGR3A+ Mono")) %>%
152 |   ggplot(aes(x= seurat_annotations, y = CD14)) +
153 |   geom_boxplot(aes(color = seurat_annotations)) + 
154 |   xlab("") +
155 |   theme(
156 |     axis.title.x = element_blank(),
157 |     axis.text.x = element_blank(),
158 |     axis.ticks.x = element_blank(),
159 |     #legend.position = "none", legend.text = element_blank()
160 |   )+
161 |   coord_flip()
162 | 
163 | p2<- df %>%
164 |   filter(seurat_annotations %in% c("CD14+ Mono", "FCGR3A+ Mono")) %>%
165 |   ggplot(aes(x= CD14, y = FCGR3A)) +
166 |   geom_point(aes(color = seurat_annotations)) +
167 |   theme(legend.position = "none", legend.text = element_blank()) 
168 | 
169 | p3<- df %>%
170 |   filter(seurat_annotations %in% c("CD14+ Mono", "FCGR3A+ Mono")) %>%
171 |   ggplot(aes(x= seurat_annotations, y = FCGR3A)) +
172 |   geom_boxplot(aes(color = seurat_annotations)) +
173 |   theme(legend.position = "none") +
174 |   ylab("") +
175 |   xlab("") +
176 |   theme(
177 |     axis.title.y = element_blank(),
178 |     axis.text.y = element_blank(),
179 |     axis.ticks.y = element_blank()
180 |   ) 
181 | 
182 | p1 + plot_spacer() + p2 + p3 +
183 |    plot_layout(widths = c(4, 2), heights = c(1, 5),
184 |                guides = 'collect')
185 | 
186 | ```
187 | 
188 | 
189 | 
190 | 


--------------------------------------------------------------------------------
/scripts/R_tips_03_extract_tables_from_PDF.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "extract tables"
 3 | author: "Ming Tang"
 4 | date: "6/9/2023"
 5 | output: html_document
 6 | ---
 7 | 
 8 | ### how to extract tables from PDF file using Tabulizer https://github.com/ropensci/tabulizer
 9 | 
10 | ```{r}
11 | #install.packages("tabulizer") not working, have to install it from github
12 | 
13 | if (!require("remotes")) {
14 |     install.packages("remotes")
15 | }
16 | 
17 | remotes::install_github(c("ropensci/tabulizerjars", "ropensci/tabulizer"))
18 | 
19 | library(tabulizer)
20 | 
21 | out <- extract_tables("/cloud/project/sciadv.abm1831.pdf", pages = 3, guess = TRUE, 
22 |                       output = "data.frame")
23 | ```
24 | 
25 | 
26 | ```{r}
27 | out[[1]]
28 | View(out[[1]])
29 | 
30 | ```
31 | 


--------------------------------------------------------------------------------
/scripts/R_tips_04_list_column_dataframe_in_dataframe.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "R_tips_04_list_column_dataframe_in_dataframe"
 3 | author: "Ming Tang"
 4 | date: "6/13/2023"
 5 | output: html_document
 6 | editor_options: 
 7 |   chunk_output_type: console
 8 | ---
 9 | 
10 | ```{r}
11 | library(tibble)
12 | library(dplyr)
13 | library(tidyr)
14 | library(purrr)
15 | library(ggplot2)
16 | ```
17 | 
18 | 
19 | make a list column by using tidyr::nest()
20 | ```{r}
21 | mtcars_list<- mtcars %>%
22 |   group_by(cyl) %>%
23 |   tidyr::nest()
24 | 
25 | mtcars_list$data[[1]]
26 | mtcars_list$data[[2]]
27 | 
28 | ```
29 | 
30 | list column can even contain ggplot2 object
31 | 
32 | ```{r}
33 | mtcars_list<- mtcars_list %>%
34 |   mutate(plots = purrr::map(data, ~ggplot(.x, aes(x= hp, y = mpg)) + geom_point()))
35 | 
36 | mtcars_list
37 | ```
38 | 
39 | save the plots
40 | 
41 | ```{r}
42 | walk2(mtcars_list$cyl, mtcars_list$plots, 
43 |       ~ ggsave(filename = paste0("~/Downloads/cyl", .x, "_plot.pdf"), 
44 |                plot = .y, width = 4, height = 4))
45 | 
46 | # side effect 
47 | map2(mtcars_list$cyl, mtcars_list$plots, 
48 |       ~ ggsave(filename = paste0("~/Downloads/cyl", .x, "_plot.pdf"), 
49 |                plot = .y, width = 4, height = 4))
50 | ```
51 | 
52 | 
53 | nest by two variables 
54 | 
55 | ```{r}
56 | mtcars_list2<- mtcars %>%
57 |   group_by(cyl, gear) %>%
58 |   nest()
59 | 
60 | mtcars_list2 %>% 
61 |   mutate(filename = paste0("cyl", cyl, "_gear", gear))
62 | ```
63 | 
64 | 


--------------------------------------------------------------------------------
/scripts/R_tips_05_read_all_files_in_a_folder.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "read in all files"
  3 | author: "Ming Tang"
  4 | date: "6/14/2023"
  5 | output: html_document
  6 | editor_options: 
  7 |   chunk_output_type: console
  8 | ---
  9 | 
 10 | ### combine all the counts into a single dataframe
 11 | 
 12 | https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE197320
 13 | 
 14 | ```{r}
 15 | library(tidyverse)
 16 | library(purrr)
 17 | 
 18 | file1<- read_tsv("~/Downloads/GSE197320_RAW/GSM5914555_SKOV3_ALDH_Veh_Rep1_Genes_ReadCount.txt.gz",
 19 |                  col_names = FALSE)
 20 | 
 21 | files<- list.files("~/Downloads/GSE197320_RAW", full.names = TRUE)
 22 | 
 23 | 
 24 | samples<- basename(files) %>%
 25 |   str_replace(".+_ALDH_(.+)_Genes_ReadCount.txt.gz", "\\1")
 26 | 
 27 | 
 28 | read_count<- function(x, sample){
 29 |   df<- read_tsv(x, col_names = FALSE)
 30 |   names(df)<- c("gene", sample)
 31 |   return(df)
 32 | }
 33 | 
 34 | counts<- map2(files,samples,read_count)
 35 | 
 36 | counts_table<- purrr::reduce(counts, inner_join) 
 37 | 
 38 | head(counts_table)
 39 | ```
 40 | 
 41 | 
 42 | 
 43 | ```{r}
 44 | bind_cols(counts) %>%
 45 |   head()
 46 | 
 47 | ```
 48 | 
 49 | ### dummy files
 50 | 
 51 | 
 52 | ```{r}
 53 | library(tidyr)
 54 | library(dplyr)
 55 | library(readr)
 56 | 
 57 | list.files("~/playground", pattern = "sample[0-9].tsv")
 58 | 
 59 | files<- list.files("~/playground", pattern = "sample[0-9].tsv", full.name=TRUE)
 60 | 
 61 | files
 62 | 
 63 | names(files)<- stringr::str_split(files, pattern = "/", simplify = TRUE)[, 5] %>%
 64 |   stringr::str_replace(".tsv", "")
 65 | 
 66 | files
 67 | 
 68 | dat1<- read_tsv(files[1])
 69 | dat2<- read_tsv(files[2])
 70 | dat3<- read_tsv(files[3])
 71 | dat4<-read_tsv(files[4])
 72 | 
 73 | ```
 74 | 
 75 | use a for loop
 76 | 
 77 | ```{r}
 78 | 
 79 | results<- data.frame()
 80 | 
 81 | for (file in files) {
 82 |   x<- read_tsv(file)
 83 |   sample_name<- stringr::str_replace(file, "/Users/tommytang/playground/", "") %>%
 84 |     stringr::str_replace(".tsv", "")
 85 |   x$sample<- sample_name
 86 |   results<- rbind(results, x)
 87 | }
 88 | 
 89 | 
 90 | 
 91 | results
 92 | 
 93 | results %>%
 94 |   tidyr::pivot_wider(names_from = sample, values_from = count)
 95 | ```
 96 | 
 97 | 
 98 | use lapply
 99 | 
100 | ```{r}
101 | counts<- lapply(files, read_tsv)
102 | 
103 | do.call(rbind, counts)
104 | 
105 | purrr::reduce(counts, rbind)
106 | 
107 | ## add a sample name column
108 | read_count<- function(file){
109 |   x<- read_tsv(file)
110 |   sample_name<- stringr::str_replace(file, "/Users/tommytang/playground/", "") %>%
111 |     stringr::str_replace(".tsv", "")
112 |   x$sample<- sample_name
113 |   return(x)
114 | }
115 | 
116 | # lappy from base R
117 | counts<- lapply(files, read_count)
118 | 
119 | #base R
120 | do.call(rbind, counts)
121 | 
122 | # purrr
123 | purrr::reduce(counts, rbind) %>%
124 |   tidyr::pivot_wider(names_from = sample, values_from = count)
125 | 
126 | ```
127 | 
128 | use purrr::map and bind_rows 
129 | ```{r}
130 | # map function from purrr
131 | counts<- purrr::map(files, read_tsv)
132 | 
133 | dplyr::bind_rows(counts, .id = "sample") %>%
134 |   tidyr::pivot_wider(names_from = sample, values_from = count)
135 | ```
136 | 
137 | ```{r}
138 | counts<- purrr::map_df(files, readr::read_tsv, .id = "sample")
139 | counts %>%
140 |   tidyr::pivot_wider(names_from = sample, values_from = count)
141 | ```
142 | 
143 | 


--------------------------------------------------------------------------------
/scripts/R_tips_06_avoid_overplotting_ggblend.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "avoid overplotting ggblend"
  3 | author: "Ming Tang"
  4 | date: "6/21/2023"
  5 | output: html_document
  6 | editor_options: 
  7 |   chunk_output_type: console
  8 | ---
  9 | 
 10 | ## How to avoid overplotting with ggblend
 11 | 
 12 | https://mjskay.github.io/ggblend/
 13 | 
 14 | We’ll construct a simple dataset with two semi-overlapping point clouds. We’ll have two versions of the dataset: one with all the "a" points listed first, and one with all the "b" points listed first.
 15 | 
 16 | ```{r}
 17 | library(ggplot2)
 18 | library(ggblend)
 19 | 
 20 | theme_set(ggdist::theme_ggdist() + theme(
 21 |   plot.title = element_text(size = rel(1), lineheight = 1.1, face = "bold"),
 22 |   plot.subtitle = element_text(face = "italic"),
 23 |   panel.border = element_rect(color = "gray75", fill = NA)
 24 | ))
 25 | 
 26 | ```
 27 | 
 28 | 
 29 | 
 30 | ```{r}
 31 | set.seed(1234)
 32 | df_a = data.frame(x = rnorm(500, 0), y = rnorm(500, 1), set = "a")
 33 | df_b = data.frame(x = rnorm(500, 1), y = rnorm(500, 2), set = "b")
 34 | 
 35 | df_ab = rbind(df_a, df_b) |>
 36 |   transform(order = "draw a then b")
 37 | 
 38 | df_ba = rbind(df_b, df_a) |>
 39 |   transform(order = "draw b then a")
 40 | 
 41 | df = rbind(df_ab, df_ba)
 42 | ```
 43 | 
 44 | 
 45 | ```{r}
 46 | df |>
 47 |   ggplot(aes(x, y, color = set)) +
 48 |   geom_point(size = 3, alpha = 0.5) +
 49 |   scale_color_brewer(palette = "Set1") +
 50 |   facet_grid(~ order) +
 51 |   labs(title = "geom_point() without blending", subtitle = "Draw order matters.")
 52 | ```
 53 | 
 54 | 
 55 | 
 56 | ```{r}
 57 | df |>
 58 |   ggplot(aes(x, y, color = set)) +
 59 |   geom_point(size = 3, alpha = 0.5) |> blend("multiply") +
 60 |   scale_color_brewer(palette = "Set1") +
 61 |   facet_grid(~ order) +
 62 |   labs(
 63 |     title = "geom_point(alpha = 0.5) |> blend('multiply')",
 64 |     subtitle = "Draw order does not matter, but color is too dark."
 65 |   )
 66 | ```
 67 | 
 68 | 
 69 | Rstudio graphic device does not support blend
 70 | ```{r}
 71 | df |>
 72 |   ggplot(aes(x, y, color = set, partition = set)) +
 73 |   geom_point(size = 3, alpha = 0.5) * (blend("lighten") + blend("multiply", alpha = 0.5)) +
 74 |   scale_color_brewer(palette = "Set1") +
 75 |   facet_grid(~ order) +
 76 |   labs(
 77 |     title = "geom_point(aes(partition = set)) * (blend('lighten') + blend('multiply', alpha = 0.5))",
 78 |     subtitle = "Two order-independent blends on one layer using the distributive law."
 79 |   ) +
 80 |   theme(plot.subtitle = element_text(lineheight = 1.2))
 81 | ```
 82 | 
 83 | 
 84 | ```{r}
 85 | cairo_pdf("~/github_repos/compbio_tutorials/results/blend.pdf", width = 6, height = 4)
 86 | 
 87 | df |>
 88 |   ggplot(aes(x, y, color = set, partition = set)) +
 89 |   geom_point(size = 3, alpha = 0.5) * (blend("lighten") + blend("multiply", alpha = 0.5)) +
 90 |   scale_color_brewer(palette = "Set1") +
 91 |   facet_grid(~ order) +
 92 |   labs(
 93 |     title = "geom_point(aes(partition = set)) * (blend('lighten') + blend('multiply', alpha = 0.5))",
 94 |     subtitle = "Two order-independent blends on one layer using the distributive law."
 95 |   ) +
 96 |   theme(plot.subtitle = element_text(lineheight = 1.2))
 97 | 
 98 | dev.off()
 99 | ```
100 | 
101 | 


--------------------------------------------------------------------------------
/scripts/R_tips_06_hierarchical_clustering.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "clustering in R"
  3 | author: "Ming Tang"
  4 | date: '2023-06-30'
  5 | output: html_document
  6 | editor_options: 
  7 |   chunk_output_type: console
  8 | ---
  9 | 
 10 | In hierarchical clustering, a dendrogram is a diagram that shows the hierarchical relationship between the clusters that are formed. It is a tree-like diagram that is commonly used to visualize the results of hierarchical clustering. A dendrogram can help you to understand how the different clusters are related to each other, and it can also be used to determine the optimal number of clusters for your data. To create a dendrogram, the distances between each pair of clusters are calculated and then represented graphically using a tree-like structure, with the clusters being represented by the branches and the distance between the clusters being represented by the length of the branches. By looking at a dendrogram, you can see which clusters are more closely related to each other and which are more distantly related.
 11 | 
 12 | ```{r dendrogram, echo=FALSE, fig.cap="what is a dendrogram", out.width = '60%'}
 13 | 
 14 | knitr::include_graphics(c("../images/dendrogram.png"))
 15 | ```
 16 | 
 17 | ## distance measures 
 18 | 
 19 | Before doing clustering, one has to define the distances between data points first. There are different
 20 | distance measures as shown in Figure \@ref(fig:distmeasure). Credit https://towardsdatascience.com/9-distance-measures-in-data-science-918109d069fa.
 21 | 
 22 | ```{r distmeasure, echo=FALSE, fig.cap="distance measures", out.width = '80%'}
 23 | 
 24 | knitr::include_graphics(c("../images/distance_measures.jpeg"))
 25 | ```
 26 | 
 27 | ```{r}
 28 | ?dist
 29 | ```
 30 | 
 31 | 
 32 | ## linkages 
 33 | 
 34 | After you define the distance measure, one has to define how the clusters are merged.
 35 | That's what linkage does. 
 36 | 
 37 | Linkage: Measure of dissimilarity between two sets of objects that determine how two set of objects are merged.
 38 | 
 39 | * Single linkage: Minimum dissimilarity between points in two sets used to determine
 40 | which two sets should be merged.
 41 | 
 42 | * Complete linkage: Maximum dissimilarity between points in two sets used to determine
 43 | which two sets should be merged.
 44 | 
 45 | * Average Linkage: Average dissimilarity between points in two sets used to determine
 46 | which two sets should be merged.
 47 | 
 48 | * Ward’s Linkage. The idea has much in common with analysis of variance (ANOVA). The linkage function specifying the distance between two clusters is computed as the increase in the "error sum of squares" (ESS) after fusing two clusters into a single cluster. In other words, it minimizes the increase in the total within-cluster variance when merging two clusters. It is commonly used when the goal is to create balanced, compact clusters.
 49 | 
 50 | 
 51 | ```{r}
 52 | ?hclust
 53 | ```
 54 | 
 55 | 
 56 | Read more on clustering: https://www.nature.com/articles/nmeth.4299
 57 | 
 58 | 
 59 | Let's use the NCI data microarray data
 60 | 
 61 | ```{r}
 62 | library(ISLR)
 63 | ncidat<- t(NCI60$data)
 64 | colnames(ncidat)<- NCI60$labs
 65 | 
 66 | dim(ncidat)
 67 | 
 68 | ncidat[1:10, 1:5]
 69 | 
 70 | unique(colnames(ncidat))
 71 | 
 72 | X<- t(scale(t(ncidat),center=TRUE,scale=FALSE))
 73 | ```
 74 | 
 75 | Let's use complete linkage:
 76 | 
 77 | * Often gives comparable cluster sizes.
 78 | * Less sensitive to outliers.
 79 | * Works better with spherical distributions.
 80 | 
 81 | ```{r}
 82 | #default euclidean distance
 83 | Dmat<- dist(t(X))
 84 | com.hclust<- hclust(Dmat,method="complete")
 85 | plot(com.hclust,cex=.7,main="Complete Linkage")
 86 | ```
 87 | 
 88 | Let's make the dendrogram look a little better by coloring the labels.
 89 | 
 90 | ```{r}
 91 | library(dplyr)
 92 | # https://cran.r-project.org/web/packages/dendextend/vignettes/dendextend.html
 93 | # better dendogram
 94 | library(dendextend)
 95 | 
 96 | #https://cran.r-project.org/web/packages/Polychrome/index.html
 97 | # better color
 98 | library(Polychrome)
 99 | 
100 | set.seed(12042022)
101 | mypal <- kelly.colors(15)
102 | swatch(mypal)
103 | 
104 | # remove the white color
105 | mypal<- mypal[-1]
106 | 
107 | dend<- com.hclust %>%
108 |   as.dendrogram()
109 | 
110 | dend_labels<- dend %>% labels()
111 | 
112 | dend %>% 
113 |   color_labels(col = mypal[as.numeric(as.factor(dend_labels))]) %>%
114 |   set("labels_cex", 0.7) %>%
115 |   plot()
116 | ```
117 | 
118 | Make a little function to plot dendrogram with branch label colored
119 | 
120 | ```{r}
121 | plot_dend<- function(dend,...){
122 |   dend_labels<- dend %>% labels()
123 |   
124 |   dend %>% 
125 |   color_labels(col = mypal[as.numeric(as.factor(dend_labels))]) %>%
126 |   set("labels_cex", 0.7) %>%
127 |   plot(...)
128 | }
129 | 
130 | ```
131 | 
132 | **Tip**: what is that `...` in the function? 
133 | 
134 | >Adding `...` to a function is a powerful technique because it allows you to accept any number of additional arguments. Unfortunately it comes with a big downside: any misspelled or extraneous arguments will be silently ignored. This package [ellipsis](https://ellipsis.r-lib.org/) provides tools for making ... safer
135 | 
136 | 
137 | **single linkage**:
138 | 
139 | * Can handle diverse shapes.
140 | * Very sensitive to outliers or noise.
141 | * Often results in unbalanced clusters.
142 | * Extended, trailing clusters in which observations fused one at a time -chaining.
143 | 
144 | ```{r}
145 | sing.hclust<- hclust(Dmat,method="single")
146 | plot_dend(as.dendrogram(sing.hclust), main = "single linkage")
147 | 
148 | ```
149 | 
150 | 
151 | **Average linkage**:
152 | 
153 | * A compromise between single and complete linkage.
154 | * Less sensitive to outliers.
155 | * Works better with spherical distributions.
156 | 
157 | ```{r}
158 | ave.hclust<- hclust(Dmat,method="average")
159 | plot_dend(as.dendrogram(ave.hclust), main = "average linkage")
160 | ```
161 | 
162 | 
163 | **Ward’s linkage**
164 | 
165 | similar to Average linkage. Join objects that minimize Euclidean distance / average Euclidean distance.
166 | 
167 | ```{r}
168 | ward.hclust<-  hclust(Dmat,method="ward.D")
169 | plot_dend(as.dendrogram(ward.hclust), main = " Ward linkage")
170 | 
171 | # cut the tree with a height of 120
172 | abline(h=120)
173 | rect.hclust(ward.hclust,h=120)
174 | 
175 | cl<- cutree(ward.hclust, h= 120)
176 | table(type=colnames(X), clusters=cl)
177 | ```
178 | 
179 | **Complete linkage with different distances**
180 | 
181 | ```{r}
182 | Dmat<- dist(t(ncidat),method="manhattan") #L1 distance
183 | com.hclust<- hclust(Dmat,method="complete")
184 | plot_dend(as.dendrogram(com.hclust), main = " manhattan distance with Complete linage")
185 | ```
186 | 
187 | We can try all different combinations of distance matrix and different linkages. One can also use `1- cor(X)` as a distance measure! It is commonly used in the clustering of gene expression. Also, use either average linkage or Ward’s linkage.
188 | 
189 | **Ward’s linkage for 1- cor(X) distance**
190 | 
191 | `cor` calculate correlation between columns of a matrix. Do not need to transpose the matrix for calculating the distances between samples(columns).
192 | 
193 | ```{r}
194 | Dmat<- as.dist(1-cor(ncidat))
195 | ward.hclust<- hclust(Dmat,method="ward.D")
196 | plot_dend(as.dendrogram(ward.hclust), main = "ward linkage-1-cor(X) distance")
197 | ```
198 | 
199 | 
200 | ## sort dendrogram 
201 | 
202 | The two branches from the same node can be flipped in the dendrogram and it does not 
203 | affect the meaning of the cluster structure in the data. However, one can sort the dendrogram
204 | so it can be more visually.
205 | 
206 | Take a look at [dendsort](https://github.com/evanbiederstedt/dendsort)!
207 | 
208 | >The subtrees in the resulting dendrogram are sorted based on the average distance of subtrees at every merging point. The tighter cluster, in other words the cluster with smaller average distance, is placed on the left side of branch. When a leaf merge with a cluster, the leaf is placed on the right side.
209 | 
210 | 
211 | 
212 | ```{r}
213 | library(dendsort)
214 | 
215 | plot_dend(as.dendrogram(ward.hclust), main = "ward linkage-1-cor(X) distance")
216 | 
217 | plot_dend(as.dendrogram(ward.hclust) %>%
218 |             dendsort(), 
219 |           main = "ward linkage-1-cor(X) distance sorted dendogram")
220 | ```
221 | 
222 | 


--------------------------------------------------------------------------------
/scripts/R_tips_07_rownames.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "rownames for dataframe"
 3 | author: "Ming Tang"
 4 | date: '2024-01-04'
 5 | output: html_document
 6 | editor_options: 
 7 |   chunk_output_type: console
 8 | ---
 9 | 
10 | ```{r}
11 | expression<- data.frame(sample1 = c(1,2,3,4,5),
12 |                         sample2 = c(2,3,5,6,7),
13 |                         sample3 = c(10,12,8,9,14))
14 | 
15 | expression
16 | ```
17 | 
18 | add rownames
19 | 
20 | ```{r}
21 | rownames(expression)<- paste0("gene", 1:5)
22 | 
23 | expression
24 | ```
25 | 
26 | use rownames to subset
27 | 
28 | ```{r}
29 | rownames(expression)
30 | 
31 | expression[c("gene1", "gene2"), ]
32 | ```
33 | 
34 | duplicated genes
35 | 
36 | ```{r}
37 | genes<- c("gene1", "gene2", "gene2", "gene3", "gene3")
38 | 
39 | genes
40 | 
41 | rownames(expression)<- genes
42 | 
43 | make.names(genes, unique = TRUE)
44 | 
45 | rownames(expression)<- make.names(genes, unique = TRUE)
46 | 
47 | expression
48 | ```
49 | 


--------------------------------------------------------------------------------
/scripts/R_tips_08_convert_gene_ids.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "convert gene ids"
 3 | author: "Ming Tang"
 4 | date: '2023-07-13'
 5 | output: html_document
 6 | editor_options: 
 7 |   chunk_output_type: console
 8 | ---
 9 | 
10 | ### GUI tools
11 | 
12 | DAVID pathway https://david.ncifcrf.gov/conversion.jsp
13 | 
14 | BioDBnet https://biodbnet.abcc.ncifcrf.gov/db/db2db.php
15 | 
16 | 
17 | ### Convert gene id using Bioconductor package 
18 | ```{r}
19 | BiocManager::install("org.Hs.eg.db")
20 | library(org.Hs.eg.db)
21 | library(AnnotationDbi)
22 | 
23 | genes<- c("VEGFA", "CTCF", "HIF1A")
24 | 
25 | AnnotationDbi::select(org.Hs.eg.db, keys = genes,
26 |   columns = c('ENTREZID'), keytype = 'SYMBOL')
27 | 
28 | columns(org.Hs.eg.db)
29 | clusterProfiler::bitr(genes, fromType = "SYMBOL", toType = "ENTREZID", OrgDb = org.Hs.eg.db)
30 | 
31 | ```
32 | 
33 | 
34 | ### Mygene 
35 | 
36 | https://mygene.info/
37 | convert more IDs
38 | 
39 | ```{r}
40 | BiocManager::install("mygene")
41 | library(mygene)
42 | 
43 | queryMany(genes, scopes="symbol", fields=c("uniprot", "ensembl.gene", "reporter"), species="human")
44 | ```
45 | 
46 | 


--------------------------------------------------------------------------------
/scripts/R_tips_09_biomart_mouse_ortholog.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "mouse ortholog"
 3 | author: "Ming Tang"
 4 | date: '2024-01-11'
 5 | output: html_document
 6 | editor_options: 
 7 |   chunk_output_type: console
 8 | ---
 9 | 
10 | Get mouse orthologs for human
11 | 
12 | https://bioconductor.org/packages/release/bioc/html/biomaRt.html
13 | 
14 | ```{r}
15 | if (!require("BiocManager", quietly = TRUE))
16 |     install.packages("BiocManager")
17 | 
18 | BiocManager::install("biomaRt")
19 | 
20 | 
21 | library(dplyr)
22 | library(biomaRt)
23 | human<- useMart("ensembl", dataset = "hsapiens_gene_ensembl")
24 | 
25 | attributes<-  c("ensembl_gene_id", "external_gene_name",
26 |                 "mmusculus_homolog_ensembl_gene", 
27 |                 "mmusculus_homolog_associated_gene_name",
28 |                 "mmusculus_homolog_orthology_type",
29 |                 "mmusculus_homolog_perc_id_r1")
30 | 
31 | listAttributes(human) %>%
32 |   head()
33 | 
34 | 
35 | listAttributes(human) %>% 
36 |   filter(stringr::str_detect(name, "mmusculus_homolog_"))
37 | 
38 | listAttributes(human)  %>% head()
39 | 
40 | orth.mouse<-  getBM(attributes, filters="with_mmusculus_homolog",
41 |                     values=TRUE, mart = human, uniqueRows=TRUE)
42 | 
43 | listFilters(human)%>% head()
44 | listFilters(human)%>% 
45 |   filter(stringr::str_detect(name, "mmusculus"))
46 | 
47 | head(orth.mouse)
48 | ```
49 | 
50 | 
51 | ```{r}
52 | orth.mouse %>%
53 |   dplyr::filter(external_gene_name == "VEGFA")
54 | 
55 | orth.mouse %>%
56 |   dplyr::filter(external_gene_name == "CTCF")
57 | 
58 | orth.mouse %>%
59 |   dplyr::filter(external_gene_name == "LILRB1")
60 | ```
61 | 


--------------------------------------------------------------------------------
/scripts/R_tips_10_ggplot2_percentage.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "R_tips_percentage_ggplot"
  3 | author: "Ming Tang"
  4 | date: '2024-01-15'
  5 | output: html_document
  6 | editor_options: 
  7 |   chunk_output_type: console
  8 | ---
  9 | 
 10 | How to label the y-axis with % for percentages?
 11 | 
 12 | ```{r}
 13 | library(Seurat)
 14 | library(SeuratData)
 15 | library(dplyr)
 16 | library(ggplot2)
 17 | 
 18 | data("pbmc3k")
 19 | 
 20 | pbmc3k
 21 | pbmc3k<- UpdateSeuratObject(pbmc3k)
 22 | 
 23 | pbmc3k
 24 | ```
 25 | 
 26 | 
 27 | ```{r}
 28 | table(pbmc3k$seurat_annotations)
 29 | ```
 30 | 
 31 | 
 32 | ```{r}
 33 | pbmc3k@meta.data %>%
 34 |   head()
 35 | ```
 36 | 
 37 | Let's calculate the percentage of each cell types
 38 | 
 39 | ```{r}
 40 | cell_number<- pbmc3k@meta.data %>%
 41 |   count(seurat_annotations)
 42 | 
 43 | cell_number
 44 | ```
 45 | 
 46 | calculate the percentage 
 47 | ```{r}
 48 | cell_number<- cell_number %>%
 49 |   mutate(percent = n/ sum(n) * 100) %>%
 50 |   rename(cell_type = seurat_annotations)
 51 | 
 52 | cell_number
 53 | ```
 54 | 
 55 | plotting 
 56 | 
 57 | ```{r}
 58 | ggplot(cell_number, aes(x= cell_type, y = percent)) +
 59 |   geom_bar(stat = "identity") +
 60 |   xlab("")
 61 | ```
 62 | 
 63 | add percentage
 64 | 
 65 | ```{r}
 66 | ggplot(cell_number, aes(x= cell_type, y = percent)) +
 67 |   geom_bar(stat = "identity") +
 68 |   scale_y_continuous(labels = scales::percent) +
 69 |   xlab("")
 70 | ```
 71 | 
 72 | fix it
 73 | 
 74 | ```{r}
 75 | ggplot(cell_number, aes(x= cell_type, y = percent)) +
 76 |   geom_bar(stat = "identity") +
 77 |   scale_y_continuous(labels = scales::percent_format(scale = 1)) +
 78 |   xlab("")
 79 | 
 80 | 
 81 | ```
 82 | 
 83 | show more digits 
 84 | 
 85 | ```{r}
 86 | ggplot(cell_number, aes(x= cell_type, y = percent)) +
 87 |   geom_bar(stat = "identity") +
 88 |   scale_y_continuous(labels = scales::percent_format(scale = 1, accuracy = 0.01)) +
 89 |   xlab("")
 90 | ```
 91 | 
 92 | rotate x-axis label
 93 | 
 94 | ```{r}
 95 | ggplot(cell_number, aes(x= cell_type, y = percent)) +
 96 |   geom_bar(stat = "identity") +
 97 |   scale_y_continuous(labels = scales::percent_format(scale = 1, accuracy = 0.1)) +
 98 |   xlab("") +
 99 |   theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
100 | ```
101 | 
102 | re-order the bar by percentages
103 | 
104 | ```{r}
105 | ggplot(cell_number, 
106 |        aes(x= cell_type %>% forcats::fct_reorder(percent), y = percent)) +
107 |   geom_bar(stat = "identity") +
108 |   scale_y_continuous(labels = scales::percent_format(scale = 1, accuracy = 0.1)) +
109 |   xlab("") +
110 |   theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
111 | ```
112 | 
113 | descending 
114 | 
115 | ```{r}
116 | ggplot(cell_number, 
117 |        aes(x= cell_type %>% forcats::fct_reorder(percent,.desc=TRUE), y = percent)) +
118 |   geom_bar(stat = "identity") +
119 |   scale_y_continuous(labels = scales::percent_format(scale = 1, accuracy = 0.1)) +
120 |   xlab("") +
121 |   theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
122 | ```
123 | 
124 | 


--------------------------------------------------------------------------------
/scripts/R_tips_11_read_all_tabs_spreadsheet.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "read all tabs from spreadsheet"
 3 | author: "Ming Tang"
 4 | date: '2024-04-25'
 5 | output: html_document
 6 | editor_options: 
 7 |   chunk_output_type: console
 8 | ---
 9 | 
10 | ```{r}
11 | library(readxl)
12 | library(here)
13 | library(tidyverse)
14 | 
15 | # read in one sheet
16 | read_excel(here("data/gene_counts.xlsx"), sheet = "sample1")
17 | 
18 | ```
19 | 
20 | 
21 | ```{r}
22 | 
23 | read_excel(here("data/gene_counts.xlsx"), sheet = "sample1")
24 | read_excel(here("data/gene_counts.xlsx"), sheet = "sample2")
25 | read_excel(here("data/gene_counts.xlsx"), sheet = "sample3")
26 | read_excel(here("data/gene_counts.xlsx"), sheet = "sample4")
27 | 
28 | path<- here("data/gene_counts.xlsx")
29 | 
30 | path %>% 
31 |   excel_sheets() %>% 
32 |   set_names() %>% 
33 |   map(read_excel, path = path)
34 | 
35 | 
36 | path %>% 
37 |   excel_sheets() %>% 
38 |   map(read_excel, path = path)
39 | ```
40 | 
41 | 
42 | merge into a single dataframe
43 | ```{r}
44 | path %>% 
45 |   excel_sheets() %>% 
46 |   set_names() %>% 
47 |   map(read_excel, path = path) %>%
48 |   bind_rows(.id="sample")
49 | ```
50 | 
51 | 
52 | make it to a wide format count table
53 | 
54 | ```{r}
55 | path %>% 
56 |   excel_sheets() %>% 
57 |   set_names() %>% 
58 |   map(read_excel, path = path) %>%
59 |   bind_rows(.id="sample") %>%
60 |   tidyr::pivot_wider(names_from = "sample", values_from = "count") 
61 | ```
62 | 
63 | 


--------------------------------------------------------------------------------
/scripts/R_tips_12_save_vs_saveRDS.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "save vs saveRDS"
 3 | output: html_document
 4 | date: "2024-08-29"
 5 | editor_options: 
 6 |   chunk_output_type: console
 7 | ---
 8 | 
 9 | ### save() vs saveRDS()
10 | 
11 | ```{r}
12 | # Load necessary libraries
13 | library(tidymodels)
14 | library(broom)
15 | library(ggplot2)
16 | library(dplyr)
17 | 
18 | head(mtcars)
19 | 
20 | ggplot(mtcars, aes(x= mpg, y = wt)) +
21 |   geom_point()
22 | 
23 | 
24 | # Define the linear regression model specification
25 | lm_spec <- linear_reg() %>%
26 |   set_engine("lm")
27 | 
28 | # Fit the model
29 | lm_fit <- lm_spec %>%
30 |   fit(mpg ~ wt, data = mtcars)
31 | 
32 | # Extract the coefficient and p-value
33 | model_summary <- tidy(lm_fit)
34 | 
35 | 
36 | saveRDS(lm_fit, file = "data/lm_fit.rds")
37 | 
38 | save(lm_fit, model_summary, file = "data/lm_fit.rda")
39 | ```
40 | 
41 | 
42 | ### read the data back
43 | 
44 | ```{r}
45 | my_saved_fit<- readRDS("data/lm_fit.rds")
46 | 
47 | load("data/lm_fit.rda")
48 | ```


--------------------------------------------------------------------------------
/scripts/R_tips_13_copy_paste_vector_datapasta.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "datapasta demo"
 3 | output: html_document
 4 | date: "2024-09-04"
 5 | editor_options: 
 6 |   chunk_output_type: console
 7 | ---
 8 | 
 9 | introduction to datapasta https://github.com/MilesMcBain/datapasta
10 | 
11 | ```{r}
12 | install.packages(
13 |    "datapasta", 
14 |    repos = c(mm = "https://milesmcbain.r-universe.dev", getOption("repos")))
15 | ```
16 | 
17 | ### copy and paste vector
18 | 
19 | ```{r}
20 | my_genes<- c("VEGFA", "CTCF", "TP53", "FOXP3", "CD3D", "CD8A", "CD4")
21 | 
22 | c("VEGFA",
23 |   "CTCF",
24 |   "TP53",
25 |   "FOXP3",
26 |   "CD3D",
27 |   "CD8A",
28 |   "CD4")
29 | 
30 | 
31 | ```
32 | 
33 | ### copy and paste tibble
34 | 
35 | ```{r}
36 | my_dataframe<- tibble::tribble(
37 |     ~gene, ~expression,
38 |   "VEGFA",          3L,
39 |    "CTCF",          4L,
40 |    "TP53",         10L,
41 |   "FOXP3",          1L,
42 |    "CD3D",         14L,
43 |    "CD8A",         15L,
44 |     "CD4",          0L
45 |   )
46 | 
47 | 
48 | ```
49 | 


--------------------------------------------------------------------------------
/scripts/R_tips_14_janitor_clean_column_names.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "janitor clean column name"
 3 | output: html_document
 4 | date: "2024-09-04"
 5 | editor_options: 
 6 |   chunk_output_type: console
 7 | ---
 8 | 
 9 | ### Janitor R package 
10 | 
11 | showcase how to use https://github.com/sfirke/janitor
12 | 
13 | ```{r}
14 | install.packages("janitor")
15 | 
16 | library(readxl)
17 | library(janitor)
18 | library(dplyr)
19 | library(here)
20 | ```
21 | 
22 | ```{r}
23 | 
24 | roster_raw <- read_excel(here("data/dirty_data.xlsx")) # I included the copy in the repo
25 | 
26 | head(roster_raw)
27 | ```
28 | 
29 | ```{r}
30 | roster_raw <- roster_raw %>%
31 |   row_to_names(row_number = 1) %>%
32 |   clean_names()
33 | 
34 | # or 
35 | read_excel(here("data/dirty_data.xlsx"), skip=1)
36 | 
37 | head(roster_raw)
38 | 
39 | View(roster_raw)
40 | ```
41 | 
42 | clean it further 
43 | ```{r}
44 | roster <- roster_raw %>%
45 |   remove_empty(c("rows", "cols")) %>%
46 |   remove_constant(na.rm = TRUE, quiet = FALSE) %>% # remove the column of all "Yes" values
47 |   mutate(
48 |     hire_date = convert_to_date(
49 |       hire_date, # handle the mixed-format dates
50 |       character_fun = lubridate::mdy
51 |     ),
52 |     cert = dplyr::coalesce(certification, certification_2),
53 |     cert2 = dplyr::coalesce(certification_2, certification)
54 |   ) %>% View()
55 | 
56 |   select(-certification, -certification_2)
57 | ```
58 | 
59 | ### better table using  tabyl()
60 | 
61 | ```{r}
62 | table(roster$subject)
63 | 
64 | roster %>%
65 |   tabyl(subject)
66 | 
67 | roster %>%
68 |   tabyl(employee_status, full_time)
69 | 
70 | roster %>%
71 |   tabyl(full_time, subject, employee_status, show_missing_levels = FALSE)
72 | 
73 | roster %>%
74 |   tabyl(subject, employee_status, full_time, show_missing_levels = FALSE)
75 | ```


--------------------------------------------------------------------------------
/scripts/R_tips_15_calculate_cpm.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "calculate cpm"
 3 | output: html_document
 4 | date: "2024-10-03"
 5 | editor_options: 
 6 |   chunk_output_type: console
 7 | ---
 8 | # how to calculate counts per million (CPM) from scratch for RNAseq or ChIP-seq count table 
 9 | 
10 | 
11 | ### use sweep 
12 | 
13 | ```{r}
14 | mat <- matrix(1:9, nrow=3, byrow=TRUE)  
15 | 
16 | matrix(1:9, nrow=3)
17 | # Divide each column by its sum
18 | normalized_mat1 <- sweep(mat, 2, colSums(mat)/10^6, FUN="/")
19 | 
20 | 
21 | normalized_mat1
22 | 
23 | ```
24 | 
25 | ### use apply
26 | 
27 | ```{r}
28 | normalized_mat2<- apply(mat, 2, function(x) x/sum(x) * 10^6)
29 | normalized_mat2
30 | 
31 | all.equal(normalized_mat1, normalized_mat2)
32 | ```
33 | 
34 | ### use vectorization 
35 | 
36 | ```{r}
37 | mat/colSums(mat)
38 | 
39 | normalized_mat3<- t(t(mat)/colSums(mat) * 10^6)
40 | normalized_mat3
41 | ```
42 | 
43 | ### use a package
44 | ```{r}
45 | library(edgeR)
46 | cpm(mat)
47 | ```
48 | 
49 | 
50 | 


--------------------------------------------------------------------------------
/scripts/R_tips_16_liftover_bedpe.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "lift over bedpe"
  3 | output: html_document
  4 | date: "2024-11-22"
  5 | editor_options: 
  6 |   chunk_output_type: console
  7 | ---
  8 | 
  9 | ### liftover hg19 bedpe to hg38
 10 | 
 11 | download the bedpe file here https://gitlab.com/tangming2005/Enhancer_promoter_interaction_data/-/blob/master/bedpe/ENCODE_EP.bedpe
 12 | 
 13 | 
 14 | ```{r}
 15 | library(rtracklayer)
 16 | library(AnnotationHub) 
 17 | ahub <- AnnotationHub()
 18 | 
 19 | ahub.chain <- subset(ahub, rdataclass == "ChainFile" & species == "Homo sapiens")
 20 | query(ahub.chain, c("hg19", "hg38"))
 21 | 
 22 | chain <- ahub.chain[ahub.chain$title == "hg19ToHg38.over.chain.gz"]
 23 | chain <- chain[[1]]
 24 | 
 25 | ?liftOver
 26 | 
 27 | bedpe<- import(here("data/ENCODE_EP_clean.bedpe"))
 28 | 
 29 | bedpe@first
 30 | 
 31 | bedpe@second
 32 | 
 33 | first_liftover<- liftOver(bedpe@first, chain = chain)
 34 | second_liftover<- liftOver(bedpe@second, chain = chain)
 35 | ```
 36 | 
 37 | There are regions in hg19 mapped to multiple regions of hg38
 38 | and there are hg19 regions do not map to hg38.
 39 | 
 40 | ```{r}
 41 | indx<- elementNROWS(first_liftover) >1
 42 | 
 43 | first_liftover[indx]
 44 | 
 45 | table(elementNROWS(first_liftover) ==0)
 46 | table(elementNROWS(second_liftover) ==0)
 47 | 
 48 | 
 49 | length(first_liftover)
 50 | length(second_liftover)
 51 | 
 52 | invalid_idx_first<- which(elementNROWS(first_liftover) ==0)
 53 | invalid_idx_second<- which(elementNROWS(second_liftover) ==0)
 54 | 
 55 | valid_idx<- setdiff(1:length(first_liftover), c(invalid_idx_first, invalid_idx_second))
 56 | 
 57 | #remove the invalid pairs
 58 | first_liftover<- first_liftover[valid_idx]
 59 | second_liftover<- second_liftover[valid_idx]
 60 | 
 61 | 
 62 | ## looping over the full GRangesList object is very slow
 63 | ## only pick the problematic ones
 64 | 
 65 | indx_1<- which(elementNROWS(first_liftover) >1)
 66 | indx_2<- which(elementNROWS(second_liftover) >1)
 67 | 
 68 | first_liftover[indx_1]
 69 | 
 70 | select_valid_regions<- function(gr){
 71 |   if (length(gr) > 1){
 72 |     return(gr[which.max(width(gr))])
 73 |   }
 74 | }
 75 | 
 76 | lapply(first_liftover[indx_1], select_valid_regions)
 77 | lapply(second_liftover[indx_2], select_valid_regions)
 78 | 
 79 | # check endoapply and mendoapply
 80 | # unlist GRangeList object to GRanges object, but lapply will not keep the same GRangesList object
 81 | # it will just return a list of GRanges. use endoapply
 82 | first_liftover_valid<- c(unlist(endoapply(first_liftover[indx_1],
 83 |                                           select_valid_regions)),
 84 |                          unlist(first_liftover[-indx_1]))
 85 | 
 86 | 
 87 | 
 88 | second_liftover_valid<- c(unlist(endoapply(second_liftover[indx_2],
 89 |                                            select_valid_regions)),
 90 |                           unlist(second_liftover[-indx_2]))
 91 | 
 92 | first_liftover_valid
 93 | 
 94 | ```
 95 | 
 96 | write it to file
 97 | ```{r}
 98 | hg38_bedpe<- Pairs(first_liftover_valid, second_liftover_valid)
 99 | rtracklayer::export(hg38_bedpe, here("data/ENCODE_EP_hg38.bedpe"), format="bedpe")
100 | ```
101 | 
102 | 


--------------------------------------------------------------------------------
/scripts/R_tips_17_upset_plot_for_gene_sets.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "upset plot"
 3 | output: html_document
 4 | date: "2024-12-12"
 5 | editor_options: 
 6 |   chunk_output_type: console
 7 | ---
 8 | 
 9 | ```{r}
10 | # install.packages("msigdbr")
11 | library(msigdbr)
12 | library(dplyr)
13 | 
14 | h_gene_sets<-  msigdbr(species = "human", category = "H")
15 | 
16 | h_gene_sets
17 | 
18 | head(h_gene_sets)
19 | 
20 | table(h_gene_sets$gs_name)
21 | ```
22 | 
23 | 
24 | ```{r}
25 | gs_nest<- h_gene_sets %>%
26 |   group_by(gs_name) %>%
27 |   tidyr::nest()
28 | ```
29 | 
30 | 
31 | ```{r}
32 | gs_nest$data[[1]]
33 | 
34 | gene_list<- purrr::map(gs_nest$data, ~pull(.x, gene_symbol))
35 | 
36 | names(gene_list)<- gs_nest$gs_name %>% stringr::str_replace("HALLMARK_", "")
37 | 
38 | names(gene_list)
39 | 
40 | gene_sub<- gene_list[c(13,14,15,16,18)]
41 | 
42 | names(gene_sub)
43 | ```
44 | 
45 | 
46 | 
47 | ```{r}
48 | library(ComplexHeatmap)
49 | m<-  make_comb_mat(gene_sub)
50 | m
51 | 
52 | UpSet(m)
53 | 
54 | UpSet(m, comb_order = order(-comb_size(m)))
55 | ```
56 | 
57 | 
58 | ```{r}
59 | UpSet(m, left_annotation = upset_left_annotation(m), 
60 |       comb_order = order(comb_size(m)))
61 | ```
62 | 
63 | add numbers on the bar 
64 | ```{r}
65 | UpSet(m, left_annotation = upset_left_annotation(m), 
66 |       top_annotation = upset_top_annotation(m, add_numbers = TRUE),
67 |       comb_order = order(comb_size(m)))
68 | ```
69 | 
70 | change set order 
71 | ```{r}
72 | UpSet(m, 
73 |       left_annotation = upset_left_annotation(m), 
74 |       top_annotation = upset_top_annotation(m, add_numbers = TRUE),
75 |       set_order = c("E2F_TARGETS","G2M_CHECKPOINT","ESTROGEN_RESPONSE_EARLY",
76 |                     "ESTROGEN_RESPONSE_LATE", "EPITHELIAL_MESENCHYMAL_TRANSITION"),
77 |       comb_order = order(comb_size(m)))
78 | ```
79 | 
80 | ```{r}
81 | UpSet(m, 
82 |       left_annotation = upset_left_annotation(m), 
83 |       top_annotation = upset_top_annotation(m, add_numbers = TRUE),
84 |       set_order = c("E2F_TARGETS","G2M_CHECKPOINT","ESTROGEN_RESPONSE_EARLY",
85 |                     "ESTROGEN_RESPONSE_LATE", "EPITHELIAL_MESENCHYMAL_TRANSITION"),
86 |       comb_order = order(-comb_size(m)))
87 | ```
88 | 
89 | Change the rowname size 
90 | 
91 | ```{r}
92 | UpSet(m, left_annotation = upset_left_annotation(m), 
93 |       top_annotation = upset_top_annotation(m, add_numbers = TRUE),
94 |       comb_order = order(comb_size(m)),
95 |       row_names_gp = gpar(fontsize = 8))
96 | ```
97 | 


--------------------------------------------------------------------------------
/scripts/R_tips_18_tile_a_bed_file.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "tile a bed file"
 3 | output: html_document
 4 | date: "2024-12-31"
 5 | editor_options: 
 6 |   chunk_output_type: console
 7 | ---
 8 | 
 9 | ```{r}
10 | library(GenomicRanges)
11 | library(rtracklayer)
12 | library(dplyr)
13 | library(readr)
14 | library(here)
15 | 
16 | ```
17 | 
18 | ```{r}
19 | gr2 <- GRanges(
20 |         seqnames=Rle(c("chr1", "chr2", "chr1", "chr3"), c(1, 3, 2, 4)),
21 |         ranges=IRanges(1:10, end=11),
22 |         strand=Rle(strand(c("-", "+", "*", "+", "-")), c(1, 2, 2, 3, 2)),
23 |         seqlengths=c(chr1=11, chr2=12, chr3=13))
24 | 
25 | gr2 
26 | # split every range to 2 base pair bins
27 | tiles <- tile(gr2, width = 2L)
28 | tiles
29 | ```
30 | 
31 | 
32 | We want to tile it to 5 base pair bins
33 | ```{r}
34 | gr<- rtracklayer::import(here("data/test.bed"))
35 | gr
36 | 
37 | bin_size<- 5
38 | 
39 | gr_width<- width(gr)
40 | 
41 | gr_width
42 | 
43 | bin_num<- ceiling(gr_width/bin_size)
44 | 
45 | ## after extending, the peaks are overlapping
46 | gr_center<- resize(gr, fix = "center", width = bin_num * bin_size)
47 | 
48 | ## this works fine
49 | unlist(tile(gr_center, width = bin_size))
50 | 
51 | ## merge before tile
52 | gr_center_merge<- reduce(gr_center)
53 | 
54 | gr_center_merge
55 | 
56 | out_bed<- unlist(tile(gr_center_merge, width = bin_size))
57 | 
58 | out_bed
59 | ```
60 | 
61 | check `tileGenome`. It is very helpful when you need to bin the genome into
62 | bins and calculate the number of reads in each bin.  For example, for scATACseq,
63 | one can calculate the number of reads per bin in the whole genome and use that matrix
64 | to cluster the cells.
65 | 
66 | ```{r}
67 | 
68 | library(TxDb.Hsapiens.UCSC.hg38.knownGene)
69 | txdb<- TxDb.Hsapiens.UCSC.hg38.knownGene
70 | 
71 | bins <- tileGenome(seqinfo(txdb), tilewidth=10000,
72 |                    cut.last.tile.in.chrom=TRUE)
73 | ```
74 | 
75 | 


--------------------------------------------------------------------------------
/scripts/R_tips_19_kmeans_clustering.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "k-means"
  3 | output: html_document
  4 | date: "2024-12-31"
  5 | editor_options: 
  6 |   chunk_output_type: console
  7 | ---
  8 | 
  9 | Let's use k-means clustering 
 10 | 
 11 | ```{r}
 12 | # install the package if you do not have it.
 13 | # install.packages("ISLR")
 14 | library(ISLR)
 15 | 
 16 | ncidat<- t(NCI60$data)
 17 | colnames(ncidat)<- NCI60$labs
 18 | 
 19 | dim(ncidat)
 20 | 
 21 | ncidat[1:5, 1:50]
 22 | ```
 23 | 
 24 | 
 25 | ```{r}
 26 | unique(colnames(ncidat))
 27 | ```
 28 | 
 29 | ### PCA analysis
 30 | 
 31 | ```{r}
 32 | 
 33 | X<- t(scale(t(ncidat),center=TRUE,scale= TRUE))
 34 | 
 35 | # we transpose X again for svd, or use prcomp 
 36 | sv = svd(t(X))
 37 | U = sv$u
 38 | V = sv$v
 39 | D = sv$d
 40 | ```
 41 | 
 42 | Further reading https://divingintogeneticsandgenomics.com/post/pca-in-action/
 43 | and https://divingintogeneticsandgenomics.com/post/pca-projection/
 44 | 
 45 | ```{r}
 46 | Z = t(X)%*%V
 47 | 
 48 | pc_dat<- data.frame(type = rownames(Z), PC1 = Z[,1], PC2= Z[,2])
 49 | 
 50 | library(ggplot2)
 51 | 
 52 | ggplot(pc_dat,aes(x=PC1, y=PC2, col=type)) + 
 53 |   geom_point() +
 54 |   theme_classic(base_size = 14)
 55 | ```
 56 | 
 57 | ### K-means on the raw data 
 58 | 
 59 | kmeans is by rows by default.
 60 | 
 61 | ```{r}
 62 | library(ComplexHeatmap)
 63 | 
 64 | 
 65 | K<- 9
 66 | 
 67 | km<- kmeans(t(X), centers = K)
 68 | 
 69 | table(km$cluster)
 70 | ```
 71 | 
 72 | ### set.seed() to make it reproducible 
 73 | 
 74 | ```{r}
 75 | set.seed(123)
 76 | 
 77 | km<- kmeans(t(X), centers = K)
 78 | table(km$cluster)
 79 | ```
 80 | 
 81 | In your original matrix:
 82 | 
 83 | Rows (genes): 6830 — these are the features or "attributes" of your samples.
 84 | Columns (samples): 64 — these are what you want to group (cluster).
 85 | 
 86 | When you run K-means clustering on the samples `t(X)`, the robot helper is grouping the columns based on their similarity across the 6830 genes.
 87 | 
 88 | ### Dimensions of Outputs When Clustering Samples
 89 | 
 90 | Cluster Assignments:
 91 | For each of the 64 samples, K-means will assign it to one of the 9 groups (clusters). This is a vector of length 64.
 92 | 
 93 | Dimension: 64 (one number per sample).
 94 | 
 95 | ```{r}
 96 | km$cluster 
 97 | 
 98 | length(km$cluster)
 99 | ```
100 | 
101 | 
102 | visualize it with a heatmap 
103 | ```{r}
104 | km$cluster %>% 
105 |   tibble::enframe() %>%
106 |   janitor::tabyl(name, value) %>%
107 |   tibble::column_to_rownames(var="name") %>%
108 |   as.matrix() %>%
109 |   Heatmap(cluster_columns = FALSE)
110 | ```
111 | 
112 | Centers Matrix:
113 | 
114 | The centers matrix now represents the "average sample" for each cluster. Each center is calculated based on the genes (rows). Since you have 9 clusters and each cluster center is described by the 6840 genes, the centers matrix will have:
115 | 
116 | 9 rows (clusters) × 6840 columns (genes).
117 | 
118 | ```{r}
119 | cens<- km$centers
120 | 
121 | dim(cens)
122 | ```
123 | 
124 | 
125 | ###  how do we visualize K-means results?
126 | 
127 | overlay K-means result on the PCA plot.
128 | 
129 | ```{r}
130 | par(mfrow=c(1,1))
131 | 
132 | plot(Z[,1],Z[,2],col=km$cluster,type="n")
133 | 
134 | text(Z[,1],Z[,2],colnames(ncidat),cex=.75,col=km$cluster)
135 | 
136 | 
137 | points(cens%*%V[,1],cens%*%V[,2],col=1:K,pch=16,cex=3)
138 | ```
139 | 
140 | ### K-means on the PCA space
141 | 
142 | ```{r}
143 | Z
144 | 
145 | km2<- kmeans(Z, centers = K)
146 | 
147 | km2$cluster %>% 
148 |   tibble::enframe() %>%
149 |   janitor::tabyl(name, value) %>%
150 |   tibble::column_to_rownames(var="name") %>%
151 |   as.matrix() %>%
152 |   Heatmap(cluster_columns = FALSE)
153 | ```
154 | 
155 | 
156 | K-means on the PCA-transformed Z matrix is generally preferred for clustering when working with high-dimensional data like gene expression. It focuses on the meaningful variation while avoiding noise and redundancy.
157 | 
158 | 
159 | Reduces noise and redundancy: PCA captures the most important variation in the data, filtering out noise or low-variance genes.
160 | - Better distance metrics: By focusing on a few top principal components, clustering is based on meaningful differences rather than noise.
161 | 
162 | - Efficient computation: Working in a smaller-dimensional space speeds up K-means, especially with large datasets.
163 | 
164 | 
165 | K-means on the original matrix can be useful if you suspect that low-variance genes or subtle patterns might be biologically relevant and want to preserve them.
166 | 
167 | 
168 | | **Aspect**                 | **Original Matrix**                | **PCA (Z Matrix)**                        |
169 | |----------------------------|------------------------------------|------------------------------------------|
170 | | **Dimensionality**          | High (6840 genes)                | Low (e.g., 10-50 PCs, depending on variance) |
171 | | **Noise Sensitivity**       | High                              | Low                                      |
172 | | **Focus**                   | Includes all variance             | Focuses on major variance                |
173 | | **Computational Cost**      | Higher                            | Lower                                    |
174 | | **Risk of Overfitting**     | Higher (due to noise)             | Lower                                    |
175 | 


--------------------------------------------------------------------------------
/scripts/R_tips_20_scatterplot_with_cor_p_value.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Untitled"
 3 | output: html_document
 4 | date: "2025-01-06"
 5 | editor_options: 
 6 |   chunk_output_type: console
 7 | ---
 8 | 
 9 | ```{r}
10 | # BiocManager::install(c("airway", "ggpubr","DESEq2"))
11 | 
12 | library(airway)  
13 | library(ggpubr)  
14 | library(DESeq2)  # for normalization
15 | 
16 | data("airway")
17 | dds <- DESeqDataSet(airway, design = ~ cell + dex)
18 | 
19 | ```
20 | 
21 | The airway dataset contains RNA-seq data from airway smooth muscle cells.
22 | 
23 | ```{r}
24 | dds <- DESeq(dds)  
25 | norm_counts <- counts(dds, normalized = TRUE) 
26 | ```
27 | 
28 | 
29 | ```{r}
30 | gene1 <- "ENSG00000075624"  # ACTB (Beta-actin)  
31 | gene2 <- "ENSG00000111640"  # GAPDH  
32 | 
33 | ```
34 | Both ACTB and GAPDH are well-known housekeeping genes commonly used as controls in gene expression studies.
35 | 
36 | ```{r}
37 | plot_data <- data.frame(  
38 |   Gene1 = norm_counts[gene1, ],  
39 |   Gene2 = norm_counts[gene2, ]  
40 | )
41 | 
42 | data<- iris
43 | ```
44 | 
45 | ```{r}
46 | p <- ggscatter(data,  
47 |                x = "Sepal.Length",  
48 |                y = "Petal.Length",  
49 |                add = "reg.line",  
50 |                conf.int = TRUE,  
51 |                cor.coef = TRUE,  
52 |                cor.method = "pearson",  
53 |                cor.coeff.args = list(label.sep = "\n"),  
54 |                xlab = "Sepal Length",  
55 |                ylab = "Petal Length")  
56 | print(p)  
57 | 
58 | ```


--------------------------------------------------------------------------------
/scripts/R_tips_janitor_clean_column_names.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "janitor clean column name"
 3 | output: html_document
 4 | date: "2024-09-04"
 5 | editor_options: 
 6 |   chunk_output_type: console
 7 | ---
 8 | 
 9 | ### Janitor R package 
10 | 
11 | showcase how to use https://github.com/sfirke/janitor
12 | 
13 | ```{r}
14 | install.packages("janitor")
15 | 
16 | library(readxl)
17 | library(janitor)
18 | library(dplyr)
19 | library(here)
20 | ```
21 | 
22 | ```{r}
23 | 
24 | roster_raw <- read_excel(here("data/dirty_data.xlsx")) # I included the copy in the repo
25 | 
26 | head(roster_raw)
27 | ```
28 | 
29 | ```{r}
30 | roster_raw <- roster_raw %>%
31 |   row_to_names(row_number = 1) %>%
32 |   clean_names()
33 | 
34 | # or 
35 | read_excel(here("data/dirty_data.xlsx"), skip=1)
36 | 
37 | head(roster_raw)
38 | 
39 | View(roster_raw)
40 | ```
41 | 
42 | clean it further 
43 | ```{r}
44 | roster <- roster_raw %>%
45 |   remove_empty(c("rows", "cols")) %>%
46 |   remove_constant(na.rm = TRUE, quiet = FALSE) %>% # remove the column of all "Yes" values
47 |   mutate(
48 |     hire_date = convert_to_date(
49 |       hire_date, # handle the mixed-format dates
50 |       character_fun = lubridate::mdy
51 |     ),
52 |     cert = dplyr::coalesce(certification, certification_2),
53 |     cert2 = dplyr::coalesce(certification_2, certification)
54 |   ) %>%
55 |   select(-certification, -certification_2)
56 | ```
57 | 
58 | ### better table using  tabyl()
59 | 
60 | ```{r}
61 | table(roster$subject)
62 | 
63 | roster %>%
64 |   tabyl(subject)
65 | 
66 | roster %>%
67 |   tabyl(employee_status, full_time)
68 | 
69 | roster %>%
70 |   tabyl(full_time, subject, employee_status, show_missing_levels = FALSE)
71 | 
72 | roster %>%
73 |   tabyl(subject, employee_status, full_time, show_missing_levels = FALSE)
74 | ```


--------------------------------------------------------------------------------