├── 3_min_GSEA_tutorial.Rmd ├── 5x_doublets ├── a.txt ├── analyze_doublets.ipynb └── doublet_methods.ipynb ├── AUCell_intro.Rmd ├── DE_results.csv ├── GO_in_R.Rmd ├── GO_in_python.ipynb ├── GSEA_in_python.ipynb ├── PyDeseq2_DE_tutorial.ipynb ├── RNA_Velocity_scvelo.ipynb ├── RNAseq_method_comparison ├── GTEX_240_sample_ids.csv ├── NOTES.txt ├── analysis.ipynb ├── increasing_depth.csv.gz ├── make_counts_for_polyester.ipynb └── read_generation.Rmd ├── Random_forest_single_cell_classification.ipynb ├── Scanpy_intro_pp_clustering_markers.ipynb ├── bitfam_random_forest.ipynb ├── celloracle_pseudotime_GRN.ipynb ├── convert_ensemble_ids.ipynb ├── count_table_for_deseq_example.csv ├── doublet_removal_SOLO_scVI.ipynb ├── h5ad_to_seurat.ipynb ├── high_quality_barplots.ipynb ├── high_quality_lineplots.ipynb ├── high_quality_volcano_plots.ipynb ├── hypergeometric_enrichment_test_p_value.ipynb ├── integration_comparison ├── harmony.Rmd ├── readme.txt ├── scanorama.ipynb ├── scvi.ipynb ├── seurat_cca.Rmd └── seurat_rpca.Rmd ├── monocle3_tutorial.Rmd ├── paCMAP_examples.ipynb ├── pseudobulk_pyDeseq2.ipynb ├── python_sequence_alignment.ipynb ├── salmon_to_deseq.Rmd ├── sars.ipynb ├── sc2024 ├── annotation_integration.ipynb ├── bad_mapping.ipynb ├── iterative_preprocessing.ipynb ├── preprocessing.ipynb └── readme.txt ├── scATAC_intro_R.Rmd ├── scATAC_intro_R.nb.html ├── scVI_tools_introduction.ipynb ├── scvi_label_transfer.ipynb ├── seqs.fasta ├── shifted_transformation.ipynb ├── simple_scanpy_integration.ipynb ├── simpleaf_alevin_fry_tutorial.txt ├── single_cell_analysis_complete_class.ipynb ├── single_cell_gene_co-expression.ipynb ├── single_r.Rmd ├── soupX ├── readme.txt ├── soupX_R_tutorial.Rmd └── soupX_python_test.ipynb ├── spatial_seq_intro.ipynb ├── test_significance_t_u_shapiro.ipynb └── tutorial_complex_Heatmap.Rmd /3_min_GSEA_tutorial.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "R Notebook" 3 | output: html_notebook 4 | --- 5 | 6 | ```{r} 7 | library(DESeq2) 8 | Counts <- read.delim("count_table.csv", header = TRUE, row.names = 1, sep = ",") 9 | Counts <- Counts[which(rowSums(Counts) > 0),] 10 | condition <- factor(c("C","C","C","C", "S","S","S","S")) 11 | coldata <- data.frame(row.names = colnames(Counts), condition) 12 | dds <- DESeqDataSetFromMatrix(countData = Counts, colData = coldata, design = ~condition) 13 | dds <- DESeq(dds) 14 | res <- results(dds, contrast = c("condition", "S", "C")) 15 | res <- na.omit(res) 16 | res <- res[res$baseMean > 50,] 17 | ``` 18 | 19 | ```{r} 20 | res 21 | ``` 22 | 23 | 24 | ```{r} 25 | if (!requireNamespace("BiocManager", quietly = TRUE)) 26 | install.packages("BiocManager") 27 | 28 | BiocManager::install("org.Hs.eg.db") #org.Mm.eg.db for mouse 29 | 30 | 31 | if (!require("BiocManager", quietly = TRUE)) 32 | install.packages("BiocManager") 33 | 34 | BiocManager::install("clusterProfiler") 35 | 36 | 37 | if (!require("BiocManager", quietly = TRUE)) 38 | install.packages("BiocManager") 39 | 40 | BiocManager::install("AnnotationDbi") 41 | ``` 42 | 43 | 44 | ```{r} 45 | library(org.Hs.eg.db) 46 | library(clusterProfiler) 47 | ``` 48 | 49 | 50 | 51 | 52 | ```{r} 53 | res <- res[order(-res$stat),] 54 | res 55 | ``` 56 | 57 | ```{r} 58 | gene_list <- res$stat 59 | names(gene_list) <- rownames(res) 60 | gene_list 61 | ``` 62 | 63 | 64 | 65 | ```{r} 66 | gse <- gseGO(gene_list, 67 | ont = "BP", 68 | keyType = "ENSEMBL", 69 | OrgDb = "org.Hs.eg.db", 70 | eps = 1e-300) 71 | ``` 72 | 73 | ```{r} 74 | as.data.frame(gse) 75 | ``` 76 | 77 | 78 | ```{r} 79 | fit <- gseaplot(gse, geneSetID = 1) 80 | 81 | png("gsea.png", res = 250, width = 2000, height = 1300) 82 | print(fit) 83 | dev.off() 84 | 85 | fit 86 | ``` 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | -------------------------------------------------------------------------------- /5x_doublets/a.txt: -------------------------------------------------------------------------------- 1 | blank 2 | -------------------------------------------------------------------------------- /AUCell_intro.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "R Notebook" 3 | output: html_notebook 4 | --- 5 | 6 | 7 | ```{r} 8 | if (!require("BiocManager", quietly = TRUE)) 9 | install.packages("BiocManager") 10 | 11 | BiocManager::install("AUCell") 12 | ``` 13 | 14 | ```{r} 15 | library(AUCell) 16 | library(Seurat) 17 | ``` 18 | 19 | ```{r} 20 | load("droplet_Lung_seurat_tiss.Robj") 21 | tiss <- UpdateSeuratObject(object = tiss) 22 | ``` 23 | 24 | ```{r} 25 | tiss[[]] 26 | ``` 27 | 28 | ```{r} 29 | DimPlot(object = tiss, group.by = "cell_ontology_class", label = TRUE) 30 | ``` 31 | 32 | ```{r} 33 | markers <- read.csv("PanglaoDB_markers_27_Mar_2020.tsv", sep = "\t") 34 | markers <- markers[markers$cell.type == "Endothelial cells" & markers$species != "Hs",] 35 | markers 36 | ``` 37 | ```{r} 38 | genes <- markers$official.gene.symbol 39 | 40 | mousify <- function(a){ 41 | return(paste0(substr(a,1,1), tolower(substr(a,2,nchar(a))))) 42 | 43 | } 44 | genes <- sapply(genes, mousify) 45 | genes 46 | ``` 47 | 48 | ```{r} 49 | counts <- GetAssayData(object = tiss, slot = "counts") 50 | ``` 51 | 52 | ```{r} 53 | cell_rankings <- AUCell_buildRankings(counts) 54 | ``` 55 | 56 | ```{r} 57 | cells_AUC <- AUCell_calcAUC(genes, cell_rankings) 58 | ``` 59 | 60 | 61 | ```{r} 62 | cells_assignment <- AUCell_exploreThresholds(cells_AUC, plotHist = TRUE, assign=TRUE) 63 | ``` 64 | 65 | ```{r} 66 | cells_assignment$geneSet$assignment 67 | ``` 68 | 69 | 70 | ```{r} 71 | new_cells <- names(which(getAUC(cells_AUC)["geneSet",]>0.15)) 72 | ``` 73 | 74 | ```{r} 75 | tiss$is_ec <- ifelse(colnames(tiss) %in% new_cells, "EC", "non_EC") 76 | ``` 77 | 78 | ```{r} 79 | tiss[[]] 80 | ``` 81 | 82 | ```{r} 83 | DimPlot(object = tiss, group.by = "is_ec", label = TRUE) 84 | ``` 85 | 86 | ```{r} 87 | DimPlot(object = tiss, group.by = "cell_ontology_class", label = TRUE) 88 | ``` 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | -------------------------------------------------------------------------------- /GO_in_R.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "R Notebook" 3 | output: html_notebook 4 | --- 5 | 6 | ```{r} 7 | library(DESeq2) 8 | Counts <- read.delim("../count_table.csv", header = TRUE, row.names = 1, sep = ",") 9 | Counts <- Counts[which(rowSums(Counts) > 0),] 10 | condition <- factor(c("C","C","C","C", "S","S","S","S")) 11 | coldata <- data.frame(row.names = colnames(Counts), condition) 12 | dds <- DESeqDataSetFromMatrix(countData = Counts, colData = coldata, design = ~condition) 13 | dds <- DESeq(dds) 14 | res <- results(dds, contrast = c("condition", "S", "C")) 15 | sigs <- na.omit(res) 16 | sigs <- sigs[sigs$padj < 0.05 & sigs$baseMean > 50,] 17 | ``` 18 | 19 | ```{r} 20 | sigs 21 | ``` 22 | 23 | 24 | 25 | 26 | ```{r} 27 | if (!require("BiocManager", quietly = TRUE)) 28 | install.packages("BiocManager") 29 | 30 | BiocManager::install("clusterProfiler") 31 | 32 | if (!require("BiocManager", quietly = TRUE)) 33 | install.packages("BiocManager") 34 | 35 | BiocManager::install("AnnotationDbi") 36 | 37 | if (!require("BiocManager", quietly = TRUE)) 38 | install.packages("BiocManager") 39 | 40 | BiocManager::install("org.Hs.eg.db") 41 | ``` 42 | 43 | ```{r} 44 | library(clusterProfiler) 45 | library(org.Hs.eg.db) 46 | library(AnnotationDbi) 47 | ``` 48 | 49 | ```{r} 50 | genes_to_test <- rownames(sigs[sigs$log2FoldChange > 0.5,]) 51 | ``` 52 | 53 | ```{r} 54 | GO_results <- enrichGO(gene = genes_to_test, OrgDb = "org.Hs.eg.db", keyType = "ENSEMBL", ont = "BP") 55 | ``` 56 | 57 | ```{r} 58 | as.data.frame(GO_results) 59 | ``` 60 | 61 | ```{r} 62 | fit <- plot(barplot(GO_results, showCategory = 15)) 63 | 64 | png("out.png", res = 250, width = 1400, height = 1800) 65 | print(fit) 66 | dev.off() 67 | 68 | fit 69 | ``` 70 | 71 | ```{r} 72 | 73 | ``` 74 | 75 | ```{r} 76 | 77 | ``` 78 | 79 | 80 | 81 | 82 | 83 | 84 | -------------------------------------------------------------------------------- /RNAseq_method_comparison/GTEX_240_sample_ids.csv: -------------------------------------------------------------------------------- 1 | GTEX-111CU-2026-SM-5GZZC 2 | GTEX-111VG-2226-SM-5N9DU 3 | GTEX-1128S-2326-SM-5GZZY 4 | GTEX-113IC-0226-SM-5HL5C 5 | GTEX-117YX-1226-SM-5H11S 6 | GTEX-11DXW-1226-SM-5H133 7 | GTEX-11DXX-1926-SM-5EGJK 8 | GTEX-11DXX-2326-SM-9YFKS 9 | GTEX-11DXZ-0726-SM-5N9C4 10 | GTEX-11DXZ-2626-SM-59882 11 | GTEX-11EQ9-1826-SM-5Q5AJ 12 | GTEX-11GS4-1826-SM-5HL4T 13 | GTEX-11GS4-3126-SM-5A5LH 14 | GTEX-11GSO-0626-SM-5A5LW 15 | GTEX-11GSP-2326-SM-5HL63 16 | GTEX-11LCK-0926-SM-5A5KA 17 | GTEX-11O72-0006-SM-5O9DB 18 | GTEX-11TTK-0426-SM-5EQLT 19 | GTEX-11TTK-2826-SM-5GU5K 20 | GTEX-11TUW-0006-SM-5LZW2 21 | GTEX-11TUW-0226-SM-5LU8X 22 | GTEX-11TUW-1826-SM-5BC5D 23 | GTEX-11UD1-0011-R5b-SM-5P9FP 24 | GTEX-11UD2-0005-SM-5NQ9B 25 | GTEX-11UD2-0226-SM-5EQKY 26 | GTEX-11WQK-1826-SM-5EQM2 27 | GTEX-11WQK-2926-SM-5EQKT 28 | GTEX-11ZTS-1426-SM-5EQMM 29 | GTEX-11ZU8-0126-SM-5EQ58 30 | GTEX-11ZVC-1426-SM-5EGGA 31 | GTEX-12126-0011-R10b-SM-5BC6T 32 | GTEX-1212Z-0008-SM-5TDBW 33 | GTEX-1269C-2826-SM-5EQ5O 34 | GTEX-12WSG-5016-SM-7SB84 35 | GTEX-12WSK-0008-SM-5YY9R 36 | GTEX-12WSM-0011-R10a-SM-5DUWV 37 | GTEX-12ZZY-1526-SM-5LZWF 38 | GTEX-12ZZZ-0008-SM-5YY9S 39 | GTEX-12ZZZ-0011-R5a-SM-5EQ4M 40 | GTEX-13111-1126-SM-5GCMZ 41 | GTEX-13113-5019-SM-7EPH2 42 | GTEX-1313W-0011-R10b-SM-5DUXA 43 | GTEX-1313W-0011-R4b-SM-5KLZV 44 | GTEX-1313W-0011-R5b-SM-5L3EP 45 | GTEX-1313W-0326-SM-5LZU5 46 | GTEX-131XG-2626-SM-5KM17 47 | GTEX-131XH-2626-SM-5GCNP 48 | GTEX-131XH-2926-SM-5LZU4 49 | GTEX-131XW-3226-SM-5LZUO 50 | GTEX-132Q8-0011-R6a-SM-5K7YN 51 | GTEX-1339X-2726-SM-5PNYU 52 | GTEX-1399S-2026-SM-5KM4B 53 | GTEX-1399T-0826-SM-5IFES 54 | GTEX-139D8-1126-SM-5LU8W 55 | GTEX-139T6-0626-SM-5IFF9 56 | GTEX-139TS-0008-SM-62LDG 57 | GTEX-139TU-0226-SM-5J1NM 58 | GTEX-139UW-2826-SM-5L3E7 59 | GTEX-13CF2-0011-R2a-SM-5L3DC 60 | GTEX-13CF2-0011-R5a-SM-5LZWS 61 | GTEX-13D11-0626-SM-5LZYY 62 | GTEX-13D11-0726-SM-5LZZB 63 | GTEX-13D11-2226-SM-5IFEO 64 | GTEX-13FH7-1326-SM-5IFG8 65 | GTEX-13FLV-0626-SM-5IFEY 66 | GTEX-13FTW-1526-SM-5LZXC 67 | GTEX-13FTW-2126-SM-5K7YG 68 | GTEX-13FTX-1126-SM-5N9EN 69 | GTEX-13FXS-0011-R2b-SM-5K7XX 70 | GTEX-13G51-0011-R6b-SM-5LZX4 71 | GTEX-13IVO-0726-SM-5LZYV 72 | GTEX-13JVG-3026-SM-5IJEV 73 | GTEX-13NYB-0226-SM-5N9G4 74 | GTEX-13NYB-2226-SM-5MR58 75 | GTEX-13NYB-2426-SM-5IFF4 76 | GTEX-13NYS-1926-SM-5IJCB 77 | GTEX-13NZ9-1026-SM-5MR5K 78 | GTEX-13NZ9-1226-SM-5MR3J 79 | GTEX-13O1R-1026-SM-5KM2L 80 | GTEX-13O1R-1326-SM-5IJF5 81 | GTEX-13O3O-0011-R2b-SM-5P9H1 82 | GTEX-13O3O-0011-R4b-SM-5KM3F 83 | GTEX-13O3P-0726-SM-5J2OM 84 | GTEX-13O3Q-2726-SM-5KM51 85 | GTEX-13O61-1126-SM-5L3FI 86 | GTEX-13OVG-1026-SM-5KLZ3 87 | GTEX-13OVH-3026-SM-5MR4N 88 | GTEX-13OVI-1026-SM-5L3EM 89 | GTEX-13OVJ-1426-SM-5K7Z2 90 | GTEX-13OVL-0126-SM-5L3GJ 91 | GTEX-13OW5-0011-R7a-SM-5O9C9 92 | GTEX-13OW5-1126-SM-5J1NR 93 | GTEX-13OW6-3026-SM-5J2MI 94 | GTEX-13OW7-0426-SM-5K7VK 95 | GTEX-13PLJ-1726-SM-5L3FT 96 | GTEX-13RTJ-0526-SM-62LDP 97 | GTEX-13SLX-2426-SM-664OM 98 | GTEX-13SLX-3226-SM-5YYA6 99 | GTEX-13U4I-0426-SM-5LU4W 100 | GTEX-13VXT-0726-SM-5SIAD 101 | GTEX-13VXT-1426-SM-5LU4B 102 | GTEX-13X6J-2126-SM-5TDCV 103 | GTEX-144GM-1926-SM-5LUAN 104 | GTEX-145LS-2926-SM-5O99G 105 | GTEX-145ME-2026-SM-5SIA5 106 | GTEX-145MH-0011-R5a-SM-5P9JT 107 | GTEX-145MI-0011-R6a-SM-5PNZA 108 | GTEX-145MO-0626-SM-5NQAW 109 | GTEX-14753-2926-SM-5LU9J 110 | GTEX-1477Z-2226-SM-5QGPG 111 | GTEX-148VI-1026-SM-5TDDJ 112 | GTEX-148VJ-0006-SM-5NQB1 113 | GTEX-148VJ-2626-SM-5QGPI 114 | GTEX-14ABY-0011-R1a-SM-6EU2W 115 | GTEX-14AS3-0126-SM-5Q5F4 116 | GTEX-14ASI-0011-R4a-SM-69LQ4 117 | GTEX-14B4R-1426-SM-5Q5CG 118 | GTEX-14BIL-0006-SM-5N9F2 119 | GTEX-14BIN-0011-R10a-SM-5S2UA 120 | GTEX-14BIN-0626-SM-793DP 121 | GTEX-14BIN-2426-SM-5TDCF 122 | GTEX-14BIN-2626-SM-5YY8U 123 | GTEX-14C39-0526-SM-664OF 124 | GTEX-14C39-1826-SM-5ZZW4 125 | GTEX-14C5O-0526-SM-62LEI 126 | GTEX-14DAR-2026-SM-5S2O3 127 | GTEX-14E7W-1026-SM-62LEK 128 | GTEX-14E7W-1826-SM-69LQ1 129 | GTEX-14JFF-0005-SM-7P8RF 130 | GTEX-14JG6-2226-SM-6EU2G 131 | GTEX-14JIY-0011-R1a-SM-68713 132 | GTEX-14JIY-0011-R8a-SM-6AJAP 133 | GTEX-14LLW-0126-SM-6LLIO 134 | GTEX-14PII-1026-SM-5ZZVW 135 | GTEX-14PJ2-2126-SM-5YY96 136 | GTEX-14PJ3-0005-SM-9JGFT 137 | GTEX-14PJO-1826-SM-69LPR 138 | GTEX-14PK6-1326-SM-686ZE 139 | GTEX-14PKV-1426-SM-5YYB9 140 | GTEX-14PN3-1226-SM-69LOW 141 | GTEX-15DCD-0226-SM-6LPKC 142 | GTEX-15DYW-2626-SM-6LPK7 143 | GTEX-15DZA-0826-SM-6AJBF 144 | GTEX-15EO6-0011-R11b-SM-6M47M 145 | GTEX-15EO6-3126-SM-6LPIQ 146 | GTEX-15EOM-5016-SM-7P8SR 147 | GTEX-15G1A-2026-SM-7KUFG 148 | GTEX-169BO-0126-SM-79OLS 149 | GTEX-16MT8-1326-SM-6M47R 150 | GTEX-16NPV-2426-SM-6M482 151 | GTEX-16YQH-0006-SM-7P8OQ 152 | GTEX-16YQH-0426-SM-7KUL7 153 | GTEX-16Z82-0126-SM-7DHLF 154 | GTEX-17EVQ-0526-SM-7KFSK 155 | GTEX-17EVQ-2926-SM-7EWE3 156 | GTEX-17F96-1126-SM-7EWDF 157 | GTEX-17F9E-0526-SM-7DUFD 158 | GTEX-17F9Y-1226-SM-7EPGE 159 | GTEX-17GQL-0226-SM-7LTAK 160 | GTEX-17HG3-0011-R5a-SM-7DUEW 161 | GTEX-17HG3-2226-SM-7938L 162 | GTEX-17HGU-2426-SM-7EWDP 163 | GTEX-17JCI-0011-R10b-SM-718A2 164 | GTEX-17JCI-0726-SM-7EPH1 165 | GTEX-183FY-1926-SM-7KFRI 166 | GTEX-18465-1126-SM-7LG6E 167 | GTEX-18465-2726-SM-7LT9I 168 | GTEX-18A67-0526-SM-7LT9X 169 | GTEX-18A6Q-0011-R11a-SM-72D6H 170 | GTEX-18A6Q-0011-R1b-SM-731DI 171 | GTEX-18A6Q-0826-SM-7KFRD 172 | GTEX-18A7A-1426-SM-731AP 173 | GTEX-18A7B-0526-SM-7LG68 174 | GTEX-18D9A-1326-SM-7LT8P 175 | GTEX-18D9B-1926-SM-7KFSV 176 | GTEX-18D9B-2526-SM-718BN 177 | GTEX-18QFQ-1726-SM-731C9 178 | GTEX-1A3MV-1726-SM-72D68 179 | GTEX-1A3MX-0005-SM-7MGW7 180 | GTEX-1A3MX-0011-R1b-SM-7P8PH 181 | GTEX-1A3MX-0011-R3b-SM-79OOW 182 | GTEX-1A3MX-1926-SM-72D7F 183 | GTEX-1A8G6-0011-R6b-SM-7P8PE 184 | GTEX-1A8G6-0526-SM-7PC1E 185 | GTEX-1A8G6-1626-SM-7MGWO 186 | GTEX-1AMEY-1526-SM-73KYW 187 | GTEX-1AX9I-2326-SM-7PBXV 188 | GTEX-1AYCT-1026-SM-79ONR 189 | GTEX-1AYCT-2026-SM-793CJ 190 | GTEX-1B8KZ-0126-SM-7DHM5 191 | GTEX-1B8SG-0006-SM-7MKFA 192 | GTEX-1B8SG-1826-SM-731F3 193 | GTEX-1B933-2926-SM-731FO 194 | GTEX-1B996-0011-R5a-SM-7P8PN 195 | GTEX-1BAJH-0926-SM-79OO6 196 | GTEX-1BAJH-2526-SM-7IGOT 197 | GTEX-1C64N-0426-SM-7PC31 198 | GTEX-1C64O-0006-SM-7PC22 199 | GTEX-1C6VS-1226-SM-79OO2 200 | GTEX-1C6WA-0011-R6a-SM-7PBYP 201 | GTEX-1CAMQ-0426-SM-7IGPL 202 | GTEX-1CAMR-0526-SM-7P8RS 203 | GTEX-1CAMS-0326-SM-7PC35 204 | GTEX-1CB4E-0626-SM-7DHMW 205 | GTEX-1EH9U-0226-SM-7PBY8 206 | GTEX-1EH9U-3226-SM-7MKGR 207 | GTEX-1EKGG-0226-SM-9WYTH 208 | GTEX-1EMGI-2626-SM-7IGNR 209 | GTEX-1EU9M-0011-R9a-SM-9WG61 210 | GTEX-1EWIQ-3126-SM-7MXTI 211 | GTEX-1F52S-0011-R3a-SM-CKZNI 212 | GTEX-1F5PL-0826-SM-7MXU7 213 | GTEX-1F5PL-1626-SM-7MXTY 214 | GTEX-1F6IF-0526-SM-7MKHD 215 | GTEX-1F75A-0011-R11a-SM-AHZ35 216 | GTEX-1F75A-0726-SM-7RHHD 217 | GTEX-1F75B-0011-R2a-SM-ARL8I 218 | GTEX-1GF9U-2026-SM-7SB92 219 | GTEX-1GF9U-2126-SM-7SB7L 220 | GTEX-1GF9W-0011-R4a-SM-CE6RH 221 | GTEX-1GF9W-0011-R6b-SM-9QEIE 222 | GTEX-1GF9W-0126-SM-7PC12 223 | GTEX-1GF9X-1126-SM-7MKHC 224 | GTEX-1GMR3-1326-SM-7P8TB 225 | GTEX-1GMR3-2726-SM-7MKFB 226 | GTEX-1GMR8-0005-SM-ARZL2 227 | GTEX-1GN1U-2126-SM-AHZ4J 228 | GTEX-1GN1W-0526-SM-9MQJQ 229 | GTEX-1GN2E-0426-SM-9MQKX 230 | GTEX-1GN73-0005-SM-ACKVI 231 | GTEX-1GPI7-0326-SM-7MKH6 232 | GTEX-1GPI7-0926-SM-7PC3J 233 | GTEX-1GTWX-0426-SM-9MQM1 234 | GTEX-1GZ2Q-0226-SM-7P8TL 235 | GTEX-1GZ4H-0126-SM-9JGGI 236 | GTEX-1GZ4I-0011-R4a-SM-9QEI9 237 | GTEX-1H1CY-0011-R5a-SM-CM2SN 238 | GTEX-1H1CY-3026-SM-9OSW7 239 | GTEX-1H1DG-0011-R10b-SM-CE6S7 240 | GTEX-1H1ZS-0526-SM-9WG5L 241 | -------------------------------------------------------------------------------- /RNAseq_method_comparison/NOTES.txt: -------------------------------------------------------------------------------- 1 | 1) Transcript quant methods: 2 | Salmon 3 | kallisto 4 | STAR + RSEM 5 | STAR + StringTie 6 | 7 | 2) Gene count methods: 8 | Salmon 9 | kallisto 10 | STAR + RSEM 11 | STAR + StringTie 12 | STAR + HTSeq-count 13 | STAR + featureCounts 14 | 15 | 16 | 3) References 17 | gencode.v44.transcripts.fa.gz 18 | GRCh38.primary_assembly.genome.fa.gz 19 | gencode.v44.primary_assembly.annotation.gtf.gz 20 | GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_transcript_expected_count.gct.gz 21 | 22 | 4) Datasets 23 | 240 random gTEX samples simulated 24 | 25 | Increasing coverage of 5 closest GTEX samples to the total counts q75 value. simulated 26 | 1,5,10,15,20,25,30,40,50,60,70,80,90,100 % read depth, n = 5 27 | 28 | Samples simulated with polyester, paired, 101 bp, unstranded 29 | ('filtered_fasta.fa.gz' is gencode.v44.transcripts.fa.gz with no transcript < 100 bp) 30 | simulate_experiment_countmat('filtered_fasta.fa.gz', readmat = , outdir = <>, 31 | paired = TRUE, error_model = 'uniform', 32 | error_rate = 0.004, readlen = 101, gzip=TRUE) 33 | 34 | 35 | 36 | 37 | #####STAR 38 | STAR --runMode genomeGenerate --genomeDir STAR_index --genomeFastaFiles GRCh38.primary_assembly.genome.fa --sjdbGTFfile gencode.v44.primary_assembly.annotation.gtf --runThreadN 22 39 | 40 | STAR --runMode alignReads --genomeDir STAR_index --outSAMtype BAM SortedByCoordinate --readFilesIn "$file_r1" "$file_r2" --readFilesCommand gunzip -c --runThreadN 12 --outFileNamePrefix "gtex_STAR_out/${samp}" 41 | 42 | 43 | 44 | #####RSEM 45 | rsem-prepare-reference --gtf gencode.v44.primary_assembly.annotation.gtf --star -p 12 GRCh38.primary_assembly.genome.fa rsem_ref/rsem_ref 46 | 47 | rsem-calculate-expression --star --no-bam-output --paired-end --star-gzipped-read-file --no-qualities -p 18 "$file_r1" "$file_r2" rsem_ref/rsem_ref "gtex_rsem/${samp}" 48 | 49 | 50 | 51 | #####stringtie v2.2.1 52 | stringtie -G gencode.v44.primary_assembly.annotation.gtf -p 8 -e -o gtex_stringtie/${samp}.gtf -A gtex_stringtie/${samp}.genes "$file_bam" 53 | prepDE.py -l 101 -t wut_if_god_was.csv -i sample_sheet.txt 54 | 55 | not easy to use because of requiring star -> stringtie -> prepDE. output is in a weird gtf format. prepDE is not intuitive 56 | 57 | 58 | 59 | #####salmon v1.10.2 60 | salmon index -t trans_and_deocys.fa.gz -d decoys.txt -p 30 -i human_salmon_index --gencode 61 | 62 | salmon quant -i "$salmon_index" -l A -1 "$file_r1" -2 "$file_r2" \-p 18 --validateMappings -o "random_salmon_out/${samp}" 63 | 64 | 65 | 66 | #####kallisto v0.50.0 67 | kallisto index -i kallisto_index gencode.v44.transcripts.fa.gz 68 | 69 | kallisto quant -i "$salmon_index" -o "gtex_kallisto_out/${samp}" -t 22 "$file_r1" "$file_r2" 70 | 71 | 72 | 73 | #####htseq 2.0.3 74 | for file in *bam; do samtools index $file; done 75 | 76 | htseq-count -f bam -r pos --stranded no --max-reads-in-buffer 90000000 gtex_STAR_out/*bam gencode.v44.primary_assembly.annotation.gtf 77 | 78 | 79 | find gtex_STAR_out -name '*Aligned.sortedByCoord.out.bam' | parallel -j 12 'base=$(basename {} "Aligned.sortedByCoord.out.bam"); htseq-count -f bam -r pos --stranded no --max-reads-in-buffer 90000000 {} gencode.v44.primary_assembly.annotation.gtf > "htseq_gtex_counts/sample_${base}.txt"' 80 | 81 | 82 | 83 | #####featurecount (subread v2.0,6) 84 | featureCounts -p --countReadPairs -a gencode.v44.primary_assembly.annotation.gtf -T 6 -o gtex_feature_counts.txt gtex_STAR_out/*bam 85 | -------------------------------------------------------------------------------- /RNAseq_method_comparison/increasing_depth.csv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mousepixels/sanbomics_scripts/521f16b9b4cedb736c28403a557462cffa35f06a/RNAseq_method_comparison/increasing_depth.csv.gz -------------------------------------------------------------------------------- /RNAseq_method_comparison/read_generation.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "R Notebook" 3 | output: html_notebook 4 | --- 5 | 6 | 7 | 8 | ```{r} 9 | library(polyester) 10 | library(Biostrings) 11 | library(readr) 12 | ``` 13 | 14 | 15 | 16 | ```{r} 17 | #removing small txs 18 | fasta = readDNAStringSet('gencode.v44.transcripts.fa.gz') 19 | fasta <- fasta[width(fasta) >= 100] 20 | writeXStringSet(fasta, 'filtered_fasta.fa.gz', compress = TRUE) 21 | ``` 22 | 23 | 24 | ```{r} 25 | fasta = readDNAStringSet('filtered_fasta.fa.gz') 26 | ``` 27 | 28 | 29 | simulating gtex samples, I stopped at 240 instead of 500 because I enjoy life 30 | ```{r} 31 | df <- read_csv("gtex_500.csv") 32 | df$transcript_id <- NULL 33 | df <- as.matrix(df) 34 | 35 | 36 | simulate_experiment_countmat('filtered_fasta.fa.gz', readmat = df, outdir = 'test_reads', 37 | paired = TRUE, error_model = 'uniform', 38 | error_rate = 0.004, readlen = 101, gzip=TRUE) 39 | ``` 40 | 41 | 42 | 43 | simulating increasing depths counts 44 | ```{r} 45 | df <- read_csv('increasing_depth.csv') 46 | df <- as.matrix(df) 47 | 48 | 49 | 50 | simulate_experiment_countmat('filtered_fasta.fa.gz', readmat = df , outdir = 'increasing_depth', 51 | paired = TRUE, error_model = 'uniform', 52 | error_rate = 0.004, readlen = 101, gzip=TRUE) 53 | 54 | ``` 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | -------------------------------------------------------------------------------- /convert_ensemble_ids.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 26, 6 | "id": "d89a4260", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "####complete function, may need slight modification based on your gtf format####\n", 11 | "\n", 12 | "def get_ens_dict(file_path):\n", 13 | " with open(file_path) as f:\n", 14 | " gtf = list(f)\n", 15 | "\n", 16 | " gtf = [x for x in gtf if not x.startswith('#')]\n", 17 | " gtf = [x for x in gtf if 'gene_id \"' in x and 'gene_name \"' in x]\n", 18 | " if len(gtf) == 0:\n", 19 | " print('you need to change gene_id \" and gene_name \" formats')\n", 20 | " \n", 21 | " gtf = list(map(lambda x: (x.split('gene_id \"')[1].split('\"')[0], x.split('gene_name \"')[1].split('\"')[0]), gtf))\n", 22 | " gtf = dict(set(gtf))\n", 23 | " return gtf\n", 24 | "\n", 25 | "gtf_dict = get_ens_dict('Homo_sapiens.GRCh38.105.gtf') #replace with your file path" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 1, 31 | "id": "771ccf9b", 32 | "metadata": {}, 33 | "outputs": [ 34 | { 35 | "name": "stdout", 36 | "output_type": "stream", 37 | "text": [ 38 | "count_table.csv Homo_sapiens.GRCh38.105.gtf\ttutorial.Rmd\t Untitled.ipynb\r\n", 39 | "deseq_results.csv tutorial.nb.html\t\tUntitled1.ipynb\r\n" 40 | ] 41 | } 42 | ], 43 | "source": [ 44 | "!ls" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "id": "df6faf97", 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "#make sure to use the gtf you made your genome index with.. or find the same organism/version on ensemble\n", 55 | "#this gtf was taken straigh from ensemble" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 2, 61 | "id": "a4cfb217", 62 | "metadata": {}, 63 | "outputs": [ 64 | { 65 | "name": "stdout", 66 | "output_type": "stream", 67 | "text": [ 68 | "#!genome-build GRCh38.p13\r\n", 69 | "#!genome-version GRCh38\r\n", 70 | "#!genome-date 2013-12\r\n", 71 | "#!genome-build-accession GCA_000001405.28\r\n", 72 | "#!genebuild-last-updated 2021-08\r\n", 73 | "1\tensembl_havana\tgene\t1211340\t1214153\t.\t-\t.\tgene_id \"ENSG00000186827\"; gene_version \"11\"; gene_name \"TNFRSF4\"; gene_source \"ensembl_havana\"; gene_biotype \"protein_coding\";\r\n", 74 | "1\tensembl_havana\ttranscript\t1211340\t1214153\t.\t-\t.\tgene_id \"ENSG00000186827\"; gene_version \"11\"; transcript_id \"ENST00000379236\"; transcript_version \"4\"; gene_name \"TNFRSF4\"; gene_source \"ensembl_havana\"; gene_biotype \"protein_coding\"; transcript_name \"TNFRSF4-201\"; transcript_source \"ensembl_havana\"; transcript_biotype \"protein_coding\"; tag \"CCDS\"; ccds_id \"CCDS11\"; tag \"basic\"; transcript_support_level \"1 (assigned to previous version 3)\";\r\n", 75 | "1\tensembl_havana\texon\t1213983\t1214153\t.\t-\t.\tgene_id \"ENSG00000186827\"; gene_version \"11\"; transcript_id \"ENST00000379236\"; transcript_version \"4\"; exon_number \"1\"; gene_name \"TNFRSF4\"; gene_source \"ensembl_havana\"; gene_biotype \"protein_coding\"; transcript_name \"TNFRSF4-201\"; transcript_source \"ensembl_havana\"; transcript_biotype \"protein_coding\"; tag \"CCDS\"; ccds_id \"CCDS11\"; exon_id \"ENSE00001832731\"; exon_version \"2\"; tag \"basic\"; transcript_support_level \"1 (assigned to previous version 3)\";\r\n" 76 | ] 77 | } 78 | ], 79 | "source": [ 80 | "!head -n 8 Homo_sapiens.GRCh38.105.gtf" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 3, 86 | "id": "da275a2e", 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "with open('Homo_sapiens.GRCh38.105.gtf') as f:\n", 91 | " gtf = list(f)\n", 92 | " \n", 93 | " \n", 94 | "gtf = [x for x in gtf if not x.startswith('#')]" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 4, 100 | "id": "5d4e89b6", 101 | "metadata": {}, 102 | "outputs": [ 103 | { 104 | "data": { 105 | "text/plain": [ 106 | "3236571" 107 | ] 108 | }, 109 | "execution_count": 4, 110 | "metadata": {}, 111 | "output_type": "execute_result" 112 | } 113 | ], 114 | "source": [ 115 | "len(gtf)" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": 1, 121 | "id": "3389cca8", 122 | "metadata": {}, 123 | "outputs": [], 124 | "source": [ 125 | "#gtf" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": 6, 131 | "id": "7ec553e4", 132 | "metadata": {}, 133 | "outputs": [], 134 | "source": [ 135 | "gtf = [x for x in gtf if 'gene_id \"' in x and 'gene_name \"' in x]" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": 7, 141 | "id": "930911ac", 142 | "metadata": {}, 143 | "outputs": [ 144 | { 145 | "data": { 146 | "text/plain": [ 147 | "3080284" 148 | ] 149 | }, 150 | "execution_count": 7, 151 | "metadata": {}, 152 | "output_type": "execute_result" 153 | } 154 | ], 155 | "source": [ 156 | "len(gtf)" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": 8, 162 | "id": "44aba8e0", 163 | "metadata": {}, 164 | "outputs": [ 165 | { 166 | "data": { 167 | "text/plain": [ 168 | "'1\\tensembl_havana\\tgene\\t1211340\\t1214153\\t.\\t-\\t.\\tgene_id \"ENSG00000186827\"; gene_version \"11\"; gene_name \"TNFRSF4\"; gene_source \"ensembl_havana\"; gene_biotype \"protein_coding\";\\n'" 169 | ] 170 | }, 171 | "execution_count": 8, 172 | "metadata": {}, 173 | "output_type": "execute_result" 174 | } 175 | ], 176 | "source": [ 177 | "gtf[0]" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": null, 183 | "id": "ec75cd40", 184 | "metadata": {}, 185 | "outputs": [], 186 | "source": [ 187 | "#your gtf might have slightly differnt format. make sure to change the split strings accordingly\n", 188 | "#e.g., some gtf files do not have any quotes" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": 9, 194 | "id": "4d4a0b72", 195 | "metadata": {}, 196 | "outputs": [], 197 | "source": [ 198 | "gtf = list(map(lambda x: (x.split('gene_id \"')[1].split('\"')[0], x.split('gene_name \"')[1].split('\"')[0]), gtf))" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": 10, 204 | "id": "7353a4f0", 205 | "metadata": {}, 206 | "outputs": [ 207 | { 208 | "data": { 209 | "text/plain": [ 210 | "[('ENSG00000186827', 'TNFRSF4'),\n", 211 | " ('ENSG00000186827', 'TNFRSF4'),\n", 212 | " ('ENSG00000186827', 'TNFRSF4'),\n", 213 | " ('ENSG00000186827', 'TNFRSF4'),\n", 214 | " ('ENSG00000186827', 'TNFRSF4')]" 215 | ] 216 | }, 217 | "execution_count": 10, 218 | "metadata": {}, 219 | "output_type": "execute_result" 220 | } 221 | ], 222 | "source": [ 223 | "gtf[0:5]" 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": 12, 229 | "id": "bd41ea0a", 230 | "metadata": {}, 231 | "outputs": [ 232 | { 233 | "data": { 234 | "text/plain": [ 235 | "3080284" 236 | ] 237 | }, 238 | "execution_count": 12, 239 | "metadata": {}, 240 | "output_type": "execute_result" 241 | } 242 | ], 243 | "source": [ 244 | "len(gtf)" 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": 13, 250 | "id": "70208ae3", 251 | "metadata": {}, 252 | "outputs": [], 253 | "source": [ 254 | "gtf = list(set(gtf))" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": 14, 260 | "id": "21a7f773", 261 | "metadata": {}, 262 | "outputs": [ 263 | { 264 | "data": { 265 | "text/plain": [ 266 | "40839" 267 | ] 268 | }, 269 | "execution_count": 14, 270 | "metadata": {}, 271 | "output_type": "execute_result" 272 | } 273 | ], 274 | "source": [ 275 | "len(gtf)" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": 3, 281 | "id": "854662be", 282 | "metadata": {}, 283 | "outputs": [], 284 | "source": [ 285 | "#gtf" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": 16, 291 | "id": "f27e7754", 292 | "metadata": {}, 293 | "outputs": [ 294 | { 295 | "name": "stdout", 296 | "output_type": "stream", 297 | "text": [ 298 | "count_table.csv Homo_sapiens.GRCh38.105.gtf\ttutorial.Rmd\t Untitled.ipynb\r\n", 299 | "deseq_results.csv tutorial.nb.html\t\tUntitled1.ipynb\r\n" 300 | ] 301 | } 302 | ], 303 | "source": [ 304 | "!ls" 305 | ] 306 | }, 307 | { 308 | "cell_type": "code", 309 | "execution_count": 17, 310 | "id": "d4a934dc", 311 | "metadata": {}, 312 | "outputs": [], 313 | "source": [ 314 | "import pandas as pd" 315 | ] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "execution_count": 19, 320 | "id": "00c7f321", 321 | "metadata": {}, 322 | "outputs": [ 323 | { 324 | "data": { 325 | "text/html": [ 326 | "
\n", 327 | "\n", 340 | "\n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | "
Unnamed: 0baseMeanlog2FoldChangelfcSEstatpvaluepadj
0ENSG00000160072320.844930-0.4859800.158346-3.0690990.0021470.027228
1ENSG00000069424231.9217121.1425180.2709894.2161000.0000250.000938
2ENSG000001167861218.6330250.3978000.1104443.6018400.0003160.006850
3ENSG0000011625416.7420052.7236290.7079773.8470570.0001200.003336
4ENSG00000204138316.084877-0.5353100.146291-3.6592090.0002530.005816
........................
1447ENSG000001591311000.997666-0.3776180.125794-3.0018840.0026830.031762
1448ENSG00000159259114.924454-1.7910300.474113-3.7776410.0001580.004068
1449ENSG0000016029876.770422-1.1898020.372401-3.1949510.0013990.019928
1450ENSG00000159055123.604477-0.9460330.322999-2.9289000.0034020.038006
1451ENSG000001592001182.5014991.5555950.4708043.3041290.0009530.015224
\n", 466 | "

1452 rows × 7 columns

\n", 467 | "
" 468 | ], 469 | "text/plain": [ 470 | " Unnamed: 0 baseMean log2FoldChange lfcSE stat \\\n", 471 | "0 ENSG00000160072 320.844930 -0.485980 0.158346 -3.069099 \n", 472 | "1 ENSG00000069424 231.921712 1.142518 0.270989 4.216100 \n", 473 | "2 ENSG00000116786 1218.633025 0.397800 0.110444 3.601840 \n", 474 | "3 ENSG00000116254 16.742005 2.723629 0.707977 3.847057 \n", 475 | "4 ENSG00000204138 316.084877 -0.535310 0.146291 -3.659209 \n", 476 | "... ... ... ... ... ... \n", 477 | "1447 ENSG00000159131 1000.997666 -0.377618 0.125794 -3.001884 \n", 478 | "1448 ENSG00000159259 114.924454 -1.791030 0.474113 -3.777641 \n", 479 | "1449 ENSG00000160298 76.770422 -1.189802 0.372401 -3.194951 \n", 480 | "1450 ENSG00000159055 123.604477 -0.946033 0.322999 -2.928900 \n", 481 | "1451 ENSG00000159200 1182.501499 1.555595 0.470804 3.304129 \n", 482 | "\n", 483 | " pvalue padj \n", 484 | "0 0.002147 0.027228 \n", 485 | "1 0.000025 0.000938 \n", 486 | "2 0.000316 0.006850 \n", 487 | "3 0.000120 0.003336 \n", 488 | "4 0.000253 0.005816 \n", 489 | "... ... ... \n", 490 | "1447 0.002683 0.031762 \n", 491 | "1448 0.000158 0.004068 \n", 492 | "1449 0.001399 0.019928 \n", 493 | "1450 0.003402 0.038006 \n", 494 | "1451 0.000953 0.015224 \n", 495 | "\n", 496 | "[1452 rows x 7 columns]" 497 | ] 498 | }, 499 | "execution_count": 19, 500 | "metadata": {}, 501 | "output_type": "execute_result" 502 | } 503 | ], 504 | "source": [ 505 | "df = pd.read_csv('deseq_results.csv')\n", 506 | "df" 507 | ] 508 | }, 509 | { 510 | "cell_type": "code", 511 | "execution_count": 20, 512 | "id": "a3fac37a", 513 | "metadata": {}, 514 | "outputs": [], 515 | "source": [ 516 | "gtf = dict(gtf)" 517 | ] 518 | }, 519 | { 520 | "cell_type": "code", 521 | "execution_count": 21, 522 | "id": "8d5f4049", 523 | "metadata": {}, 524 | "outputs": [], 525 | "source": [ 526 | "df['Gene Name'] = df['Unnamed: 0'].map(gtf)" 527 | ] 528 | }, 529 | { 530 | "cell_type": "code", 531 | "execution_count": 22, 532 | "id": "636d86cb", 533 | "metadata": {}, 534 | "outputs": [ 535 | { 536 | "data": { 537 | "text/html": [ 538 | "
\n", 539 | "\n", 552 | "\n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | "
Unnamed: 0baseMeanlog2FoldChangelfcSEstatpvaluepadjGene Name
0ENSG00000160072320.844930-0.4859800.158346-3.0690990.0021470.027228ATAD3B
1ENSG00000069424231.9217121.1425180.2709894.2161000.0000250.000938KCNAB2
2ENSG000001167861218.6330250.3978000.1104443.6018400.0003160.006850PLEKHM2
3ENSG0000011625416.7420052.7236290.7079773.8470570.0001200.003336CHD5
4ENSG00000204138316.084877-0.5353100.146291-3.6592090.0002530.005816PHACTR4
...........................
1447ENSG000001591311000.997666-0.3776180.125794-3.0018840.0026830.031762GART
1448ENSG00000159259114.924454-1.7910300.474113-3.7776410.0001580.004068CHAF1B
1449ENSG0000016029876.770422-1.1898020.372401-3.1949510.0013990.019928C21orf58
1450ENSG00000159055123.604477-0.9460330.322999-2.9289000.0034020.038006MIS18A
1451ENSG000001592001182.5014991.5555950.4708043.3041290.0009530.015224RCAN1
\n", 690 | "

1452 rows × 8 columns

\n", 691 | "
" 692 | ], 693 | "text/plain": [ 694 | " Unnamed: 0 baseMean log2FoldChange lfcSE stat \\\n", 695 | "0 ENSG00000160072 320.844930 -0.485980 0.158346 -3.069099 \n", 696 | "1 ENSG00000069424 231.921712 1.142518 0.270989 4.216100 \n", 697 | "2 ENSG00000116786 1218.633025 0.397800 0.110444 3.601840 \n", 698 | "3 ENSG00000116254 16.742005 2.723629 0.707977 3.847057 \n", 699 | "4 ENSG00000204138 316.084877 -0.535310 0.146291 -3.659209 \n", 700 | "... ... ... ... ... ... \n", 701 | "1447 ENSG00000159131 1000.997666 -0.377618 0.125794 -3.001884 \n", 702 | "1448 ENSG00000159259 114.924454 -1.791030 0.474113 -3.777641 \n", 703 | "1449 ENSG00000160298 76.770422 -1.189802 0.372401 -3.194951 \n", 704 | "1450 ENSG00000159055 123.604477 -0.946033 0.322999 -2.928900 \n", 705 | "1451 ENSG00000159200 1182.501499 1.555595 0.470804 3.304129 \n", 706 | "\n", 707 | " pvalue padj Gene Name \n", 708 | "0 0.002147 0.027228 ATAD3B \n", 709 | "1 0.000025 0.000938 KCNAB2 \n", 710 | "2 0.000316 0.006850 PLEKHM2 \n", 711 | "3 0.000120 0.003336 CHD5 \n", 712 | "4 0.000253 0.005816 PHACTR4 \n", 713 | "... ... ... ... \n", 714 | "1447 0.002683 0.031762 GART \n", 715 | "1448 0.000158 0.004068 CHAF1B \n", 716 | "1449 0.001399 0.019928 C21orf58 \n", 717 | "1450 0.003402 0.038006 MIS18A \n", 718 | "1451 0.000953 0.015224 RCAN1 \n", 719 | "\n", 720 | "[1452 rows x 8 columns]" 721 | ] 722 | }, 723 | "execution_count": 22, 724 | "metadata": {}, 725 | "output_type": "execute_result" 726 | } 727 | ], 728 | "source": [ 729 | "df" 730 | ] 731 | }, 732 | { 733 | "cell_type": "code", 734 | "execution_count": null, 735 | "id": "3394d1e1", 736 | "metadata": {}, 737 | "outputs": [], 738 | "source": [] 739 | }, 740 | { 741 | "cell_type": "code", 742 | "execution_count": null, 743 | "id": "283a35eb", 744 | "metadata": {}, 745 | "outputs": [], 746 | "source": [] 747 | }, 748 | { 749 | "cell_type": "code", 750 | "execution_count": null, 751 | "id": "a7a56a7e", 752 | "metadata": {}, 753 | "outputs": [], 754 | "source": [] 755 | }, 756 | { 757 | "cell_type": "code", 758 | "execution_count": null, 759 | "id": "675931c3", 760 | "metadata": {}, 761 | "outputs": [], 762 | "source": [] 763 | }, 764 | { 765 | "cell_type": "code", 766 | "execution_count": null, 767 | "id": "62108215", 768 | "metadata": {}, 769 | "outputs": [], 770 | "source": [] 771 | } 772 | ], 773 | "metadata": { 774 | "kernelspec": { 775 | "display_name": "Python 3 (ipykernel)", 776 | "language": "python", 777 | "name": "python3" 778 | }, 779 | "language_info": { 780 | "codemirror_mode": { 781 | "name": "ipython", 782 | "version": 3 783 | }, 784 | "file_extension": ".py", 785 | "mimetype": "text/x-python", 786 | "name": "python", 787 | "nbconvert_exporter": "python", 788 | "pygments_lexer": "ipython3", 789 | "version": "3.10.1" 790 | } 791 | }, 792 | "nbformat": 4, 793 | "nbformat_minor": 5 794 | } 795 | -------------------------------------------------------------------------------- /h5ad_to_seurat.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "d94f83ba", 6 | "metadata": {}, 7 | "source": [ 8 | "My opinion: learn to use scanpy instead\n", 9 | "\n", 10 | "scanpy is great and integrates with machine learning very nicely, eg scVI tools, scArches, etc" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 1, 16 | "id": "b1457418", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "import scanpy as sc\n", 21 | "from scipy import io" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 2, 27 | "id": "8cff35b4", 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "!mkdir matrix_files" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 3, 37 | "id": "480f32c2", 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "adata = sc.read_h5ad('TS_Heart.h5ad')" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 5, 47 | "id": "4564bb37", 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "adata = adata.raw.to_adata() #only if adata has RAW saved and thats what you want!!" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 14, 57 | "id": "eb07b27c", 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "with open('matrix_files/barcodes.tsv', 'w') as f:\n", 62 | " for item in adata.obs_names:\n", 63 | " f.write(item + '\\n')" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 15, 69 | "id": "0f64c503", 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [ 73 | "with open('matrix_files/features.tsv', 'w') as f:\n", 74 | " for item in ['\\t'.join([x,x,'Gene Expression']) for x in adata.var_names]:\n", 75 | " f.write(item + '\\n')" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 12, 81 | "id": "f375b043", 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "io.mmwrite('matrix_files/matrix', adata.X.T)" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 16, 91 | "id": "fbee4699", 92 | "metadata": {}, 93 | "outputs": [ 94 | { 95 | "name": "stdout", 96 | "output_type": "stream", 97 | "text": [ 98 | "barcodes.tsv features.tsv matrix.mtx\r\n" 99 | ] 100 | } 101 | ], 102 | "source": [ 103 | "!ls matrix_files/" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": 18, 109 | "id": "8417f9fe", 110 | "metadata": {}, 111 | "outputs": [], 112 | "source": [ 113 | "!gzip matrix_files/*" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 19, 119 | "id": "4891515a", 120 | "metadata": {}, 121 | "outputs": [ 122 | { 123 | "name": "stdout", 124 | "output_type": "stream", 125 | "text": [ 126 | "barcodes.tsv.gz features.tsv.gz matrix.mtx.gz\r\n" 127 | ] 128 | } 129 | ], 130 | "source": [ 131 | "!ls matrix_files/" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": 22, 137 | "id": "0afd24f7", 138 | "metadata": {}, 139 | "outputs": [], 140 | "source": [ 141 | "adata.obs.to_csv('metadata.csv')" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": 23, 147 | "id": "9ec92fc7", 148 | "metadata": {}, 149 | "outputs": [], 150 | "source": [ 151 | "#example script\n", 152 | "#to run:\n", 153 | "#mkdir \n", 154 | "#python name_of_this_script.py \n", 155 | "#gzip /*\n", 156 | "import scanpy as sc\n", 157 | "from scipy import io\n", 158 | "import sys\n", 159 | "\n", 160 | "adata = sc.read_h5ad(sys.argv[1])\n", 161 | "out_dir = sys.argv[2]\n", 162 | "\n", 163 | "adata = adata.raw.to_adata() #only if adata has RAW saved and thats what you want!!\n", 164 | "\n", 165 | "with open(out_dir + '/barcodes.tsv', 'w') as f:\n", 166 | " for item in adata.obs_names:\n", 167 | " f.write(item + '\\n')\n", 168 | " \n", 169 | "with open(out_dir + '/features.tsv', 'w') as f:\n", 170 | " for item in ['\\t'.join([x,x,'Gene Expression']) for x in adata.var_names]:\n", 171 | " f.write(item + '\\n')\n", 172 | " \n", 173 | "io.mmwrite(out_dir +'/matrix', adata.X.T)\n", 174 | "\n", 175 | "adata.obs.to_csv(sys.argv[1] + '.metadata.csv')" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": null, 181 | "id": "6bbf5690", 182 | "metadata": {}, 183 | "outputs": [], 184 | "source": [] 185 | } 186 | ], 187 | "metadata": { 188 | "kernelspec": { 189 | "display_name": "Python 3 (ipykernel)", 190 | "language": "python", 191 | "name": "python3" 192 | }, 193 | "language_info": { 194 | "codemirror_mode": { 195 | "name": "ipython", 196 | "version": 3 197 | }, 198 | "file_extension": ".py", 199 | "mimetype": "text/x-python", 200 | "name": "python", 201 | "nbconvert_exporter": "python", 202 | "pygments_lexer": "ipython3", 203 | "version": "3.9.12" 204 | } 205 | }, 206 | "nbformat": 4, 207 | "nbformat_minor": 5 208 | } 209 | -------------------------------------------------------------------------------- /high_quality_barplots.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "b173b621", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import matplotlib.pyplot as plt\n", 11 | "import seaborn as sns" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 68, 17 | "id": "569d4896", 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "a = [67, 56, 62, 70, 89]\n", 22 | "b = [42, 55, 61, 51, 62]\n", 23 | "\n", 24 | "vals = a + b\n", 25 | "groups = ['Gene_a']*5 + ['Control']*5" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 27, 31 | "id": "280b161c", 32 | "metadata": {}, 33 | "outputs": [ 34 | { 35 | "data": { 36 | "text/plain": [ 37 | "MannwhitneyuResult(statistic=22.5, pvalue=0.046532985074510584)" 38 | ] 39 | }, 40 | "execution_count": 27, 41 | "metadata": {}, 42 | "output_type": "execute_result" 43 | } 44 | ], 45 | "source": [ 46 | "from scipy import stats\n", 47 | "stats.mannwhitneyu(a,b)" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 69, 53 | "id": "d37f53ce", 54 | "metadata": {}, 55 | "outputs": [ 56 | { 57 | "data": { 58 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAJgAAAEyCAYAAAAGMhkdAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAsTAAALEwEAmpwYAAAawUlEQVR4nO2deZxcVZXHv50mIekOiAQQBIUEk4AEBXIGcRsQlWWQAQZkUVEkIQ4uQIwLoEhYRFARBEW2YABlERC3zCiOCwYU5JDEIQyfNDFhT0hYk04H0umu+ePcl65UqrtfJXXrvao638+nP139lqr7un7v3HPPPfedlkKhgOPEYkjWDXAaGxeYExUXmBMVF5gTFReYExUXmBMVF5gTFRdYDRCRmSLSlAFHF1gERGSoiOw/wP4dRWT3WrYpKzbLugENyqeBa0TkD8DZyUYR2Ro4E/g8MBd4bzbNqx0usDjcDLQBXwEeBFaE7YuAkWH/+dk0rba0+FxkPERkK+D3gIRNrwNHqupvM2tUjXGBRUBEhgGTsO5xJ8yCbRl+twE3Aheo6pOZNbJGuJMfh08DVwGPA/sBd4ftY4ArgE8At2bTtNriAovDDcCHVPVAVX0w2aiqL6rqNGAscEpmrash7uRHQFW7gT8MsP/pGjYnU9wHc6LiXaQTFReYE5Wm9MFEZF/gHGCLrNsSiZVYGOTvWTekKQWGiesjWTeiBhyedQOaVWCJ5XoVmJdhO2KwF/AGcmKdm1VgCfNU9YCsG1FNROTPQL+ZHLXGnXwnKi4wJyouMCcqLjAnKi4wJyq5GEWKyOXY8Hqeqp5Rg4+cV/K7kZhX8jtTcjHZnQyt99lnH6699tqsm+NUTkt/O7yLdKLiAnOi4gJzopILJz8m8+fPZ8aMGaxatWrQY9vb25k0aRITJkyoQcuag4YX2IwZM5g9e3ZF51x22WWRWtN8NLzAEss1cuRIxo0b1+9xHR0ddHZ2prJ0TnoaXmAJ48aNGzAEMmXKFObMmVPDFjUH7uQ7UXGBOVFxgTlRcYE5UXGBOVFxgTlRaZowRUdHB1OmTBlwv1N9Gl5g7e3tAHR2dqaKcyXHO9Wh4QU2adIkgNRzkZMnT47dpKai4QU2YcIEn1vMEHfynai4wJyouMCcqLjAnKi4wJyouMCcqLjAnKi4wJyouMCcqKSK5ItIO3AucCSwI9ADPAncBlysqj0iMhSrzfMprD7PMuBO4BxVXVn9pjv1QFoL9iPgy1gJlCeAl4EJwIVYyTqw8inTgZ2xsnXbAacDvxYRt5RNStov/l/D73tUdQ9MaIlV2llE9sEKPAGcrqq7AUeHv/fHLJ/ThKSd7J6NWaaDRORRrKjmFlixzW/RJy6Au8LvWcBrwHDgYODnpW8qIicBJ2GPbqqYSy+9lAULFlR83jPPPENXVxdtbW3stNNOFZ8/fvx4pk2bVvF5zUhagZ0CFIATgbeHbd3A/wIvAG8pOnYZgKr2isgLmD/21n7edxc24YnICxYs2KS1jJ2dnSxbtmyjz3cGJ63AzsDE9SBwBLAVcC8mvCGY01+Ofp8bFXgivM9e2LPdK2L8+PGVngL0reIebLV3tT+3GRlUYCLSBlwQ/rxLVZ8HnheRe4FjgQ8B1xWdsh2wJDj2o8K2suXrVHUmMHNjn+2+sd1Usop7sNXezqaTxslvo0+I/wLrSgbvGbatAoprUCfO/WGY/0XJfqeJGFRgqvoC8Jfw50dF5J9Y17Z72Hajqj5MX4ng74vIY/Q5+/cBv6hWg536Im2Y4kjgEqAD2B4YAShW+Pw74ZhPAecDTwG7Ys7/lcBhqtpbvSY79UQqJ19VXwbODD/9HdONRfvPrU7TnEbAI+xOVFxgTlRcYE5UXGBOVFxgTlRcYE5UXGBOVFxgTlRcYE5UXGBOVBr+8U0DsXDhwgGfepjgNYw2nqYW2IoVKyrKiPXnjFVOUwsMvIZRbJpeYF7DKC7u5DtRcYE5UXGBOVFpKB8sbflkL7pQO3IrsI1Ztb1w4UJWrFhR0TlpK4AMdlwxvvK7j9wKbFNXbaclbQWQtMc565NbgSUMA7ZuGWyBuPFioUA3MBQYNcA5zxcKFMLrFgb+J6zFnpkw2HsCvFQosCZVS5uH3Ats65YWDhs6NNWxs7q7WVooMGqQc25as4bu8LoA614PxPYtLRw0SDuSz3f6yL3AYtICvCmFdRwK7N3aGr09jUhTC2wzSG0dnY3D42BOVFxgTlRcYE5UGtIHe6lQYFZ3/2PDtTVsS7PTUAJL3PU1kCpc4OY7Pg0lsL1bW6GnZ724VnehQCdQ/PyoJHi6FtazdEk4Ytsh2Ugv7Vwq1E8ad0MJbNshQzioRBz3dHfzYj/WrIcylq6nZ4P3qBUzZsxg9uzZFZ2T9zTuhhJYORL7VDzlVG5KKZnmSRPVj0ViudKmcS9btoypU6fm2uI1vMASiqecyk0p5WmaJ20a99KlSyvOOKm1xWsagTUiPT329Pg8L1xxgTUAeV644iN1JyqpLZiIjALOwSp97IgVw3oUK34118v5OeVIWy9yFFZGZldsdL8QG3DtHbbNxcr5fQILOT0OjMHK+e0lIgfGeJT5A2vX9huCSEj2v1gU3U+7rT9GtbSw32buXaQh7X/pQkxIzwIfUNXHAUSkFdi8TDm/H4jI4cCv6Cvnt0G1tU3lxUIh9civmw1jXmm3ZcFAaxKSdQJdXV3r/k7WC5SrJFfJuoJqrydIU6uoBatJBFZo9BYR2R0ruHAlcDVwaNEpNSvnN1gKM/TFvIpTo3uwSH4LkKQRVpIaneZzN4WOjo51I7+B6O21TqHceoFyleSyWFeQxoJtC2wdXr8fWB5+dgeuwr6jTMr5pemm7unu5umQg1/a8RVYfwoJ0qVGx6K9vR1gUGEltLa20tPTs16YolwluUqqy1W7klwagRUf8yLWVXZhRUrfDXweK8lXjqjl/NJQbn6yP7JOjZ40aRJA6sj8smXLKgq0ZlFdLo3AlmMJCsOAjmREKCIPYwLbBbi56PialfNLQ7n5ybwyYcKEiiLtU6dOZcGCBf12kaXbEgtZSwYVmKp2BwEcBIwTkZGYBds7HNKBleu7MPx9NPADclLOb3lvL3MrtGBZZVNUSjmL11932N7ezuTJk2vexrSjyK9jFmYU8E9MYLuEfeep6sMicitwAlbO73NYVwoZl/Ob29PD05WMCjPMpqiUchYvb8VW01Zbe0hEDsCs1H7YzX4vJq4/hcM+hcW/PklfOb87ga9nWc6vXDZFOfKQTVHKxuSH5Y3U0UJVfQArn9zf/lyX8xtsAW+esikSNiY/LG94ODrHVJoflsfHfLrA6oA8Z0sMRn14s07d4gJzouICc6LiPlhGpHmCY9osiOLjirelfSJjpVSScdE0AhtstfdLNQ5RVPIEx0qewljpObFpeIFVutq71nkUI9vbGTt6dNl9jy9aRGdXFyPb2hg7ZswG+1d1dbF0+XI6V62it7eXIWEGInndNnz4umOHtLay/bbb0t7WttFtfXzxYjorDIU0vMDynk0xdvRofnTxRWX3nXrm2cydPx/6mYFYunw5K1b2ZaMn+WHJ686QkJiw5267cem552x0W9e1pwIaXmD1lE1RStuIEQB0rlqV6ovdcost6O3p2cDqJZana/XqqO0tR8MLrJ45+fjjAPoVRnEX+s499mDSCcdz5Q0/Zu78+YwdM2adZdwYy1MtXGA5Zo/x4wbs0hLhjB0zZpO6vpjUZ9/h1A0uMCcqLjAnKi4wJyouMCcqPopsAB5fvJhTzzzbXi9atO73um2LF2fWNhdYHTNQILazq2uDbcnxtcQFVseUC8T2N3/ZNmIEk044vuZtdIHllMuuvY6ORdXr2rpWr+bKG3486HHjxoxm6pRTqva5LrCc0rFo8SZN75TrIrPABZZTxo0pn8IzGM8sWcLq1asZMWIEO+2wQ80+tz9cYDmlmt1UlngczImKC8yJigvMiYoLzImKO/l1zKMLOrjhtttTpUK3jRjByccfxx7jB36EZrVxgdUxN9x2O/c/9FBF59Q689UFVscklmugpW/giz6cTWSgpW+Q7aIPd/KdqLjAnKi4wJyoVOyDicgdwDHhz7tU9Ziw3autORtQkQUTkU/TJ65SbgCmAztjNY22w6qt/ToUZXCakNRfvIjsClwB/A14pmRfabW13bCCDNBXbc1pQtLWi9wM+ClWO+rjwJ9KDsmk2ppjFC/66G9/VqT1wc4F3gV8QlUXi0jp/kyqrTU7lT59J5eLPsTUdBbwE1X9aT+H9VdCI/Nqa43MYE/fKSbPiz4mYDUhjxGRo8K25DF5R4pIJ/CdouNzVW2tkRns6Tt5oJLR3XCgPfwklqk1/P2bouMS5z4X1dacbElTzm8mMLN4m4g8gYUjiuNguay25mRLNSe7c1ltzcmWjRKYqu5SZluuq6052eARdicqLjAnKi4wJyouMCcqLjAnKi4wJyouMCcqLjAnKi4wJyouMCcqLjAnKi4wJyouMCcqLjAnKi4wJyouMCcqLjAnKi4wJyouMCcqLjAnKi4wJyouMCcqLjAnKi4wJyouMCcqLjAnKi4wJyouMCcqLjAnKi4wJyouMCcqLjAnKi4wJyouMCcqLjAnKi4wJyppKn1Mw555Px7YBliOFcQ6X1UfCcd4KT+nLGks2BeAD2CFsJ4AdsRK+v1NRHYJx3gpP6csab7864HRqvoWVR0PTAvb24GjvJSfMxBpKn1cWLLp3qLXr7ORpfzAy/k1AxvTfX0h/H4RuIN+SvlhlT6g/1J+0FfOzyutNSiVVLwdJiI3YY78CuBIVV3Oxpfyg75yfq+mbYdTX6QSmIhsA/wBOBFYAhygqveF3U8VHbpdOH7QUn5ghbZU9QBgXkWtduqGQQUmIrsDDwLvw4Swr6rOLTqkuFSfl/Jz1iNNMay7gTFFx99ZVFL5elW93kv5Of2RRmDDi15PKNmXWCcv5eeUJU2YYpcUx3gpP6csHmV3ouICc6LiAnOi4gJzouICc6LiAnOi4gJzouICc6LiAnOi4gJzouICc6LiAnOi4gJzouICc6LiAnOi4gJzouICc6LiAnOi4gJzouICc6LiAnOi4gJzouICc6LiAnOi4gJzouICc6LiAnOi4gJzouICc6LiAnOi4gJzouICc6LiAnOi4gJzouICc6KS5iHAFSEixwNfAXYHVgN/BM5U1YXV/iwn/1TVgonIJOBWYG+sYEMr9uz8+0Vk+2p+llMfVE1gIjIMuDj8eZeqjsGs2EqsAsjZ1fosp35oKRQKVXkjEXkvVngB4GOqemvYfg/wYaAjlAMsPuckrNrau4FhI0eOZNy4cQB0dHTQ2dlJCxH68UisBQpA8XX0R3J9Q4YMoW348AGPzQtdr71Gb2/vBtc3Z86ce4F5qnpG6TnV/O42qLoWeD78Lld1bRes2hoAnZ2dzJkzZ70DCkB3ddpXM8pdR3/09vbS2dUVuUXVpcz17d/fsdUU2MZUXXsCq7Y2EfPXXgJqMRjYCysh+CqNV4hrL7K5trKfVU2BbVB1reT1BlXXVHUmMLOKbUiFiPwZu+vmhWpvDUPerq2ao8iHsCKlEKquicibgf3CNq+61oRUzYKp6hoRORu4BjhaRBZhNSO3wIpjXTzQ+TVmJvBnrItuNGaSo2ur2igyQUQ+DnwJC1G8Rl+gtaOqH+TUBVUXmOMU43ORTlRcYE5UXGANgoi0iMh0EXl38nfWbQL3wRqCEA76LVby+pfAx1W1S0RaVDXTL7hpLJiIDCl3V+flTt9EXsAC2a8DBwJTAbIWFzSJBRORzVR1bXg9GtgSWK6qz2Xbsk1HRFpVtUdE9gR+AuyJxcA+o6q/F5EhqtqbVfsa3oKFf3AirrOAvwC/B34bsjkQkbr8P4QusCf8fgT4PjabsjNwpohsp6q9WVrpZrFgb8Ui3AeETc8DbwqvJ6rq3MQSZNC8igmCaSlnmUTkGuDTQA9wlapOq3X7iqnLO7cSRGQ8MAsT1+3Y9NVPsHRuwmsSS5BFGysh3AiFYJnGiMhpInKyiBwcDjkfUGBz4AQROS6cl8m1NbzAgJeBXbG7+QQswXEadu2rgd1F5NLsmlcZiZUVkdOAR4DLgeuBX4nIecArwLnAUmB7YKqIvE1VC1m4Ag3dRSYOrohMADqxu/sTwCLsizkFmzPdDDhSVX+VVVvTElLTLwFOx3K+7gV2AvbBRPV9Vb1ERC4ATgPagN+o6lFZtLehLVjio6jqfGA34FhgOfAu4L+BN2JJjpDTzGwRKW3X9sBHMHEdqqpHAkcBz4R9h4jIG4DLgH9giZxP1qzBJTS0wErYCRPRtlhK0SPAY8BkYDtV/XmGbduAxGdS1bUisrmITBaRMVgYYlfMx3oiHPM08MVw6v7AGFV9CTgLOKRcrnytyOVdG4n/A+YD7wD+I2z7gar+BvriSVk1rpiSuN2bgF8DgnWLS7BQxEhgCtbtg/maKzHL9gqAqt5f04aXoe4smIi0lns9GKr6V+CrwNeAa4EJqvrr8D4tWYmr3OiuSFznACcDbwd+BtwI/BUT0ObAFBE5XUTeieXgbYHF+Z6tSeNTUFdOfrGVCYmNHwSGAb8D7lXVp4rv/qLzys7JhVFVIasplf6i7CIyHLgZSz1fAzyrqrsW7T8J+C6wdcmps7F5yGeiNbpC6kpgACKyBXAd5rAndGM+1XtV9fVSQZUI842q+nIeJoJDe0YD5wEnhRHvIcCj2FrSq7Ab6GHgOOCJRJDhBjsWGAesAm5T1e9mcAkDUjcCC13J5tid+1ngAWxS92zgUMyf/KaqnlNy3pCiL+XEcOzVqvqXGja/LCKyL+ZfbYtZrDdjVvlE4B7s2k7HJrE/Bfy82DoHF2Eo0K6qL5JD6sYHC9ZmG+AIbIncCeHvw4Fe4KfAI2FaCBEZGs7rFZE3i8gPMB/meOwLzQNPYz7Va1h87oNYd/+oqi7HRrv3YTfWlzFfDOjzG1X1tbyKC3IqsDKxn4SxwI6YoH6F3f3LsLm3z2DW6Ssisrmqdof3OhS4G7N6i4H9VfWuuFcwOMGyLsGmroZjC5RfBaap6rzgHz4OfDtsnwicGkaVuUjFSUPuushi30hEPokNvZ8IE9LbYbGf4ZjfNRv4GCa4GzGBnYUFGQtYF/N1LNj4Y2BKlqGI0gGIiGyD+VvdmJUaBdypqscWHTMS6yYvCJsOVdXf1a7Vm0ZuBFbiiG8D3AW8HxtFLQFOVNX7whTI1zBR/RJb8Pt5zH+5E/iCqj4vIjtg00FHASdkbbVKbpyPAs9hfuQ2WJd9GtZNDgfOUNUrRGQcdu1twAzgPlXN0/rSQcmFwEr++e/Bni92JdaljcISBJ/ERkybAzcBR5a8zbdV9cyS990TeEpVX416AQNQcuPsDNyBBU1fwSzwaar6pIh8EBtNvgfzyS7HRHe/qh4sIsNV9bUMLmGTyIXAYF1XcAtwCPYPvhv7B38RmIRZqJmqenIYPZ2I+WMAv1fVv4f3aQV68+CjlIxg34mNBM+gL9MBbGR4TDjmP7Fr3q3oba7IcqpnU8lMYGViVR/GHPdhmMN7tKreHfK5pmKZDy1Yd3d7aZByoCS8LAl+463Y4612wLJOb8As1IewLnGaql4mIu1YvOtYzFKfpaoPZNHualFzgSU5Sf1EsKcDX8CyHGZhKTQ9ofv4MnBQOPRNYRifnJeLoGk5Qt7W5ViGaSswTlUXisiB2Dzie7C5xUNU9eFwzgjgtbxeUyXUVGAl/sg7MAf8ZWCZqt4W7uBbsHSUlcD5qvq9cPxngYsw5/h9IVsgNww07SQivwD+HRuYXKSq3wjbP4tZ512xyfh35M0Cbyo1zaYoEtfX6Bt2E7ZNxJ7Acw7mg4wFThGRh1R1NvALLAB5bzg+N1ar5MbZG3gvdoM8rap/xPzIiZjP+G8i8qCqzsIGK3thAluCdYurN/yE+iW6BZO+ZVVDMB/qfCxWtRQT2b6Y87sCm+r5joh8HvgGNoT/O/BhVV1Z9J4bTGjXijK+Y/EI+CIsYyPJkFiDhRyuDrnxt2Ixr18CX1XVxSKyD7BjktnRaEQTmIhshTmsWwLXqeorItKGpZPsAuyBreyZhd3Zc4HLVfXmIMY7sC70ElU9K0ojK6RkVDhCVVeH11sBP8Kudwm26GI0ttL6SeBrqnqLiFyPpd+8gi02OT0vVjgWUaaKQiDxr9g//RIsMwAsjrUPJrpvYc/13BHzrfYH9hORqeFLPAvYLw/iKsou7RWRHcQWV3xLRJKH347HRoRPAe9S1SOwUe+z2BrFY8JxZ2I+5FbAwkYXF1TZBwtziN+hLwNgBmaVHhVbrLAlFr3eD1vd8xjwOcyqXQicClwX5hI7wntmGn4o6QJPxMIMW2G+0mIR+QeW4z8qbFsRTv078D3gUuDDIjJWVR8PaTZr85BtWguqIrDQpQ0DpmPiehk4WVV/GfYfDHwTywxYiuUvtWPd4utYN3kw1rVcpaqvJ+8dvtzM7nS15V6bYXOa38B8qG8CVwOdqvqq9C0Ha8FiWNeH85Lnkz+GXTfJIKVZqEoXGazLDpjPtAaYXCSuH2IrePbBRoYdmNAK2ET1/Zi47gEOV9V/VKNNVUaAT2LXdqyqnqOqz2IjRbDr68G6+8+KyBSxxyidEvb/GVgldbCwt9pUs4s8GhPQMmAOgIgci+VtPYdN9RyK+SU3Y1/KQdiX9FtVvTGck6fFF4lTfyo2MLkfW+DaisW8esOIdkEIqP4QCztcXfQ2dwIXNlp8Ky3VFFg7oZIK8E5s9DQL87H+C1tN/XXMEjynqtOBK0RkmKqugXyJC9abbdgq/B4WuuziNiZWaRaWFTEReBt241yjqj+uQVNzSzVHkU/DutJCh4rIzqq6CiuM9Sq2kmclNpG9IDmpSFyZrewZCBFJ8rQAthGRw8P2lmC9usN843Ss+z8Oc/oPanZxQXWLYbUB/4ONEJ/DrNZ54fVh2PMSJmLBxi+FbM66QEROwdKXC9jysfNV9bGwb0esSzwMy5q9phnCD2mppsBasDnEn2FTHmBxoZVYUBUsLvbF4lFiPRASIG/CUom6sDy1u7AR5STMP/sd9tC3p/p5m6YkRiGG47ApoDFYF/wy5vhPV9XbwzG58rXSIPYAld9QvmrctcDn6u2aakGUqSKxh9K+A1tStRZ4QFVfDvvqTlwJQWRHYNm0T2EPTrlNVf+QZbtyTaFQqMnPxIkTh9Tqs2pwLcPC76FZtyXvP7lJma4nBsr9ctbHBeZEJZcLb53GwQXmRMUF5kTFBeZExQXmRMUF5kTFBeZExQXmROX/AbGgft7b5vaiAAAAAElFTkSuQmCC\n", 59 | "text/plain": [ 60 | "
" 61 | ] 62 | }, 63 | "metadata": { 64 | "needs_background": "light" 65 | }, 66 | "output_type": "display_data" 67 | } 68 | ], 69 | "source": [ 70 | "plt.figure(figsize = (2,4))\n", 71 | "\n", 72 | "ax = sns.barplot(x = groups, y = vals, capsize = 0.5, edgecolor = '0.2', lw = 2.5, errwidth = 2.5, \n", 73 | " palette = ['brown', 'mistyrose'], errcolor = '0.2')\n", 74 | "\n", 75 | "kwargs = {'edgecolor':'0.2', 'linewidth':2.5, 'fc': 'none'}\n", 76 | "\n", 77 | "ax = sns.swarmplot(x = groups, y = vals, marker = 's', s = 10, **kwargs)\n", 78 | "\n", 79 | "prev_lim = ax.get_ylim()[1]\n", 80 | "\n", 81 | "max_val = max(vals)\n", 82 | "plt.plot([0,0,1,1], [max_val+5, max_val+8, max_val+8, max_val+5], lw = 2.5, color = '0.2', \n", 83 | " clip_on = False)\n", 84 | "\n", 85 | "plt.text(x = 0.5, y = max_val+8, s = '*', ha = 'center', size = 20, weight = 'bold', color = '0.2')\n", 86 | "\n", 87 | "\n", 88 | "for axis in ['bottom', 'left']:\n", 89 | " ax.spines[axis].set_linewidth(2.5)\n", 90 | " ax.spines[axis].set_color('0.2')\n", 91 | " \n", 92 | "ax.spines['top'].set_visible(False)\n", 93 | "ax.spines['right'].set_visible(False)\n", 94 | "\n", 95 | "plt.xticks(size = 14, rotation = 35, rotation_mode = 'anchor', ha = 'right', weight = 'bold', color = '0.2')\n", 96 | "plt.yticks(size = 14, weight = 'bold', color = '0.2')\n", 97 | "\n", 98 | "ax.tick_params(width = 2.5, color = '0.2')\n", 99 | "\n", 100 | "plt.ylim(top = prev_lim)\n", 101 | "\n", 102 | "\n", 103 | "plt.savefig('bar_test.svg', bbox_inches = 'tight')\n", 104 | "plt.savefig('bar_test.png', bbox_inches = 'tight', dpi = 250, facecolor = ax.get_facecolor())" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": 39, 110 | "id": "dd2b76bc", 111 | "metadata": {}, 112 | "outputs": [], 113 | "source": [ 114 | "import pandas as pd" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": 40, 120 | "id": "dfc3cee6", 121 | "metadata": {}, 122 | "outputs": [], 123 | "source": [ 124 | "a = [67, 56, 62, 70, 89, 45, 37, 50, 60, 34]\n", 125 | "b = [42, 55, 61, 51, 62, 25, 41, 35, 38, 22]\n", 126 | "\n", 127 | "vals = a + b\n", 128 | "gene = ['Gene_a']*10 + ['Control']*10\n", 129 | "\n", 130 | "groups = ['Treated']*5 + ['Untreated']*5 + ['Treated']*5 + ['Untreated']*5\n", 131 | "\n", 132 | "df = pd.DataFrame(zip(vals, gene, groups), columns = ['Value', 'Gene', 'Treatment'])" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 42, 138 | "id": "f8f6280a", 139 | "metadata": {}, 140 | "outputs": [ 141 | { 142 | "data": { 143 | "text/html": [ 144 | "
\n", 145 | "\n", 158 | "\n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | "
ValueGeneTreatment
067Gene_aTreated
156Gene_aTreated
262Gene_aTreated
370Gene_aTreated
489Gene_aTreated
\n", 200 | "
" 201 | ], 202 | "text/plain": [ 203 | " Value Gene Treatment\n", 204 | "0 67 Gene_a Treated\n", 205 | "1 56 Gene_a Treated\n", 206 | "2 62 Gene_a Treated\n", 207 | "3 70 Gene_a Treated\n", 208 | "4 89 Gene_a Treated" 209 | ] 210 | }, 211 | "execution_count": 42, 212 | "metadata": {}, 213 | "output_type": "execute_result" 214 | } 215 | ], 216 | "source": [ 217 | "df.head()" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": 63, 223 | "id": "db7749d0", 224 | "metadata": {}, 225 | "outputs": [ 226 | { 227 | "data": { 228 | "image/png": "\n", 229 | "text/plain": [ 230 | "
" 231 | ] 232 | }, 233 | "metadata": { 234 | "needs_background": "light" 235 | }, 236 | "output_type": "display_data" 237 | } 238 | ], 239 | "source": [ 240 | "plt.figure(figsize = (4,4))\n", 241 | "\n", 242 | "ax = sns.barplot(data = df, x = 'Gene', y = 'Value', capsize = 0.25, edgecolor = '0.2', lw = 2.5, errwidth = 2.5, \n", 243 | " palette = ['brown', 'mistyrose'], errcolor = '0.2', hue = 'Treatment')\n", 244 | "\n", 245 | "kwargs = {'edgecolor':'0.2', 'linewidth':2.5, 'fc': 'none'}\n", 246 | "\n", 247 | "ax = sns.swarmplot(data = df, x = 'Gene', y = 'Value', hue = 'Treatment',\n", 248 | " dodge = True, marker = 's', s = 10, **kwargs)\n", 249 | "\n", 250 | "prev_lim = ax.get_ylim()[1]\n", 251 | "\n", 252 | "max_val = max(vals)\n", 253 | "\n", 254 | "\n", 255 | "handles, labels = ax.get_legend_handles_labels()\n", 256 | "\n", 257 | "plt.legend(handles[2:], labels[2:], loc = 10, bbox_to_anchor = (0.5, -0.15), \n", 258 | " ncol = 2, frameon = False, fontsize = 14, labelcolor = '0.2', prop = {'weight': 'bold'})\n", 259 | "\n", 260 | "\n", 261 | "\n", 262 | "for axis in ['bottom', 'left']:\n", 263 | " ax.spines[axis].set_linewidth(2.5)\n", 264 | " ax.spines[axis].set_color('0.2')\n", 265 | " \n", 266 | "ax.spines['top'].set_visible(False)\n", 267 | "ax.spines['right'].set_visible(False)\n", 268 | "\n", 269 | "plt.xticks(size = 14, ha = 'center', weight = 'bold', color = '0.2')\n", 270 | "plt.yticks(size = 14, weight = 'bold', color = '0.2')\n", 271 | "\n", 272 | "ax.tick_params(width = 2.5, color = '0.2')\n", 273 | "\n", 274 | "plt.xlabel('')\n", 275 | "plt.ylabel('Values', size = 14, weight = 'bold', color = '0.2')\n", 276 | "\n", 277 | "#plt.ylim(top = prev_lim)\n", 278 | "\n", 279 | "\n", 280 | "plt.savefig('bar_test_big.svg', bbox_inches = 'tight')\n", 281 | "plt.savefig('bar_test_big.png', bbox_inches = 'tight', dpi = 250, facecolor = ax.get_facecolor())" 282 | ] 283 | }, 284 | { 285 | "cell_type": "code", 286 | "execution_count": 67, 287 | "id": "8341a2ef", 288 | "metadata": {}, 289 | "outputs": [ 290 | { 291 | "name": "stdout", 292 | "output_type": "stream", 293 | "text": [ 294 | "[[-0.2 60.4]\n", 295 | " [-0.2 79.2]]\n", 296 | "[[ 0.8 47.4]\n", 297 | " [ 0.8 60.2]]\n", 298 | "[[ 0.2 37.8 ]\n", 299 | " [ 0.2 53.415]]\n", 300 | "[[ 1.2 25.8]\n", 301 | " [ 1.2 38.6]]\n" 302 | ] 303 | } 304 | ], 305 | "source": [ 306 | "for e in ax.lines:\n", 307 | " xy = e.get_xydata()\n", 308 | " if xy[0,0] == xy[1,0]:\n", 309 | " print(xy)" 310 | ] 311 | }, 312 | { 313 | "cell_type": "code", 314 | "execution_count": null, 315 | "id": "a64e0933", 316 | "metadata": {}, 317 | "outputs": [], 318 | "source": [] 319 | } 320 | ], 321 | "metadata": { 322 | "kernelspec": { 323 | "display_name": "Python 3 (ipykernel)", 324 | "language": "python", 325 | "name": "python3" 326 | }, 327 | "language_info": { 328 | "codemirror_mode": { 329 | "name": "ipython", 330 | "version": 3 331 | }, 332 | "file_extension": ".py", 333 | "mimetype": "text/x-python", 334 | "name": "python", 335 | "nbconvert_exporter": "python", 336 | "pygments_lexer": "ipython3", 337 | "version": "3.10.1" 338 | } 339 | }, 340 | "nbformat": 4, 341 | "nbformat_minor": 5 342 | } 343 | -------------------------------------------------------------------------------- /high_quality_lineplots.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 16, 6 | "id": "2dde0ea3", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import pandas as pd\n", 11 | "import seaborn as sns\n", 12 | "import matplotlib.pyplot as plt\n", 13 | "from scipy import stats" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 21, 19 | "id": "a6de9c7a", 20 | "metadata": {}, 21 | "outputs": [ 22 | { 23 | "data": { 24 | "text/html": [ 25 | "
\n", 26 | "\n", 39 | "\n", 40 | " \n", 41 | " \n", 42 | " \n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | "
Tumor sizeWeekTreatment
021Control
121Control
231Control
332Control
442Control
\n", 81 | "
" 82 | ], 83 | "text/plain": [ 84 | " Tumor size Week Treatment\n", 85 | "0 2 1 Control\n", 86 | "1 2 1 Control\n", 87 | "2 3 1 Control\n", 88 | "3 3 2 Control\n", 89 | "4 4 2 Control" 90 | ] 91 | }, 92 | "execution_count": 21, 93 | "metadata": {}, 94 | "output_type": "execute_result" 95 | } 96 | ], 97 | "source": [ 98 | "ctr = [2,2,3,3,4,5,5,6,4,8,6,6,9,11,12,12,15,16,20,25,27]\n", 99 | "drug = [2,3,2,3,4,3,3,4,5,5,6,6,8,7,6,7,8,11,10,11,15]\n", 100 | "\n", 101 | "week = []\n", 102 | "for x in range(1,8): #weeks 1-7\n", 103 | " week += [x,x,x]\n", 104 | "week += week\n", 105 | "\n", 106 | "vals = ctr + drug\n", 107 | "\n", 108 | "labels = ['Control']*21 + ['Drug_z']*21\n", 109 | "\n", 110 | "df = pd.DataFrame(zip(vals, week, labels), columns = ['Tumor size', 'Week', 'Treatment'])\n", 111 | "df.head()" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": 50, 117 | "id": "51dfdc8f", 118 | "metadata": {}, 119 | "outputs": [ 120 | { 121 | "data": { 122 | "image/png": "\n", 123 | "text/plain": [ 124 | "
" 125 | ] 126 | }, 127 | "metadata": { 128 | "needs_background": "light" 129 | }, 130 | "output_type": "display_data" 131 | } 132 | ], 133 | "source": [ 134 | "plt.figure(figsize = (4,4))\n", 135 | "\n", 136 | "err_kws = {'capsize': 5, 'capthick': 2, 'elinewidth':2}\n", 137 | "\n", 138 | "ax = sns.lineplot(data = df, x = 'Week', y = 'Tumor size', hue = 'Treatment', lw = 2.5,\n", 139 | " style = 'Treatment', markers = ['o', '^'], dashes = False, markersize = 8 ,\n", 140 | " err_style = 'bars', err_kws = err_kws, palette = ['gray', 'firebrick'])\n", 141 | "\n", 142 | "\n", 143 | "for axis in ['bottom', 'left']:\n", 144 | " ax.spines[axis].set_linewidth(2.5)\n", 145 | " ax.spines[axis].set_color('0.2')\n", 146 | "\n", 147 | "ax.spines['top'].set_visible(False)\n", 148 | "ax.spines['right'].set_visible(False)\n", 149 | "\n", 150 | "ax.tick_params(width = 2.5, color = '0.2')\n", 151 | "\n", 152 | "plt.xticks(size = 14, weight = 'bold', color = '0.2')\n", 153 | "plt.yticks(size = 14, weight = 'bold', color = '0.2')\n", 154 | "\n", 155 | "ax.set_xlabel(ax.get_xlabel(), fontsize = 14, weight = 'bold', color = '0.2')\n", 156 | "ax.set_ylabel(ax.get_ylabel(), fontsize = 14, weight = 'bold', color = '0.2')\n", 157 | "\n", 158 | "plt.legend(frameon = False, prop = {'weight':'bold', 'size':14}, labelcolor = '0.2')\n", 159 | "\n", 160 | "\n", 161 | "for week in range(1,8):\n", 162 | " z = df[(df.Week == week) & (df.Treatment == 'Drug_z')]['Tumor size'].values\n", 163 | " c = df[(df.Week == week) & (df.Treatment == 'Control')]['Tumor size'].values\n", 164 | " \n", 165 | " p = stats.ttest_ind(z,c).pvalue\n", 166 | " \n", 167 | " max_v = df[df.Week == week]['Tumor size'].max()\n", 168 | " \n", 169 | " if p < 0.05:\n", 170 | " plt.text(x = week- 0.05, y = max_v - 0.5, s = \"*\",\n", 171 | " fontsize = 20, ha = 'center', va = 'bottom', color = '0.2', weight = 'bold')\n", 172 | " else:\n", 173 | " plt.text(x = week, y = max_v, s = \"ns\", fontsize = 12, ha = 'center', va = 'bottom', color = '0.2')\n", 174 | "\n", 175 | "\n", 176 | "\n", 177 | "\n", 178 | "plt.savefig('line_test.png', bbox_inches = 'tight', dpi = 250, facecolor = ax.get_facecolor())" 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": 45, 184 | "id": "212202ce", 185 | "metadata": {}, 186 | "outputs": [], 187 | "source": [ 188 | "for week in range(1,8):\n", 189 | " z = df[(df.Week == week) & (df.Treatment == 'Drug_z')]['Tumor size'].values\n", 190 | " c = df[(df.Week == week) & (df.Treatment == 'Control')]['Tumor size'].values\n", 191 | " \n", 192 | " p = stats.ttest_ind(z,c).pvalue\n", 193 | " \n", 194 | " max_v = df[df.Week == week]['Tumor size'].max()\n", 195 | " \n", 196 | " if p < 0.05:\n", 197 | " plt.text(x = week, y = max_v, s = \"*\", fontsize = 20, ha = 'center', va = 'bottom')\n", 198 | " else:\n", 199 | " plt.text(x = week, y = max_v, s = \"ns\", fontsize = 14, ha = 'center', va = 'bottom')" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": 46, 205 | "id": "174fc492", 206 | "metadata": {}, 207 | "outputs": [ 208 | { 209 | "data": { 210 | "text/plain": [ 211 | "Ttest_indResult(statistic=-4.6475800154489, pvalue=0.009678951648207292)" 212 | ] 213 | }, 214 | "execution_count": 46, 215 | "metadata": {}, 216 | "output_type": "execute_result" 217 | } 218 | ], 219 | "source": [ 220 | "s" 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": null, 226 | "id": "f87879ca", 227 | "metadata": {}, 228 | "outputs": [], 229 | "source": [] 230 | } 231 | ], 232 | "metadata": { 233 | "kernelspec": { 234 | "display_name": "Python 3 (ipykernel)", 235 | "language": "python", 236 | "name": "python3" 237 | }, 238 | "language_info": { 239 | "codemirror_mode": { 240 | "name": "ipython", 241 | "version": 3 242 | }, 243 | "file_extension": ".py", 244 | "mimetype": "text/x-python", 245 | "name": "python", 246 | "nbconvert_exporter": "python", 247 | "pygments_lexer": "ipython3", 248 | "version": "3.10.1" 249 | } 250 | }, 251 | "nbformat": 4, 252 | "nbformat_minor": 5 253 | } 254 | -------------------------------------------------------------------------------- /integration_comparison/harmony.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "R Notebook" 3 | output: html_notebook 4 | --- 5 | 6 | ```{r} 7 | library(Seurat) 8 | library(harmony) 9 | ``` 10 | 11 | 12 | 13 | ```{r} 14 | files <- list.files('dedif_data/') 15 | files <- grep("2i|Dox", files, value = TRUE) 16 | files <- grep("h5", files, value = TRUE) 17 | files <- grep("C1_", files, value = TRUE) 18 | files 19 | ``` 20 | 21 | 22 | 23 | ```{r} 24 | prep_integration <- function(h5_path){ 25 | #print(h5_path) 26 | day <- sub("^.*_D([^_]*)_Dox.*$", "\\1", h5_path) 27 | day <- sub("^.*_D([^_]*)_2i.*$", "\\1", day) 28 | print(day) 29 | 30 | data <- Read10X_h5(paste0('dedif_data/', h5_path)) 31 | 32 | data <- CreateSeuratObject(data, min.cells = 0, min.features = 300) 33 | 34 | ub <- quantile(data[["nFeature_RNA"]]$nFeature_RNA, probs = 0.97) 35 | data <- data[, data[["nFeature_RNA"]] < ub] 36 | data <- NormalizeData(object = data, verbose = FALSE) 37 | 38 | # 39 | 40 | 41 | data$day <- day 42 | data$Dataset <- h5_path 43 | return(data) 44 | } 45 | 46 | ``` 47 | 48 | 49 | 50 | ```{r} 51 | data_list <- sapply(files, prep_integration) 52 | ``` 53 | 54 | 55 | ```{r} 56 | data <- merge(data_list[1]$GSM3195648_D0_Dox_C1_gene_bc_mat.h5, y = data_list[2:length(data_list)]) 57 | ``` 58 | 59 | 60 | ```{r} 61 | data <- FindVariableFeatures(object = data, nfeatures = 2000, verbose = FALSE, selection.method = 'vst') 62 | data <- ScaleData(data, verbose = FALSE) 63 | data <- RunPCA(data, npcs = 40, verbose = FALSE) 64 | ``` 65 | 66 | 67 | ```{r} 68 | start <- Sys.time() 69 | data <- RunHarmony(data, "Dataset") 70 | print( Sys.time() - start ) 71 | ``` 72 | 73 | 9.710755 mins 74 | 75 | ```{r} 76 | data <- RunUMAP(data, reduction = "harmony", dims = 1:40) 77 | ``` 78 | 79 | ```{r} 80 | data$dayint <- data[[]]$day 81 | data$dayint <- ifelse(data$dayint == "iPSC", 20, data$dayint) 82 | data$dayint <- as.numeric(data$dayint) 83 | ``` 84 | 85 | 86 | ```{r} 87 | FeaturePlot(data, "dayint") 88 | ``` 89 | 90 | 91 | 92 | 93 | 94 | 95 | ```{r} 96 | files <- list.files('pbmc_cd16/') 97 | files <- grep("MH|new", files, value = TRUE) 98 | files 99 | ``` 100 | 101 | 102 | 103 | ```{r} 104 | prep_integration <- function(h5_path){ 105 | 106 | data <-CreateSeuratObject(counts = Read10X(paste0('pbmc_cd16/', h5_path)), min.cells = 0, min.features = 300) 107 | 108 | 109 | ub <- quantile(data[["nFeature_RNA"]]$nFeature_RNA, probs = 0.97) 110 | data <- data[, data[["nFeature_RNA"]] < ub] 111 | data <- NormalizeData(object = data, verbose = FALSE) 112 | 113 | data <- FindVariableFeatures(object = data, nfeatures = 2000, verbose = FALSE, selection.method = 'vst') 114 | 115 | 116 | data$Sample <- h5_path 117 | return(data) 118 | } 119 | 120 | ``` 121 | 122 | 123 | ```{r} 124 | data_list <- sapply(files, prep_integration) 125 | ``` 126 | 127 | 128 | 129 | 130 | ```{r} 131 | data <- merge(data_list[1]$MH8919176, y = data_list[2:length(data_list)]) 132 | ``` 133 | 134 | 135 | 136 | ```{r} 137 | data <- FindVariableFeatures(object = data, nfeatures = 2000, verbose = FALSE, selection.method = 'vst') 138 | data <- ScaleData(data, verbose = FALSE) 139 | data <- RunPCA(data, npcs = 40, verbose = FALSE) 140 | ``` 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | ```{r} 149 | start <- Sys.time() 150 | data <- RunHarmony(data, "Sample") 151 | print( Sys.time() - start ) 152 | ``` 153 | 154 | 2.746183 mins 155 | 156 | 157 | ```{r} 158 | data <- RunUMAP(data, reduction = "harmony", dims = 1:40) 159 | ``` 160 | 161 | 162 | ```{r} 163 | df <- read.csv('pbmc_cd16/cell_types.csv') 164 | df 165 | ``` 166 | 167 | 168 | ```{r} 169 | data$cell_types <- df$initial_clustering[match(rownames(data[[]]), df$covid_index)] 170 | ``` 171 | 172 | 173 | ```{r} 174 | DimPlot(data, group.by = 'cell_types') 175 | ``` 176 | 177 | 178 | 179 | ```{r} 180 | DimPlot(data[,data$cell_types == "CD16" | data$cell_types == "CD14"], group.by = 'cell_types') 181 | ``` 182 | 183 | 184 | 185 | 186 | Two datasets 187 | 188 | 189 | 190 | ```{r} 191 | prep_integration <- function(h5_path){ 192 | 193 | data <-CreateSeuratObject(counts = Read10X(paste0('two_different/', h5_path)), min.cells = 0, min.features = 300) 194 | 195 | data$Sample <- h5_path 196 | 197 | if (h5_path == 'lung'){ 198 | data$cell_type = 'lung_na' 199 | } 200 | else{ 201 | df <- read.csv('two_different/muscle_meta.csv') 202 | data$cell_type <- df$cell_type 203 | } 204 | 205 | ub <- quantile(data[["nFeature_RNA"]]$nFeature_RNA, probs = 0.97) 206 | data <- data[, data[["nFeature_RNA"]] < ub] 207 | data <- NormalizeData(object = data, verbose = FALSE) 208 | 209 | data <- FindVariableFeatures(object = data, nfeatures = 2000, verbose = FALSE, selection.method = 'vst') 210 | 211 | 212 | 213 | return(data) 214 | } 215 | 216 | ``` 217 | 218 | 219 | 220 | ```{r} 221 | data_list <- c(prep_integration('lung'), prep_integration('muscle')) 222 | ``` 223 | 224 | 225 | ```{r} 226 | data <- merge(prep_integration('lung') , y = prep_integration('muscle')) 227 | ``` 228 | 229 | ```{r} 230 | data 231 | ``` 232 | 233 | ```{r} 234 | data <- FindVariableFeatures(object = data, nfeatures = 2000, verbose = FALSE, selection.method = 'vst') 235 | data <- ScaleData(data, verbose = FALSE) 236 | data <- RunPCA(data, npcs = 40, verbose = FALSE) 237 | ``` 238 | 239 | 240 | 241 | ```{r} 242 | start <- Sys.time() 243 | data <- RunHarmony(data, "Sample") 244 | print( Sys.time() - start ) 245 | ``` 246 | 247 | 28.89322 secs 248 | 249 | 250 | 251 | 252 | ```{r} 253 | data <- RunUMAP(data, reduction = "harmony", dims = 1:40) 254 | ``` 255 | 256 | 257 | 258 | 259 | ```{r} 260 | DimPlot(data, group.by = 'Sample') 261 | ``` 262 | 263 | 264 | 265 | 266 | ```{r} 267 | DimPlot(data, group.by = 'cell_type') 268 | ``` 269 | 270 | 271 | 272 | ```{r} 273 | DimPlot(data[, data$cell_type == 'lung_na' | data$cell_type == 'Macrophage'], group.by = 'cell_type') 274 | ``` 275 | 276 | 277 | 278 | 279 | ```{r} 280 | DimPlot(data[, data$cell_type == 'lung_na' | data$cell_type == 'Endothelial'], group.by = 'cell_type') 281 | ``` 282 | 283 | 284 | 285 | ```{r} 286 | DimPlot(data[, data$cell_type == 'lung_na' | data$cell_type == 'SkMus_1'], group.by = 'cell_type') 287 | ``` 288 | 289 | 290 | 291 | 292 | -------------------------------------------------------------------------------- /integration_comparison/readme.txt: -------------------------------------------------------------------------------- 1 | comparison of multiple RNA integraiton methods 2 | -------------------------------------------------------------------------------- /integration_comparison/seurat_cca.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "R Notebook" 3 | output: html_notebook 4 | --- 5 | 6 | ```{r} 7 | library(Seurat) 8 | ``` 9 | 10 | 11 | ```{r} 12 | files <- list.files('dedif_data/') 13 | files <- grep("2i|Dox", files, value = TRUE) 14 | files <- grep("h5", files, value = TRUE) 15 | files <- grep("C1_", files, value = TRUE) 16 | files 17 | ``` 18 | 19 | 20 | ########################## CCA ###################################### 21 | 22 | dedif 23 | 24 | ```{r} 25 | prep_integration <- function(h5_path){ 26 | #print(h5_path) 27 | day <- sub("^.*_D([^_]*)_Dox.*$", "\\1", h5_path) 28 | day <- sub("^.*_D([^_]*)_2i.*$", "\\1", day) 29 | print(day) 30 | 31 | data <- Read10X_h5(paste0('dedif_data/', h5_path)) 32 | 33 | data <- CreateSeuratObject(data, min.cells = 0, min.features = 300) 34 | 35 | ub <- quantile(data[["nFeature_RNA"]]$nFeature_RNA, probs = 0.97) 36 | data <- data[, data[["nFeature_RNA"]] < ub] 37 | data <- NormalizeData(object = data, verbose = FALSE) 38 | 39 | data <- FindVariableFeatures(object = data, nfeatures = 2000, verbose = FALSE, selection.method = 'vst') 40 | 41 | 42 | data$day <- day 43 | return(data) 44 | } 45 | 46 | ``` 47 | 48 | 49 | 50 | ```{r} 51 | data_list <- sapply(files, prep_integration) 52 | ``` 53 | 54 | ```{r} 55 | features <- SelectIntegrationFeatures(object.list = data_list) 56 | ``` 57 | 58 | 59 | ```{r} 60 | start <- Sys.time() 61 | anchors <- FindIntegrationAnchors(object.list = data_list, anchor.features = features, reduction = "cca") 62 | 63 | 64 | data <- IntegrateData(anchorset = anchors) 65 | 66 | print( Sys.time() - start ) 67 | ``` 68 | 69 | 70 | 71 | FAILED, out of memory 72 | 73 | 74 | 75 | 76 | CD16 removed 77 | 78 | ```{r} 79 | files <- list.files('pbmc_cd16/') 80 | files <- grep("MH|new", files, value = TRUE) 81 | files 82 | ``` 83 | 84 | 85 | 86 | ```{r} 87 | prep_integration <- function(h5_path){ 88 | 89 | data <-CreateSeuratObject(counts = Read10X(paste0('pbmc_cd16/', h5_path)), min.cells = 0, min.features = 300) 90 | 91 | 92 | ub <- quantile(data[["nFeature_RNA"]]$nFeature_RNA, probs = 0.97) 93 | data <- data[, data[["nFeature_RNA"]] < ub] 94 | data <- NormalizeData(object = data, verbose = FALSE) 95 | 96 | data <- FindVariableFeatures(object = data, nfeatures = 2000, verbose = FALSE, selection.method = 'vst') 97 | 98 | 99 | data$Sample <- h5_path 100 | return(data) 101 | } 102 | 103 | ``` 104 | 105 | 106 | ```{r} 107 | data_list <- sapply(files, prep_integration) 108 | ``` 109 | 110 | 111 | 112 | ```{r} 113 | features <- SelectIntegrationFeatures(object.list = data_list) 114 | ``` 115 | 116 | 117 | ```{r} 118 | start <- Sys.time() 119 | anchors <- FindIntegrationAnchors(object.list = data_list, anchor.features = features, reduction = "cca") 120 | 121 | 122 | data <- IntegrateData(anchorset = anchors) 123 | 124 | print( Sys.time() - start ) 125 | ``` 126 | 127 | 128 | 1.3 hours 129 | 130 | 131 | 132 | ```{r} 133 | data <- ScaleData(data, verbose = FALSE) 134 | data <- RunPCA(data, npcs = 40, verbose = FALSE) 135 | data <- RunUMAP(data, reduction = "pca", dims = 1:40) 136 | data <- FindNeighbors(data, reduction = "pca", dims = 1:40) 137 | data <- FindClusters(data, resolution = 0.5) 138 | 139 | ``` 140 | 141 | 142 | ```{r} 143 | data 144 | ``` 145 | 146 | 147 | ```{r} 148 | df <- read.csv('pbmc_cd16/cell_types.csv') 149 | df 150 | ``` 151 | 152 | 153 | ```{r} 154 | data$cell_types <- df$initial_clustering[match(rownames(data[[]]), df$covid_index)] 155 | ``` 156 | 157 | 158 | ```{r} 159 | DimPlot(data, group.by = 'cell_types') 160 | ``` 161 | 162 | 163 | ```{r} 164 | DimPlot(data[,data$cell_types == "CD16" | data$cell_types == "CD14"], group.by = 'cell_types') 165 | ``` 166 | 167 | 168 | 169 | 170 | 171 | Two datasets 172 | 173 | 174 | 175 | ```{r} 176 | prep_integration <- function(h5_path){ 177 | 178 | data <-CreateSeuratObject(counts = Read10X(paste0('two_different/', h5_path)), min.cells = 0, min.features = 300) 179 | 180 | data$Sample <- h5_path 181 | 182 | if (h5_path == 'lung'){ 183 | data$cell_type = 'lung_na' 184 | } 185 | else{ 186 | df <- read.csv('two_different/muscle_meta.csv') 187 | data$cell_type <- df$cell_type 188 | } 189 | 190 | ub <- quantile(data[["nFeature_RNA"]]$nFeature_RNA, probs = 0.97) 191 | data <- data[, data[["nFeature_RNA"]] < ub] 192 | data <- NormalizeData(object = data, verbose = FALSE) 193 | 194 | data <- FindVariableFeatures(object = data, nfeatures = 2000, verbose = FALSE, selection.method = 'vst') 195 | 196 | 197 | 198 | return(data) 199 | } 200 | 201 | ``` 202 | 203 | 204 | ```{r} 205 | data_list <- c(prep_integration('lung'), prep_integration('muscle')) 206 | ``` 207 | 208 | 209 | 210 | ```{r} 211 | features <- SelectIntegrationFeatures(object.list = data_list) 212 | ``` 213 | 214 | 215 | ```{r} 216 | start <- Sys.time() 217 | anchors <- FindIntegrationAnchors(object.list = data_list, anchor.features = features, reduction = "cca") 218 | 219 | 220 | data <- IntegrateData(anchorset = anchors) 221 | 222 | print( Sys.time() - start ) 223 | ``` 224 | 225 | 3.075165 mins 226 | 227 | 228 | ```{r} 229 | data <- ScaleData(data, verbose = FALSE) 230 | data <- RunPCA(data, npcs = 40, verbose = FALSE) 231 | data <- RunUMAP(data, reduction = "pca", dims = 1:40) 232 | data <- FindNeighbors(data, reduction = "pca", dims = 1:40) 233 | data <- FindClusters(data, resolution = 0.5) 234 | 235 | ``` 236 | 237 | 238 | 239 | ```{r} 240 | DimPlot(data, group.by = 'cell_type') 241 | ``` 242 | 243 | ```{r} 244 | DimPlot(data[, data$cell_type == 'lung_na' | data$cell_type == 'Macrophage'], group.by = 'cell_type') 245 | ``` 246 | 247 | ```{r} 248 | DimPlot(data[, data$cell_type == 'lung_na' | data$cell_type == 'Endothelial'], group.by = 'cell_type') 249 | ``` 250 | 251 | 252 | ```{r} 253 | DimPlot(data[, data$cell_type == 'lung_na' | data$cell_type == 'SkMus_1'], group.by = 'cell_type') 254 | ``` 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | ```{r} 263 | FeaturePlot(data, c("Cdh5"), max.cutoff = 2) 264 | ``` 265 | 266 | 267 | ```{r} 268 | FeaturePlot(data, c("Ptprc"), max.cutoff = 2) 269 | ``` 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | -------------------------------------------------------------------------------- /integration_comparison/seurat_rpca.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "R Notebook" 3 | output: html_notebook 4 | --- 5 | 6 | 7 | 8 | ```{r} 9 | library(Seurat) 10 | ``` 11 | 12 | 13 | ```{r} 14 | files <- list.files('dedif_data/') 15 | files <- grep("2i|Dox", files, value = TRUE) 16 | files <- grep("h5", files, value = TRUE) 17 | files <- grep("C1_", files, value = TRUE) 18 | files 19 | ``` 20 | 21 | 22 | ########################## rpca ###################################### 23 | 24 | 25 | ```{r} 26 | prep_integration <- function(h5_path){ 27 | #print(h5_path) 28 | day <- sub("^.*_D([^_]*)_Dox.*$", "\\1", h5_path) 29 | day <- sub("^.*_D([^_]*)_2i.*$", "\\1", day) 30 | print(day) 31 | 32 | data <- Read10X_h5(paste0('dedif_data/', h5_path)) 33 | 34 | data <- CreateSeuratObject(data, min.cells = 0, min.features = 300) 35 | 36 | ub <- quantile(data[["nFeature_RNA"]]$nFeature_RNA, probs = 0.97) 37 | data <- data[, data[["nFeature_RNA"]] < ub] 38 | data <- NormalizeData(object = data, verbose = FALSE) 39 | 40 | data <- FindVariableFeatures(object = data, nfeatures = 2000, verbose = FALSE, selection.method = 'vst') 41 | 42 | 43 | data$day <- day 44 | return(data) 45 | } 46 | 47 | ``` 48 | 49 | 50 | 51 | ```{r} 52 | data_list <- sapply(files, prep_integration) 53 | ``` 54 | 55 | ```{r} 56 | features <- SelectIntegrationFeatures(object.list = data_list) 57 | ``` 58 | 59 | ```{r} 60 | #dont do in cca 61 | scale_pca <- function(x){ 62 | x <- ScaleData(x, features = features, verbose = FALSE) 63 | x <- RunPCA(x, features = features, verbose = FALSE) 64 | return(x) 65 | } 66 | 67 | 68 | data_list <- lapply(X = data_list, scale_pca) 69 | ``` 70 | 71 | 72 | 73 | ```{r} 74 | start <- Sys.time() 75 | anchors <- FindIntegrationAnchors(object.list = data_list, anchor.features = features, reduction = "rpca") 76 | 77 | 78 | data <- IntegrateData(anchorset = anchors) 79 | 80 | print( Sys.time() - start ) 81 | ``` 82 | 83 | 84 | 85 | 86 | ```{r} 87 | data <- ScaleData(data, verbose = FALSE) 88 | data <- RunPCA(data, npcs = 40, verbose = FALSE) 89 | data <- RunUMAP(data, reduction = "pca", dims = 1:40) 90 | data <- FindNeighbors(data, reduction = "pca", dims = 1:40) 91 | data <- FindClusters(data, resolution = 0.5) 92 | 93 | data$dayint <- data[[]]$day 94 | data$dayint <- ifelse(data$dayint == "iPSC", 20, data$dayint) 95 | data$dayint <- as.numeric(data$dayint) 96 | ``` 97 | 98 | ```{r} 99 | data <- readRDS("rpca_dedif.rds") 100 | ``` 101 | 102 | ```{r} 103 | FeaturePlot(data, "dayint") 104 | ``` 105 | 106 | 107 | oops, R crashed and it didn't save notebook.. similar enough to above that just reloading object 108 | 109 | ```{r} 110 | data <- readRDS("rpca_cd16.rds") 111 | ``` 112 | 113 | 114 | 115 | 116 | ```{r} 117 | df <- read.csv('pbmc_cd16/cell_types.csv') 118 | data$cell_types <- df$initial_clustering[match(rownames(data[[]]), df$covid_index)] 119 | ``` 120 | 121 | 122 | 123 | ```{r} 124 | DimPlot(data, group.by = 'cell_types') 125 | ``` 126 | 127 | 128 | 129 | ```{r} 130 | DimPlot(data[,data$cell_types == "CD16" | data$cell_types == "CD14"], group.by = 'cell_types') 131 | ``` 132 | 133 | 134 | 135 | 136 | 137 | Two datasets 138 | 139 | 140 | 141 | ```{r} 142 | prep_integration <- function(h5_path){ 143 | 144 | data <-CreateSeuratObject(counts = Read10X(paste0('two_different/', h5_path)), min.cells = 0, min.features = 300) 145 | 146 | data$Sample <- h5_path 147 | 148 | if (h5_path == 'lung'){ 149 | data$cell_type = 'lung_na' 150 | } 151 | else{ 152 | df <- read.csv('two_different/muscle_meta.csv') 153 | data$cell_type <- df$cell_type 154 | } 155 | 156 | ub <- quantile(data[["nFeature_RNA"]]$nFeature_RNA, probs = 0.97) 157 | data <- data[, data[["nFeature_RNA"]] < ub] 158 | data <- NormalizeData(object = data, verbose = FALSE) 159 | 160 | data <- FindVariableFeatures(object = data, nfeatures = 2000, verbose = FALSE, selection.method = 'vst') 161 | 162 | 163 | 164 | return(data) 165 | } 166 | 167 | ``` 168 | 169 | 170 | 171 | ```{r} 172 | data_list <- c(prep_integration('lung'), prep_integration('muscle')) 173 | ``` 174 | 175 | ```{r} 176 | features <- SelectIntegrationFeatures(object.list = data_list) 177 | ``` 178 | 179 | 180 | ```{r} 181 | #dont do in cca 182 | scale_pca <- function(x){ 183 | x <- ScaleData(x, features = features, verbose = FALSE) 184 | x <- RunPCA(x, features = features, verbose = FALSE) 185 | return(x) 186 | } 187 | 188 | 189 | data_list <- lapply(X = data_list, scale_pca) 190 | ``` 191 | 192 | 193 | ```{r} 194 | start <- Sys.time() 195 | anchors <- FindIntegrationAnchors(object.list = data_list, anchor.features = features, reduction = "rpca") 196 | 197 | 198 | data <- IntegrateData(anchorset = anchors) 199 | 200 | print( Sys.time() - start ) 201 | ``` 202 | 203 | 34.42784 secs 204 | 205 | 206 | ```{r} 207 | data <- ScaleData(data, verbose = FALSE) 208 | data <- RunPCA(data, npcs = 40, verbose = FALSE) 209 | data <- RunUMAP(data, reduction = "pca", dims = 1:40) 210 | data <- FindNeighbors(data, reduction = "pca", dims = 1:40) 211 | data <- FindClusters(data, resolution = 0.5) 212 | 213 | ``` 214 | 215 | 216 | 217 | ```{r} 218 | DimPlot(data, group.by = 'cell_type') 219 | ``` 220 | 221 | 222 | ```{r} 223 | DimPlot(data[, data$cell_type == 'lung_na' | data$cell_type == 'Macrophage'], group.by = 'cell_type') 224 | ``` 225 | 226 | 227 | 228 | ```{r} 229 | DimPlot(data[, data$cell_type == 'lung_na' | data$cell_type == 'Endothelial'], group.by = 'cell_type') 230 | ``` 231 | 232 | 233 | 234 | 235 | -------------------------------------------------------------------------------- /monocle3_tutorial.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "R Notebook" 3 | output: html_notebook 4 | --- 5 | 6 | 7 | ```{r} 8 | BiocManager::install(c('BiocGenerics', 'DelayedArray', 'DelayedMatrixStats', 9 | 'limma', 'lme4', 'S4Vectors', 'SingleCellExperiment', 10 | 'SummarizedExperiment', 'batchelor', 'HDF5Array', 11 | 'terra', 'ggrastr')) 12 | ``` 13 | 14 | ```{r} 15 | devtools::install_github('cole-trapnell-lab/monocle3') 16 | ``` 17 | 18 | ```{r} 19 | install.packages('Seurat') 20 | ``` 21 | 22 | ```{r} 23 | devtools::install_github("satijalab/seurat-wrappers") 24 | ``` 25 | 26 | 27 | 28 | ```{r} 29 | library(monocle3) 30 | library(Seurat) 31 | ``` 32 | 33 | 34 | ```{r} 35 | files <- list.files('data') 36 | files <- grep("2i|Dox", files, value = TRUE) 37 | files <- grep("h5", files, value = TRUE) 38 | files 39 | ``` 40 | 41 | 42 | 43 | ```{r} 44 | input_files <- function(h5_path){ 45 | day <- sub("^.*_D([^_]*)_Dox.*$", "\\1", h5_path) 46 | day <- sub("^.*_D([^_]*)_2i.*$", "\\1", day) 47 | 48 | data <- Read10X_h5(paste0('data/', h5_path)) 49 | data <- CreateSeuratObject(data, min.cells = 0, min.features = 200) 50 | data[["percent.mt"]] <- PercentageFeatureSet(data, pattern = "mt-") 51 | lb <- quantile(data[["nFeature_RNA"]]$nFeature_RNA, probs = 0.02) 52 | ub <- quantile(data[["nFeature_RNA"]]$nFeature_RNA, probs = 0.97) 53 | data <- data[, data[["nFeature_RNA"]] > lb & data[["nFeature_RNA"]] < ub & data[["percent.mt"]] < 15] 54 | 55 | data$day <- day 56 | return(data) 57 | } 58 | ``` 59 | 60 | 61 | ```{r} 62 | data_list <- sapply(files, input_files) 63 | ``` 64 | 65 | 66 | #### This is where you would integrate if you wanted to do so #### 67 | #### Example at the bottom of the notebook ###### 68 | 69 | 70 | ```{r} 71 | data <- merge(data_list[1]$GSM3195648_D0_Dox_C1_gene_bc_mat.h5, y = data_list[2:length(data_list)]) 72 | ``` 73 | 74 | 75 | ```{r} 76 | data 77 | ``` 78 | 79 | 80 | ```{r} 81 | data <- NormalizeData(object = data, verbose = FALSE) 82 | data <- FindVariableFeatures(object = data, nfeatures = 2000, verbose = FALSE, selection.method = 'vst') 83 | data <- ScaleData(data, verbose = FALSE) 84 | data <- RunPCA(data, npcs = 30, verbose = FALSE) 85 | data <- FindNeighbors(data, dims = 1:30) 86 | 87 | data <- RunUMAP(data, reduction = "pca", dims = 1:30) 88 | 89 | 90 | data@active.assay = 'RNA' 91 | ``` 92 | 93 | 94 | 95 | ```{r} 96 | DimPlot(data, group.by = c("day")) 97 | ``` 98 | 99 | 100 | ```{r} 101 | data$dayint <- data[[]]$day 102 | data$dayint <- ifelse(data$dayint == "iPSC", 20, data$dayint) 103 | data$dayint <- as.numeric(data$dayint) 104 | ``` 105 | 106 | 107 | 108 | ```{r} 109 | FeaturePlot(data, "dayint") 110 | ``` 111 | 112 | 113 | ```{r} 114 | cds <- SeuratWrappers::as.cell_data_set(data) #change to cds here 115 | ``` 116 | 117 | 118 | ```{r} 119 | cds <- cluster_cells(cds) 120 | ``` 121 | 122 | 123 | ```{r} 124 | plot_cells(cds, show_trajectory_graph = FALSE, 125 | color_cells_by = "partition") 126 | ``` 127 | 128 | 129 | 130 | ```{r} 131 | cds <- learn_graph(cds, use_partition = FALSE) #graph learned across all partitions 132 | ``` 133 | 134 | 135 | ```{r} 136 | cds <- order_cells(cds) 137 | ``` 138 | 139 | 140 | 141 | ```{r} 142 | plot_cells(cds, color_cells_by = "pseudotime", label_branch_points=FALSE, label_leaves=FALSE) 143 | ``` 144 | 145 | 146 | ```{r} 147 | rowData(cds)$gene_name <- rownames(cds) 148 | rowData(cds)$gene_short_name <- rowData(cds)$gene_name 149 | ``` 150 | 151 | 152 | 153 | ```{r} 154 | plot_cells(cds, 155 | genes=c('Sox2', 'Nanog', 'Col6a2'), 156 | label_cell_groups=FALSE, 157 | show_trajectory_graph=FALSE, 158 | min_expr = 3) 159 | ``` 160 | 161 | 162 | ```{r} 163 | cds_pt_res <- graph_test(cds, neighbor_graph="principal_graph", cores=8) 164 | ``` 165 | 166 | 167 | ```{r} 168 | cds_pt_res <- readRDS("cds_pt_res.rds") 169 | ``` 170 | 171 | ```{r} 172 | cds_pt_res 173 | ``` 174 | 175 | ```{r} 176 | cds_pt_res <- na.omit(cds_pt_res) 177 | cds_pt_res <- cds_pt_res[cds_pt_res$p_value < 0.05 & cds_pt_res$status == "OK", ] 178 | cds_pt_res 179 | ``` 180 | 181 | 182 | ```{r} 183 | cds_pt_res[order(-cds_pt_res$morans_test_statistic),] 184 | ``` 185 | 186 | 187 | ```{r} 188 | plot_cells(cds, genes=c("Col1a2", "Uba52", "Serpine1", "Dppa5a"), 189 | show_trajectory_graph=FALSE, 190 | label_cell_groups=FALSE, 191 | label_leaves=FALSE) 192 | ``` 193 | 194 | 195 | 196 | ```{r} 197 | cds_subset <- choose_cells(cds) 198 | ``` 199 | 200 | 201 | 202 | ```{r} 203 | cds_subset 204 | ``` 205 | 206 | 207 | ```{r} 208 | cds_subset_pt_res <- graph_test(cds_subset, neighbor_graph="principal_graph", cores=8) 209 | cds_subset_pt_res <- na.omit(cds_subset_pt_res) 210 | cds_subset_pt_res <- cds_subset_pt_res[cds_subset_pt_res$p_value < 0.05 & cds_subset_pt_res$status == "OK", ] 211 | cds_subset_pt_res 212 | ``` 213 | 214 | 215 | 216 | 217 | ```{r} 218 | cds_subset_pt_res[order(-cds_subset_pt_res$morans_test_statistic),] 219 | ``` 220 | 221 | 222 | ```{r} 223 | plot_cells(cds_subset, genes=c("Rpl7a", "Eef1a1", "Mgst1", "Lgals1"), 224 | show_trajectory_graph=FALSE, 225 | label_cell_groups=FALSE, 226 | label_leaves=FALSE) 227 | ``` 228 | 229 | 230 | 231 | ```{r} 232 | cds_subset_subset <- cds_subset[rowData(cds_subset)$gene_short_name %in% c("Rpl7a", "Eef1a1", "Mgst1", "Lgals1")] 233 | ``` 234 | 235 | ```{r} 236 | plot_genes_in_pseudotime(cds_subset_subset, 237 | color_cells_by="dayint", 238 | min_expr=0.5) 239 | ``` 240 | 241 | 242 | 243 | ################################ Integration Example ###################### 244 | 245 | ```{r} 246 | features <- SelectIntegrationFeatures(object.list = data_list) 247 | ``` 248 | 249 | ```{r} 250 | 251 | scale_pca <- function(x){ 252 | x <- ScaleData(x, features = features, verbose = FALSE) 253 | x <- RunPCA(x, features = features, verbose = FALSE) 254 | return(x) 255 | } 256 | 257 | 258 | data_list <- lapply(X = data_list, scale_pca) 259 | ``` 260 | 261 | 262 | ```{r} 263 | anchors <- FindIntegrationAnchors(object.list = data_list, anchor.features = features, reduction = "rpca") 264 | saveRDS(anchors, file = "integration_anchors.rds") 265 | 266 | 267 | data <- IntegrateData(anchorset = anchors) 268 | 269 | ``` 270 | 271 | 272 | -------------------------------------------------------------------------------- /python_sequence_alignment.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "2bb017e7", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "#conda install -c bioconda mafft \n", 11 | "#!pip install Biopython" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "id": "c64c0ba6", 17 | "metadata": {}, 18 | "source": [ 19 | "Here I convert a SeqIO object to a string then to bytes. I then pass this encoded string to a subprocess call of maft through STDOUT. Mafft reads the encoded fasta through STDIN and ouputs the aligned fasta through STDOUT. This STDOUT is then decoded back into a python string and read as a new aligned SeqIO object.\n", 20 | "\n", 21 | "I have supplied a function at the bottom that takes unaligned SeqIO objects and returns alinged SeqIO objects" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 1, 27 | "id": "146e2ffc", 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "from Bio import SeqIO\n", 32 | "import subprocess\n", 33 | "from io import StringIO" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 2, 39 | "id": "d0655331", 40 | "metadata": {}, 41 | "outputs": [ 42 | { 43 | "name": "stdout", 44 | "output_type": "stream", 45 | "text": [ 46 | "alignment.ipynb sars_map.ipynb test_500_seqs.fasta\r\n" 47 | ] 48 | } 49 | ], 50 | "source": [ 51 | "!ls" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 3, 57 | "id": "22bd34fd", 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "seqs = list(SeqIO.parse('test_500_seqs.fasta', 'fasta'))" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 5, 67 | "id": "3e87ed3a", 68 | "metadata": {}, 69 | "outputs": [ 70 | { 71 | "data": { 72 | "text/plain": [ 73 | "500" 74 | ] 75 | }, 76 | "execution_count": 5, 77 | "metadata": {}, 78 | "output_type": "execute_result" 79 | } 80 | ], 81 | "source": [ 82 | "len(seqs)" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": 6, 88 | "id": "ea18c1ac", 89 | "metadata": {}, 90 | "outputs": [], 91 | "source": [ 92 | "#This is just a command line example of what is happening in the python code below\n", 93 | "!cat test_500_seqs.fasta | mafft --quiet - > aligned_file.fasta" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": 7, 99 | "id": "a2aa430b", 100 | "metadata": {}, 101 | "outputs": [ 102 | { 103 | "name": "stdout", 104 | "output_type": "stream", 105 | "text": [ 106 | "aligned_file.fasta alignment.ipynb sars_map.ipynb test_500_seqs.fasta\r\n" 107 | ] 108 | } 109 | ], 110 | "source": [ 111 | "!ls" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": 8, 117 | "id": "28029134", 118 | "metadata": {}, 119 | "outputs": [], 120 | "source": [ 121 | "seq_str = ''\n", 122 | "for seq in seqs:\n", 123 | " seq_str += '>' + seq.description + '\\n'\n", 124 | " seq_str += str(seq.seq) + '\\n'" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": 10, 130 | "id": "cc16fddc", 131 | "metadata": {}, 132 | "outputs": [], 133 | "source": [ 134 | "child = subprocess.Popen(['mafft', '--quiet', '-'], stdin=subprocess.PIPE, stdout=subprocess.PIPE)\n", 135 | "child.stdin.write(seq_str.encode())\n", 136 | "child_out = child.communicate()[0].decode('utf8')\n", 137 | "seq_ali = list(SeqIO.parse(StringIO(child_out), 'fasta'))\n", 138 | "child.stdin.close()" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": 14, 144 | "id": "b530f795", 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [ 148 | "#to write to file\n", 149 | "with open('aligned_file.fasta', 'w') as f:\n", 150 | " for seq in seqs:\n", 151 | " f.write( '>' + seq.description + '\\n')\n", 152 | " f.write(str(seq.seq) + '\\n')\n" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": null, 158 | "id": "dae165f9", 159 | "metadata": {}, 160 | "outputs": [], 161 | "source": [ 162 | "#in a function. takes unaligned SeqIO object and returns aligned SeqIO object\n", 163 | "def align_seqs(seqs):\n", 164 | " seq_str = ''\n", 165 | " for seq in seqs:\n", 166 | " seq_str += '>' + seq.description + '\\n'\n", 167 | " seq_str += str(seq.seq) + '\\n'\n", 168 | " child = subprocess.Popen(['mafft', '--quiet', '-'], stdin=subprocess.PIPE, stdout=subprocess.PIPE)\n", 169 | " child.stdin.write(seq_str.encode())\n", 170 | " child_out = child.communicate()[0].decode('utf8')\n", 171 | " seq_ali = list(SeqIO.parse(StringIO(child_out), 'fasta'))\n", 172 | " child.stdin.close()\n", 173 | " return seq_ali" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": null, 179 | "id": "afc242a5", 180 | "metadata": {}, 181 | "outputs": [], 182 | "source": [] 183 | } 184 | ], 185 | "metadata": { 186 | "kernelspec": { 187 | "display_name": "Python 3 (ipykernel)", 188 | "language": "python", 189 | "name": "python3" 190 | }, 191 | "language_info": { 192 | "codemirror_mode": { 193 | "name": "ipython", 194 | "version": 3 195 | }, 196 | "file_extension": ".py", 197 | "mimetype": "text/x-python", 198 | "name": "python", 199 | "nbconvert_exporter": "python", 200 | "pygments_lexer": "ipython3", 201 | "version": "3.10.1" 202 | } 203 | }, 204 | "nbformat": 4, 205 | "nbformat_minor": 5 206 | } 207 | -------------------------------------------------------------------------------- /salmon_to_deseq.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "R Notebook" 3 | output: html_notebook 4 | --- 5 | ######## PREPARE INDEX ####### 6 | ##ref: https://combine-lab.github.io/alevin-tutorial/2019/selective-alignment/ 7 | 8 | curl -O https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_44/gencode.v44.transcripts.fa.gz 9 | curl -O https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_44/GRCh38.primary_assembly.genome.fa.gz 10 | 11 | grep "^>" <(gunzip -c GRCh38.primary_assembly.genome.fa.gz) | cut -d " " -f 1 > decoys.txt 12 | sed -i -e 's/>//g' decoys.txt 13 | 14 | cat gencode.v44.transcripts.fa.gz GRCh38.primary_assembly.genome.fa.gz > CRCH38_and_decoys.fa.gz 15 | 16 | salmon index -t GRCH38_and_decoys.fa.gz -d decoys.txt -p 30 -i GRCh38_salmon_index --gencode 17 | 18 | 19 | ######## RUN SALMON ON ONE SAMPLE ####### 20 | 21 | #paired 22 | salmon quant -i GRCh38_salmon_index/ -l A -1 path_to_R1.fastq.gz -2 path_to_R2.fastq.gz --validateMappings -o salmon_out/out_directory 23 | 24 | #unstranded 25 | salmon quant -i GRCh38_salmon_index/ -l A -r path_to.fastq.gz --validateMappings -o salmon_out/out_directory 26 | 27 | 28 | 29 | ######### START MULTIPLE FILE SALMON SCRIPT ###### 30 | 31 | #!/bin/bash 32 | 33 | # Set the path to the Salmon index 34 | salmon_index="human_salmon_index" 35 | 36 | # Set the path to the "fastq" folder 37 | fastq_dir="fastq" 38 | 39 | # Loop through all the directories within the "fastq" folder 40 | for dir in "${fastq_dir}"/SRR*; do 41 | # Find the R1 and R2 FASTQ files 42 | r1_file=$(find "$dir" -name "*_1.fastq.gz") 43 | r2_file=$(find "$dir" -name "*_2.fastq.gz") 44 | 45 | # Extract the sample name 46 | samp=$(basename "$dir") 47 | 48 | echo "Processing sample ${samp}" 49 | salmon quant -i "$salmon_index" -l A \ 50 | -1 "$r1_file" \ 51 | -2 "$r2_file" \ 52 | -p 28 --validateMappings -o "salmon_out/${samp}_quant" 53 | done 54 | 55 | ######### END MULTIPLE FILE SALMON SCRIPT ###### 56 | 57 | 58 | #### R CODE BELOW ####### 59 | 60 | ```{r} 61 | library(tximport) 62 | library(ensembldb) 63 | library(AnnotationHub) 64 | library(DESeq2) 65 | ``` 66 | 67 | ```{r} 68 | hub = AnnotationHub() 69 | ``` 70 | 71 | ```{r} 72 | #make sure to use the right species 73 | ensdb_query <- query(hub, c("EnsDb", "sapiens", "109")) 74 | ensdb_query 75 | ``` 76 | 77 | ```{r} 78 | ensdb_109 <- ensdb_query[['AH109606']] 79 | ``` 80 | 81 | ```{r} 82 | # Extract transcript and gene information 83 | tx_data <- transcripts(ensdb_109, return.type = "DataFrame") 84 | 85 | # Create the tx2gene data.frame 86 | tx2gene <- tx_data[, c("tx_id", "gene_id")] 87 | 88 | tx2gene 89 | ``` 90 | 91 | 92 | 93 | 94 | ```{r} 95 | quants_dir <- "salmon_out/" 96 | 97 | quant_files <- list.files(quants_dir, pattern = "quant.sf$", recursive = TRUE, full.names = TRUE) 98 | 99 | quant_dirs <- list.files(quants_dir, pattern = "_quant$", full.names = TRUE) 100 | sample_names <- gsub("_quant$", "", basename(quant_dirs)) 101 | 102 | names(quant_files) <- sample_names 103 | 104 | quant_files 105 | ``` 106 | 107 | 108 | ```{r} 109 | txi <- tximport(quant_files, type = "salmon", tx2gene = tx2gene,ignoreTxVersion = TRUE) 110 | ``` 111 | 112 | 113 | ```{r} 114 | sample_names 115 | ``` 116 | 117 | 118 | 119 | ```{r} 120 | condition <- factor(c("KO","KO","KO","KO", "WT","WT","WT","WT")) 121 | coldata <- data.frame(row.names = sample_names, condition) 122 | coldata 123 | ``` 124 | 125 | 126 | ```{r} 127 | dds <- DESeqDataSetFromTximport(txi, coldata, ~condition) 128 | dds 129 | ``` 130 | 131 | ```{r} 132 | dds <- DESeq(dds) 133 | ``` 134 | 135 | ```{r} 136 | vsdata <- vst(dds, blind = FALSE) 137 | plotPCA(vsdata, intgroup = "condition") 138 | ``` 139 | 140 | 141 | ```{r} 142 | res <- results(dds, contrast = c('condition', 'KO', 'WT')) 143 | ``` 144 | 145 | 146 | ```{r} 147 | res 148 | ``` 149 | 150 | 151 | 152 | ```{r} 153 | sigs <- na.omit(res) 154 | sigs <- sigs[sigs$padj < 0.05 & sigs$baseMean > 10, ] 155 | 156 | sigs 157 | ``` 158 | 159 | 160 | 161 | ```{r} 162 | write.csv(counts(dds), "counts.csv") 163 | ``` 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | -------------------------------------------------------------------------------- /sc2024/readme.txt: -------------------------------------------------------------------------------- 1 | Tutorial series on updated single cell workflows 2 | -------------------------------------------------------------------------------- /scATAC_intro_R.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "R Notebook" 3 | output: html_notebook 4 | --- 5 | 6 | 7 | ```{r} 8 | install.packages('hdf5r') #need to read h5 files 9 | ``` 10 | 11 | 12 | ```{r} 13 | install.packages("Signac") #seurat addon for analyzing chromatin 14 | ``` 15 | 16 | ```{r} 17 | install.packages('Seurat') 18 | ``` 19 | 20 | 21 | ```{r} 22 | library(Signac) 23 | library(Seurat) 24 | ``` 25 | 26 | 27 | ```{r} 28 | counts <- Read10X_h5(filename = "GSM5723631_Young_HSC_filtered_peak_bc_matrix.h5") 29 | ``` 30 | 31 | ```{r} 32 | meta <- read.csv( 33 | file = 'GSM5723631_Young_HSC_singlecell.csv.gz', 34 | header = TRUE, 35 | row.names = 1) 36 | ``` 37 | 38 | 39 | ```{r} 40 | chrom_assay <- CreateChromatinAssay( 41 | counts = counts, 42 | sep = c(":", "-"), 43 | genome = 'mm10', 44 | fragments = './GSM5723631_Young_HSC_fragments.tsv.gz', 45 | min.cells = 10, 46 | min.features = 200 47 | ) 48 | ``` 49 | 50 | 51 | ```{r} 52 | data <- CreateSeuratObject( 53 | counts = chrom_assay, 54 | assay = "peaks", 55 | meta.data = meta 56 | ) 57 | ``` 58 | 59 | 60 | ```{r} 61 | data[[]] 62 | ``` 63 | 64 | EnsDb.Hsapiens.v86 for human 65 | ```{r} 66 | if (!require("BiocManager", quietly = TRUE)) 67 | install.packages("BiocManager") 68 | 69 | BiocManager::install("EnsDb.Mmusculus.v79") 70 | BiocManager::install("GenomeInfoDb") #translation between chromosome names 71 | BiocManager::install("biovizBase") 72 | ``` 73 | 74 | ```{r} 75 | library(GenomeInfoDb) 76 | library(EnsDb.Mmusculus.v79) 77 | ``` 78 | 79 | 80 | ```{r} 81 | annotations <- GetGRangesFromEnsDb(ensdb = EnsDb.Mmusculus.v79) 82 | seqlevelsStyle(annotations) <- 'UCSC' 83 | ``` 84 | 85 | 86 | 87 | ```{r} 88 | Annotation(data) <- annotations 89 | ``` 90 | 91 | 92 | ```{r} 93 | data <- NucleosomeSignal(object = data) #fragment ratio 147-294: <147 94 | ``` 95 | 96 | ```{r} 97 | data <- TSSEnrichment(object = data, fast = FALSE) 98 | ``` 99 | 100 | ```{r} 101 | data$blacklist_ratio <- data$blacklist_region_fragments / data$peak_region_fragments 102 | 103 | #data[[]] 104 | ``` 105 | 106 | 107 | ```{r} 108 | data$pct_reads_in_peaks <- data$peak_region_fragments / data$passed_filters * 100 109 | ``` 110 | 111 | 112 | ```{r} 113 | VlnPlot( 114 | object = data, 115 | features = c('peak_region_fragments', 'pct_reads_in_peaks', 116 | 'blacklist_ratio', 'nucleosome_signal', 'TSS.enrichment'), 117 | pt.size = 0.1, 118 | ncol = 5 119 | ) 120 | ``` 121 | 122 | 123 | 124 | could do this.... 125 | ```{r} 126 | data <- subset( 127 | x = data, 128 | subset = peak_region_fragments > 3000 & 129 | peak_region_fragments < 20000 & 130 | pct_reads_in_peaks > 15 & 131 | blacklist_ratio < 0.05 & 132 | nucleosome_signal < 4 & 133 | TSS.enrichment > 2 134 | ) 135 | ``` 136 | 137 | 138 | 139 | ```{r} 140 | low_prf <- quantile(data[["peak_region_fragments"]]$peak_region_fragments, probs = 0.02) 141 | hig_prf <- quantile(data[["peak_region_fragments"]]$peak_region_fragments, probs = 0.98) 142 | low_prp <- quantile(data[["pct_reads_in_peaks"]]$pct_reads_in_peaks, probs = 0.02) 143 | 144 | high_blr <- quantile(data[["blacklist_ratio"]]$blacklist_ratio, probs = 0.98) 145 | 146 | hig_ns <- quantile(data[["nucleosome_signal"]]$nucleosome_signal, probs = 0.98) 147 | 148 | low_ts <- quantile(data[["TSS.enrichment"]]$TSS.enrichment, probs = 0.02) 149 | ``` 150 | 151 | ```{r} 152 | print(low_prf) 153 | print(hig_prf) 154 | print(low_prp) 155 | print(high_blr) 156 | print(hig_ns) 157 | print(low_ts) 158 | ``` 159 | 160 | ```{r} 161 | data <- subset( 162 | x = data, 163 | subset = peak_region_fragments > low_prf & 164 | peak_region_fragments < hig_prf & 165 | pct_reads_in_peaks > low_prp & 166 | blacklist_ratio < high_blr & 167 | nucleosome_signal < hig_ns & 168 | TSS.enrichment > low_ts 169 | ) 170 | ``` 171 | 172 | 173 | 174 | 175 | 176 | ```{r} 177 | data 178 | ``` 179 | 180 | 181 | Normalization, dimension reduction 182 | 183 | ```{r} 184 | data <- RunTFIDF(data) 185 | ``` 186 | 187 | ```{r} 188 | data <- FindTopFeatures(data, min.cutoff = 'q0') 189 | data 190 | ``` 191 | 192 | 193 | ```{r} 194 | data <- RunSVD(data) 195 | ``` 196 | 197 | ```{r} 198 | DepthCor(data) 199 | ``` 200 | 201 | ```{r} 202 | data <- RunUMAP(object = data, reduction = 'lsi', dims = 2:30) 203 | data <- FindNeighbors(object = data, reduction = 'lsi', dims = 2:30) 204 | data <- FindClusters(object = data, verbose = FALSE, algorithm = 3) 205 | DimPlot(object = data, label = TRUE) + NoLegend() 206 | ``` 207 | 208 | 209 | Multiple samples.... 210 | 211 | ```{r} 212 | import_atac <- function(count_path, meta_path, fragment_path){ 213 | counts <- Read10X_h5(filename = count_path) 214 | 215 | meta <- read.csv( 216 | file = meta_path, 217 | header = TRUE, 218 | row.names = 1) 219 | 220 | 221 | 222 | chrom_assay <- CreateChromatinAssay( 223 | counts = counts, 224 | sep = c(":", "-"), 225 | genome = 'mm10', 226 | fragments = fragment_path, 227 | min.cells = 10, 228 | min.features = 200 229 | ) 230 | 231 | data <- CreateSeuratObject( 232 | counts = chrom_assay, 233 | assay = "peaks", 234 | meta.data = meta 235 | ) 236 | 237 | Annotation(data) <- annotations 238 | 239 | 240 | data <- NucleosomeSignal(object = data) #fragment ratio 147-294: <147 --- mononucleosome:nucleosome-free 241 | 242 | 243 | data <- TSSEnrichment(object = data, fast = FALSE) 244 | 245 | data$blacklist_ratio <- data$blacklist_region_fragments / data$peak_region_fragments 246 | 247 | data$pct_reads_in_peaks <- data$peak_region_fragments / data$passed_filters * 100 248 | 249 | low_prf <- quantile(data[["peak_region_fragments"]]$peak_region_fragments, probs = 0.02) 250 | hig_prf <- quantile(data[["peak_region_fragments"]]$peak_region_fragments, probs = 0.98) 251 | low_prp <- quantile(data[["pct_reads_in_peaks"]]$pct_reads_in_peaks, probs = 0.02) 252 | 253 | high_blr <- quantile(data[["blacklist_ratio"]]$blacklist_ratio, probs = 0.98) 254 | 255 | hig_ns <- quantile(data[["nucleosome_signal"]]$nucleosome_signal, probs = 0.98) 256 | 257 | low_ts <- quantile(data[["TSS.enrichment"]]$TSS.enrichment, probs = 0.02) 258 | 259 | data <- subset( 260 | x = data, 261 | subset = peak_region_fragments > low_prf & 262 | peak_region_fragments < hig_prf & 263 | pct_reads_in_peaks > low_prp & 264 | blacklist_ratio < high_blr & 265 | nucleosome_signal < hig_ns & 266 | TSS.enrichment > low_ts 267 | ) 268 | 269 | 270 | 271 | #data <- RunTFIDF(data) 272 | #data <- FindTopFeatures(data, min.cutoff = 'q0') 273 | #data <- RunSVD(data) 274 | 275 | return(data) 276 | } 277 | ``` 278 | 279 | 280 | ```{r} 281 | young <- import_atac("GSM5723631_Young_HSC_filtered_peak_bc_matrix.h5", 282 | 'GSM5723631_Young_HSC_singlecell.csv.gz', 283 | './GSM5723631_Young_HSC_fragments.tsv.gz') 284 | 285 | old <- import_atac("GSM5723632_Aged_HSC_filtered_peak_bc_matrix.h5", 286 | 'GSM5723632_Aged_HSC_singlecell.csv.gz', 287 | './GSM5723632_Aged_HSC_fragments.tsv.gz') 288 | ``` 289 | 290 | ```{r} 291 | young$dataset <- "young" 292 | old$dataset <- "old" 293 | ``` 294 | 295 | 296 | ```{r} 297 | data <- merge(young, old) 298 | ``` 299 | 300 | 301 | ```{r} 302 | data 303 | ``` 304 | 305 | 306 | ```{r} 307 | data <- FindTopFeatures(data, min.cutoff = 'q0') 308 | data <- RunTFIDF(data) 309 | data <- RunSVD(data) 310 | data 311 | ``` 312 | 313 | 314 | ```{r} 315 | data <- RunUMAP(object = data, reduction = 'lsi', dims = 2:30) 316 | data <- FindNeighbors(object = data, reduction = 'lsi', dims = 2:30) 317 | ``` 318 | 319 | ```{r} 320 | data <- FindClusters(object = data, verbose = FALSE, algorithm = 3, resolution = .4) 321 | ``` 322 | 323 | ```{r} 324 | DimPlot(object = data, label = TRUE) + NoLegend() 325 | ``` 326 | 327 | ```{r} 328 | DimPlot(object = data, label = TRUE, group.by = "dataset") + NoLegend() 329 | ``` 330 | 331 | Data analysis 332 | 333 | ```{r} 334 | gene.activities <- GeneActivity(data) 335 | ``` 336 | 337 | ```{r} 338 | data[['RNA']] <- CreateAssayObject(counts = gene.activities) 339 | 340 | data <- NormalizeData( 341 | object = data, 342 | assay = 'RNA', 343 | normalization.method = 'LogNormalize', 344 | scale.factor = median(data$nCount_RNA) 345 | ) 346 | ``` 347 | 348 | ```{r} 349 | data[['RNA']] 350 | ``` 351 | 352 | 353 | ```{r} 354 | DefaultAssay(data) <- 'RNA' 355 | 356 | 357 | FeaturePlot( 358 | object = data, 359 | features = c('Kit', 'Pecam1', 'Itgam'), 360 | max.cutoff = 'q95' 361 | ) 362 | ``` 363 | 364 | 365 | ```{r} 366 | DefaultAssay(data) <- 'peaks' 367 | 368 | da_peaks <- FindMarkers( 369 | object = data, 370 | ident.1 = rownames(data[[]][data$dataset == "old",]), 371 | ident.2 = rownames(data[[]][data$dataset == "young",]), 372 | min.pct = 0.05, 373 | test.use = 'LR', 374 | latent.vars = 'peak_region_fragments' 375 | ) 376 | 377 | 378 | ``` 379 | 380 | 381 | ```{r} 382 | da_peaks 383 | ``` 384 | 385 | ```{r} 386 | da_peaks$closest_gene <-ClosestFeature(data, regions = rownames(da_peaks))$gene_name 387 | da_peaks$distance <- ClosestFeature(data, regions = rownames(da_peaks))$distance 388 | da_peaks 389 | ``` 390 | 391 | ```{r} 392 | CoveragePlot( 393 | object = data, 394 | region = rownames(da_peaks)[2], 395 | extend.upstream = 10000, 396 | extend.downstream = 5000, 397 | group.by = "dataset" 398 | ) 399 | ``` 400 | 401 | ```{r} 402 | plot1 <- VlnPlot( 403 | object = data, 404 | features = rownames(da_peaks)[2], 405 | group.by = "dataset" 406 | ) 407 | plot2 <- FeaturePlot( 408 | object = data, 409 | features = rownames(da_peaks)[2], 410 | max.cutoff = 'q95' 411 | ) 412 | 413 | plot1 | plot2 414 | ``` 415 | 416 | 417 | 418 | -------------------------------------------------------------------------------- /shifted_transformation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 10, 6 | "id": "ee4391f7", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import numpy as np\n", 11 | "import scanpy as sc\n", 12 | "from scipy.sparse import diags\n", 13 | "\n", 14 | "#from scipy.sparse import csr_matrix\n", 15 | "#will only work with sparse, covert prior : adata.X = csr_matrix(adata.X)\n", 16 | "\n", 17 | "def shifted_transformation(adata, y0=1):\n", 18 | " size_factors = adata.X.sum(axis=1) / np.mean(adata.X.sum(axis=1))\n", 19 | "\n", 20 | " adata.X = diags(1 / size_factors.A1).dot(adata.X)\n", 21 | " adata.X.data = np.log(adata.X.data + y0)\n", 22 | " \n", 23 | " return adata\n" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 3, 29 | "id": "1d135da4", 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "adata = sc.datasets.pbmc3k() #loading pbcm toy dataset\n", 34 | "adata = shifted_transformation(adata)" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 11, 40 | "id": "8ff05ad0", 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "id": "07b475dc", 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [] 52 | } 53 | ], 54 | "metadata": { 55 | "kernelspec": { 56 | "display_name": "Python 3 (ipykernel)", 57 | "language": "python", 58 | "name": "python3" 59 | }, 60 | "language_info": { 61 | "codemirror_mode": { 62 | "name": "ipython", 63 | "version": 3 64 | }, 65 | "file_extension": ".py", 66 | "mimetype": "text/x-python", 67 | "name": "python", 68 | "nbconvert_exporter": "python", 69 | "pygments_lexer": "ipython3", 70 | "version": "3.10.6" 71 | } 72 | }, 73 | "nbformat": 4, 74 | "nbformat_minor": 5 75 | } 76 | -------------------------------------------------------------------------------- /simpleaf_alevin_fry_tutorial.txt: -------------------------------------------------------------------------------- 1 | ##### setup ##### 2 | conda create -n af -y -c bioconda -c conda-forge simpleaf piscem 3 | export ALEVIN_FRY_HOME="$PWD" 4 | simpleaf set-paths 5 | ulimit -n 4096 6 | 7 | 8 | ##### index ##### 9 | gunzip -c fastq/pbmc_1k_v3_S1_L002_R2_001.fastq.gz | head | sed -n '2p' | wc -c 10 | # ***make sure to subtract 1 11 | simpleaf index --output simpleaf_index --fasta genome.fa --gtf genes.gtf --rlen 91 --threads 28 --use-piscem 12 | 13 | 14 | ##### quant ##### 15 | 16 | simpleaf quant --reads1 a_r1.fastq.gz,b_r1.fastq.gz --reads2 a_r2.fastq.gz,b_R2_001.fastq.gz --threads 28 --index simpleaf_index/index --chemistry 10xv3 --resolution cr-like --unfiltered-pl --expected-ori fw --t2g-map simpleaf_index/index/t2g_3col.tsv --output simpleaf_output 17 | 18 | 19 | 20 | %pip install pyroe 21 | 22 | import scanpy as sc 23 | from pyroe import load_fry 24 | 25 | 26 | # all counts summed in .X 27 | adata = load_fry("output_dir/af_quant", output_format = {'X' : ['U','S','A']}) 28 | 29 | # unspliced in unspliced layer 30 | adata = load_fry("output_dir/af_quant", output_format = {'X' : ['S', 'A'],'unspliced' : ['U']}) -------------------------------------------------------------------------------- /single_cell_gene_co-expression.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "a664602f", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "'''\n", 11 | "Are two selected genes co-expressed?\n", 12 | "\n", 13 | "Are any genes co-expressed with a selected gene?\n", 14 | "\n", 15 | "'''" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 1, 21 | "id": "afc181a0", 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "import scanpy as sc\n", 26 | "from scipy import stats\n", 27 | "import numpy as np\n", 28 | "import pandas as pd" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 2, 34 | "id": "e3b218c6", 35 | "metadata": {}, 36 | "outputs": [ 37 | { 38 | "name": "stderr", 39 | "output_type": "stream", 40 | "text": [ 41 | "/home/jrlab2019/miniconda3/envs/sc/lib/python3.9/site-packages/scanpy/preprocessing/_normalization.py:170: UserWarning: Received a view of an AnnData. Making a copy.\n", 42 | " view_to_actual(adata)\n" 43 | ] 44 | } 45 | ], 46 | "source": [ 47 | "#simple import and preprocessing\n", 48 | "adata = sc.read_10x_mtx('../tutorial_sample/outs/filtered_feature_bc_matrix/')\n", 49 | "sc.pp.filter_cells(adata, min_genes=200)\n", 50 | "sc.pp.filter_genes(adata, min_cells=3)\n", 51 | "adata.var['mt'] = adata.var_names.str.startswith('MT-') \n", 52 | "sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)\n", 53 | "adata = adata[adata.obs.pct_counts_mt < 20]\n", 54 | "sc.pp.normalize_total(adata, target_sum=1e4)" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 3, 60 | "id": "fb29b19e", 61 | "metadata": {}, 62 | "outputs": [ 63 | { 64 | "data": { 65 | "text/plain": [ 66 | "AnnData object with n_obs × n_vars = 8093 × 21949\n", 67 | " obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt'\n", 68 | " var: 'gene_ids', 'feature_types', 'n_cells', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts'" 69 | ] 70 | }, 71 | "execution_count": 3, 72 | "metadata": {}, 73 | "output_type": "execute_result" 74 | } 75 | ], 76 | "source": [ 77 | "adata" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 4, 83 | "id": "89fb17ac", 84 | "metadata": {}, 85 | "outputs": [ 86 | { 87 | "data": { 88 | "text/plain": [ 89 | "Index(['AL627309.1', 'AL627309.5', 'LINC01409', 'FAM87B', 'LINC01128',\n", 90 | " 'LINC00115', 'FAM41C', 'AL645608.2', 'AL645608.4', 'LINC02593',\n", 91 | " ...\n", 92 | " 'MT-CYB', 'BX004987.1', 'AC145212.1', 'MAFIP', 'AC011043.1',\n", 93 | " 'AL354822.1', 'AL592183.1', 'AC240274.1', 'AC007325.4', 'zika'],\n", 94 | " dtype='object', length=21949)" 95 | ] 96 | }, 97 | "execution_count": 4, 98 | "metadata": {}, 99 | "output_type": "execute_result" 100 | } 101 | ], 102 | "source": [ 103 | "adata.var_names" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": 10, 109 | "id": "b060f1a0", 110 | "metadata": {}, 111 | "outputs": [], 112 | "source": [ 113 | "i1 = np.where(adata.var_names == 'zika')[0][0]" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 11, 119 | "id": "6e02a079", 120 | "metadata": {}, 121 | "outputs": [], 122 | "source": [ 123 | "i2 = np.where(adata.var_names == 'IFITM1')[0][0]" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": 9, 129 | "id": "87801fdb", 130 | "metadata": {}, 131 | "outputs": [ 132 | { 133 | "data": { 134 | "text/plain": [ 135 | "(8093, 21949)" 136 | ] 137 | }, 138 | "execution_count": 9, 139 | "metadata": {}, 140 | "output_type": "execute_result" 141 | } 142 | ], 143 | "source": [ 144 | "data = adata.X.toarray()\n", 145 | "data.shape" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": 12, 151 | "id": "4a024686", 152 | "metadata": {}, 153 | "outputs": [ 154 | { 155 | "data": { 156 | "text/plain": [ 157 | "array([ 0. , 0.3849559 , 0.47232196, ..., 83.49873 ,\n", 158 | " 0. , 0. ], dtype=float32)" 159 | ] 160 | }, 161 | "execution_count": 12, 162 | "metadata": {}, 163 | "output_type": "execute_result" 164 | } 165 | ], 166 | "source": [ 167 | "data[:, i1]" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": 13, 173 | "id": "c4f5a8ca", 174 | "metadata": {}, 175 | "outputs": [ 176 | { 177 | "data": { 178 | "text/plain": [ 179 | "(-0.004018901858421179, 0.7177328380898428)" 180 | ] 181 | }, 182 | "execution_count": 13, 183 | "metadata": {}, 184 | "output_type": "execute_result" 185 | } 186 | ], 187 | "source": [ 188 | "stats.pearsonr(data[:, i1], data[:, i2])" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": 14, 194 | "id": "4914ceda", 195 | "metadata": {}, 196 | "outputs": [], 197 | "source": [ 198 | "out = []\n", 199 | "for gene in adata.var_names:\n", 200 | " i2 = np.where(adata.var_names == gene)[0][0]\n", 201 | " res = stats.pearsonr(data[:, i1], data[:, i2])\n", 202 | " out.append([gene, res[0], res[1]])" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": 15, 208 | "id": "8ae63171", 209 | "metadata": {}, 210 | "outputs": [ 211 | { 212 | "data": { 213 | "text/html": [ 214 | "
\n", 215 | "\n", 228 | "\n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | "
generp
0AL627309.10.0077124.878679e-01
1AL627309.5-0.0007739.445847e-01
2LINC014090.0376107.140523e-04
3FAM87B-0.0039927.195206e-01
4LINC01128-0.0085124.438714e-01
............
21944AL354822.1-0.0133472.299292e-01
21945AL592183.10.0045406.830051e-01
21946AC240274.10.1102362.633133e-23
21947AC007325.40.0201177.034826e-02
21948zika1.0000000.000000e+00
\n", 306 | "

21949 rows × 3 columns

\n", 307 | "
" 308 | ], 309 | "text/plain": [ 310 | " gene r p\n", 311 | "0 AL627309.1 0.007712 4.878679e-01\n", 312 | "1 AL627309.5 -0.000773 9.445847e-01\n", 313 | "2 LINC01409 0.037610 7.140523e-04\n", 314 | "3 FAM87B -0.003992 7.195206e-01\n", 315 | "4 LINC01128 -0.008512 4.438714e-01\n", 316 | "... ... ... ...\n", 317 | "21944 AL354822.1 -0.013347 2.299292e-01\n", 318 | "21945 AL592183.1 0.004540 6.830051e-01\n", 319 | "21946 AC240274.1 0.110236 2.633133e-23\n", 320 | "21947 AC007325.4 0.020117 7.034826e-02\n", 321 | "21948 zika 1.000000 0.000000e+00\n", 322 | "\n", 323 | "[21949 rows x 3 columns]" 324 | ] 325 | }, 326 | "execution_count": 15, 327 | "metadata": {}, 328 | "output_type": "execute_result" 329 | } 330 | ], 331 | "source": [ 332 | "df = pd.DataFrame(out, columns = ['gene', 'r', 'p'])\n", 333 | "df" 334 | ] 335 | }, 336 | { 337 | "cell_type": "code", 338 | "execution_count": 16, 339 | "id": "6dd2805b", 340 | "metadata": {}, 341 | "outputs": [ 342 | { 343 | "name": "stderr", 344 | "output_type": "stream", 345 | "text": [ 346 | "/home/jrlab2019/miniconda3/envs/sc/lib/python3.9/site-packages/pandas/core/arraylike.py:397: RuntimeWarning: divide by zero encountered in log10\n", 347 | " result = getattr(ufunc, method)(*inputs, **kwargs)\n" 348 | ] 349 | }, 350 | { 351 | "data": { 352 | "text/html": [ 353 | "
\n", 354 | "\n", 367 | "\n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | "
generpbon-log10_p
0AL627309.10.0077124.878679e-011.070821e+040.311698
1AL627309.5-0.0007739.445847e-012.073269e+040.024759
2LINC014090.0376107.140523e-041.567273e+013.146270
3FAM87B-0.0039927.195206e-011.579276e+040.142957
4LINC01128-0.0085124.438714e-019.742533e+030.352743
..................
21944AL354822.1-0.0133472.299292e-015.046717e+030.638406
21945AL592183.10.0045406.830051e-011.499128e+040.165576
21946AC240274.10.1102362.633133e-235.779463e-1922.579527
21947AC007325.40.0201177.034826e-021.544074e+031.152747
21948zika1.0000000.000000e+000.000000e+00inf
\n", 469 | "

21949 rows × 5 columns

\n", 470 | "
" 471 | ], 472 | "text/plain": [ 473 | " gene r p bon -log10_p\n", 474 | "0 AL627309.1 0.007712 4.878679e-01 1.070821e+04 0.311698\n", 475 | "1 AL627309.5 -0.000773 9.445847e-01 2.073269e+04 0.024759\n", 476 | "2 LINC01409 0.037610 7.140523e-04 1.567273e+01 3.146270\n", 477 | "3 FAM87B -0.003992 7.195206e-01 1.579276e+04 0.142957\n", 478 | "4 LINC01128 -0.008512 4.438714e-01 9.742533e+03 0.352743\n", 479 | "... ... ... ... ... ...\n", 480 | "21944 AL354822.1 -0.013347 2.299292e-01 5.046717e+03 0.638406\n", 481 | "21945 AL592183.1 0.004540 6.830051e-01 1.499128e+04 0.165576\n", 482 | "21946 AC240274.1 0.110236 2.633133e-23 5.779463e-19 22.579527\n", 483 | "21947 AC007325.4 0.020117 7.034826e-02 1.544074e+03 1.152747\n", 484 | "21948 zika 1.000000 0.000000e+00 0.000000e+00 inf\n", 485 | "\n", 486 | "[21949 rows x 5 columns]" 487 | ] 488 | }, 489 | "execution_count": 16, 490 | "metadata": {}, 491 | "output_type": "execute_result" 492 | } 493 | ], 494 | "source": [ 495 | "df['bon'] = df.p * len(df)\n", 496 | "df['-log10_p'] = -np.log10(df.p)\n", 497 | "\n", 498 | "df" 499 | ] 500 | }, 501 | { 502 | "cell_type": "code", 503 | "execution_count": 18, 504 | "id": "e58c8438", 505 | "metadata": {}, 506 | "outputs": [ 507 | { 508 | "data": { 509 | "text/html": [ 510 | "
\n", 511 | "\n", 524 | "\n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | "
generpbon-log10_p
0zika1.0000000.000000e+000.000000e+00inf
1IFIT20.3190317.119685e-1911.562700e-186190.147539
2OASL0.2991206.186299e-1671.357831e-162166.208569
3IFIT10.2479111.291226e-1132.834112e-109112.888998
4DDX580.2437379.033933e-1101.982858e-105109.044123
..................
693RAE10.0526282.169152e-064.761072e-025.663710
694EXOSC40.0526282.169169e-064.761109e-025.663707
695AC016747.10.0526232.173228e-064.770018e-025.662895
696PPM1L0.0526112.184895e-064.795627e-025.660569
697FUNDC2-0.0525382.256946e-064.953771e-025.646479
\n", 626 | "

698 rows × 5 columns

\n", 627 | "
" 628 | ], 629 | "text/plain": [ 630 | " gene r p bon -log10_p\n", 631 | "0 zika 1.000000 0.000000e+00 0.000000e+00 inf\n", 632 | "1 IFIT2 0.319031 7.119685e-191 1.562700e-186 190.147539\n", 633 | "2 OASL 0.299120 6.186299e-167 1.357831e-162 166.208569\n", 634 | "3 IFIT1 0.247911 1.291226e-113 2.834112e-109 112.888998\n", 635 | "4 DDX58 0.243737 9.033933e-110 1.982858e-105 109.044123\n", 636 | ".. ... ... ... ... ...\n", 637 | "693 RAE1 0.052628 2.169152e-06 4.761072e-02 5.663710\n", 638 | "694 EXOSC4 0.052628 2.169169e-06 4.761109e-02 5.663707\n", 639 | "695 AC016747.1 0.052623 2.173228e-06 4.770018e-02 5.662895\n", 640 | "696 PPM1L 0.052611 2.184895e-06 4.795627e-02 5.660569\n", 641 | "697 FUNDC2 -0.052538 2.256946e-06 4.953771e-02 5.646479\n", 642 | "\n", 643 | "[698 rows x 5 columns]" 644 | ] 645 | }, 646 | "execution_count": 18, 647 | "metadata": {}, 648 | "output_type": "execute_result" 649 | } 650 | ], 651 | "source": [ 652 | "df = df[df.bon < 0.05].sort_values('bon').reset_index(drop = True)\n", 653 | "df" 654 | ] 655 | }, 656 | { 657 | "cell_type": "code", 658 | "execution_count": 20, 659 | "id": "14cacf15", 660 | "metadata": {}, 661 | "outputs": [ 662 | { 663 | "data": { 664 | "image/png": "\n", 665 | "text/plain": [ 666 | "
" 667 | ] 668 | }, 669 | "metadata": { 670 | "needs_background": "light" 671 | }, 672 | "output_type": "display_data" 673 | } 674 | ], 675 | "source": [ 676 | "import seaborn as sns\n", 677 | "import matplotlib.pyplot as plt\n", 678 | "\n", 679 | "plt.figure(figsize = (3,5))\n", 680 | "\n", 681 | "ax = sns.barplot(data = df[1:16], x = '-log10_p', y = 'gene', color = 'grey')\n", 682 | "\n", 683 | "plt.show()\n" 684 | ] 685 | }, 686 | { 687 | "cell_type": "code", 688 | "execution_count": null, 689 | "id": "cb162306", 690 | "metadata": {}, 691 | "outputs": [], 692 | "source": [] 693 | }, 694 | { 695 | "cell_type": "code", 696 | "execution_count": null, 697 | "id": "b9e4915d", 698 | "metadata": {}, 699 | "outputs": [], 700 | "source": [] 701 | }, 702 | { 703 | "cell_type": "code", 704 | "execution_count": null, 705 | "id": "c7c9b80d", 706 | "metadata": {}, 707 | "outputs": [], 708 | "source": [] 709 | }, 710 | { 711 | "cell_type": "code", 712 | "execution_count": null, 713 | "id": "3f4d5cd6", 714 | "metadata": {}, 715 | "outputs": [], 716 | "source": [] 717 | }, 718 | { 719 | "cell_type": "code", 720 | "execution_count": null, 721 | "id": "9aad4a5c", 722 | "metadata": {}, 723 | "outputs": [], 724 | "source": [] 725 | }, 726 | { 727 | "cell_type": "code", 728 | "execution_count": null, 729 | "id": "aeee62a7", 730 | "metadata": {}, 731 | "outputs": [], 732 | "source": [] 733 | }, 734 | { 735 | "cell_type": "code", 736 | "execution_count": null, 737 | "id": "fae68899", 738 | "metadata": {}, 739 | "outputs": [], 740 | "source": [] 741 | }, 742 | { 743 | "cell_type": "code", 744 | "execution_count": null, 745 | "id": "9a14a5f8", 746 | "metadata": {}, 747 | "outputs": [], 748 | "source": [] 749 | }, 750 | { 751 | "cell_type": "code", 752 | "execution_count": null, 753 | "id": "45697472", 754 | "metadata": {}, 755 | "outputs": [], 756 | "source": [] 757 | } 758 | ], 759 | "metadata": { 760 | "kernelspec": { 761 | "display_name": "Python 3 (ipykernel)", 762 | "language": "python", 763 | "name": "python3" 764 | }, 765 | "language_info": { 766 | "codemirror_mode": { 767 | "name": "ipython", 768 | "version": 3 769 | }, 770 | "file_extension": ".py", 771 | "mimetype": "text/x-python", 772 | "name": "python", 773 | "nbconvert_exporter": "python", 774 | "pygments_lexer": "ipython3", 775 | "version": "3.9.12" 776 | } 777 | }, 778 | "nbformat": 4, 779 | "nbformat_minor": 5 780 | } 781 | -------------------------------------------------------------------------------- /single_r.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "R Notebook" 3 | output: html_notebook 4 | --- 5 | 6 | ```{r} 7 | if (!require("BiocManager", quietly = TRUE)) 8 | install.packages("BiocManager") 9 | 10 | BiocManager::install("SingleR") 11 | ``` 12 | ```{r} 13 | install.packages('Seurat') 14 | ``` 15 | 16 | ```{r} 17 | #very basic Seurat preprocessing 18 | prep_data <- function(Data.path){ 19 | Raw_data <- Read10X(data.dir = Data.path) 20 | seuset_data <- CreateSeuratObject(counts = Raw_data, min.cells = 3, min.features = 200) 21 | seuset_data[["percent.mt"]] <- PercentageFeatureSet(seuset_data, pattern = "mt-") 22 | lb <- quantile(seuset_data[["nFeature_RNA"]]$nFeature_RNA, probs = 0.01) 23 | ub <- quantile(seuset_data[["nFeature_RNA"]]$nFeature_RNA, probs = 0.99) 24 | seuset_data <- seuset_data[, seuset_data[["nFeature_RNA"]] > lb & seuset_data[["nFeature_RNA"]] < ub & seuset_data[["percent.mt"]] < 15] 25 | seuset_data <- NormalizeData(object = seuset_data, verbose = FALSE) 26 | seuset_data <- FindVariableFeatures(object = seuset_data, nfeatures = 3000, verbose = FALSE, selection.method = 'vst') 27 | seuset_data <- ScaleData(seuset_data, verbose = FALSE) 28 | seuset_data <- RunPCA(seuset_data, npcs = 20, verbose = FALSE) 29 | seuset_data <- FindNeighbors(seuset_data, dims = 1:20) 30 | seuset_data <- FindClusters(seuset_data, resolution = 0.3) 31 | seuset_data <- RunUMAP(seuset_data, reduction = "pca", dims = 1:20) 32 | return(seuset_data) 33 | } 34 | ``` 35 | 36 | 37 | ```{r} 38 | data <- prep_data("Lung1/outs/filtered_feature_bc_matrix") 39 | ``` 40 | ```{r} 41 | a <- DimPlot(data, reduction = "umap", label=TRUE) 42 | 43 | png("./umap_u.png", res = 250, width = 1500, height = 1500) 44 | 45 | print(a) 46 | dev.off() 47 | #a + b + c 48 | a 49 | ``` 50 | 51 | ```{r} 52 | library(SingleR) 53 | ``` 54 | 55 | ```{r} 56 | ref <- celldex::MouseRNAseqData() 57 | ``` 58 | 59 | ```{r} 60 | results <- SingleR(test = as.SingleCellExperiment(data), ref = ref, labels = ref$label.main) 61 | ``` 62 | 63 | ```{r} 64 | data$singlr_labels <- results$labels 65 | ``` 66 | 67 | ```{r} 68 | DimPlot(data, reduction = 'umap', group.by = 'singlr_labels', label = TRUE) 69 | ``` 70 | 71 | ```{r} 72 | FeaturePlot(data, features = c("Ptprc", "Cd3e")) 73 | ``` 74 | 75 | ```{r} 76 | if (!require("BiocManager", quietly = TRUE)) 77 | install.packages("BiocManager") 78 | 79 | BiocManager::install("scRNAseq") 80 | ``` 81 | ```{r} 82 | if (!require("BiocManager", quietly = TRUE)) 83 | install.packages("BiocManager") 84 | 85 | BiocManager::install("scuttle") 86 | ``` 87 | ```{r} 88 | if (!require("BiocManager", quietly = TRUE)) 89 | install.packages("BiocManager") 90 | 91 | BiocManager::install("TabulaMurisData") 92 | ``` 93 | ```{r} 94 | library(ExperimentHub) 95 | ``` 96 | ```{r} 97 | eh <- ExperimentHub() 98 | ``` 99 | ```{r} 100 | query(eh, "TabulaMurisData") 101 | ``` 102 | ```{r} 103 | eh[['EH1617']] 104 | ``` 105 | 106 | ```{r} 107 | lung_ref <- eh[['EH1617']] 108 | lung_ref <- lung_ref[,lung_ref$tissue == 'Lung'] 109 | lung_ref <- lung_ref[,!is.na(lung_ref$cell_ontology_class)] 110 | ``` 111 | 112 | ```{r} 113 | lung_ref 114 | ``` 115 | 116 | ```{r} 117 | library(scuttle) 118 | ``` 119 | 120 | ```{r} 121 | lung_ref <- logNormCounts(lung_ref) 122 | ``` 123 | 124 | 125 | ```{r} 126 | results <- SingleR(test = as.SingleCellExperiment(data), ref = lung_ref, labels = lung_ref$cell_ontology_class) 127 | ``` 128 | 129 | 130 | ```{r} 131 | data$singlr_label <- results$labels 132 | ``` 133 | 134 | ```{r} 135 | a <- DimPlot(data, reduction = "umap", group.by = 'singlr_label', label = FALSE) 136 | 137 | png("./umap_l.png", res = 250, width = 2500, height = 1500) 138 | 139 | print(a) 140 | dev.off() 141 | #a + b + c 142 | a 143 | ``` 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | -------------------------------------------------------------------------------- /soupX/readme.txt: -------------------------------------------------------------------------------- 1 | R and python scripts for soupX 2 | -------------------------------------------------------------------------------- /soupX/soupX_R_tutorial.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "soupX tutorial" 3 | output: html_notebook 4 | --- 5 | 6 | 7 | install.packages('SoupX') 8 | 9 | ```{r} 10 | library(Seurat) 11 | library(SoupX) 12 | ``` 13 | 14 | 15 | ```{r} 16 | 17 | mad_outlier <- function(sobj, metric, nmads){ 18 | M <- sobj@meta.data[[metric]] 19 | median_M <- median(M, na.rm = TRUE) 20 | mad_M <- mad(M, na.rm = TRUE) 21 | outlier <- (M < (median_M - nmads * mad_M)) | (M > (median_M + nmads * mad_M)) 22 | return(outlier) 23 | } 24 | 25 | pp <- function(sample_id){ 26 | path <- paste0(sample_id, "/outs/filtered_feature_bc_matrix/") 27 | sobj <- Read10X(data.dir = path) 28 | sobj <- CreateSeuratObject(counts = sobj, min.cells = 0, min.features = 200) 29 | sobj$sample_id <- sample_id 30 | 31 | #add QC metrics 32 | sobj$log1p_total_counts <- log1p(sobj@meta.data$nCount_RNA) 33 | sobj$log1p_n_genes_by_counts <- log1p(sobj@meta.data$nFeature_RNA) 34 | sobj[["percent.mt"]] <- PercentageFeatureSet(sobj, pattern = "^mt-") 35 | 36 | #find outliers and subset 37 | bool_vector <- !mad_outlier(sobj, 'log1p_total_counts', 5) & !mad_outlier(sobj, 'log1p_n_genes_by_counts', 5) & !mad_outlier(sobj, 'percent.mt', 3) 38 | sobj <- subset(sobj, cells = which(bool_vector)) 39 | 40 | return(sobj) 41 | } 42 | ``` 43 | 44 | ```{r} 45 | samples <- c('Lung1', 'Lung2', 'Lung3', 'Lung4', 'Lung6') 46 | ``` 47 | 48 | 49 | ```{r} 50 | data_list <- sapply(samples, pp) 51 | ``` 52 | 53 | 54 | ```{r} 55 | get_soup_groups <- function(sobj){ 56 | sobj <- NormalizeData(sobj, verbose = FALSE) 57 | sobj <- FindVariableFeatures(object = sobj, nfeatures = 2000, verbose = FALSE, selection.method = 'vst') 58 | sobj <- ScaleData(sobj, verbose = FALSE) 59 | sobj <- RunPCA(sobj, npcs = 20, verbose = FALSE) 60 | sobj <- FindNeighbors(sobj, dims = 1:20, verbose = FALSE) 61 | sobj <- FindClusters(sobj, resolution = 0.5, verbose = FALSE) 62 | 63 | return(sobj@meta.data[['seurat_clusters']]) 64 | 65 | } 66 | ``` 67 | 68 | ```{r} 69 | add_soup_groups <- function(sobj){ 70 | sobj$soup_group <- get_soup_groups(sobj) 71 | return(sobj) 72 | } 73 | data_list <- sapply(data_list, add_soup_groups) 74 | ``` 75 | 76 | ```{r} 77 | data_list[1]$Lung1[[]] 78 | ``` 79 | 80 | ```{r} 81 | make_soup <- function(sobj){ 82 | sample_id <- as.character(sobj$sample_id[1]) #e.g, Lung1 83 | path <- paste0(sample_id, "/outs/raw_feature_bc_matrix/") 84 | raw <- Read10X(data.dir = path) 85 | 86 | sc = SoupChannel(raw,sobj@assays$RNA@counts) 87 | sc = setClusters(sc,sobj$soup_group) 88 | sc = autoEstCont(sc, doPlot=FALSE) 89 | out = adjustCounts(sc, roundToInt = TRUE) 90 | 91 | #optional keep original 92 | sobj[["original.counts"]] <- CreateAssayObject(counts = sobj@assays$RNA@counts) 93 | 94 | sobj@assays$RNA@counts <- out 95 | 96 | return(sobj) 97 | 98 | } 99 | ``` 100 | 101 | ```{r} 102 | data_list <- sapply(data_list, make_soup) 103 | ``` 104 | 105 | ```{r} 106 | sum(data_list[1]$Lung1@assays$original.counts@counts) 107 | ``` 108 | 109 | ```{r} 110 | sum(data_list[1]$Lung1@assays$RNA@counts)/sum(data_list[1]$Lung1@assays$original.counts@counts) 111 | ``` 112 | 113 | ```{r} 114 | 115 | ``` 116 | -------------------------------------------------------------------------------- /test_significance_t_u_shapiro.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "a5975753", 6 | "metadata": {}, 7 | "source": [ 8 | "## Three steps that will cover the majority of cases:\n", 9 | "\n", 10 | "### 1) are my data normally distributed? shapiro test\n", 11 | "### 2) if yes: t-test\n", 12 | "### 3) if no: u-test" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 1, 18 | "id": "3fe379a1", 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "from scipy import stats #the only import you actually need for this\n", 23 | "import numpy as np\n", 24 | "import seaborn as sns\n", 25 | "import pandas as pd\n", 26 | "import matplotlib.pyplot as plt\n", 27 | "%config InlineBackend.print_figure_kwargs={'facecolor' : \"w\"}" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "id": "74cbab56", 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "id": "24acca47", 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 2, 49 | "id": "06d7bc02", 50 | "metadata": {}, 51 | "outputs": [ 52 | { 53 | "data": { 54 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXgAAAEGCAYAAABvtY4XAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAsTAAALEwEAmpwYAAAUO0lEQVR4nO3de3CU9b3H8U+uhIQkQAi3XEiQWxIuuWwEGyBBRbBUpAEBBwVFmko7PToMc2Z0ysCMc06tQxXQlmmoRYVW2iP2IFLtjHECRYm4gjKQAwIm3NJCAiSQhCUk2fNHash9seHZJb/n/Zpxxt3fZp9vZpg3D88+z7N+brfbLQCAcfx9PQAAwBoEHgAMReABwFAEHgAMReABwFCBvh6gpQEDBighIcHXYwBAj1FaWqqKiooO1+6owCckJMjpdPp6DADoMRwOR6drHKIBAEMReAAwFIEHAEMReAAwFIEHAEMReAAwFIEHAEMReAAwFIEHAEMReAAwFIEHAEMReAAwFIEHAEMReAAwFIEHAEMReAAwFIEHAEMReAAwFIEHAEMReAAwFIEHAEMReAAwFIEHAEMReAAwlKWBf+WVV5SSkqKxY8fq0UcflcvlsnJzAIAWLAv8uXPntGHDBjmdTh0+fFgNDQ3atm2bVZsDALRh6R58fX29rl27pvr6etXW1mro0KFWbg4A0IJlgY+JidHKlSsVHx+vIUOGKDIyUg888EC71+Xn58vhcMjhcKi8vNyqcQDAdiwL/OXLl7Vjxw6VlJSorKxMNTU12rp1a7vX5eXlyel0yul0Kjo62qpxAMB2LAv8Rx99pMTEREVHRysoKEi5ubn69NNPrdocAKANywIfHx+voqIi1dbWyu12q6CgQElJSVZtDgDQhmWBnzhxoubNm6f09HSNGzdOjY2NysvLs2pzAIA2/Nxut9vXQ3zL4XDI6XT6egwA6DG66iZXsgKAoQg8ABiKwAOAoQg8ABiKwAOAoQg8ABiKwAOAoQg8ABgq0NcDALCJxkbpZIH0jy+lsGgpeY7Uu6+PhzIbgQdgvapz0h8ekS4cufnch89LP9woJT/su7kMxyEaANZ790et4y5JN2qk7cuky6d8M5MNEHgA1jpfLJ36pOO1hjrpwFvencdGCDwAa10u6Xr90jfemcOGCDwAa/VL7Hq9/3DvzGFDBB6AtQYlS8OyOl4LCJbSF3t3Hhsh8ACsN/d30qCxrZ8LCpPmvi71G+abmWyA0yQBWC9iqPT0XulEi/PgU+ZIIZG+nsxoBB6Ad/j5SSPvb/oPXsEhGgAwFIEHAEMReAAwFIEHAEMReAAwFIEHAEMReAAwFIEHAEMReAAwFIEHAEMReAAwFPeiAeA9Fcelf3zVdLOxhCmSP/uYViLwAKznuiK9myd9/cHN5/oOa7qNcNzdvpvLcPz1CcB6/7u8ddwlqfKUtHWeVF3um5lsgMADsNalEunoro7XrldJB/nSbasQeADWuvB/ktydr58/4rVR7IbAA7BWn0Ee1gd7Zw4bIvAArBWbIQ1M6WTRT0pb5NVx7ITAA7De3E1S2MA2T/pJM/5bGtRZ/NFdnCYJwHqDUqSffSEd+tPNL91OXSQNGOnryYxmaeArKyu1bNkyHT58WH5+fvr973+ve+65x8pNArhThURId//I11PYiqWBf+aZZzRz5ky98847qqurU21trZWbAwC0YFngr1y5oj179uiNN96QJAUHBys4ONiqzQEA2rDsQ9ZvvvlG0dHRevLJJ5WWlqZly5appqam3evy8/PlcDjkcDhUXs4VbQBwu1gW+Pr6eh04cEDLly/XwYMHFRYWphdffLHd6/Ly8uR0OuV0OhUdHW3VOABgO5YFPjY2VrGxsZo4caIkad68eTpw4IBVmwMAtGFZ4AcPHqy4uDgdO3ZMklRQUKDk5GSrNgcAaMPSs2heffVVLVq0SHV1dRo+fLg2b95s5eYAAC1YGvjU1FQ5nU4rNwEA6AS3KgAAQxF4ADAUgQcAQxF4ADAUgQcAQxF4ADAUgQcAQxF4ADAUgQcAQxF4ADAUgQcAQxF4ADAUgQcAQxF4ADAUgQcAQxF4ADAUgQcAQ3kM/Pnz5/XUU0/pwQcflCQVFxfr9ddft3wwAED3eAz8E088oRkzZqisrEySNGrUKK1bt87quQAA3eQx8BUVFZo/f778/ZteGhgYqICAAMsHAwB0j8fAh4WF6eLFi/Lz85MkFRUVKTIy0vLBAADdE+jpBS+//LJmz56tkydPKisrS+Xl5XrnnXe8MRsAoBs8Bj49PV27d+/WsWPH5Ha7NXr0aAUFBXljNgBAN3gM/FtvvdXq8YEDByRJixcvtmYiAMBt4THwn3/+efP/u1wuFRQUKD09ncADwB3OY+BfffXVVo+rqqr0+OOPWzYQAOD2+M5XsoaGhur48eNWzAIAuI087sE/9NBDzadINjY2qri4WPPnz7d8MABA93gM/MqVK2++ODBQw4YNU2xsrKVDAQC6z2Pgs7OzvTEHAOA26zTw4eHhzYdmWnK73fLz89OVK1csHQyAYS6VSM7XpbIvpbBoKW2RNOJ+X09ltE4Df/XqVW/OAcBkJX+X/jhfulF787kj70qTfiLN/IXv5jKcx0M037pw4YJcLlfz4/j4eEsGAmCYxkZpx09ax/1bRb+RkudI8RO9PpYdeDxN8r333tPIkSOVmJio7OxsJSQkNN8bHgA8Or1Pqjzd+fqhP3lvFpvxGPhVq1apqKhIo0aNUklJiQoKCpSVleWN2QCYwFXlYb3SK2PYkcfABwUFKSoqSo2NjWpsbNS0adP05ZdfemE0AEaISZf8uzgaHDfJe7PYjMdj8H379lV1dbWmTJmiRYsWaeDAgQoMvOVD9wDsLnywlPaY9MUb7dciYqQJC70+kl143IOfOnWqKisrtX79es2cOVN33XWXdu7c6Y3ZAJji+2uliU9Lgb1vPjcsS1qyUwqJ8N1chvO4K+52uzVjxgz1799fCxcu1IIFCxQVFXXLG2hoaJDD4VBMTIzef//9bg0LoIcKCJIe/KWU85x08YQUGiX1T/T1VMbzuAe/evVqHTlyRL/+9a9VVlam7Oxs3X//rV+csH79eiUlJXVrSACG6N1XinUQdy+55btJDhw4UIMHD1ZUVJQuXLhwSz9z9uxZ7dq1S8uWLfu3BwQA/Hs8Bn7jxo3KycnRfffdp4qKCm3atEmHDh26pTd/9tln9dJLL8nf/zvflRgA0E0ej8GfOnVK69atU2pq6nd64/fff18DBw5URkaGCgsLO31dfn6+8vPzJUnl5eXfaRsAgM75ud1utxVv/Nxzz2nLli0KDAyUy+XSlStXlJubq61bt3b6Mw6HQ06n04pxbOfr81d1/UajRg8OV3Ag/4ICTNVVNy0LfEuFhYVau3atx7NoCHz3fXqyQmveO6Kvz1dLkgb06aWf3TtCS76X4NvBAFiiq25yxZJBjv7zip7c/Lmu1zc2P1dRfV2r3zuikCB/LcjkBnGAnXjl3+45OTmcA28Rt9ut6uv1crvd2rSnpFXcW/pN4Ul54R9rgNRQ7+sJ8C/swfdQNxoa9drHJ/SHz06rovq6BvTppfqGjuMuSacu1qr86nUNjAjx4pSwjfo6ae/LknOzVP1Pqf/wpitX786TWn5xUF2NdOmbpgudIob6bl6bIPA91Io/f6WdX5U1P66ovt7l6wP8/dQ7OMDqsWBX//OEdGzXzceXvpE++E/pcmnTF3o01EsfvyB9/rpUd1WSn3TXvdIPXpH6DfPR0Obj9IoeqLjsSqu434r7kwYqPCTIoolga6f2tY57S5/9Vqo6K/3tOemTdf+KuyS5pZMF0ps/kK5Xe2tS2yHwPVDh17d2JfG3hkSG6Oezki2aBrZ3/G+dr7kbpMPvdnwnSanpi0AObbNkLBD4HinoO1wZ3C8sSJufyFRc/1ALJ4Kt+Xk49Fd1Rmqo63z91L7bOw+aEfgeaEbK4FafW3Xlcs0NvbCr2NqBYG9jZnW+5h8kJUzp+ud7hd/eedCMwPdA8VGh+tGU4bf8+k9OXFRpRY2FE8HWYtKl8Z18aceUFU1/AUTEdv7z4+dbMxcIfE/1/PeT9MqCCUqP76uosGBFhHR9QlRZ1TUvTQZbmrNRevAlKTpJCg6XhqZLuZukac9L/gHS7A1SYAen6DqWSsO+5/15bYLTJHuwH6bF6odpTXtG/7WrWJv+XtLh6wL8/ZQ4IMybo8Fu/P2liT9u+q8jI+6Tfvx3aX++9I+vpLBoKW1R14d30G0E3hALM+P1u70l6uhi1Yz4fhoS2bv9AuBN0aOkWWt9PYWtcIjGEMcvVHcYd0n6Z9U1blMA2BCBN8TOQ51f+HT68jUdOlvlxWkA3AkIvCFcdQ1drl+70fU6APMQeEPcc1dUp2vhvQI1PjbSi9MAuBMQeEPMz4xTTN+OP0jNmzpcocF8ng7YDYE3RERIkP7040manjxIAf5Nl7kOiuiln89K0s/uG+nj6QD4Art1BontF6pNix2qqr2hK64bGhIZosAA/g4H7IrAGygyNEiRodwaGLA7du8AwFDswRumsdGt/aWXVFlbp3GxfTv94BWA+Qi8QfaXXNKKP3+ps5ebbizm7yc9nBqjX+SOU0gQX9cH2A2BN0RZ5TU9uXm/alpc8NTolv5y8JyCA/z1y3njfTgdAF/gGLwh/vjZ6VZxb+kvB895/FJuAOYh8IY4Utb5vWbqGhr19fmrna4DMBOBN0R0eK+u1/t0vQ7APATeEI844jpdS43rq5GD+N5LwG4IvCEyE/rrP+4d0e75QRG9tPaRCT6YCICvcRaNQVY8MFoPpAzW9gNnVVl7Q6lxfZWbHqPwEK5qBeyIwBtmbEykxsZwa2AAHKIBAGMReAAwFIEHAEMReAAwFIEHAEMReAAwFIEHAEMReAAwFIEHAEMReAAwlGWBP3PmjKZNm6akpCSlpKRo/fr1Vm0KANABy+5FExgYqF/96ldKT0/X1atXlZGRoenTpys5OdmqTQIAWrBsD37IkCFKT0+XJIWHhyspKUnnzp2zanMAgDa8cjfJ0tJSHTx4UBMnTmy3lp+fr/z8fElSeXm5N8YBAFuw/EPW6upqzZ07V+vWrVNERES79by8PDmdTjmdTkVHR1s9DgDYhqWBv3HjhubOnatFixYpNzfXyk0BANqwLPBut1tPPfWUkpKStGLFCqs2AwDohGWB/+STT7RlyxZ9/PHHSk1NVWpqqv76179atTkAQBuWfcg6efJkud1uq94eAOABV7ICgKEIPAAYisADgKEIPAAYisADgKEIPAAYisADgKEIPAAYisADgKEIPAAYisADgKEIPAAYisADgKEIPAAYisADgKEIPAAYisADgKEIPAAYisADgKEIPAAYisADgKEIPAAYisADgKEIPAAYisADgKEIPAAYisADgKEIPAAYisADgKEIPAAYisADgKEIPAAYisADgKEIPAAYisADgKEIPAAYisADgKEIPAAYytLAf/jhhxo9erRGjBihF1980cpNAQDasCzwDQ0N+ulPf6oPPvhAxcXFevvtt1VcXGzV5gAAbVgW+P3792vEiBEaPny4goODtXDhQu3YscOqzQEA2gi06o3PnTunuLi45sexsbH67LPP2r0uPz9f+fn5kqSjR4/K4XBYNZKtlJeXKzo62tdjAB3iz+ftU1pa2umaZYF3u93tnvPz82v3XF5envLy8qwaw7YcDoecTqevxwA6xJ9P77DsEE1sbKzOnDnT/Pjs2bMaOnSoVZsDALRhWeAzMzN1/PhxlZSUqK6uTtu2bdPs2bOt2hwAoA3LDtEEBgbqtdde04wZM9TQ0KClS5cqJSXFqs2hDQ574U7Gn0/v8HN3dLAcANDjcSUrABiKwAOAoQg8AK8oLS3V2LFjfT2GrRB4ADAUgTfQnDlzlJGRoZSUlOarhIE7QX19vZYsWaLx48dr3rx5qq2t9fVIRuMsGgNdunRJ/fv317Vr15SZmandu3crKirK12PB5kpLS5WYmKi9e/cqKytLS5cuVXJyslauXOnr0YzFHryBNmzYoAkTJmjSpEk6c+aMjh8/7uuRAElSXFycsrKyJEmPPfaY9u7d6+OJzGbZhU7wjcLCQn300Ufat2+fQkNDlZOTI5fL5euxAEnt70fV0f2pcPuwB2+Yqqoq9evXT6GhoTp69KiKiop8PRLQ7PTp09q3b58k6e2339bkyZN9PJHZCLxhZs6cqfr6eo0fP16rVq3SpEmTfD0S0CwpKUlvvvmmxo8fr0uXLmn58uW+HslofMgKAIZiDx4ADEXgAcBQBB4ADEXgAcBQBB4ADEXgAcBQBB5oo76+3tcjALcFgYftvPDCCxozZoymT5+uRx99VGvXrlVOTo6ef/55ZWdna/369SooKFBaWprGjRunpUuX6vr165KkhIQEVVRUSJKcTqdycnIkSWvWrNHjjz+ue++9VyNHjtSmTZt89esBzbgXDWzF6XRq+/btOnjwoOrr65Wenq6MjAxJUmVlpXbv3i2Xy6WRI0eqoKBAo0aN0uLFi7Vx40Y9++yzXb73oUOHVFRUpJqaGqWlpWnWrFkaOnSoF34roGPswcNW9u7dq4cffli9e/dWeHi4Hnrooea1BQsWSJKOHTumxMREjRo1SpK0ZMkS7dmzx+N7f/u+AwYM0LRp07R//35rfgngFhF42EpXd+YICwvz+JrAwEA1NjZKUru7dHKnRNxpCDxsZfLkydq5c6dcLpeqq6u1a9eudq8ZM2aMSktLdeLECUnSli1blJ2dLanpGPwXX3whSdq+fXurn9uxY4dcLpcuXryowsJCZWZmWvzbAF0j8LCVzMxMzZ49WxMmTFBubq4cDociIyNbvSYkJESbN2/WI488onHjxsnf319PP/20JGn16tV65plnNGXKFAUEBLT6ubvvvluzZs3SpEmTtGrVKo6/w+e4myRsp7q6Wn369FFtba2mTp2q/Px8paend+s916xZoz59+vD1c7ijcBYNbCcvL0/FxcVyuVxasmRJt+MO3KnYgwcAQ3EMHgAMReABwFAEHgAMReABwFAEHgAM9f8Ule/46Sio0AAAAABJRU5ErkJggg==\n", 55 | "text/plain": [ 56 | "
" 57 | ] 58 | }, 59 | "metadata": { 60 | "needs_background": "light" 61 | }, 62 | "output_type": "display_data" 63 | } 64 | ], 65 | "source": [ 66 | "#generate normally distributed values\n", 67 | "group_a = np.random.normal(4, 1, 5) #mu, sigma, n\n", 68 | "group_b = np.random.normal(5, 1, 5) #mu, sigma, n\n", 69 | "df = pd.DataFrame(np.concatenate((np.vstack((group_a, np.array(['a']*len(group_a)))).T,\n", 70 | " np.vstack((group_b, np.array(['b']*len(group_a)))).T)), columns = ['value', 'group'])\n", 71 | "df['value'] = df['value'].astype('float')\n", 72 | "ax = sns.swarmplot(data = df, x = 'group', y = 'value', s = 8)\n", 73 | "plt.ylim(bottom = 0, top = df.value.max() + 2)\n", 74 | "plt.show()" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 3, 80 | "id": "0db544c9", 81 | "metadata": {}, 82 | "outputs": [ 83 | { 84 | "data": { 85 | "text/plain": [ 86 | "Ttest_indResult(statistic=-2.141696254533902, pvalue=0.0646055063603198)" 87 | ] 88 | }, 89 | "execution_count": 3, 90 | "metadata": {}, 91 | "output_type": "execute_result" 92 | } 93 | ], 94 | "source": [ 95 | "stats.ttest_ind(group_a, group_b)" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 4, 101 | "id": "52f5d905", 102 | "metadata": {}, 103 | "outputs": [ 104 | { 105 | "data": { 106 | "text/plain": [ 107 | "Ttest_indResult(statistic=-4.273273869671518, pvalue=0.0016285682897101618)" 108 | ] 109 | }, 110 | "execution_count": 4, 111 | "metadata": {}, 112 | "output_type": "execute_result" 113 | } 114 | ], 115 | "source": [ 116 | "stats.ttest_ind([1,3,4,3,2], [4,5,6,6,5,4,5]) #input can just be list of numbers" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": 5, 122 | "id": "d23231ee", 123 | "metadata": {}, 124 | "outputs": [ 125 | { 126 | "data": { 127 | "text/plain": [ 128 | "ShapiroResult(statistic=0.9919436573982239, pvalue=0.9807631969451904)" 129 | ] 130 | }, 131 | "execution_count": 5, 132 | "metadata": {}, 133 | "output_type": "execute_result" 134 | } 135 | ], 136 | "source": [ 137 | "#shapiro test for normal distribution\n", 138 | "stats.shapiro(np.random.normal(10, 1, 50) ) #p value > 0.05 if it is normally distributed" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": 6, 144 | "id": "90f0bc0d", 145 | "metadata": {}, 146 | "outputs": [ 147 | { 148 | "data": { 149 | "image/png": "\n", 150 | "text/plain": [ 151 | "
" 152 | ] 153 | }, 154 | "metadata": { 155 | "needs_background": "light" 156 | }, 157 | "output_type": "display_data" 158 | } 159 | ], 160 | "source": [ 161 | "#generate random distribution of numbers, i.e., not normally distributed\n", 162 | "group_a = np.random.uniform(low=9, high=17, size=(20,))\n", 163 | "group_b = np.random.uniform(low=7, high=15, size=(20,))\n", 164 | "df = pd.DataFrame(np.concatenate((np.vstack((group_a, np.array(['a']*len(group_a)))).T,\n", 165 | " np.vstack((group_b, np.array(['b']*len(group_a)))).T)), columns = ['value', 'group'])\n", 166 | "df['value'] = df['value'].astype('float')\n", 167 | "ax = sns.swarmplot(data = df, x = 'group', y = 'value')\n", 168 | "plt.ylim(bottom = 0, top = df.value.max() + 2)\n", 169 | "plt.show()" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": 7, 175 | "id": "e41b85da", 176 | "metadata": {}, 177 | "outputs": [ 178 | { 179 | "data": { 180 | "text/plain": [ 181 | "ShapiroResult(statistic=0.8799253702163696, pvalue=0.017637841403484344)" 182 | ] 183 | }, 184 | "execution_count": 7, 185 | "metadata": {}, 186 | "output_type": "execute_result" 187 | } 188 | ], 189 | "source": [ 190 | "stats.shapiro(group_a)" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": 8, 196 | "id": "212ccb52", 197 | "metadata": {}, 198 | "outputs": [ 199 | { 200 | "data": { 201 | "text/plain": [ 202 | "ShapiroResult(statistic=0.9161470532417297, pvalue=0.08355055004358292)" 203 | ] 204 | }, 205 | "execution_count": 8, 206 | "metadata": {}, 207 | "output_type": "execute_result" 208 | } 209 | ], 210 | "source": [ 211 | "stats.shapiro(group_b)" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": 9, 217 | "id": "000c02b4", 218 | "metadata": {}, 219 | "outputs": [ 220 | { 221 | "data": { 222 | "text/plain": [ 223 | "MannwhitneyuResult(statistic=267.0, pvalue=0.0720454304673439)" 224 | ] 225 | }, 226 | "execution_count": 9, 227 | "metadata": {}, 228 | "output_type": "execute_result" 229 | } 230 | ], 231 | "source": [ 232 | "stats.mannwhitneyu(group_a, group_b)" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": 10, 238 | "id": "f40c3e62", 239 | "metadata": {}, 240 | "outputs": [ 241 | { 242 | "data": { 243 | "text/plain": [ 244 | "ShapiroResult(statistic=0.8577129244804382, pvalue=0.14439702033996582)" 245 | ] 246 | }, 247 | "execution_count": 10, 248 | "metadata": {}, 249 | "output_type": "execute_result" 250 | } 251 | ], 252 | "source": [ 253 | "stats.shapiro([4,5,6,6,5,4,5]) #example from value list" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": 11, 259 | "id": "9386650d", 260 | "metadata": {}, 261 | "outputs": [ 262 | { 263 | "data": { 264 | "text/plain": [ 265 | "MannwhitneyuResult(statistic=1.0, pvalue=0.008167958654692957)" 266 | ] 267 | }, 268 | "execution_count": 11, 269 | "metadata": {}, 270 | "output_type": "execute_result" 271 | } 272 | ], 273 | "source": [ 274 | "stats.mannwhitneyu([1,3,4,3,2], [4,5,6,6,5,4,5]) #example from value list" 275 | ] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "execution_count": null, 280 | "id": "4aec45f5", 281 | "metadata": {}, 282 | "outputs": [], 283 | "source": [] 284 | }, 285 | { 286 | "cell_type": "code", 287 | "execution_count": 12, 288 | "id": "f558a08c", 289 | "metadata": {}, 290 | "outputs": [], 291 | "source": [ 292 | "#example function that tests distribution then picks test accordingly\n", 293 | "def test_sig(x, y):\n", 294 | " if stats.shapiro(x).pvalue >= 0.05 and stats.shapiro(y).pvalue >= 0.05: #if they are normally distriuted\n", 295 | " print(\"t-test\")\n", 296 | " return stats.ttest_ind(x,y)\n", 297 | " else:\n", 298 | " print(\"mann-whitney\")\n", 299 | " return stats.mannwhitneyu(x, y)" 300 | ] 301 | }, 302 | { 303 | "cell_type": "code", 304 | "execution_count": 13, 305 | "id": "752d96d8", 306 | "metadata": {}, 307 | "outputs": [ 308 | { 309 | "name": "stdout", 310 | "output_type": "stream", 311 | "text": [ 312 | "t-test\n" 313 | ] 314 | }, 315 | { 316 | "data": { 317 | "text/plain": [ 318 | "Ttest_indResult(statistic=-10.808567545708417, pvalue=8.002021519806372e-18)" 319 | ] 320 | }, 321 | "execution_count": 13, 322 | "metadata": {}, 323 | "output_type": "execute_result" 324 | } 325 | ], 326 | "source": [ 327 | "test_sig(np.random.normal(10, 1, 50), np.random.normal(13, 2, 40))" 328 | ] 329 | }, 330 | { 331 | "cell_type": "code", 332 | "execution_count": 14, 333 | "id": "4540acbf", 334 | "metadata": {}, 335 | "outputs": [ 336 | { 337 | "name": "stdout", 338 | "output_type": "stream", 339 | "text": [ 340 | "mann-whitney\n" 341 | ] 342 | }, 343 | { 344 | "data": { 345 | "text/plain": [ 346 | "MannwhitneyuResult(statistic=7075.0, pvalue=4.0031006370913845e-07)" 347 | ] 348 | }, 349 | "execution_count": 14, 350 | "metadata": {}, 351 | "output_type": "execute_result" 352 | } 353 | ], 354 | "source": [ 355 | "test_sig(np.random.random(100)*1.4,np.random.random(100))" 356 | ] 357 | }, 358 | { 359 | "cell_type": "code", 360 | "execution_count": null, 361 | "id": "5e348816", 362 | "metadata": {}, 363 | "outputs": [], 364 | "source": [] 365 | } 366 | ], 367 | "metadata": { 368 | "kernelspec": { 369 | "display_name": "Python 3 (ipykernel)", 370 | "language": "python", 371 | "name": "python3" 372 | }, 373 | "language_info": { 374 | "codemirror_mode": { 375 | "name": "ipython", 376 | "version": 3 377 | }, 378 | "file_extension": ".py", 379 | "mimetype": "text/x-python", 380 | "name": "python", 381 | "nbconvert_exporter": "python", 382 | "pygments_lexer": "ipython3", 383 | "version": "3.10.1" 384 | } 385 | }, 386 | "nbformat": 4, 387 | "nbformat_minor": 5 388 | } 389 | -------------------------------------------------------------------------------- /tutorial_complex_Heatmap.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "R Notebook" 3 | output: html_notebook 4 | --- 5 | 6 | ```{r} 7 | if (!requireNamespace("BiocManager", quietly = TRUE)) 8 | install.packages("BiocManager") 9 | 10 | BiocManager::install("DESeq2") 11 | ``` 12 | 13 | ```{r} 14 | library(DESeq2) 15 | library(ggplot2) 16 | ``` 17 | 18 | ```{bash} 19 | ls 20 | ``` 21 | 22 | 23 | ```{r} 24 | Counts <- read.delim("count_table.csv", header = TRUE, row.names = 1, sep = ",") 25 | ``` 26 | 27 | ```{r} 28 | Counts 29 | ``` 30 | 31 | ```{r} 32 | Counts <- Counts[which(rowSums(Counts) > 0),] 33 | ``` 34 | 35 | 36 | ```{r} 37 | Counts 38 | ``` 39 | 40 | ```{r} 41 | condition <- factor(c("C","C","C","C", "S","S","S","S")) 42 | ``` 43 | 44 | ```{r} 45 | coldata <- data.frame(row.names = colnames(Counts), condition) 46 | ``` 47 | 48 | ```{r} 49 | coldata 50 | ``` 51 | 52 | ```{r} 53 | dds <- DESeqDataSetFromMatrix(countData = Counts, colData = coldata, design = ~condition) 54 | ``` 55 | 56 | ```{r} 57 | dds <- DESeq(dds) 58 | ``` 59 | 60 | ```{r} 61 | vsdata <- vst(dds, blind=FALSE) 62 | ``` 63 | 64 | ```{r} 65 | plotPCA(vsdata, intgroup = "condition") 66 | ``` 67 | 68 | ```{r} 69 | plotDispEsts(dds) 70 | ``` 71 | 72 | ```{r} 73 | res <- results(dds, contrast = c("condition", "S", "C")) 74 | ``` 75 | 76 | ```{r} 77 | res 78 | ``` 79 | 80 | 81 | ```{r} 82 | sigs <- na.omit(res) 83 | ``` 84 | 85 | ```{r} 86 | sigs <- sigs[sigs$padj < 0.05,] 87 | ``` 88 | 89 | ```{r} 90 | sigs 91 | ``` 92 | 93 | ```{r} 94 | write.csv(sigs, file = "deseq_results.csv") 95 | ``` 96 | 97 | 98 | ```{bash} 99 | ls 100 | ``` 101 | 102 | ```{r} 103 | sigs 104 | ``` 105 | 106 | ```{r} 107 | df <- as.data.frame(sigs) 108 | df 109 | ``` 110 | 111 | 112 | 113 | ```{r} 114 | ensembl_map <- read.csv('ensemble_key_mapper.csv', header = FALSE) 115 | 116 | 117 | keys <- ensembl_map$V1 118 | values <- ensembl_map$V2 119 | 120 | l <- list() 121 | for (i in 1:length(keys)){ 122 | l[keys[i]] <- values[i] 123 | } 124 | 125 | 126 | ``` 127 | 128 | 129 | 130 | 131 | 132 | ```{r} 133 | #for non-mapped labels 134 | no_values <- setdiff(rownames(df), keys) 135 | for (i in 1:length(no_values)){ 136 | l[no_values[i]] <- 'NA' 137 | } 138 | ``` 139 | 140 | 141 | ```{r} 142 | df$symbol <- unlist(l[rownames(df)], use.names = FALSE) 143 | ``` 144 | 145 | 146 | 147 | ```{r} 148 | df.top <- df[ (df$baseMean > 50) & (abs(df$log2FoldChange) > 2),] 149 | df.top 150 | ``` 151 | 152 | ```{r} 153 | df.top <- df.top[order(df.top$log2FoldChange, decreasing = TRUE),] 154 | ``` 155 | 156 | 157 | ```{r} 158 | rlog_out <- rlog(dds, blind=FALSE) #get normalized count data from dds object 159 | mat<-assay(rlog_out)[rownames(df.top), rownames(coldata)] #sig genes x samples 160 | colnames(mat) <- rownames(coldata) 161 | base_mean <- rowMeans(mat) 162 | mat.scaled <- t(apply(mat, 1, scale)) #center and scale each column (Z-score) then transpose 163 | colnames(mat.scaled)<-colnames(mat) 164 | ``` 165 | 166 | 167 | ```{r} 168 | num_keep <- 25 169 | #1 to num_keep len-num_keep to len 170 | rows_keep <- c(seq(1:num_keep), seq((nrow(mat.scaled)-num_keep), nrow(mat.scaled)) ) 171 | ``` 172 | 173 | 174 | 175 | ```{r} 176 | l2_val <- as.matrix(df.top[rows_keep,]$log2FoldChange) #getting log2 value for each gene we are keeping 177 | colnames(l2_val)<-"logFC" 178 | 179 | mean <- as.matrix(df.top[rows_keep,]$baseMean) #getting mean value for each gene we are keeping 180 | colnames(mean)<-"AveExpr" 181 | ``` 182 | 183 | 184 | 185 | 186 | 187 | ```{r} 188 | if (!requireNamespace("BiocManager", quietly = TRUE)) 189 | install.packages("BiocManager") 190 | 191 | BiocManager::install("ComplexHeatmap") 192 | ``` 193 | 194 | 195 | 196 | ```{r} 197 | library(ComplexHeatmap) 198 | library(RColorBrewer) 199 | library(circlize) 200 | ``` 201 | 202 | 203 | 204 | ```{r} 205 | #maps values between b/w/r for min and max l2 values 206 | col_logFC <- colorRamp2(c(min(l2_val),0, max(l2_val)), c("blue", "white", "red")) 207 | 208 | #maps between 0% quantile, and 75% quantile of mean values --- 0, 25, 50, 75, 100 209 | col_AveExpr <- colorRamp2(c(quantile(mean)[1], quantile(mean)[4]), c("white", "red")) 210 | ``` 211 | 212 | 213 | 214 | ```{r} 215 | 216 | ha <- HeatmapAnnotation(summary = anno_summary(gp = gpar(fill = 2), 217 | height = unit(2, "cm"))) 218 | 219 | h1 <- Heatmap(mat.scaled[rows_keep,], cluster_rows = F, 220 | column_labels = colnames(mat.scaled), name="Z-score", 221 | cluster_columns = T) 222 | h2 <- Heatmap(l2_val, row_labels = df.top$symbol[rows_keep], 223 | cluster_rows = F, name="logFC", top_annotation = ha, col = col_logFC, 224 | cell_fun = function(j, i, x, y, w, h, col) { # add text to each grid 225 | grid.text(round(l2_val[i, j],2), x, y) 226 | }) 227 | h3 <- Heatmap(mean, row_labels = df.top$symbol[rows_keep], 228 | cluster_rows = F, name = "AveExpr", col=col_AveExpr, 229 | cell_fun = function(j, i, x, y, w, h, col) { # add text to each grid 230 | grid.text(round(mean[i, j],2), x, y) 231 | }) 232 | 233 | h<-h1+h2+h3 234 | h 235 | ``` 236 | 237 | 238 | ```{r} 239 | png("./heatmap_v1.png", res = 300, width = 3000, height = 5500) 240 | print(h) 241 | dev.off() 242 | ``` 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | --------------------------------------------------------------------------------