├── .gitignore ├── README.md ├── atac ├── archR │ ├── add_motif_annotation │ │ ├── add_motif_annotation_manual.R │ │ ├── archR_add_background_peaks.R │ │ ├── archR_add_motif_annotation.R │ │ └── plot_motif_seqlogo.R │ ├── bigwig │ │ ├── README.txt │ │ └── archR_export_bw.R │ ├── celltype_assignment │ │ └── archR_celltype_assignment.R │ ├── chromvar │ │ ├── cells │ │ │ └── run_chromvar_from_archR.R │ │ ├── pseudobulk │ │ │ └── run_chromvar_pseudobulk.R │ │ └── utils.R │ ├── chromvar_chip │ │ ├── cells │ │ │ └── run_chromvar_chip.R │ │ ├── metacells │ │ │ └── differential │ │ │ │ ├── celltype │ │ │ │ ├── analysis │ │ │ │ │ ├── before_snakemake │ │ │ │ │ │ └── define_markers.R │ │ │ │ │ ├── compare_differential_chromvar_pseudobulk_metacells.R │ │ │ │ │ ├── define_markers.R │ │ │ │ │ ├── old │ │ │ │ │ │ └── load_data.R │ │ │ │ │ └── plot_marker_peaks_archR.R │ │ │ │ └── run_diff_chromvar_chip_celltype_metacells.R │ │ │ │ ├── differential_chromvar_metacells.R │ │ │ │ └── old │ │ │ │ ├── plot_differential_chromvar_chip_pseudobulk.R │ │ │ │ └── run_differential_chromvar_chip_pseudobulk.R │ │ ├── pseudobulk │ │ │ ├── compare_chromvar_chip_cells_vs_pseudobulk.R │ │ │ └── differential │ │ │ │ └── celltype │ │ │ │ ├── analysis │ │ │ │ ├── define_markers.R │ │ │ │ └── old │ │ │ │ │ └── plot_differential_chromvar_chip_pseudobulk.R │ │ │ │ ├── differential_chromvar_pseudobulk.R │ │ │ │ └── parse_differential_results.R │ │ └── run_chromvar_chip.R │ ├── differential │ │ ├── cells │ │ │ ├── archr_differential_accessibility_cells.R │ │ │ ├── celltype │ │ │ │ ├── analysis │ │ │ │ │ ├── define_markers.R │ │ │ │ │ ├── load_data.R │ │ │ │ │ ├── old │ │ │ │ │ │ ├── GeneScoreMatrix │ │ │ │ │ │ │ ├── define_marker_genes.R │ │ │ │ │ │ │ └── plot_marker_genes_archR.R │ │ │ │ │ │ ├── PeakMatrix │ │ │ │ │ │ │ ├── define_marker_peaks.R │ │ │ │ │ │ │ └── plot_marker_peaks_archR.R │ │ │ │ │ │ ├── browser_plot_archR.R │ │ │ │ │ │ ├── define_markers_archR.R │ │ │ │ │ │ └── plot_marker_peaks_archR.R │ │ │ │ │ └── volcano_plots_diff_atac.R │ │ │ │ ├── old │ │ │ │ │ └── run_diff_acc_celltype.R │ │ │ │ └── parse_differential_results.R │ │ │ └── genotype │ │ │ │ ├── analysis │ │ │ │ ├── analysis.R │ │ │ │ ├── atac_boxplots_wt_vs_ko_hits.R │ │ │ │ └── load_data.R │ │ │ │ └── run_diff_acc_genotype.R │ │ ├── compare_differential_atac_pseudobulk_metacells_cells.R │ │ ├── metacells │ │ │ ├── celltype │ │ │ │ ├── analysis │ │ │ │ │ ├── before_snakemake │ │ │ │ │ │ └── define_markers.R │ │ │ │ │ ├── old │ │ │ │ │ │ ├── define_markers.R │ │ │ │ │ │ └── load_data.R │ │ │ │ │ └── plot_marker_peaks_archR.R │ │ │ │ ├── parse_differential_results.R │ │ │ │ └── run_diff_acc_celltype_metacells.R │ │ │ ├── differential_accessibility_metacells.R │ │ │ ├── genotype │ │ │ │ ├── analysis │ │ │ │ │ ├── atac_boxplots_wt_vs_ko_hits.R │ │ │ │ │ ├── compare_differential_atac_pseudobulk_metacells_cells.R │ │ │ │ │ ├── diff_atac_genotype_metacells_analysis.R │ │ │ │ │ └── load_data.R │ │ │ │ ├── old │ │ │ │ │ └── analysis │ │ │ │ │ │ ├── analysis.R │ │ │ │ │ │ ├── atac_boxplots_wt_vs_ko_hits.R │ │ │ │ │ │ └── load_data.R │ │ │ │ └── run_diff_acc_genotype_metacells.R │ │ │ └── utils.R │ │ ├── pseudobulk │ │ │ ├── celltype │ │ │ │ ├── analysis │ │ │ │ │ ├── define_markers.R │ │ │ │ │ └── plot_marker_peaks_stats.R │ │ │ │ ├── differential_accessibility_pseudobulk.R │ │ │ │ └── parse_differential_results.R │ │ │ └── celltype_genotype │ │ │ │ ├── analysis │ │ │ │ ├── diff_acc_genotype_pseudobulk_analysis.R │ │ │ │ └── load_data.R │ │ │ │ ├── differential_accessibility_celltype_genotype_pseudobulk.R │ │ │ │ ├── explore_diff_acc_genotype.R │ │ │ │ ├── old │ │ │ │ └── old_stuff.R │ │ │ │ └── parse_differential_results.R │ │ └── utils.R │ ├── dimensionality_reduction │ │ ├── cells │ │ │ ├── archR_dimensionality_reduction.R │ │ │ └── atac_dimensionality_reduction_cells.R │ │ └── metacells │ │ │ └── atac_dimensionality_reduction_metacells.R │ ├── feature_stats │ │ ├── archR_calculate_feature_stats.R │ │ └── plot_feature_stats_atac.R │ ├── gene_scores │ │ ├── add_GeneScore_matrices.R │ │ ├── compare_gene_scores.R │ │ └── plot_genes_BrowserTrack_archR.R │ ├── load_archR_project.R │ ├── load_motif_annotation.R │ ├── metacells │ │ ├── aggregate_atac_metacell.R │ │ └── compare_metacell_vs_singlecell_vs_pseudobulk_atac.R │ ├── peak_calling │ │ ├── README.txt │ │ ├── analysis │ │ │ ├── calculate_cpg_density_atac_peaks.R │ │ │ ├── calculate_peak_stats_archR.R │ │ │ ├── link_TFs2genes_motifmatchr.R │ │ │ ├── link_peaks2genes_genomic_distance.R │ │ │ ├── motifmatcher_analysis.R │ │ │ └── plot_peak_calling_stats_archR.R │ │ ├── filter_peaks_archR.R │ │ └── peak_calling_archR.R │ ├── plot_individual_peaks │ │ ├── compare_celltypes │ │ │ └── plot_individual_peaks_compare_celltypes.R │ │ └── compare_genotypes │ │ │ ├── plot_individual_peaks_compare_genotypes_cells_metacells_pseudobulk.R │ │ │ └── pseudobulk_with_replicates │ │ │ └── plot_individual_peaks_genotypes_pseudobulk_with_replicates.R │ ├── processing │ │ ├── 0_create_arrow_files.R │ │ ├── 1_create_archR_project.R │ │ ├── 2_create_archR_metadata.R │ │ ├── 3_qc.R │ │ ├── save_atac_anndata.R │ │ ├── save_atac_matrices.R │ │ └── update_archR_metadata.R │ ├── pseudobulk │ │ ├── 1_archR_add_GroupCoverage.R │ │ ├── 2_archR_pseudobulk.R │ │ └── archR_pseudobulk_with_replicates.R │ └── snakemake │ │ ├── README.txt │ │ ├── Snakefile │ │ ├── config_ricard_babraham.yaml │ │ ├── run_cluster.sh │ │ └── run_cluster_single.sh ├── igv_settings │ ├── igv_session_babraham_celltypes.xml │ └── igv_session_babraham_nmp_metacells.xml └── motifs │ ├── calculate_motif_similarity.R │ └── utils.R ├── gastrulation_multiome_10x.Rproj ├── images ├── igv_screenshot_github.png └── overview_github.png ├── load_paga_graph.R ├── rna ├── PijuanSala2019_comparison │ └── PijuanSala2019_comparison_pseudobulk.R ├── TF2gene_coexpression │ ├── coexpression_TF_vs_gene_metacells.R │ ├── coexpression_TF_vs_gene_pseudobulk.R │ ├── coexpression_TF_vs_gene_single_cells.R │ ├── compare_coexpression_TF_vs_gene_cell_vs_metacell_vs_pseudobulk.R │ └── utils.R ├── celltype_proportions │ ├── compare_celltype_proportions.R │ ├── compare_celltype_proportions_paga.R │ └── plot_celltype_proportions.R ├── conversions │ ├── convert_SingleCellExperiment_to_anndata.R │ └── convert_anndata_to_SingleCellExperiment.R ├── differential │ ├── cells │ │ ├── differential.R │ │ └── parse_differential_results.R │ ├── metacells │ │ ├── analysis │ │ │ └── define_marker_genes.R │ │ ├── differential.R │ │ └── parse_differential_results.R │ ├── other │ │ └── extract_TFs_diff.R │ ├── pseudobulk │ │ ├── celltype │ │ │ ├── analysis │ │ │ │ ├── define_marker_TFs.R │ │ │ │ ├── define_marker_genes.R │ │ │ │ ├── old │ │ │ │ │ ├── extract_TFs_diff.R │ │ │ │ │ └── volcano_plots_diff_rna.R │ │ │ │ └── plot_marker_genes_stats.R │ │ │ ├── differential_celltype_pseudobulk.R │ │ │ ├── old │ │ │ │ └── run_diff_expr_celltype_pseudobulk.R │ │ │ └── parse_differential_results.R │ │ └── celltype_genotype │ │ │ ├── analysis │ │ │ └── explore_diff_rna_genotype.R │ │ │ ├── differential_celltype_genotype_pseudobulk.R │ │ │ └── parse_differential_results.R │ └── utils.R ├── dimensionality_reduction │ ├── dimensionality_reduction_sce.R │ ├── dimensionality_reduction_seurat.R │ └── metacells │ │ └── dimensionality_reduction_sce_metacells.R ├── iSEE │ └── iSEE.R ├── mapping │ ├── analysis │ │ ├── plot_mapping_umap.R │ │ ├── plot_mapping_wt_vs_ko.R │ │ └── plot_utils.R │ ├── run │ │ ├── mapping_functions.R │ │ ├── mnn │ │ │ ├── mapping_functions.R │ │ │ └── mapping_mnn.R │ │ └── parse_sample_metadata_after_mapping.R │ └── trajectories │ │ ├── mapping_functions.R │ │ ├── mapping_mnn_trajectory.R │ │ ├── parse_sample_metadata_after_mapping.R │ │ └── plot_mapping_trajectory_wt_vs_ko.R ├── metacells │ ├── SEACell_env.yml │ ├── analysis │ │ ├── compare_expr_umap_metacell_vs_singlecell.R │ │ ├── compare_metacell_vs_singlecell_vs_pseudobulk_expr.R │ │ ├── overlay_metacells_atlas_umap.R │ │ └── trajectories │ │ │ ├── overlay_metacells_atlas_trajectory.R │ │ │ └── plot_trajectory_metacells.R │ └── run │ │ ├── aggregate_rna_metacell.R │ │ ├── aggregate_rna_metacell_velocyto.R │ │ ├── run_metacell.py │ │ └── run_metacell_trajectory.py ├── plot_individual_genes │ ├── celltypes │ │ └── plot_individual_genes_cells_metacells_pseudobulk.R │ ├── genotype │ │ └── plot_individual_genes_by_genotype.R │ └── pseudobulk │ │ ├── barplot_individual_genes_pseudobulk.R │ │ └── plot_paga_individual_genes_pseudobulk.R ├── processing │ ├── 1_create_seurat_rna.R │ ├── 2_QC.R │ ├── 3_seurat_to_SCE.R │ ├── 4_doublet_detection.R │ ├── 5_parse_sample_metadata_after_doublets.R │ ├── 6_plot_stats.R │ └── extract_TFs_from_SingleCellExperiment.R ├── pseudobulk │ ├── old │ │ ├── create_pseudobulk_metadata_with_replicates.R │ │ ├── old_code.R │ │ ├── pseudobulk_rna.R │ │ └── pseudobulk_rna_intronic_exonic.R │ ├── pseudobulk_rna.R │ └── pseudobulk_rna_with_replicates.R ├── scanpy │ ├── create_anndata_from_SingleCellExperiment.R │ ├── create_anndata_scvelo.py │ ├── dimensionality_reduction │ │ └── dimensionality_reduction.py │ ├── scvelo │ │ ├── nmp_trajectory │ │ │ └── scvelo_analysis_cells.ipynb │ │ └── run_scvelo.py │ ├── template.ipynb │ └── velocyto │ │ ├── create_anndata_from_loom_files.py │ │ ├── run_velocyto.sh │ │ └── velocyto_env.yml └── snakemake │ ├── README.txt │ ├── Snakefile │ ├── config_ricard_babraham.yaml │ ├── environment.yaml │ └── run_cluster.sh ├── rna_atac ├── gene_regulatory_networks │ └── metacells │ │ └── trajectories │ │ ├── build_GRN_metacells_trajectory.R │ │ ├── cell_oracle │ │ └── celloracle_train.ipynb │ │ └── plot_GRN_metacells_nmp_trajectory.R ├── load_rna_atac_pseudobulk.R ├── mofa │ ├── not_used │ │ └── run.py │ ├── plot_mofa_results.R │ ├── prepare_mofa.R │ └── run_mofa_fast.R ├── rna_vs_acc │ ├── metacells │ │ ├── TFexpr_vs_peakAcc │ │ │ ├── compare_cor_TFexpr_vs_peak_acc_pseudobulk_vs_metacell.R │ │ │ ├── plot_TFexpr_vs_peakAcc_individual_examples.R │ │ │ ├── run_cor_TFexpr_vs_peakAcc_metacells.R │ │ │ └── trajectories │ │ │ │ └── compare_cor_TFexpr_vs_peak_acc_pseudobulk_vs_metacell_trajectories.R │ │ ├── gene_expr_vs_peak_acc │ │ │ ├── cor_gene_expr_vs_peak_acc_metacells.R │ │ │ └── plot_gene_expr_vs_peak_acc_metacells.R │ │ └── gene_expr_vs_promoter_acc │ │ │ ├── cor_gene_expr_vs_promoter_acc_metacells.R │ │ │ └── plot_gene_expr_vs_promoter_acc_metacells.R │ └── pseudobulk │ │ ├── TFexpr_vs_peakAcc │ │ ├── README.txt │ │ ├── analysis │ │ │ ├── old │ │ │ │ └── TF_cobinding_analysis.R │ │ │ ├── plot_TFexpr_vs_peakAcc_general_stats.R │ │ │ ├── plot_TFexpr_vs_peakAcc_individual_examples.R │ │ │ ├── plot_cor_TFexpr_vs_peakAcc_stats_per_TF.R │ │ │ └── plot_cor_TFexpr_vs_peakAcc_stats_per_peak.R │ │ └── run_cor_TFexpr_vs_peakAcc_pseudobulk.R │ │ ├── gene_expr_vs_peak_acc │ │ ├── analysis │ │ │ └── plot_gene_expr_vs_peak_acc_general_stats_pseudobulk.R │ │ ├── cor_gene_expr_vs_peak_acc_pseudobulk.R │ │ └── peak_markers_rna_vs_acc.R │ │ ├── gene_expr_vs_promoter_acc │ │ ├── cor_gene_expr_vs_promoter_acc_pseudobulk.R │ │ └── plot_gene_expr_vs_promoter_acc_pseudobulk.R │ │ └── gene_markers_rna_vs_acc │ │ ├── gene_markers_rna_vs_acc.R │ │ └── plot_number_markers.R ├── rna_vs_chromvar │ └── pseudobulk │ │ ├── per_celltype │ │ └── rna_vs_chromvar_pseudobulk_per_celltype.R │ │ └── per_gene │ │ ├── PAGA │ │ └── plot_rna_vs_chromvar_paga.R │ │ ├── cor_rna_vs_chromvar_per_gene_pseudobulk.R │ │ ├── pgc_neural_crest │ │ └── plot_rna_vs_chromvar_per_gene_pseudobulk_pgc_neural_crest.R │ │ └── plot_rna_vs_chromvar_per_gene_pseudobulk.R ├── rna_vs_chromvar_chip │ └── pseudobulk │ │ ├── per_celltype │ │ ├── rna_vs_chromvar_TF_markers_heatmap.R │ │ ├── rna_vs_chromvar_TF_pleiotropy.R │ │ ├── rna_vs_chromvar_pseudobulk_per_celltype.R │ │ └── rna_vs_chromvar_pseudobulk_pgc_neural_crest.R │ │ └── per_gene │ │ ├── PAGA │ │ ├── fig │ │ │ └── plot_rna_vs_chromvar_paga_fig.R │ │ └── plot_rna_vs_chromvar_paga.R │ │ ├── fig │ │ ├── plot_rna_vs_chromvar_per_gene_pseudobulk_fig.R │ │ └── rna_vs_chromvar_marker_score_per_gene_fig.R │ │ ├── rna_vs_chromvar_marker_score_per_gene.R │ │ └── rna_vs_chromvar_per_gene_pseudobulk.R ├── snakemake │ ├── Snakefile │ ├── config_ricard_babraham.yaml │ └── run_cluster.sh └── virtual_chipseq_library │ ├── link_TF2genes_virtual_chip.R │ ├── metacells │ ├── analysis │ │ └── virtual_chipseq_metacells_exploration.R │ ├── create_virtual_chipseq_library_metacells.R │ └── virtual_chipseq_compare_pseudobulk_vs_metacells.R │ └── pseudobulk │ ├── analysis │ ├── stats │ │ ├── virtual_chipseq_plot_individual_peaks.R │ │ └── virtual_chipseq_plot_stats.R │ ├── validation │ │ ├── virtual_chipseq_validation.R │ │ └── virtual_chipseq_validation_roc_curves.R │ └── virtual_chipseq_exploration.R │ └── create_virtual_chipseq_library_pseudobulk.R ├── settings.R ├── settings.py ├── utils.R └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Resilio Sync 2 | *.sync 3 | 4 | # MAC 5 | *Icon* 6 | *.DS_Store 7 | 8 | # R stuff 9 | *.Rhistory 10 | *.rds 11 | 12 | # Images 13 | *.pdf 14 | *.tiff 15 | *.ai 16 | *.svg 17 | 18 | # Data files 19 | *.hdf5 20 | *.gz 21 | *.zip 22 | 23 | # output files 24 | *.html 25 | *.log 26 | *.tsv 27 | 28 | # Ignore local folders 29 | **/local-data 30 | *.Rproj.user 31 | 32 | # Jupyter notebooks 33 | *.ipynb_checkpoints 34 | # *.ipynb 35 | .Rproj.user 36 | 37 | *.snakemake* 38 | 39 | 40 | *slurm*out 41 | -------------------------------------------------------------------------------- /atac/archR/add_motif_annotation/archR_add_background_peaks.R: -------------------------------------------------------------------------------- 1 | 2 | suppressPackageStartupMessages(library(ArchR)) 3 | suppressPackageStartupMessages(library(argparse)) 4 | 5 | here::i_am("atac/archR/add_motif_annotation/archR_add_background_peaks.R") 6 | 7 | ###################### 8 | ## Define arguments ## 9 | ###################### 10 | 11 | p <- ArgumentParser(description='') 12 | p$add_argument('--method', type="character", default="chromVAR", help='ArchR or chromVAR') 13 | p$add_argument('--number_background_peaks', type="integer", default=50, help='Number of background peaks') 14 | p$add_argument('--threads', type="integer", default=1, help='Number of threads') 15 | 16 | args <- p$parse_args(commandArgs(TRUE)) 17 | 18 | ## START TEST ## 19 | args$number_background_peaks <- 50 20 | args$method <- "chromVAR" 21 | args$threads <- 1 22 | ## END TEST ## 23 | 24 | ##################### 25 | ## Define settings ## 26 | ##################### 27 | 28 | source(here::here("settings.R")) 29 | 30 | ######################## 31 | ## Load ArchR Project ## 32 | ######################## 33 | 34 | source(here::here("atac/archR/load_archR_project.R")) 35 | 36 | addArchRThreads(threads = args$threads) 37 | 38 | ########################## 39 | ## Add background peaks ## 40 | ########################## 41 | 42 | # This function will compute background peaks controlling for total accessibility and GC-content 43 | # changes in the ArchR project: (1) it creates Background-Peaks.rds and (2) adds "bgdPeaks" entry to "metadata(getPeakSet(ArchRProject))" 44 | 45 | # Background peaks are chosen by sampling peaks based on similarity in GC content and # of fragments across samples using the Mahalanobis distance. 46 | # - The "w" paramter controls how similar background peaks should be. 47 | # - The "binSize" parameter controls the precision with which the similarity is computed. Increasing "binSize" will make the function run slower. 48 | # Returns a matrix with one row per peak and one column per iteration. values in a row represent indices of background peaks for the peak with that index 49 | 50 | ArchRProject <- addBgdPeaks( 51 | ArchRProj = ArchRProject, 52 | nIterations = args$number_background_peaks, 53 | w = 0.1, 54 | binSize = 50, 55 | method = args$method, 56 | seed = 42, 57 | outFile = file.path(getOutputDirectory(ArchRProject), "Background-Peaks.rds"), # default 58 | force = TRUE 59 | ) 60 | 61 | # if (!file.exists(metadata(ArchRProject@peakSet)$bgdPeaks)) { 62 | 63 | ########## 64 | ## TEST ## 65 | ########## 66 | 67 | R.utils::sourceDirectory("/bi/group/reik/ricard/scripts/git/archR/R", verbose=T, modifiedOnly=FALSE) 68 | 69 | ArchRProj = ArchRProject 70 | nIterations = 50 71 | w = 0.1 72 | binSize = 50 73 | seed = 1 74 | method = "chromVAR" 75 | 76 | -------------------------------------------------------------------------------- /atac/archR/add_motif_annotation/archR_add_motif_annotation.R: -------------------------------------------------------------------------------- 1 | # https://www.archrproject.com/bookdown/calculating-gene-scores-in-archr.html 2 | 3 | suppressPackageStartupMessages(library(ArchR)) 4 | suppressPackageStartupMessages(library(argparse)) 5 | 6 | here::i_am("atac/archR/gene_scores/add_GeneScore_matrices.R") 7 | 8 | ###################### 9 | ## Define arguments ## 10 | ###################### 11 | 12 | p <- ArgumentParser(description='') 13 | p$add_argument('--metadata', type="character", help='metadata file') 14 | # p$add_argument('--outdir', type="character", help='Output directory') 15 | p$add_argument('--threads', type="integer", default=1, help='Number of threads') 16 | 17 | args <- p$parse_args(commandArgs(TRUE)) 18 | 19 | ## START TEST ## 20 | # args$metadata <- "/bi/group/reik/ricard/data/gastrulation_multiome_10x/results/atac/archR/qc/sample_metadata_after_qc.txt.gz" 21 | # args$threads <- 1 22 | ## END TEST ## 23 | 24 | ##################### 25 | ## Define settings ## 26 | ##################### 27 | 28 | source(here::here("settings.R")) 29 | source(here::here("utils.R")) 30 | 31 | ######################## 32 | ## Load ArchR Project ## 33 | ######################## 34 | 35 | source(here::here("atac/archR/load_archR_project.R")) 36 | 37 | addArchRThreads(threads = args$threads) 38 | 39 | ########################## 40 | ## Add motif annotation ## 41 | ########################## 42 | 43 | # cisbp (stringent threshold) 44 | ArchRProject <- addMotifAnnotations( 45 | ArchRProject, 46 | motifSet = "cisbp", 47 | name = "Motif_cisbp", 48 | cutOff = 5e-05, 49 | width = 7, 50 | force = FALSE 51 | ) 52 | 53 | # cisbp (lenient threshold) 54 | # ArchRProject <- addMotifAnnotations( 55 | # ArchRProject, 56 | # motifSet = "cisbp", 57 | # name = "Motif_cisbp_lenient", 58 | # cutOff = 1e-04, 59 | # width = 7, 60 | # force = TRUE 61 | # ) 62 | 63 | # homer 64 | # ArchRProject <- addMotifAnnotations( 65 | # ArchRProject, 66 | # motifSet = "homer", 67 | # cutOff = opts$motif.pvalue.cutoff, 68 | # name = "Motif_homer", 69 | # force = TRUE 70 | # ) 71 | 72 | 73 | # JASPAR2020 human (stringent) 74 | ArchRProject <- addMotifAnnotations( 75 | ArchRProject, 76 | motifSet = "JASPAR2020", 77 | collection = "CORE", 78 | species = "Homo sapiens", 79 | cutOff = 5e-05, 80 | name = "Motif_JASPAR2020", 81 | force = FALSE 82 | ) 83 | 84 | # JASPAR2020 human (lenient) 85 | # ArchRProject <- addMotifAnnotations( 86 | # ArchRProject, 87 | # motifSet = "JASPAR2020", 88 | # collection = "CORE", 89 | # species = "Homo sapiens", 90 | # cutOff = 1e-04, 91 | # name = "Motif_JASPAR2020_lenient", 92 | # force = TRUE 93 | # ) 94 | 95 | 96 | ################################ 97 | ## Save peakAnnotation object ## 98 | ################################ 99 | 100 | saveRDS(ArchRProject@peakAnnotation, sprintf("%s/Annotations/peakAnnotation.rds",io$archR.directory)) 101 | -------------------------------------------------------------------------------- /atac/archR/add_motif_annotation/plot_motif_seqlogo.R: -------------------------------------------------------------------------------- 1 | here::i_am("atac/archR/add_motif_annotation/plot_motif_seqlogo.R") 2 | 3 | source(here::here("settings.R")) 4 | 5 | suppressPackageStartupMessages(library(TFBSTools)) 6 | suppressPackageStartupMessages(library(ggseqlogo)) 7 | 8 | ###################### 9 | ## Define arguments ## 10 | ###################### 11 | 12 | p <- ArgumentParser(description='') 13 | p$add_argument('--motif_annotation', type="character", help='Motif annotation') 14 | # p$add_argument('--peak_annotation_file', type="character", help='') 15 | p$add_argument('--outdir', type="character", help='Output directory') 16 | p$add_argument('--test', action="store_true", help='Test mode') 17 | 18 | args <- p$parse_args(commandArgs(TRUE)) 19 | 20 | ## START TEST ## 21 | args$motif_annotation <- "CISBP" 22 | args$peak_annotation_file <- file.path(io$archR.directory,"Annotations/peakAnnotation.rds") 23 | args$outdir <- file.path(io$archR.directory,sprintf("Annotations/seqlogo/%s",args$motif_annotation)) 24 | args$test <- FALSE 25 | ## END TEST ## 26 | 27 | # Parse arguments 28 | dir.create(args$outdir, showWarnings=F, recursive = T) 29 | 30 | ############### 31 | ## Load data ## 32 | ############### 33 | 34 | # raw position frequency matrix (PFM) 35 | pwm <- readRDS(args$peak_annotation_file)[[args$motif_annotation]][["motifs"]] 36 | # pwm <- readRDS(io$peak_annotation_file)[["Motif_JASPAR2020_human"]][["motifs"]] 37 | 38 | ########## 39 | ## Plot ## 40 | ########## 41 | 42 | motifs.to.plot <- names(pwm) 43 | if (args$test) { 44 | motifs.to.plot <- motifs.to.plot %>% head(n=3) 45 | } 46 | 47 | # postProbs = (PFM + bg * pseudocounts) / (colSums(PFM) + sum(bg) * pseudocounts) 48 | # priorProbs = bg / sum(bg) 49 | # PWM_log2probratio = log2(postProbs / priorProbs) 50 | 51 | grep("YBX",motifs.to.plot,value=T) 52 | 53 | # i <- "YBX2_827" 54 | for (i in motifs.to.plot) { 55 | 56 | # position weight matrix (PWM) 57 | 58 | if (args$motif_annotation=="JASPAR") { 59 | tmp <- toPWM(pwm[[i]], type="prob") %>% as.matrix 60 | } else if (args$motif_annotation=="CISBP") { 61 | tmp <- (2**as.matrix(pwm[[i]]))*0.25 # this is not entirely accurate 62 | } 63 | 64 | p <- ggseqlogo(tmp) + 65 | theme( 66 | axis.line = element_line(size=rel(0.5), color="black"), 67 | axis.text.x = element_blank(), 68 | axis.text.y = element_text(size=rel(0.75)), 69 | axis.title.y = element_text(size=rel(0.75)), 70 | # axis.title.y = element_blank() 71 | ) 72 | pdf(file.path(args$outdir,sprintf("seqlogo_%s_%s.pdf",args$motif_annotation,i)), width=5, height=2.2) 73 | print(p) 74 | dev.off() 75 | } 76 | 77 | # Completion token 78 | file.create(file.path(args$outdir,"completed.txt")) -------------------------------------------------------------------------------- /atac/archR/bigwig/README.txt: -------------------------------------------------------------------------------- 1 | ############ 2 | ## Signac ## 3 | ############ 4 | 5 | https://timoast.github.io/sinto/basic_usage.html#filter-cell-barcodes-from-bam-file 6 | https://github.com/timoast/sinto 7 | 8 | 9 | Hi, it is not currently possible to create a bigwig for different groups of cells in Signac. I'd suggest writing the cell names to a file and then splitting the bam file by cell using the sinto package (https://github.com/timoast/sinto), and then creating bigwig tracks using deeptools (https://deeptools.readthedocs.io/en/develop/content/tools/bamCoverage.html) 10 | 11 | 12 | ########### 13 | ## ArchR ## 14 | ########### 15 | 16 | getGroupBW() -------------------------------------------------------------------------------- /atac/archR/bigwig/archR_export_bw.R: -------------------------------------------------------------------------------- 1 | # https://www.ArchRProject.com/bookdown/how-does-archr-make-pseudo-bulk-replicates.html 2 | here::i_am("atac/archR/bigwig/archR_export_bw.R") 3 | 4 | source(here::here("settings.R")) 5 | source(here::here("utils.R")) 6 | 7 | suppressPackageStartupMessages(library(ArchR)) 8 | 9 | ###################### 10 | ## Define arguments ## 11 | ###################### 12 | 13 | p <- ArgumentParser(description='') 14 | p$add_argument('--archr_directory', type="character", help='ArchR directory') 15 | p$add_argument('--metadata', type="character", help='metadata file') 16 | p$add_argument('--group_by', type="character", help='Metadata column to group by') 17 | p$add_argument('--norm_method', type="character", default="ReadsInTSS", help='Normalisation method') 18 | p$add_argument('--min_cells', type="integer", default=100, help='Minimum number of cells per celltype') 19 | p$add_argument('--tile_size', type="integer", default=100, help='Tile size') 20 | p$add_argument('--threads', type="integer", default=1, help='Number of threads') 21 | 22 | args <- p$parse_args(commandArgs(TRUE)) 23 | 24 | ## START TEST ## 25 | # args$metadata <- file.path(io$basedir,"results/atac/archR/qc/sample_metadata_after_qc.txt.gz") 26 | # args$group_by <- "celltype_genotype" 27 | # args$norm_method <- c("ReadsInTSS") 28 | # args$tile_size <- 100 29 | # args$min_cells <- 100 30 | # args$threads <- 4 31 | ## END TEST ## 32 | 33 | ######################## 34 | ## Load cell metadata ## 35 | ######################## 36 | 37 | sample_metadata <- fread(args$metadata) %>% 38 | .[pass_atacQC==TRUE & sample%in%opts$samples] %>% 39 | .[,celltype_genotype:=sprintf("%s-%s",celltype,genotype)] 40 | 41 | stopifnot(args$group_by%in%colnames(sample_metadata)) 42 | sample_metadata <- sample_metadata[!is.na(sample_metadata[[args$group_by]])] 43 | 44 | # Filter celltypes by minimum number of cells 45 | sample_metadata <- sample_metadata[,N:=.N,by=c(args$group_by)] %>% .[N>=args$min_cells] %>% .[,N:=NULL] 46 | 47 | table(sample_metadata[[args$group_by]]) 48 | 49 | ######################## 50 | ## Load ArchR project ## 51 | ######################## 52 | 53 | # source(here::here("atac/archR/load_archR_project.R")) 54 | 55 | setwd(args$archr_directory) 56 | 57 | addArchRGenome("mm10") 58 | addArchRThreads(threads = args$threads) 59 | 60 | ArchRProject <- loadArchRProject(args$archr_directory)[sample_metadata$cell] 61 | 62 | ########################### 63 | ## Update ArchR metadata ## 64 | ########################### 65 | 66 | sample_metadata.to.archr <- sample_metadata %>% 67 | .[cell%in%rownames(ArchRProject.filt)] %>% setkey(cell) %>% .[rownames(ArchRProject.filt)] %>% 68 | as.data.frame() %>% tibble::column_to_rownames("cell") 69 | 70 | stopifnot(all(rownames(sample_metadata.to.archr) == rownames(getCellColData(ArchRProject.filt)))) 71 | ArchRProject.filt <- addCellColData( 72 | ArchRProject.filt, 73 | data = sample_metadata.to.archr[[args$group_by]], 74 | name = args$group_by, 75 | cells = rownames(sample_metadata.to.archr), 76 | force = TRUE 77 | ) 78 | 79 | # print cell numbers 80 | table(getCellColData(ArchRProject.filt,args$group_by)[[1]]) 81 | 82 | 83 | ################### 84 | ## Export bigwig ## 85 | ################### 86 | 87 | # This function will group, summarize and export a bigwig for each group in an ArchRProject. 88 | getGroupBW( 89 | ArchRProj = ArchRProject.filt, 90 | groupBy = args$group_by, 91 | # groupBy = "Sample", 92 | normMethod = args$norm_method, 93 | tileSize = args$tile_size, 94 | maxCells = 1000, # default 95 | ceiling = 4 96 | ) 97 | 98 | # Create a completion token 99 | file.create(file.path(io$archR.directory,sprintf("/GroupBigWigs/%s/completed.txt",args$group_by))) 100 | -------------------------------------------------------------------------------- /atac/archR/chromvar_chip/metacells/differential/celltype/analysis/compare_differential_chromvar_pseudobulk_metacells.R: -------------------------------------------------------------------------------- 1 | here::i_am("atac/archR/chromvar_chip/metacells/differential/celltype/analysis/compare_differential_chromvar_pseudobulk_metacells.R") 2 | 3 | # Load default settings 4 | source(here::here("settings.R")) 5 | source(here::here("utils.R")) 6 | 7 | ##################### 8 | ## Define settings ## 9 | ##################### 10 | 11 | # Options 12 | opts$celltypes <- c("NMP","Epiblast","Gut") 13 | 14 | # I/O 15 | io$basedir <- file.path(io$basedir,"test") 16 | io$diff.pseudobulk <- file.path(io$basedir,"results/atac/archR/chromvar_chip/pseudobulk/differential/celltypes/CISBP") 17 | io$diff.metacells <- file.path(io$basedir,"results/atac/archR/chromvar_chip/metacells/differential/celltypes/CISBP") 18 | # io$outdir <- file.path(io$basedir,sprintf("results/atac/archR/differential/comparison/%s/%s",opts$group_variable, opts$matrix)); dir.create(io$outdir, showWarnings = F, recursive = T) 19 | 20 | ########################################## 21 | ## Load results at the pseudobulk level ## 22 | ########################################## 23 | 24 | # i <- "Epiblast"; j <- "Primitive_Streak" 25 | chromvar_diff_pseudobulk.dt <- opts$celltypes %>% map(function(i) { opts$celltypes %>% map(function(j) { 26 | file <- file.path(io$diff.pseudobulk,sprintf("chromVAR_%s_vs_%s.txt.gz",i,j)) 27 | if (file.exists(file)) { 28 | fread(file, select=c(1,2)) %>% 29 | .[,c("celltypeA","celltypeB"):=list(as.factor(i),as.factor(j))] %>% 30 | .[,class:="pseudobulk"] %>% 31 | return 32 | } 33 | }) %>% rbindlist }) %>% rbindlist 34 | 35 | 36 | ######################################## 37 | ## Load results at the metacell level ## 38 | ######################################## 39 | 40 | chromvar_diff_metacells.dt <- opts$celltypes %>% map(function(i) { opts$celltypes %>% map(function(j) { 41 | file <- file.path(io$diff.metacells,sprintf("chromVAR_%s_vs_%s.txt.gz",i,j)) 42 | if (file.exists(file)) { 43 | fread(file, select=c(1,2)) %>% 44 | .[,c("celltypeA","celltypeB"):=list(as.factor(i),as.factor(j))] %>% 45 | .[,class:="metacells"] %>% 46 | return 47 | } 48 | }) %>% rbindlist }) %>% rbindlist 49 | 50 | ################### 51 | ## Sanity checks ## 52 | ################### 53 | 54 | # all(sort(unique(atac_diff_cells.dt$peak))==sort(unique(chromvar_diff_metacells.dt$peak))) 55 | # all(sort(unique(atac_diff_cells.dt$peak))==sort(unique(chromvar_diff_pseudobulk.dt$peak))) 56 | # mean(is.na(chromvar_diff_metacells.dt$diff)) 57 | # mean(is.na(atac_diff_cells.dt$diff)) 58 | # mean(is.na(chromvar_diff_pseudobulk.dt$diff)) 59 | 60 | ########### 61 | ## Merge ## 62 | ########### 63 | 64 | stopifnot(colnames(chromvar_diff_pseudobulk.dt)==colnames(chromvar_diff_metacells.dt)) 65 | 66 | chromvar_diff.dt <- rbindlist(list(chromvar_diff_metacells.dt, chromvar_diff_pseudobulk.dt)) %>% 67 | dcast(gene+celltypeA+celltypeB~class, value.var="diff") 68 | 69 | ########## 70 | ## Plot ## 71 | ########## 72 | 73 | to.plot <- chromvar_diff.dt[celltypeA=="Epiblast" & celltypeB=="NMP"] 74 | 75 | ggscatter(to.plot, x="pseudobulk", y="metacells", size=0.5, add="reg.line", add.params = list(color="blue", fill="lightgray"), conf.int=TRUE) + 76 | labs(x="Differential chromvar (pseudobulk)", y="Differential chromvar (metacells)") 77 | -------------------------------------------------------------------------------- /atac/archR/chromvar_chip/metacells/differential/celltype/analysis/old/load_data.R: -------------------------------------------------------------------------------- 1 | # i <- opts$celltypes[2]; j <- opts$celltypes[1] 2 | 3 | diff.dt <- opts$celltypes %>% map(function(i) { opts$celltypes %>% map(function(j) { 4 | file <- sprintf("%s/%s_%s_vs_%s.txt.gz", io$diff.dir,opts$matrix,i,j) 5 | if (file.exists(file)) { 6 | fread(file) %>% .[,c("celltypeA","celltypeB"):=list(as.factor(i),as.factor(j))] %>% return 7 | } 8 | }) %>% rbindlist }) %>% rbindlist %>% 9 | .[,MeanDiff:=-MeanDiff] %>% # change sign to keep the groupB - groupA consistency 10 | .[,sig:=FALSE] %>% .[abs(MeanDiff)>=opts$min.MeanDiff & FDR<=opts$fdr,sig:=TRUE] %>% 11 | .[,direction:=c("up","down")[as.numeric(MeanDiff>0)+1]] # up = higher accessibility in celltype A 12 | 13 | # ad hoc 14 | # if ("name"%in%colnames(atac_diff_cells.dt)) { 15 | # atac_diff_cells.dt[,idx:=NULL] %>% setnames("name","idx") 16 | # } 17 | -------------------------------------------------------------------------------- /atac/archR/chromvar_chip/metacells/differential/old/plot_differential_chromvar_chip_pseudobulk.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages(library(argparse)) 2 | 3 | here::i_am("atac/archR/chromvar_chip/pseudobulk/differential/plot_differential_chromvar_chip_pseudobulk.R") 4 | 5 | ################################ 6 | ## Initialize argument parser ## 7 | ################################ 8 | 9 | p <- ArgumentParser(description='') 10 | p$add_argument('--motif_annotation', type="character", help='Motif annotation') 11 | p$add_argument('--chromvar_diff_pseudobulk_dir', type="character", help='Motif annotation') 12 | p$add_argument('--outdir', type="character", help='Motif annotation') 13 | args <- p$parse_args(commandArgs(TRUE)) 14 | 15 | ##################### 16 | ## Define settings ## 17 | ##################### 18 | 19 | # load default setings 20 | source(here::here("settings.R")) 21 | source(here::here("utils.R")) 22 | 23 | ## START TEST ## 24 | args$motif_annotation <- "CISBP" 25 | args$chromvar_diff_pseudobulk_dir <- file.path(io$basedir,sprintf("results_new/atac/archR/chromvar_chip/pseudobulk/differential/%s",args$motif_annotation)) 26 | args$outdir <- file.path(io$basedir,sprintf("results_new/atac/archR/chromvar_chip/pseudobulk/differential/%s/pdf",args$motif_annotation)) 27 | ## END TEST ## 28 | 29 | dir.create(args$outdir, showWarnings = F) 30 | 31 | ######################################################## 32 | ## Load precomputed differential chromVAR-ChIP scores ## 33 | ######################################################## 34 | 35 | diff.dt <- (1:length(opts$celltypes)) %>% map(function(i) { 36 | (i:length(opts$celltypes)) %>% map(function(j) { 37 | if (i!=j) { 38 | file <- file.path(args$chromvar_diff_pseudobulk_dir,sprintf("%s_vs_%s_chromVAR_chip_pseudobulk.txt.gz",opts$celltypes[[i]],opts$celltypes[[j]])) 39 | if (file.exists(file)) { 40 | fread(file) %>% 41 | .[,groupA:=factor(opts$celltypes[[i]],levels=opts$celltypes)] %>% 42 | .[,groupB:=factor(opts$celltypes[[j]],levels=opts$celltypes)] %>% 43 | return 44 | } 45 | } 46 | }) %>% rbindlist 47 | }) %>% rbindlist 48 | 49 | ########## 50 | ## Plot ## 51 | ########## 52 | 53 | # celltypes.to.plot <- c("Gut","Erythroid3") 54 | # genes.to.plot <- c("TAL1") 55 | 56 | opts$xlim.max <- 3 57 | opts$xlim.min <- -3 58 | 59 | # i <- "Gut"; j <- "Erythroid3" 60 | for (i in opts$celltypes) { 61 | for (j in opts$celltypes) { 62 | 63 | to.plot <- diff.dt %>% 64 | .[groupA==i & groupB==j] %>% 65 | .[,gene:=factor(gene,levels=rev(gene))] %>% 66 | .[diff>=opts$xlim.max,diff:=opts$xlim.max] %>% 67 | .[diff<=opts$xlim.min,diff:=opts$xlim.min] 68 | 69 | p <- ggplot(to.plot, aes(x=diff, y=gene)) + 70 | geom_jitter(aes(color=abs(diff), alpha=abs(diff)), width = 0.15) + 71 | ggrepel::geom_text_repel(data=head(to.plot[diff>0],n=10), aes(x=diff, y=gene, label=gene), size=4, max.overlaps=Inf) + 72 | ggrepel::geom_text_repel(data=head(to.plot[diff<0],n=10), aes(x=diff, y=gene, label=gene), size=4, max.overlaps=Inf) + 73 | scale_color_gradient(low = "gray80", high = "red") + 74 | scale_alpha_continuous(range=c(0.25,1)) + 75 | coord_cartesian(xlim=c(opts$xlim.min,opts$xlim.max)) + 76 | theme_classic() + 77 | labs(x="Differential motif accessibility (chromVAR)", y="") + 78 | # coord_flip() + 79 | annotate("text", x=opts$xlim.min/1.5, y=75, size=4, label=sprintf("(+) %s",i)) + 80 | annotate("text", x=opts$xlim.max/1.5, y=75, size=4, label=sprintf("(+) %s",j)) + 81 | geom_segment(x=0, xend=0, y=0, yend=nrow(to.plot), color="black", size=0.25, linetype="dashed") + 82 | theme( 83 | legend.position = "none", 84 | axis.text.y = element_blank(), 85 | axis.ticks.y = element_blank(), 86 | axis.text.x = element_text(size=rel(1.0), color="black") 87 | ) 88 | 89 | 90 | pdf(file.path(args$outdir,sprintf("%s_vs_%s_%s_chromVAR_chip_pseudobulk_volcano.pdf",i,j,args$motif_annotation)), width=7, height=5) 91 | print(p) 92 | dev.off() 93 | } 94 | } 95 | 96 | -------------------------------------------------------------------------------- /atac/archR/chromvar_chip/metacells/differential/old/run_differential_chromvar_chip_pseudobulk.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages(library(argparse)) 2 | 3 | here::i_am("atac/archR/chromvar_chip/pseudobulk/differential/run_differential_chromvar_chip_pseudobulk.R") 4 | 5 | ################################ 6 | ## Initialize argument parser ## 7 | ################################ 8 | 9 | p <- ArgumentParser(description='') 10 | p$add_argument('--motif_annotation', type="character", help='Motif annotation') 11 | p$add_argument('--chromvar_chip_pseudobulk', type="character", help='Motif annotation') 12 | p$add_argument('--outdir', type="character", help='Motif annotation') 13 | args <- p$parse_args(commandArgs(TRUE)) 14 | 15 | ##################### 16 | ## Define settings ## 17 | ##################### 18 | 19 | # load default setings 20 | source(here::here("settings.R")) 21 | source(here::here("utils.R")) 22 | 23 | ## START TEST ## 24 | # args$motif_annotation <- "JASPAR" 25 | # args$chromvar_chip_pseudobulk <- file.path(io$basedir,sprintf("results_new/atac/archR/chromvar_chip/pseudobulk/chromVAR_deviations_%s_archr_chip.rds",args$motif_annotation)) 26 | # args$outdir <- file.path(io$basedir,sprintf("results_new/atac/archR/chromvar_chip/pseudobulk/differential/%s",args$motif_annotation)) 27 | ## END TEST ## 28 | 29 | # I/O 30 | dir.create(args$outdir, showWarnings = F) 31 | 32 | ##################################### 33 | ## Load pseudobulk chromVAR scores ## 34 | ##################################### 35 | 36 | chromvar_deviations_pseudobulk.se <- readRDS(args$chromvar_chip_pseudobulk) 37 | 38 | ###################################### 39 | ## Differential motif accessibility ## 40 | ###################################### 41 | 42 | # i <- 1; j <- 2 43 | for (i in 1:length(opts$celltypes)) { 44 | for (j in i:length(opts$celltypes)) { 45 | if (i!=j) { 46 | foo <- assay(chromvar_deviations_pseudobulk.se[,opts$celltypes[[j]]])[,1] 47 | bar <- assay(chromvar_deviations_pseudobulk.se[,opts$celltypes[[i]]])[,1] 48 | 49 | chromvar_diff.dt <- data.table( 50 | gene = names(foo), 51 | diff = round(foo-bar,2) 52 | # groupA = opts$celltypes[[i]], 53 | # groupB = opts$celltypes[[j]] 54 | ) %>% sort.abs("diff") 55 | 56 | # save 57 | outfile <- file.path(args$outdir,sprintf("%s_vs_%s_%s_chromVAR_chip_pseudobulk.txt.gz",opts$celltypes[[i]],opts$celltypes[[j]],args$motif_annotation)) 58 | fwrite(chromvar_diff.dt, outfile, sep="\t") 59 | } 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /atac/archR/chromvar_chip/pseudobulk/differential/celltype/differential_chromvar_pseudobulk.R: -------------------------------------------------------------------------------- 1 | here::i_am("atac/archR/chromvar_chip/pseudobulk/differential/celltype/differential_chromvar_pseudobulk.R") 2 | 3 | source(here::here("settings.R")) 4 | source(here::here("utils.R")) 5 | 6 | ###################### 7 | ## Define arguments ## 8 | ###################### 9 | 10 | p <- ArgumentParser(description='') 11 | p$add_argument('--motif_annotation', type="character", help='') 12 | p$add_argument('--chromvar_chip', type="character", help='') 13 | p$add_argument('--groupA', type="character", help='group A') 14 | p$add_argument('--groupB', type="character", help='group B') 15 | p$add_argument('--outfile', type="character", help='Output directory') 16 | 17 | args <- p$parse_args(commandArgs(TRUE)) 18 | 19 | ## START TEST ## 20 | # io$basedir <- file.path(io$basedir,"test") 21 | # args <- list() 22 | # args$motif_annotation <- "JASPAR" 23 | # args$chromvar_chip <- file.path(io$basedir,sprintf("results/atac/archR/chromvar_chip/pseudobulk_with_replicates/chromVAR_chip_%s_archr.rds",args$motif_annotation)) 24 | # args$groupA <- "ExE_ectoderm" 25 | # args$groupB <- "Caudal_neurectoderm" 26 | # args$outfile <- file.path(io$basedir,sprintf("results/atac/archR/chromvar_chip/pseudobulk_with_replicates/differential/celltype/chromVAR_%s_vs_%s.txt.gz",args$groupA,args$groupB)) 27 | ## END TEST ## 28 | 29 | ##################### 30 | ## Define settings ## 31 | ##################### 32 | 33 | # I/O 34 | dir.create(dirname(args$outfile), showWarnings=F, recursive = T) 35 | 36 | # Options 37 | opts$groups <- c(args$groupA,args$groupB) 38 | 39 | # stupid stuff but otherwise the snakemake pipeline doesn't work 40 | if (args$groupA==args$groupB) { 41 | out <- data.table(feature=NA, diff=NA, padj=NA) 42 | fwrite(out, args$outfile, sep="\t", na="NA", quote=F) 43 | warning("groupA and groupB are the same, saving an empty file...") 44 | quit(status=0) 45 | } 46 | 47 | ########################### 48 | ## Load chromVAR results ## 49 | ########################### 50 | 51 | print(sprintf("Fetching chromVAR results: '%s'...",args$chromvar_chip)) 52 | 53 | # Load 54 | atac_chromvar.se <- readRDS(args$chromvar_chip) 55 | 56 | # parse 57 | if (!"celltype"%in%colnames(colData(atac_chromvar.se))) { 58 | atac_chromvar.se$celltype <- colnames(atac_chromvar.se) %>% strsplit("_rep") %>% map_chr(1) 59 | } 60 | 61 | atac_chromvar.se <- atac_chromvar.se[,atac_chromvar.se$celltype%in%opts$groups] 62 | atac_chromvar.se$celltype <- factor(atac_chromvar.se$celltype, levels=opts$groups) 63 | 64 | # check that we have pseudobulk replicates for both cell types 65 | if (any(!opts$groups%in%unique(atac_chromvar.se$celltype))) { 66 | warning("groups not found, saving an empty file...") 67 | out <- data.table(feature=NA, diff=NA, padj=NA) 68 | fwrite(out, args$outfile, sep="\t", na="NA", quote=F) 69 | quit(status=0) 70 | } 71 | 72 | # Create data.table 73 | atac_chromvar.dt <- assay(atac_chromvar.se,"z") %>% t %>% 74 | as.data.table(keep.rownames = T) %>% 75 | setnames("rn","sample") %>% 76 | melt(id.vars=c("sample"), variable.name="gene", value.name="chromvar_zscore") 77 | 78 | tmp <- data.table(sample=colnames(atac_chromvar.se), group=atac_chromvar.se$celltype) 79 | atac_chromvar.dt <- atac_chromvar.dt %>% merge(tmp[,c("sample","group")]) 80 | 81 | ########################## 82 | ## Differential testing ## 83 | ########################## 84 | 85 | out <- atac_chromvar.dt %>% .[,.( 86 | diff = mean(.SD[group==opts$groups[2],chromvar_zscore]) - mean(.SD[group==opts$groups[1],chromvar_zscore]), 87 | p.value = t.test(x=.SD[group==opts$groups[1],chromvar_zscore], y=.SD[group==opts$groups[2],chromvar_zscore])[["p.value"]] 88 | ), by="gene"] %>% 89 | .[,padj:=p.adjust(p.value,method="fdr")] %>% .[,p.value:=NULL] %>% 90 | setorder(padj, na.last=T) %>% 91 | .[,c("diff","padj"):=list(round(diff,2),format(padj,digits=3))] 92 | 93 | ################## 94 | ## Save results ## 95 | ################## 96 | 97 | fwrite(out, args$outfile, sep="\t", na="NA", quote=F) 98 | -------------------------------------------------------------------------------- /atac/archR/chromvar_chip/pseudobulk/differential/celltype/parse_differential_results.R: -------------------------------------------------------------------------------- 1 | here::i_am("atac/archR/differential/pseudobulk/celltype/parse_differential_results.R") 2 | 3 | # Load default settings 4 | source(here::here("settings.R")) 5 | 6 | ###################### 7 | ## Define arguments ## 8 | ###################### 9 | 10 | p <- ArgumentParser(description='') 11 | p$add_argument('--diff_results_dir', type="character", help='File') 12 | p$add_argument('--outfile', type="character", help='File') 13 | args <- p$parse_args(commandArgs(TRUE)) 14 | 15 | ## START TEST ## 16 | # io$basedir <- file.path(io$basedir,"test") 17 | # args <- list() 18 | # args$diff_results_dir <- file.path(io$basedir,"results/atac/archR/chromvar_chip/pseudobulk_with_replicates/differential/celltypes/CISBP") 19 | # args$outfile <- file.path(io$basedir,"results/atac/archR/chromvar_chip/pseudobulk_with_replicates/differential/celltypes/CISBP/diff_results.txt.gz") 20 | ## END TEST ## 21 | 22 | # I/O 23 | dir.create(dirname(args$outfile), showWarnings=F, recursive=T) 24 | 25 | ################################################ 26 | ## Load differential expression and fetch TFs ## 27 | ################################################ 28 | 29 | diff_results_list <- list() 30 | 31 | # i <- "Visceral_endoderm"; j <- "Surface_ectoderm" 32 | for (i in 1:length(opts$celltypes)) { 33 | for (j in i:length(opts$celltypes)) { 34 | 35 | if (i!=j) { 36 | file <- file.path(args$diff_results_dir,sprintf("%s_vs_%s.txt.gz",opts$celltypes[[i]],opts$celltypes[[j]])) 37 | if (file.exists(file)) { 38 | tmp <- fread(file) %>% .[,c("celltypeA","celltypeB"):=list(opts$celltypes[[i]],opts$celltypes[[j]])] 39 | diff_results_list[[sprintf("%s_vs_%s",opts$celltypes[[i]],opts$celltypes[[j]])]] <- tmp 40 | } else { 41 | print(sprintf("%s not found...",file)) 42 | } 43 | } 44 | } 45 | } 46 | 47 | ########## 48 | ## Save ## 49 | ########## 50 | 51 | fwrite(rbindlist(diff_results_list), args$outfile, sep="\t", quote=F, na="NA") 52 | -------------------------------------------------------------------------------- /atac/archR/differential/cells/celltype/analysis/load_data.R: -------------------------------------------------------------------------------- 1 | # i <- opts$celltypes[2]; j <- opts$celltypes[1] 2 | 3 | diff.dt <- opts$celltypes %>% map(function(i) { opts$celltypes %>% map(function(j) { 4 | file <- sprintf("%s/%s_%s_vs_%s.txt.gz", io$diff.dir,opts$matrix,i,j) 5 | if (file.exists(file)) { 6 | fread(file) %>% .[,c("celltypeA","celltypeB"):=list(as.factor(i),as.factor(j))] %>% return 7 | } 8 | }) %>% rbindlist }) %>% rbindlist %>% 9 | .[,MeanDiff:=-MeanDiff] %>% # change sign to keep the groupB - groupA consistency 10 | .[,sig:=FALSE] %>% .[abs(MeanDiff)>=opts$min.MeanDiff & FDR<=opts$fdr,sig:=TRUE] %>% 11 | .[,direction:=c("up","down")[as.numeric(MeanDiff>0)+1]] # up = higher accessibility in celltype A 12 | 13 | # ad hoc 14 | # if ("name"%in%colnames(atac_diff_cells.dt)) { 15 | # atac_diff_cells.dt[,idx:=NULL] %>% setnames("name","idx") 16 | # } 17 | -------------------------------------------------------------------------------- /atac/archR/differential/cells/celltype/analysis/old/GeneScoreMatrix/define_marker_genes.R: -------------------------------------------------------------------------------- 1 | ##################### 2 | ## Define settings ## 3 | ##################### 4 | 5 | # Load default settings 6 | source(here::here("settings.R")) 7 | source(here::here("utils.R")) 8 | 9 | # I/O 10 | io$archR.diff.dir <- file.path(io$basedir,"results_new/atac/archR/differential/GeneScoreMatrix_TSS") 11 | io$outdir <- file.path(io$basedir,"results_new/atac/archR/differential/GeneScoreMatrix_TSS/markers"); dir.create(io$outdir, showWarnings = F) 12 | 13 | # Options 14 | # opts$groups <- strsplit(list.files(io$diff.dir, pattern="*.gz"),"_vs_") %>% map(~ .[[1]]) %>% unlist %>% unique 15 | opts$celltypes <- c( 16 | "Epiblast", 17 | "Primitive_Streak", 18 | "Caudal_epiblast", 19 | "PGC", 20 | "Anterior_Primitive_Streak", 21 | "Notochord", 22 | "Def._endoderm", 23 | "Gut", 24 | "Nascent_mesoderm", 25 | "Mixed_mesoderm", 26 | "Intermediate_mesoderm", 27 | "Caudal_Mesoderm", 28 | "Paraxial_mesoderm", 29 | "Somitic_mesoderm", 30 | "Pharyngeal_mesoderm", 31 | "Cardiomyocytes", 32 | "Allantois", 33 | "ExE_mesoderm", 34 | "Mesenchyme", 35 | "Haematoendothelial_progenitors", 36 | "Endothelium", 37 | "Blood_progenitors_1", 38 | "Blood_progenitors_2", 39 | "Erythroid1", 40 | "Erythroid2", 41 | "Erythroid3", 42 | "NMP", 43 | "Rostral_neurectoderm", 44 | "Caudal_neurectoderm", 45 | "Neural_crest", 46 | "Forebrain_Midbrain_Hindbrain", 47 | "Spinal_cord", 48 | "Surface_ectoderm", 49 | "Visceral_endoderm", 50 | "ExE_endoderm", 51 | "ExE_ectoderm", 52 | "Parietal_endoderm" 53 | )# %>% head(n=4) 54 | 55 | opts$min.MeanDiff <- 0.10 56 | opts$fdr <- 0.01 57 | 58 | # Minimum fraction of significant differential pairwise comparisons 59 | opts$score <- 0.75 60 | 61 | ################## 62 | ## Load results ## 63 | ################## 64 | 65 | dt <- opts$celltypes %>% map(function(i) { opts$celltypes %>% map(function(j) { 66 | file <- sprintf("%s/GeneScoreMatrix_TSS_%s_vs_%s.txt.gz", io$archR.diff.dir,i,j) 67 | if (file.exists(file)) { 68 | fread(file) %>% .[,c("celltypeA","celltypeB"):=list(as.factor(i),as.factor(j))] %>% 69 | return 70 | } 71 | }) %>% rbindlist }) %>% rbindlist %>% 72 | .[,sig:=FALSE] %>% .[abs(MeanDiff)>=opts$min.MeanDiff & FDR<=opts$fdr,sig:=TRUE] %>% 73 | .[,direction:=c("up","down")[as.numeric(MeanDiff<0)+1]] # up = higher accessibility in celltype A 74 | 75 | ncelltypes <- length(intersect(unique(dt$celltypeA),unique(dt$celltypeB))) 76 | 77 | ######################### 78 | ## Define marker genes ## 79 | ######################### 80 | 81 | foo <- dt[,.(score=sum(sig==T & direction=="up")), by=c("celltypeA","name")] %>% setnames("celltypeA","celltype") 82 | bar <- dt[,.(score=sum(sig==T & direction=="down")), by=c("celltypeB","name")] %>% setnames("celltypeB","celltype") 83 | 84 | markers_peaks.dt <- merge(foo,bar,by=c("celltype","name"), all=TRUE) %>% .[,score:=score.x+score.y] %>% 85 | .[,c("score.x","score.y"):=NULL] %>% 86 | .[,score:=round(score/(ncelltypes+1),2)] %>% 87 | .[score>=opts$score] %>% 88 | setorder(celltype,-score) 89 | rm(foo,bar) 90 | 91 | length(unique(markers_peaks.dt$name)) 92 | 93 | ########## 94 | ## Save ## 95 | ########## 96 | 97 | fwrite(markers_peaks.dt, file.path(io$outdir,"marker_genes.txt.gz")) 98 | 99 | -------------------------------------------------------------------------------- /atac/archR/differential/cells/celltype/analysis/old/PeakMatrix/define_marker_peaks.R: -------------------------------------------------------------------------------- 1 | here::i_am("atac/archR/differential/analysis/PeakMatrix/define_marker_peaks.R") 2 | 3 | source(here::here("settings.R")) 4 | source(here::here("utils.R")) 5 | 6 | ############# 7 | ## Options ## 8 | ############# 9 | 10 | opts$matrix <- "PeakMatrix" 11 | opts$group_variable <- "celltype.mapped" 12 | opts$min.MeanDiff <- 0.10 13 | opts$fdr <- 0.01 14 | opts$score <- 0.75 # Minimum fraction of significant differential pairwise comparisons 15 | 16 | ######### 17 | ## I/O ## 18 | ######### 19 | 20 | io$diff.dir <- file.path(io$basedir,sprintf("results/atac/archR/differential/%s/%s",opts$group_variable,opts$matrix)) 21 | io$outdir <- file.path(io$basedir,sprintf("results/atac/archR/differential/%s/%s/markers",opts$group_variable,opts$matrix)); dir.create(io$outdir, showWarnings = F) 22 | 23 | ################## 24 | ## Load results ## 25 | ################## 26 | 27 | source(here::here("atac/archR/differential/analysis/load_data.R")) 28 | 29 | ################### 30 | ## Sanity checks ## 31 | ################### 32 | 33 | # Load stats 34 | diff_stats.dt <- fread(file.path(io$diff.dir,"diff_stats.txt")) %>% setnames(c("celltypeA","celltypeB","N_groupA","N_groupB")) 35 | 36 | # check if some DA comparison is missing 37 | tmp <- diff_stats.dt %>% 38 | merge(diff.dt[,c("celltypeA","celltypeB")] %>% unique %>% .[,done:=TRUE], all.x=TRUE, by=c("celltypeA","celltypeB")) %>% 39 | .[is.na(done),done:=FALSE] 40 | stopifnot(tmp$done==TRUE) 41 | 42 | ######################### 43 | ## Define marker genes ## 44 | ######################### 45 | 46 | ncelltypes <- unique(c(as.character(unique(diff.dt$celltypeA)),as.character(unique(diff.dt$celltypeB)))) %>% length 47 | 48 | foo <- diff.dt[,.(score=sum(sig==T & direction=="up")), by=c("celltypeA","idx")] %>% setnames("celltypeA","celltype") 49 | bar <- diff.dt[,.(score=sum(sig==T & direction=="down")), by=c("celltypeB","idx")] %>% setnames("celltypeB","celltype") 50 | 51 | markers_peaks.dt <- merge(foo,bar,by=c("celltype","idx"), all=TRUE) %>% .[,score:=score.x+score.y] %>% 52 | .[,c("score.x","score.y"):=NULL] %>% 53 | # .[,score:=round(score/(ncelltypes+1),2)] %>% 54 | .[,score:=round(score/(ncelltypes-1),2)] %>% 55 | setorder(celltype,-score) 56 | # rm(foo,bar) 57 | 58 | stopifnot(max(markers_peaks.dt$score,na.rm=T)==1) 59 | 60 | 61 | ############################################## 62 | ## Add MeanDiff values from pseudobulk data ## 63 | ############################################## 64 | 65 | diff_pseudobulk.dt <- file.path(io$basedir,"results/atac/archR/differential/pseudobulk/celltype.mapped/PeakMatrix/differential_atac_PeakMatrix_pseudobulk_summary.txt.gz") %>% fread 66 | markers_peaks.dt <- markers_peaks.dt %>% merge(diff_pseudobulk.dt, by=c("celltype","idx")) 67 | 68 | ########## 69 | ## Save ## 70 | ########## 71 | 72 | # Save marker score for all combination of genes and cell types 73 | length(unique(markers_peaks.dt$idx)) 74 | length(unique(markers_peaks.dt$celltype)) 75 | fwrite(markers_peaks.dt, file.path(io$outdir,"marker_peaks_upregulated_all.txt.gz"), sep="\t") 76 | 77 | # Save marker score for strong markers 78 | markers_peaks_filt.dt <- markers_peaks.dt %>% .[score>=opts$score & diff>=opts$min.MeanDiff] 79 | length(unique(markers_peaks_filt.dt$idx)) 80 | length(unique(markers_peaks_filt.dt$celltype)) 81 | fwrite(markers_peaks_filt.dt, file.path(io$outdir,"marker_peaks_upregulated_filtered.txt.gz"), sep="\t") 82 | 83 | -------------------------------------------------------------------------------- /atac/archR/differential/cells/celltype/analysis/old/browser_plot_archR.R: -------------------------------------------------------------------------------- 1 | #################### 2 | ## Browser tracks ## 3 | #################### 4 | 5 | # i <- "chr2:39483639-39484239" 6 | 7 | opts$extend.upstream <- 2500 8 | opts$extend.downstream <- 2500 9 | opts$tileSize <- 50 10 | 11 | # Ugly hack 12 | # rename <- paste(1:length(opts$celltypes),opts$celltypes,sep="_") 13 | # names(rename) <- opts$celltypes 14 | # ArchRProject.filt$celltype.predicted <- stringr::str_replace_all(ArchRProject.filt$celltype.predicted,rename) 15 | 16 | for (i in unique(markers_peaks.dt$idx) %>% head(n=5)) { 17 | 18 | # Fetch GRanges 19 | to.plot <- peakset.gr[peakset.gr$idx==i] 20 | start(to.plot) <- start(to.plot) - opts$extend.upstream 21 | end(to.plot) <- end(to.plot) + opts$extend.downstream 22 | 23 | # Plot 24 | p <- plotBrowserTrack( 25 | ArchRProj = ArchRProject.filt, 26 | region = to.plot, 27 | groupBy = "celltype.predicted", 28 | tileSize = opts$tileSize, 29 | pal = opts$celltype.colors, 30 | plotSummary = c("bulkTrack", "featureTrack", "geneTrack"), 31 | sizes = c(10, 1.5, 1.5), 32 | ) 33 | 34 | # grid::grid.newpage() 35 | 36 | pdf(sprintf("%s/%s_BrowserTrack.pdf",io$outdir,gsub(":","-",i)), width = 9, height = 5) 37 | grid::grid.draw(p) 38 | dev.off() 39 | } -------------------------------------------------------------------------------- /atac/archR/differential/cells/celltype/parse_differential_results.R: -------------------------------------------------------------------------------- 1 | here::i_am("atac/archR/differential/cells/celltype/parse_differential_results.R") 2 | 3 | # Load default settings 4 | source(here::here("settings.R")) 5 | 6 | ###################### 7 | ## Define arguments ## 8 | ###################### 9 | 10 | p <- ArgumentParser(description='') 11 | p$add_argument('--diff_results_dir', type="character", help='File') 12 | p$add_argument('--min_cells', type="integer", default=5, help='Minimum number of cells per group') 13 | p$add_argument('--outdir', type="character", help='File') 14 | args <- p$parse_args(commandArgs(TRUE)) 15 | 16 | ## START TEST ## 17 | # io$basedir <- file.path(io$basedir,"test") 18 | # args <- list() 19 | # args$diff_results_dir <- file.path(io$basedir,"results/atac/archR/differential/cells/celltype/PeakMatrix") 20 | # args$min_cells <- 5 21 | # args$outdir <- file.path(io$basedir,"results/atac/archR/differential/cells/celltype/PeakMatrix/parsed") 22 | ## END TEST ## 23 | 24 | # I/O 25 | dir.create(args$outdir, showWarnings=F, recursive=T) 26 | 27 | ################################################ 28 | ## Load differential expression and fetch TFs ## 29 | ################################################ 30 | 31 | stats.dt <- data.table(celltypeA=as.character(NA), celltypeB=as.character(NA), groupA_N=as.integer(NA), groupB_N=as.integer(NA), included=as.logical(NA)) 32 | diff_results_list <- list() 33 | 34 | # i <- "Visceral_endoderm"; j <- "Surface_ectoderm" 35 | for (i in 1:length(opts$celltypes)) { 36 | for (j in i:length(opts$celltypes)) { 37 | 38 | if (i!=j) { 39 | file <- file.path(args$diff_results_dir,sprintf("%s_vs_%s.txt.gz",opts$celltypes[[i]],opts$celltypes[[j]])) 40 | if (file.exists(file)) { 41 | tmp <- fread(file) %>% 42 | .[,c("celltypeA","celltypeB"):=list(opts$celltypes[[i]],opts$celltypes[[j]])] 43 | 44 | if (tmp[celltypeA==opts$celltypes[[i]],groupA_N][1]>=args$min_cells & tmp[celltypeB==opts$celltypes[[j]],groupB_N][1]>=args$min_cells) { 45 | stats.dt <- rbind(stats.dt,data.table(celltypeA=opts$celltypes[[i]], celltypeB=opts$celltypes[[j]], groupA_N=tmp[celltypeA==opts$celltypes[[i]],groupA_N][1], groupB_N=tmp[celltypeB==opts$celltypes[[j]],groupB_N][1], included=TRUE)) 46 | diff_results_list[[sprintf("%s_vs_%s",opts$celltypes[[i]],opts$celltypes[[j]])]] <- tmp 47 | } else { 48 | stats.dt <- rbind(stats.dt,data.table(celltypeA=opts$celltypes[[i]], celltypeB=opts$celltypes[[j]], groupA_N=tmp[celltypeA==opts$celltypes[[i]],groupA_N][1], groupB_N=tmp[celltypeB==opts$celltypes[[j]],groupB_N][1], included=FALSE)) 49 | } 50 | } else { 51 | print(sprintf("%s not found...",file)) 52 | } 53 | } 54 | } 55 | } 56 | 57 | ########## 58 | ## Save ## 59 | ########## 60 | 61 | fwrite(stats.dt[-1], file.path(args$outdir,"diff_stats.txt.gz"), sep="\t", quote=F, na="NA") 62 | fwrite(rbindlist(diff_results_list), file.path(args$outdir,"diff_results.txt.gz"), sep="\t", quote=F, na="NA") 63 | 64 | -------------------------------------------------------------------------------- /atac/archR/differential/cells/genotype/analysis/load_data.R: -------------------------------------------------------------------------------- 1 | 2 | ############################################# 3 | ## Load results from differential analysis ## 4 | ############################################# 5 | 6 | diff.dt <- opts$celltypes %>% map(function(j) { 7 | file <- sprintf("%s/%s_WT_vs_KO.txt.gz", io$indir,j) 8 | if (file.exists(file)) { 9 | fread(file, select=c(1,2,3)) %>% .[,c("celltype"):=list(j)] 10 | } 11 | }) %>% rbindlist %>% 12 | # .[,MeanDiff:=-MeanDiff] # change sign to keep the groupB - groupA consistency 13 | .[, sign := ifelse(MeanDiff>0,"Upregulated in KO","Downregulated in KO")] %>% 14 | .[, sig := (FDR<=opts$threshold_fdr & abs(MeanDiff)>=opts$min.MeanDiff)] 15 | 16 | 17 | # Print stats 18 | print(sprintf("Number of celltypes: %s",length(unique(diff.dt$celltype)))) 19 | print(sprintf("Number of features: %s",length(unique(diff.dt$idx)))) 20 | 21 | -------------------------------------------------------------------------------- /atac/archR/differential/cells/genotype/run_diff_acc_genotype.R: -------------------------------------------------------------------------------- 1 | here::i_am("atac/archR/differential/cells/genotype/run_diff_acc_genotype.R") 2 | 3 | source(here::here("settings.R")) 4 | 5 | ###################### 6 | ## Define arguments ## 7 | ###################### 8 | 9 | p <- ArgumentParser(description='') 10 | p$add_argument('--archr_directory', type="character", help='ArchR directory') 11 | p$add_argument('--metadata', type="character", help='') 12 | p$add_argument('--matrix', type="character", default="PeakMatrix", help='Matrix to use') 13 | # p$add_argument('--group_variable', type="character", help='') 14 | p$add_argument('--outdir', type="character", help='Output directory') 15 | p$add_argument('--min_cells', type="integer", default=50, help='Minimum number of cells per cell type') 16 | p$add_argument('--test_mode', action="store_true", help='Test mode? subset data') 17 | 18 | args <- p$parse_args(commandArgs(TRUE)) 19 | 20 | ## START TEST ## 21 | # io$basedir <- file.path(io$basedir,"test") 22 | # args <- list() 23 | # args$archr_directory <- file.path(io$basedir,"processed/atac/archR") 24 | # args$metadata <- file.path(io$basedir,"results/atac/archR/qc/sample_metadata_after_qc.txt.gz") 25 | # args$matrix <- "PeakMatrix" # "GeneScoreMatrix_TSS" 26 | # # args$group_variable <- "celltype_genotype" 27 | # args$min_cells <- 30 28 | # args$outdir <- file.path(io$basedir,sprintf("results/atac/archR/differential/cells/celltype_genotype/%s",args$matrix)) 29 | # args$test_mode <- TRUE 30 | ## END TEST ## 31 | 32 | ##################### 33 | ## Define settings ## 34 | ##################### 35 | 36 | # I/O 37 | io$script <- here::here("atac/archR/differential/cells/archr_differential_accessibility_cells.R") 38 | dir.create(args$outdir, showWarnings=FALSE, recursive=TRUE) 39 | 40 | # Options 41 | opts$statistical.test <- "wilcoxon" 42 | 43 | opts$samples <- c( 44 | "E8.5_CRISPR_T_KO", 45 | "E8.5_CRISPR_T_WT" 46 | ) 47 | 48 | ######################## 49 | ## Load cell metadata ## 50 | ######################## 51 | 52 | cells_metadata.dt <- fread(args$metadata) %>% 53 | .[sample%in%opts$samples & pass_atacQC==TRUE & !is.na(genotype) & !is.na(celltype)] %>% 54 | .[,celltype_genotype:=sprintf("%s_%s",celltype,genotype)] 55 | 56 | # cells_metadata.dt <- cells_metadata.dt %>% 57 | # .[,group:=eval(as.name(args$group_variable))] %>% 58 | 59 | # Only consider cell types with sufficient number of cellls 60 | stats.dt <- cells_metadata.dt[,.N,by=c("celltype","genotype")] %>% dcast(celltype~genotype, value.var="N", fill=0) 61 | celltypes.to.use <- stats.dt[T_KO>=args$min_cells & WT>=args$min_cells,celltype] 62 | stats.dt <- stats.dt[celltype%in%celltypes.to.use] 63 | print(stats.dt) 64 | 65 | ######### 66 | ## Run ## 67 | ######### 68 | 69 | if (args$test_mode) { 70 | print("Test mode activated, running only a few comparisons...") 71 | celltypes.to.use <- celltypes.to.use %>% head(n=3) 72 | } 73 | 74 | # j <- "NMP" 75 | for (j in celltypes.to.use) { 76 | outfile <- sprintf("%s/%s_WT_vs_KO.txt.gz", args$outdir,j); dir.create(dirname(outfile), showWarnings = F) 77 | if (!file.exists(outfile)) { 78 | 79 | # Define LSF command 80 | if (grepl("BI",Sys.info()['nodename'])) { 81 | lsf <- "" 82 | } else if (grepl("pebble|headstone", Sys.info()['nodename'])) { 83 | lsf <- sprintf("sbatch -n 1 --mem 8G --wrap") 84 | } 85 | cmd <- sprintf("%s 'Rscript %s --archr_directory %s --metadata %s --samples %s --celltypes %s --groupA WT --groupB T_KO --matrix %s --group_variable genotype --statistical_test %s --outfile %s'", 86 | lsf, io$script, args$archr_directory, args$metadata, paste(opts$samples,collapse=" "), j, args$matrix, opts$statistical.test, outfile) 87 | 88 | # Run 89 | print(cmd) 90 | system(cmd) 91 | } 92 | } 93 | 94 | 95 | # Save stats 96 | fwrite(stats.dt, file.path(args$outdir,"diff_stats.txt"), sep="\t", quote=F) 97 | 98 | # Completion token 99 | file.create(file.path(args$outdir,"completed.txt")) -------------------------------------------------------------------------------- /atac/archR/differential/metacells/celltype/analysis/old/load_data.R: -------------------------------------------------------------------------------- 1 | # i <- opts$celltypes[2]; j <- opts$celltypes[1] 2 | 3 | diff.dt <- opts$celltypes %>% map(function(i) { opts$celltypes %>% map(function(j) { 4 | file <- sprintf("%s/%s_%s_vs_%s.txt.gz", io$diff.dir,opts$matrix,i,j) 5 | if (file.exists(file)) { 6 | fread(file) %>% .[,c("celltypeA","celltypeB"):=list(as.factor(i),as.factor(j))] %>% return 7 | } 8 | }) %>% rbindlist }) %>% rbindlist %>% 9 | .[,MeanDiff:=-MeanDiff] %>% # change sign to keep the groupB - groupA consistency 10 | .[,sig:=FALSE] %>% .[abs(MeanDiff)>=opts$min.MeanDiff & FDR<=opts$fdr,sig:=TRUE] %>% 11 | .[,direction:=c("up","down")[as.numeric(MeanDiff>0)+1]] # up = higher accessibility in celltype A 12 | 13 | # ad hoc 14 | # if ("name"%in%colnames(atac_diff_cells.dt)) { 15 | # atac_diff_cells.dt[,idx:=NULL] %>% setnames("name","idx") 16 | # } 17 | -------------------------------------------------------------------------------- /atac/archR/differential/metacells/celltype/parse_differential_results.R: -------------------------------------------------------------------------------- 1 | here::i_am("atac/archR/differential/metacells/celltype/parse_differential_results.R") 2 | 3 | # Load default settings 4 | source(here::here("settings.R")) 5 | 6 | ###################### 7 | ## Define arguments ## 8 | ###################### 9 | 10 | p <- ArgumentParser(description='') 11 | p$add_argument('--diff_results_dir', type="character", help='File') 12 | p$add_argument('--min_cells', type="integer", default=5, help='Minimum number of cells per group') 13 | p$add_argument('--outdir', type="character", help='File') 14 | args <- p$parse_args(commandArgs(TRUE)) 15 | 16 | ## START TEST ## 17 | # io$basedir <- file.path(io$basedir,"test") 18 | # args <- list() 19 | # args$diff_results_dir <- file.path(io$basedir,"results/atac/archR/differential/metacells/celltype/PeakMatrix") 20 | # args$min_cells <- 5 21 | # args$outdir <- file.path(io$basedir,"results/atac/archR/differential/metacells/celltype/PeakMatrix/parsed") 22 | ## END TEST ## 23 | 24 | # I/O 25 | dir.create(args$outdir, showWarnings=F, recursive=T) 26 | 27 | ################################################ 28 | ## Load differential expression and fetch TFs ## 29 | ################################################ 30 | 31 | stats.dt <- data.table(celltypeA=as.character(NA), celltypeB=as.character(NA), groupA_N=as.integer(NA), groupB_N=as.integer(NA), included=as.logical(NA)) 32 | diff_results_list <- list() 33 | 34 | # i <- "Visceral_endoderm"; j <- "Surface_ectoderm" 35 | for (i in 1:length(opts$celltypes)) { 36 | for (j in i:length(opts$celltypes)) { 37 | 38 | if (i!=j) { 39 | file <- file.path(args$diff_results_dir,sprintf("%s_vs_%s.txt.gz",opts$celltypes[[i]],opts$celltypes[[j]])) 40 | if (file.exists(file)) { 41 | tmp <- fread(file) %>% 42 | .[,c("celltypeA","celltypeB"):=list(opts$celltypes[[i]],opts$celltypes[[j]])] 43 | 44 | if (tmp[celltypeA==opts$celltypes[[i]],groupA_N][1]>=args$min_cells & tmp[celltypeB==opts$celltypes[[j]],groupB_N][1]>=args$min_cells) { 45 | stats.dt <- rbind(stats.dt,data.table(celltypeA=opts$celltypes[[i]], celltypeB=opts$celltypes[[j]], groupA_N=tmp[celltypeA==opts$celltypes[[i]],groupA_N][1], groupB_N=tmp[celltypeB==opts$celltypes[[j]],groupB_N][1], included=TRUE)) 46 | diff_results_list[[sprintf("%s_vs_%s",opts$celltypes[[i]],opts$celltypes[[j]])]] <- tmp 47 | } else { 48 | stats.dt <- rbind(stats.dt,data.table(celltypeA=opts$celltypes[[i]], celltypeB=opts$celltypes[[j]], groupA_N=tmp[celltypeA==opts$celltypes[[i]],groupA_N][1], groupB_N=tmp[celltypeB==opts$celltypes[[j]],groupB_N][1], included=FALSE)) 49 | } 50 | } else { 51 | print(sprintf("%s not found...",file)) 52 | } 53 | } 54 | } 55 | } 56 | 57 | ########## 58 | ## Save ## 59 | ########## 60 | 61 | fwrite(stats.dt[-1], file.path(args$outdir,"diff_stats.txt.gz"), sep="\t", quote=F, na="NA") 62 | fwrite(rbindlist(diff_results_list), file.path(args$outdir,"diff_results.txt.gz"), sep="\t", quote=F, na="NA") 63 | 64 | -------------------------------------------------------------------------------- /atac/archR/differential/metacells/genotype/analysis/load_data.R: -------------------------------------------------------------------------------- 1 | 2 | ############################################# 3 | ## Load results from differential analysis ## 4 | ############################################# 5 | 6 | diff.dt <- opts$celltypes %>% map(function(j) { 7 | file <- sprintf("%s/%s_WT_vs_KO.txt.gz", io$indir,j) 8 | if (file.exists(file)) { 9 | fread(file, select=c(1,2,3)) %>% .[,c("celltype"):=list(j)] 10 | } 11 | }) %>% rbindlist %>% 12 | # .[,MeanDiff:=-MeanDiff] # change sign to keep the groupB - groupA consistency 13 | .[, sign := ifelse(logFC>0,"Upregulated in KO","Downregulated in KO")] %>% 14 | .[, sig := (padj_fdr<=opts$threshold_fdr & abs(logFC)>=opts$min.logFC)] 15 | 16 | 17 | # Print stats 18 | print(sprintf("Number of celltypes: %s",length(unique(diff.dt$celltype)))) 19 | print(sprintf("Number of features: %s",length(unique(diff.dt$feature)))) -------------------------------------------------------------------------------- /atac/archR/differential/metacells/genotype/old/analysis/load_data.R: -------------------------------------------------------------------------------- 1 | 2 | ############################################# 3 | ## Load results from differential analysis ## 4 | ############################################# 5 | 6 | diff.dt <- opts$celltypes %>% map(function(j) { 7 | file <- sprintf("%s/%s_WT_vs_KO.txt.gz", io$indir,j) 8 | if (file.exists(file)) { 9 | fread(file, select=c(1,2,3)) %>% .[,c("celltype"):=list(j)] 10 | } 11 | }) %>% rbindlist %>% 12 | # .[,MeanDiff:=-MeanDiff] # change sign to keep the groupB - groupA consistency 13 | .[, sign := ifelse(MeanDiff>0,"Downregulated in KO","Upregulated in KO")] %>% 14 | .[, sig := (FDR<=opts$threshold_fdr & abs(MeanDiff)>=opts$min.MeanDiff)] 15 | 16 | 17 | # Print stats 18 | print(sprintf("Number of celltypes: %s",length(unique(diff.dt$celltype)))) 19 | print(sprintf("Number of features: %s",length(unique(diff.dt$idx)))) 20 | 21 | -------------------------------------------------------------------------------- /atac/archR/differential/metacells/utils.R: -------------------------------------------------------------------------------- 1 | 2 | # Function to differential expression 3 | # - sce: SingleCellExperiment object with the column "group" in the colData 4 | # - groups: the names of the two groups 5 | # - min_detection_rate_per_group: minimum detection rate per group 6 | calculate_diff_acc_edgeR <- function(sce, groups, min_detection_rate_per_group = 0.50) { 7 | 8 | # Sanity checks 9 | if (!is(sce, "SingleCellExperiment")) stop("'sce' has to be an instance of SingleCellExperiment") 10 | stopifnot(length(groups)==2) 11 | 12 | # Filter genes by detection rate per group 13 | cdr_A <- rowMeans(logcounts(sce[,sce$group==groups[1]])>0) >= min_detection_rate_per_group 14 | cdr_B <- rowMeans(logcounts(sce[,sce$group==groups[2]])>0) >= min_detection_rate_per_group 15 | out <- .edgeR(sce[cdr_B | cdr_A,]) %>% .[,log_padj_fdr:= -log10(padj_fdr)] 16 | 17 | return(out) 18 | } 19 | 20 | 21 | .edgeR <- function(sce) { 22 | 23 | # Convert SCE to DGEList 24 | sce_edger <- scran::convertTo(sce, type="edgeR") 25 | 26 | # Define design matrix (with intercept) 27 | cdr <- colMeans(logcounts(sce)>0) 28 | design <- model.matrix(~cdr+sce$group) 29 | 30 | # Estimate dispersions 31 | sce_edger <- estimateDisp(sce_edger,design) 32 | 33 | # Fit GLM 34 | fit <- glmQLFit(sce_edger,design) 35 | 36 | # Likelihood ratio test 37 | lrt <- glmQLFTest(fit) 38 | 39 | # Construct output data.frame 40 | out <- topTags(lrt, n=nrow(lrt))$table %>% as.data.table(keep.rownames=T) %>% 41 | setnames(c("gene","logFC","logCPM","LR","p.value","padj_fdr")) %>% 42 | .[,c("logCPM","LR"):=NULL] 43 | 44 | return(out) 45 | } 46 | -------------------------------------------------------------------------------- /atac/archR/differential/pseudobulk/celltype/analysis/plot_marker_peaks_stats.R: -------------------------------------------------------------------------------- 1 | # Load default settings 2 | source(here::here("settings.R")) 3 | source(here::here("utils.R")) 4 | 5 | ##################### 6 | ## Define settings ## 7 | ##################### 8 | 9 | io$basedir <- file.path(io$basedir,"test") 10 | io$marker_peaks <- file.path(io$basedir,"results/atac/archR/differential/pseudobulk/celltype/PeakMatrix/parsed/markers_filt.txt.gz") 11 | io$outdir <- file.path(io$basedir,"results/atac/archR/differential/pseudobulk/celltype/PeakMatrix/parsed/pdf"); dir.create(io$outdir, showWarnings = F) 12 | 13 | opts$celltypes <- c( 14 | "Epiblast", 15 | "Primitive_Streak", 16 | "Caudal_epiblast", 17 | "PGC", 18 | "Anterior_Primitive_Streak", 19 | "Notochord", 20 | "Def._endoderm", 21 | "Gut", 22 | "Nascent_mesoderm", 23 | "Mixed_mesoderm", 24 | "Intermediate_mesoderm", 25 | "Caudal_Mesoderm", 26 | "Paraxial_mesoderm", 27 | "Somitic_mesoderm", 28 | "Pharyngeal_mesoderm", 29 | "Cardiomyocytes", 30 | "Allantois", 31 | "ExE_mesoderm", 32 | "Mesenchyme", 33 | "Haematoendothelial_progenitors", 34 | "Endothelium", 35 | "Blood_progenitors_1", 36 | "Blood_progenitors_2", 37 | "Erythroid1", 38 | "Erythroid2", 39 | "Erythroid3", 40 | "NMP", 41 | "Rostral_neurectoderm", 42 | # "Caudal_neurectoderm", 43 | "Neural_crest", 44 | "Forebrain_Midbrain_Hindbrain", 45 | "Spinal_cord", 46 | "Surface_ectoderm" 47 | # "Visceral_endoderm" 48 | # "ExE_endoderm", 49 | # "ExE_ectoderm" 50 | # "Parietal_endoderm" 51 | ) 52 | 53 | ############################### 54 | ## Load differential results ## 55 | ############################### 56 | 57 | marker_peaks.dt <- fread(io$marker_peaks) %>% .[celltype%in%opts$celltypes] 58 | 59 | ############################################### 60 | ## Plot number of marker peaks per cell type ## 61 | ############################################### 62 | 63 | to.plot <- marker_peaks.dt %>% .[,.N,by=c("celltype")] 64 | 65 | p <- ggbarplot(to.plot, x="celltype", y="N", fill="celltype") + 66 | scale_fill_manual(values=opts$celltype.colors) + 67 | labs(x="", y="Number of marker peaks") + 68 | theme( 69 | axis.text.y = element_text(size=rel(0.65)), 70 | axis.text.x = element_text(colour="black",size=rel(0.7), angle=90, hjust=1, vjust=0.5), 71 | axis.title = element_text(colour="black",size=rel(0.75)), 72 | axis.ticks.x = element_blank(), 73 | legend.position = "none" 74 | ) 75 | 76 | pdf(file.path(io$outdir,"barplot_number_marker_peaks.pdf"), width = 6, height = 4) 77 | print(p) 78 | dev.off() 79 | 80 | ################################## 81 | ## Plot gene marker exclusivity ## 82 | ################################## 83 | 84 | to.plot <- marker_peaks.dt %>% 85 | .[,.(Nx=.N),by="gene"] %>% 86 | .[,Nx:=factor(Nx)] %>% 87 | .[,.(Ny=.N),by="Nx"] 88 | 89 | p <- ggbarplot(to.plot, x="Nx", y="Ny", fill="gray70") + 90 | labs(x="Number of different cell types per marker peak", y="") + 91 | theme( 92 | axis.text = element_text(size=rel(0.75)), 93 | ) 94 | 95 | pdf(file.path(io$outdir,"boxplot_exclusivity_per_gene.pdf"), width = 7, height = 5) 96 | print(p) 97 | dev.off() 98 | 99 | ################################################ 100 | ## Plot gene marker exclusivity per cell type ## 101 | ################################################ 102 | 103 | to.plot <- marker_peaks.dt %>% .[,N:=.N,by="gene"] 104 | 105 | p <- ggboxplot(to.plot, x="celltype", y="N", fill="celltype", color="black") + 106 | scale_fill_manual(values=opts$celltype.colors) + 107 | labs(x="", y="Exclusivity of gene markers\n(the smaller the more exclusive)") + 108 | theme( 109 | axis.text.y = element_text(size=rel(0.75)), 110 | axis.title.y = element_text(size=rel(0.85)), 111 | axis.text.x = element_text(colour="black",size=rel(0.7), angle=90, hjust=1, vjust=0.5), 112 | legend.position = "none" 113 | ) 114 | 115 | pdf(file.path(io$outdir,"boxplot_exclusivity_per_celltype.pdf"), width = 9, height = 5) 116 | print(p) 117 | dev.off() 118 | 119 | 120 | -------------------------------------------------------------------------------- /atac/archR/differential/pseudobulk/celltype/parse_differential_results.R: -------------------------------------------------------------------------------- 1 | here::i_am("atac/archR/differential/pseudobulk/celltype/parse_differential_results.R") 2 | 3 | # Load default settings 4 | source(here::here("settings.R")) 5 | 6 | ###################### 7 | ## Define arguments ## 8 | ###################### 9 | 10 | p <- ArgumentParser(description='') 11 | p$add_argument('--diff_results_dir', type="character", help='File') 12 | p$add_argument('--outdir', type="character", help='File') 13 | args <- p$parse_args(commandArgs(TRUE)) 14 | 15 | ## START TEST ## 16 | # io$basedir <- file.path(io$basedir,"test") 17 | # args <- list() 18 | # args$diff_results_dir <- file.path(io$basedir,"results/atac/archR/differential/metacells/celltype/PeakMatrix") 19 | # args$outdir <- file.path(io$basedir,"results/atac/archR/differential/metacells/celltype/PeakMatrix/parsed") 20 | ## END TEST ## 21 | 22 | # I/O 23 | dir.create(args$outdir, showWarnings=F, recursive=T) 24 | 25 | ################################################ 26 | ## Load differential expression and fetch TFs ## 27 | ################################################ 28 | 29 | diff_results_list <- list() 30 | 31 | # i <- "Visceral_endoderm"; j <- "Surface_ectoderm" 32 | for (i in 1:length(opts$celltypes)) { 33 | for (j in i:length(opts$celltypes)) { 34 | 35 | if (i!=j) { 36 | file <- file.path(args$diff_results_dir,sprintf("%s_vs_%s.txt.gz",opts$celltypes[[i]],opts$celltypes[[j]])) 37 | if (file.exists(file)) { 38 | tmp <- fread(file) %>% .[,c("celltypeA","celltypeB"):=list(opts$celltypes[[i]],opts$celltypes[[j]])] 39 | diff_results_list[[sprintf("%s_vs_%s",opts$celltypes[[i]],opts$celltypes[[j]])]] <- tmp 40 | } else { 41 | print(sprintf("%s not found...",file)) 42 | } 43 | } 44 | } 45 | } 46 | 47 | ########## 48 | ## Save ## 49 | ########## 50 | 51 | fwrite(rbindlist(diff_results_list), file.path(args$outdir,"diff_results.txt.gz"), sep="\t", quote=F, na="NA") 52 | 53 | ########## 54 | ## TEST ## 55 | ########## 56 | 57 | tmp <- fread("/bi/group/reik/ricard/data/gastrulation_multiome_10x/test/results/atac/archR/differential/pseudobulk/celltype/PeakMatrix/parsed/diff_results.txt.gz") %>% 58 | .[abs(logFC)>=2 & padj_fdr<=0.01 & (mean_groupA>=2.5 | mean_groupB>=2.5)] %>% .[,.N,by="feature"] 59 | 60 | sum(tmp$N>=2) -------------------------------------------------------------------------------- /atac/archR/differential/pseudobulk/celltype_genotype/analysis/load_data.R: -------------------------------------------------------------------------------- 1 | 2 | ############################################# 3 | ## Load results from differential analysis ## 4 | ############################################# 5 | 6 | diff.dt <- opts$celltypes %>% map(function(j) { 7 | file <- sprintf("%s/%s_WT_vs_KO.txt.gz", io$indir,j) 8 | if (file.exists(file)) { 9 | fread(file, select=c(1,2,3)) %>% .[,c("celltype"):=list(j)] 10 | } 11 | }) %>% rbindlist %>% 12 | # .[,MeanDiff:=-MeanDiff] # change sign to keep the groupB - groupA consistency 13 | .[, sign := ifelse(logFC>0,"Upregulated in KO","Downregulated in KO")] %>% 14 | .[, sig := (padj_fdr<=opts$threshold_fdr & abs(logFC)>=opts$min.logFC)] 15 | 16 | 17 | # Print stats 18 | print(sprintf("Number of celltypes: %s",length(unique(diff.dt$celltype)))) 19 | print(sprintf("Number of features: %s",length(unique(diff.dt$feature)))) -------------------------------------------------------------------------------- /atac/archR/differential/pseudobulk/celltype_genotype/old/old_stuff.R: -------------------------------------------------------------------------------- 1 | # i="T_789"; j <- "NMP" 2 | tfs.to.use <- colnames(virtual_chip.mtx) 3 | celltypes.to.use <- "NMP" 4 | # i <- "T" 5 | tf_enrichment_insilico_chip.dt <- tfs.to.use %>% map(function(i) { 6 | print(i) 7 | 8 | celltypes.to.use %>% map(function(j) { 9 | 10 | foreground.peaks <- diff.dt[celltype==j & sign=="Downregulated in KO" & sig==TRUE,feature] 11 | background.peaks <- diff.dt[celltype==j & sig==FALSE,feature] 12 | foreground.nmatches <- virtual_chip_logical.mtx[foreground.peaks,i] %>% sum 13 | background.nmatches <- virtual_chip_logical.mtx[background.peaks,i] %>% sum 14 | 15 | p.value <- phyper(foreground.nmatches-1, background.nmatches, length(background.peaks)-background.nmatches,length(foreground.peaks), lower.tail = F) 16 | 17 | data.table(tf=i, celltype=j, pval=p.value) 18 | }) %>% rbindlist }) %>% rbindlist 19 | 20 | 21 | to.plot <- tf_enrichment_insilico_chip.dt[pval<=0.10] %>% 22 | .[,pval:=as.numeric(pval)] %>% 23 | .[,log_pval:=-log10(pval)] %>% 24 | .[,celltype:=factor(celltype,levels=celltypes.to.use)] 25 | 26 | to.plot[,dot_size:=minmax.normalisation(abs(log_pval))] 27 | 28 | to.plot.text <- to.plot[pval<=0.01] 29 | 30 | ggplot(to.plot[pval<=0.01], aes_string(x="log_pval", y="tf", size="dot_size")) + 31 | geom_point(shape=21) + 32 | # scale_x_discrete(drop=F) + 33 | # scale_size_continuous(range = c(0.25,2)) + 34 | # guides(x = guide_axis(angle = 90)) + 35 | theme_classic() + 36 | theme( 37 | axis.text.x = element_text(color="black", size=rel(0.75)), 38 | axis.text.y = element_text(color="black") 39 | ) 40 | -------------------------------------------------------------------------------- /atac/archR/differential/pseudobulk/celltype_genotype/parse_differential_results.R: -------------------------------------------------------------------------------- 1 | here::i_am("atac/archR/differential/pseudobulk/celltype_genotype/parse_differential_results.R") 2 | 3 | # Load default settings 4 | source(here::here("settings.R")) 5 | # source(here::here("utils.R")) 6 | 7 | ###################### 8 | ## Define arguments ## 9 | ###################### 10 | 11 | p <- ArgumentParser(description='') 12 | p$add_argument('--diff_results_dir', type="character", help='File') 13 | p$add_argument('--outfile', type="character", help='File') 14 | args <- p$parse_args(commandArgs(TRUE)) 15 | 16 | ## START TEST ## 17 | # io$basedir <- file.path(io$basedir,"test") 18 | # args <- list() 19 | # args$diff_results_dir <- file.path(io$basedir,"results/atac/archR/differential/pseudobulk/celltype_genotype/PeakMatrix") 20 | # args$outfile <- file.path(io$basedir,"results/atac/archR/differential/pseudobulk/celltype_genotype/PeakMatrix/parsed/diff_results.txt.gz") 21 | ## END TEST ## 22 | 23 | # I/O 24 | dir.create(dirname(args$outfile), showWarnings = F, recursive = T) 25 | 26 | ########################################## 27 | ## Load differential expression results ## 28 | ########################################## 29 | 30 | diff_results_list <- list() 31 | 32 | # i <- "Visceral_endoderm"; j <- "Surface_ectoderm" 33 | for (i in 1:length(opts$celltypes)) { 34 | file <- file.path(args$diff_results_dir,sprintf("%s.txt.gz",opts$celltypes[[i]])) 35 | if (file.exists(file)) { 36 | tmp <- fread(file) %>% .[,celltype:=opts$celltypes[[i]]] 37 | if (nrow(tmp)>1) { 38 | diff_results_list[[opts$celltypes[[i]]]] <- tmp 39 | } 40 | } else { 41 | print(sprintf("%s not found...",file)) 42 | } 43 | } 44 | 45 | print(names(diff_results_list)) 46 | 47 | ########## 48 | ## Save ## 49 | ########## 50 | 51 | fwrite(rbindlist(diff_results_list), args$outfile, sep="\t", quote=F, na="NA") 52 | 53 | -------------------------------------------------------------------------------- /atac/archR/differential/utils.R: -------------------------------------------------------------------------------- 1 | gg_volcano_plot <- function(to.plot, top_genes=10, xlim=NULL, ylim=NULL, label_groups = NULL) { 2 | 3 | negative_hits <- to.plot[sig==TRUE & MeanDiff<0,idx] 4 | positive_hits <- to.plot[sig==TRUE & MeanDiff>0,idx] 5 | all <- nrow(to.plot) 6 | 7 | if (is.null(xlim)) 8 | xlim <- max(abs(to.plot$MeanDiff), na.rm=T) 9 | if (is.null(ylim)) 10 | ylim <- max(-log10(to.plot$FDR), na.rm=T) 11 | 12 | p <- ggplot(to.plot, aes(x=MeanDiff, y=-log10(FDR))) + 13 | # ggrastr::geom_point_rast(aes(color=sig), size=1) + 14 | geom_point(aes(color=sig), size=1) + 15 | # geom_hline(yintercept = -log10(opts$threshold_fdr), color="blue") + 16 | geom_segment(aes(x=0, xend=0, y=0, yend=ylim-1), color="orange") + 17 | scale_color_manual(values=c("black","red")) + 18 | # scale_x_continuous(limits=c(-xlim-10,xlim+10)) + 19 | scale_x_continuous(limits=c(-xlim,xlim)) + 20 | scale_y_continuous(limits=c(0,ylim+2.5)) + 21 | labs(x="Accessibility difference (%)", y=expression(paste("-log"[10],"(FDR)"))) + 22 | annotate("text", x=0, y=ylim+1, size=7, label=sprintf("(%d)", all)) + 23 | annotate("text", x=-50, y=ylim+2, size=7, label=sprintf("%d (-)",length(negative_hits))) + 24 | annotate("text", x=50, y=ylim+2, size=7, label=sprintf("%d (+)",length(positive_hits))) + 25 | # ggrepel::geom_text_repel(data=head(to.plot[sig==T],n=top_genes), aes(x=MeanDiff, y=-log10(FDR), label=symbol), size=5) + 26 | theme_classic() + 27 | theme( 28 | axis.text = element_text(size=rel(1.00), color='black'), 29 | axis.title = element_text(size=rel(1.50), color='black'), 30 | # axis.title = element_text(), 31 | legend.position="none" 32 | ) 33 | 34 | if (length(label_groups)>0) { 35 | p <- p + 36 | annotate("text", x=-70, y=0, size=4.5, label=sprintf("Up in %s",label_groups[2])) + 37 | annotate("text", x=70, y=0, size=4.5, label=sprintf("Up in %s",label_groups[1])) 38 | } 39 | 40 | return(p) 41 | } 42 | -------------------------------------------------------------------------------- /atac/archR/feature_stats/plot_feature_stats_atac.R: -------------------------------------------------------------------------------- 1 | # TO-DO: USE OUTPUT OF SAVE ATAC MATRICES 2 | here::i_am("atac/archR/feature_stats/archR_calculate_feature_stats.R") 3 | 4 | source(here::here("settings.R")) 5 | source(here::here("utils.R")) 6 | 7 | suppressPackageStartupMessages(library(ArchR)) 8 | suppressPackageStartupMessages(library(sparseMatrixStats)) 9 | 10 | 11 | ###################### 12 | ## Define arguments ## 13 | ###################### 14 | 15 | p <- ArgumentParser(description='') 16 | p$add_argument('--feature_stats', type="character", help='feature stats file') 17 | p$add_argument('--outdir', type="character", help='Output directory') 18 | args <- p$parse_args(commandArgs(TRUE)) 19 | 20 | ## START TEST ## 21 | # args$feature_stats <- file.path(io$basedir, "results/atac/archR/feature_stats/PeakMatrix_feature_stats.txt.gz") 22 | # args$outdir <- file.path(io$basedir, "results/atac/archR/feature_stats/PeakMatrix/pdf") 23 | ## END TEST ## 24 | 25 | dir.create(args$outdir, showWarnings=F, recursive = T) 26 | 27 | ######################## 28 | ## Load feature stats ## 29 | ######################## 30 | 31 | ########## 32 | ## Plot ## 33 | ########## 34 | 35 | to.plot <- peak_metadata.dt %>% 36 | .[score>=args$min_peak_score] %>% 37 | merge(peak_stats.dt,by="peak") 38 | 39 | p <- ggboxplot(to.plot, x="peakType", y="mean_singlecell", fill="peakType", outlier.shape = NA) + 40 | coord_cartesian(ylim=c(0,1.0)) + 41 | labs(x="", y="Average chromatin accessibility") + 42 | theme( 43 | axis.text = element_text(size=rel(0.75)), 44 | legend.position = "none" 45 | ) 46 | 47 | pdf(file.path(args$outdir,"boxplots_atac_peak_type.pdf"), width = 7, height = 5) 48 | print(p) 49 | dev.off() 50 | 51 | 52 | ########## 53 | ## TEST ## 54 | ########## 55 | 56 | # to.plot <- feature_stats.dt %>% head(n=1e4) 57 | # to.plot <- feature_stats.dt[var_pseudobulk>=0.10 & var_metacells<1] 58 | # 59 | # ggscatter(to.plot, x="var_metacells", y="var_pseudobulk", size=1) + 60 | # geom_abline(slope=1, intercept=0) 61 | # 62 | # ggscatter(to.plot, x="var_cells", y="var_pseudobulk", size=1) + 63 | # geom_abline(slope=1, intercept=0) 64 | # 65 | # ggscatter(to.plot, x="mean_metacells", y="var_metacells", size=1) + 66 | # stat_smooth(method="loess") 67 | # 68 | # ggscatter(to.plot, x="mean_pseudobulk", y="var_pseudobulk", size=0.5) + 69 | # stat_smooth(method="loess") 70 | 71 | -------------------------------------------------------------------------------- /atac/archR/load_archR_project.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages(library(ArchR)) 2 | 3 | ##################### 4 | ## Define settings ## 5 | ##################### 6 | 7 | io$archR.directory <- file.path(io$basedir,"processed/atac/archR") 8 | # io$archR.directory <- file.path(io$basedir,"test/processed/atac/archR") 9 | # io$atac.peak.annotation <- file.path(io$basedir,"/original/atac_peak_annotation.tsv") 10 | io$archR.projectMetadata <- file.path(io$archR.directory,"projectMetadata.rds") 11 | io$archR.peakSet.granges <- file.path(io$archR.directory,"PeakSet.rds") 12 | 13 | setwd(io$archR.directory) 14 | 15 | #################### 16 | ## Define options ## 17 | #################### 18 | 19 | addArchRGenome("mm10") 20 | addArchRThreads(threads = 1) 21 | 22 | ######################## 23 | ## Load ArchR project ## 24 | ######################## 25 | 26 | ArchRProject <- loadArchRProject(io$archR.directory) 27 | 28 | # Load ArchR projectMetadata 29 | if (file.exists(io$archR.projectMetadata)) { 30 | ArchRProject@projectMetadata <- readRDS(io$archR.projectMetadata) 31 | } 32 | 33 | # Load peaks 34 | if (file.exists(io$archR.peakSet.granges)) { 35 | ArchRProject <- addPeakSet(ArchRProject, peakSet = readRDS(io$archR.peakSet.granges), force = TRUE) 36 | } 37 | 38 | # Load motif annotations over peaks 39 | if (!is.null(ArchRProject@peakAnnotation)) { 40 | io$archR.peakAnnotation <- file.path(io$archR.directory,"Annotations/peakAnnotation.rds") 41 | if (file.exists(io$archR.peakAnnotation)) { 42 | ArchRProject@peakAnnotation <- readRDS(io$archR.peakAnnotation) 43 | } 44 | } 45 | 46 | # Add background peaks 47 | if (!is.null(getPeakSet(ArchRProject))) { 48 | io$archR.bgdPeaks <- file.path(io$archR.directory, "Background-Peaks.rds") 49 | if (!"bgdPeaks" %in% metadata(getPeakSet(ArchRProject))$bgdPeaks) { 50 | if (file.exists(io$archR.bgdPeaks)) metadata(ArchRProject@peakSet)$bgdPeaks <- io$archR.bgdPeaks 51 | } 52 | } 53 | 54 | ########## 55 | ## TEST ## 56 | ########## 57 | 58 | # ArchRProject@peakSet <- readRDS(io$archR.peakSet.granges) 59 | # seqlevels(ArchRProject@peakSet) <- sort(seqlevels(ArchRProject@peakSet)) 60 | # ArchRProject@peakSet <- sort(ArchRProject@peakSet) 61 | 62 | 63 | # getAvailableMatrices(ArchRProject) 64 | 65 | # io$arrow.files <- opts$samples %>% 66 | # # map_chr(~ sprintf("%s/%s.arrow",io$archR.directory,.)) 67 | # map_chr(~ sprintf("%s.arrow",.)) 68 | # 69 | # ArchRProject <- ArchRProject( 70 | # ArrowFiles = io$arrow.files, 71 | # # outputDirectory = "ArchROutput", 72 | # outputDirectory = io$archR.directory, 73 | # copyArrows = FALSE 74 | # ) 75 | # saveArchRProject(ArchRProject) 76 | -------------------------------------------------------------------------------- /atac/archR/load_motif_annotation.R: -------------------------------------------------------------------------------- 1 | # opts$motif_annotation <- "Motif_cisbp" # "Motif_JASPAR2020" 2 | 3 | motif2gene_file <- sprintf("%s/Annotations/%s_TFs.txt.gz",io$archR.directory,opts$motif_annotation) 4 | 5 | if (file.exists(motif2gene_file)) { 6 | 7 | motif2gene.dt <- fread(motif2gene_file) 8 | 9 | } else { 10 | 11 | peakAnnotation <- readRDS(sprintf("%s/Annotations/peakAnnotation.rds",io$archR.directory)) 12 | stopifnot(opts$motif_annotation%in%names(peakAnnotation)) 13 | motif2gene.dt <- peakAnnotation[[opts$motif_annotation]]$motifSummary %>% 14 | as.data.table(keep.rownames = T) %>% setnames("rn","motif") %>% .[,strand:=NULL] %>% setnames("symbol","gene") 15 | 16 | # Rename genes 17 | if (grepl("cisbp",opts$motif_annotation, ignore.case = T)) { 18 | 19 | tf2gene_rename <- c( 20 | "Tcfe"="Tfe", "Nkx1"="Nkx1-", "Nkx2"="Nkx2-", "Nkx3"="Nkx3-", "Nkx4"="Nkx4-", "Nkx5"="Nkx5-", "Nkx6"="Nkx6-", "Foxf1a"="Foxf1", 21 | "Hmga1rs1"="rs1", "Mycl1$"="Mycl", "Dux$"="Duxf3", "Duxbl$"="Duxbl1", "Pit1$"="Prop1", 22 | "ENSMUSG00000079994"="Sox1", "Tcfap"="Tfap" 23 | ) 24 | 25 | motif2gene.dt[,gene:=stringr::str_replace_all(gene,tf2gene_rename)] 26 | 27 | } else if (grepl("JASPAR",opts$motif_annotation, ignore.case = T)) { 28 | 29 | # conflictive motifs: fusion proteins (UN::JUNB) and versions (TFAP2A(var.2)) 30 | # stop("To-do") 31 | tf2gene_rename <- c("TBXT"="T") 32 | motif2gene.dt[,gene:=stringr::str_replace_all(gene,tf2gene_rename)] 33 | 34 | # for JASPAR motifs 35 | motif2gene.dt[,motif:=str_replace(motif,"\\.VAR\\.","\\.var\\."),] 36 | 37 | } else { 38 | stop("Motif annotation not recognised") 39 | } 40 | 41 | motif2gene.dt[,c("motif","gene"):=list(toupper(motif),toupper(gene))] 42 | 43 | # Save 44 | fwrite(motif2gene.dt, motif2gene_file, sep="\t", quote=F) 45 | } 46 | 47 | 48 | -------------------------------------------------------------------------------- /atac/archR/peak_calling/README.txt: -------------------------------------------------------------------------------- 1 | 2 | PijuanSala: 3 | we called peaks on the pooled sample of high-quality barcodes using macs2 callpeak50 (macs2 2.1.0.20150420) with ‘P = 0.05,–nomodel,–shift 0,–extsize 150’ and discarded peaks falling in blacklisted mm10 genomic regions from the ENCODE Project Consortium51 using bedtools intersect (v2.21.0). The resulting peak summits were extended ±250 bp and subsequently merged with the promoter coordinates of genes from ensembl GRCm38.92 (from TSS to TSS –500 bp) using bedtools merge (v2.21.0). -------------------------------------------------------------------------------- /atac/archR/peak_calling/analysis/calculate_cpg_density_atac_peaks.R: -------------------------------------------------------------------------------- 1 | library(BSgenome.Mmusculus.UCSC.mm10) 2 | library(Biostrings) 3 | 4 | ##################### 5 | ## Define settings ## 6 | ##################### 7 | 8 | source(here::here("settings.R")) 9 | source(here::here("utils.R")) 10 | 11 | io$outfile <- file.path(io$basedir,"results_new/atac/archR/peak_calling/cpg_density_peaks.txt.gz") 12 | 13 | ################ 14 | ## Load peaks ## 15 | ################ 16 | 17 | peaks.dt <- fread(io$archR.peak.metadata) %>% 18 | .[,c("chr","start","end")] %>% 19 | .[,idx:=sprintf("%s:%s-%s",chr,start,end)]# %>% 20 | # .[,c("idx")] 21 | 22 | 23 | #################################### 24 | ## Calculate CpG density per peak ## 25 | #################################### 26 | 27 | chr_lengths.dt <- data.table( 28 | chr = unique(peaks.dt$chr), 29 | chr_length = seqlengths(Mmusculus) %>% .[unique(peaks.dt$chr)] 30 | ) 31 | peaks.dt <- merge(peaks.dt, chr_lengths.dt, by="chr") 32 | 33 | # Filter features that exceed chr length 34 | peaks.dt <- peaks.dt[end% 31 | .[,chr:=as.factor(sub("chr","",chr))] %>% 32 | setnames("symbol","gene") %>% 33 | .[, c("chr","start","end","gene")] %>% 34 | setkey(chr,start,end) 35 | 36 | # Load peak metadata 37 | peakSet.dt <- fread(io$archR.peak.metadata) %>% 38 | .[,chr:=as.factor(sub("chr","",chr))] %>% 39 | .[,c("chr","start","end")] %>% 40 | .[,peak:=sprintf("%s_%s_%s",chr,start,end)] %>% 41 | setkey(chr,start,end) 42 | 43 | ########################## 44 | ## Load peak2gene links ## 45 | ########################## 46 | 47 | ################################# 48 | ## Load motifmatcher Matches ## 49 | ################################# 50 | 51 | motifmatcher.se <- readRDS(sprintf("%s/Annotations/Motif_cisbp-Matches-In-Peaks.rds",io$archR.directory)) 52 | 53 | # Rename TFs 54 | colnames(motifmatcher.se) <- colnames(motifmatcher.se) %>% toupper %>% stringr::str_split(.,"_") %>% map_chr(1) 55 | motifmatcher.se <- motifmatcher.se[,!duplicated(colnames(motifmatcher.se))] 56 | 57 | # Rename peaks 58 | tmp <- rowRanges(motifmatcher.se) 59 | rownames(motifmatcher.se) <- sprintf("%s:%s-%s",seqnames(tmp), start(tmp), end(tmp)) 60 | 61 | # Subset pekas 62 | # motifmatcher.se <- motifmatcher.se[unique(cor_dt$peak),] 63 | 64 | ############# 65 | ## Overlap ## 66 | ############# 67 | 68 | ov <- foverlaps( 69 | peakSet.dt, 70 | gene_metadata[, c("chr","start","end","gene")], 71 | nomatch = NA 72 | ) %>% 73 | setnames(c("i.start","i.end"),c("peak.start","peak.end")) %>% 74 | setnames(c("start","end"),c("gene.start","gene.end")) %>% 75 | .[,c("gene.start","gene.end") := list (gene.start+opts$gene_window, gene.end-opts$gene_window)] %>% 76 | # .[,c("start_dist","end_dist"):=list( abs(gene.end-peak.start), abs(gene.start-peak.end))] %>% 77 | .[,c("start_dist","end_dist"):=list( gene.end-peak.start, gene.start-peak.end)] %>% 78 | .[,c("start_dist","end_dist"):=list( ifelse(end_dist<0 & start_dist>0,0,start_dist), ifelse(end_dist<0 & start_dist>0,0,end_dist) )] %>% 79 | .[,dist:=ifelse(abs(start_dist)% .[,c("start_dist","end_dist"):=NULL] 80 | 81 | # Select nearest gene 82 | ov_nearest <- ov %>% 83 | .[.[,.I[dist==min(dist)], by=c("peak")]$V1] %>% 84 | .[complete.cases(.)] %>% 85 | .[!duplicated(peak)] 86 | 87 | # Sanity check 88 | # ov_nearest$gene[(duplicated(ov_nearest$peak))] 89 | 90 | ########## 91 | ## Save ## 92 | ########## 93 | 94 | fwrite(ov, paste0(io$outdir,"/peaks2genes_all.txt.gz"), sep="\t", na="NA") 95 | fwrite(ov_nearest, paste0(io$outdir,"/peaks2genes_nearest.txt.gz"), sep="\t", na="NA") 96 | -------------------------------------------------------------------------------- /atac/archR/peak_calling/analysis/link_peaks2genes_genomic_distance.R: -------------------------------------------------------------------------------- 1 | here::i_am("atac/archR/peak_calling/analysis/link_peaks2genes_genomic_distance.R") 2 | 3 | source(here::here("settings.R")) 4 | source(here::here("utils.R")) 5 | 6 | ################################ 7 | ## Initialize argument parser ## 8 | ################################ 9 | 10 | p <- ArgumentParser(description='') 11 | p$add_argument('--gene_metadata', type="character", help='Gene metadata') 12 | p$add_argument('--peak_metadata', type="character", help='Peak metadata') 13 | p$add_argument('--gene_window', type="integer", default=1e5, help='Genomic window size') 14 | p$add_argument('--outdir', type="character", help='Output directory') 15 | args <- p$parse_args(commandArgs(TRUE)) 16 | 17 | ##################### 18 | ## Define settings ## 19 | ##################### 20 | 21 | ## START TEST ## 22 | # args <- list() 23 | # args$gene_metadata <- io$gene_metadata 24 | # args$peak_metadata <- file.path(io$basedir,"processed_new/atac/archR/PeakCalls/peak_metadata.tsv.gz") 25 | # args$gene_window <- 5e5 # maximum window length for the overlap 26 | # args$outdir <- file.path(io$basedir,"results_new/atac/archR/peak_calling/peaks2genes") 27 | ## END TEST ## 28 | 29 | # I/O 30 | dir.create(args$outdir, showWarnings=F) 31 | 32 | ############### 33 | ## Load data ## 34 | ############### 35 | 36 | # Load gene metadata 37 | gene_metadata <- fread(args$gene_metadata) %>% 38 | .[,chr:=as.factor(sub("chr","",chr))] %>% 39 | setnames("symbol","gene") %>% 40 | .[, c("chr","start","end","gene","ens_id","strand")] 41 | 42 | # Load peak metadata 43 | peakSet.dt <- fread(args$peak_metadata) %>% 44 | .[,chr:=as.factor(sub("chr","",chr))] %>% 45 | .[,c("chr","start","end")] %>% 46 | .[,peak:=sprintf("chr%s:%s-%s",chr,start,end)] %>% 47 | setkey(chr,start,end) 48 | 49 | ############# 50 | ## Overlap ## 51 | ############# 52 | 53 | gene_metadata.ov <- copy(gene_metadata) %>% 54 | .[strand=="+",c("gene.start","gene.end"):=list(start,end)] %>% 55 | .[strand=="-",c("gene.start","gene.end"):=list(end,start)] %>% 56 | .[strand=="+",c("start","end"):=list (gene.start-args$gene_window, gene.end+args$gene_window)] %>% 57 | .[strand=="-",c("end","start"):=list (gene.start+args$gene_window, gene.end-args$gene_window)] %>% 58 | # .[,strand:=NULL] %>% 59 | setkey(chr,start,end) 60 | 61 | stopifnot((gene_metadata.ov$end-gene_metadata.ov$start)>0) 62 | 63 | ov <- foverlaps( 64 | peakSet.dt, 65 | gene_metadata.ov, 66 | nomatch = NA 67 | ) %>% .[,c("start","end"):=NULL] %>% 68 | setnames(c("i.start","i.end"),c("peak.start","peak.end")) %>% 69 | .[,peak.mean:=(peak.start+peak.end)/2] %>% 70 | # calculate distance from the peak to the genebody 71 | .[,dist:=min(abs(gene.end-peak.mean), abs(gene.start-peak.mean)), by=c("gene","ens_id","peak","strand")] %>% 72 | .[strand=="+" & peak.mean>gene.start & peak.mean% 73 | .[strand=="-" & peak.meangene.end,dist:=0] 74 | 75 | 76 | # ov[peak=="18_64485555_64486155"] 77 | # gene_metadata[gene=="Fech"] 78 | # gene_metadata.ov[gene=="Fech"] 79 | # ov[gene=="Fech" & peak=="chr18:64485555-64486155"] 80 | # ov[peak=="chr7:103850833-103851433"] 81 | 82 | # Select nearest gene 83 | ov_nearest <- ov %>% 84 | .[.[,.I[dist==min(dist)], by=c("peak")]$V1] %>% 85 | .[complete.cases(.)] %>% 86 | .[!duplicated(peak)] 87 | 88 | # Sanity check 89 | # ov_nearest$gene[(duplicated(ov_nearest$peak))] 90 | 91 | ########## 92 | ## Save ## 93 | ########## 94 | 95 | fwrite(ov, file.path(args$outdir,"peaks2genes_all.txt.gz"), sep="\t", na="NA") 96 | fwrite(ov_nearest, file.path(args$outdir,"peaks2genes_nearest.txt.gz"), sep="\t", na="NA") 97 | -------------------------------------------------------------------------------- /atac/archR/plot_individual_peaks/compare_genotypes/pseudobulk_with_replicates/plot_individual_peaks_genotypes_pseudobulk_with_replicates.R: -------------------------------------------------------------------------------- 1 | # here::i_am("atac/archR/processing/save_archr_matrices.R") 2 | 3 | source(here::here("settings.R")) 4 | source(here::here("utils.R")) 5 | 6 | ##################### 7 | ## Define settings ## 8 | ##################### 9 | 10 | # I/O 11 | io$pseudobulk_atac_peak_matrix <- file.path(io$basedir,"results/atac/archR/pseudobulk/celltype_genotype/PeakMatrix/PeakMatrix_pseudobulk_with_replicates.rds") 12 | io$outdir <- file.path(io$basedir,"results/atac/archR/plot_individual_peaks/genotype"); dir.create(io$outdir, showWarnings = F) 13 | 14 | # Options 15 | opts$samples <- c( 16 | "E8.5_CRISPR_T_KO", 17 | "E8.5_CRISPR_T_WT" 18 | ) 19 | 20 | opts$celltypes <- c("Somitic_mesoderm", "NMP", "Spinal_cord") 21 | 22 | #################### 23 | ## Load metadata ## 24 | #################### 25 | 26 | ########################## 27 | ## Load ATAC PeakMatrix ## 28 | ########################## 29 | 30 | atac_peak_matrix_pseudobulk.se <- readRDS(io$pseudobulk_atac_peak_matrix) 31 | 32 | # subset 33 | atac_peak_matrix_pseudobulk.se <- atac_peak_matrix_pseudobulk.se[,atac_peak_matrix_pseudobulk.se$celltype%in%opts$celltypes] 34 | 35 | # Normalise ATAC data 36 | assay(atac_peak_matrix_pseudobulk.se,"logcounts") <- log(1e6*(sweep(assay(atac_peak_matrix_pseudobulk.se),2,colSums(assay(atac_peak_matrix_pseudobulk.se),na.rm=T),"/"))+1) 37 | 38 | #################################################### 39 | ## Boxplots of chromatin accessibility (WT vs KO) ## 40 | #################################################### 41 | 42 | peaks.to.plot <- c("chr7:144884955-144885555","chr7:79789147-79789747","chr7:126785067-126785667") 43 | 44 | # i <- "chr7:144884955-144885555" 45 | for (i in peaks.to.plot) { 46 | 47 | to.plot <- data.table( 48 | acc = assay(atac_peak_matrix_pseudobulk.se,"logcounts")[i,], 49 | sample = colnames(atac_peak_matrix_pseudobulk.se), 50 | celltype = atac_peak_matrix_pseudobulk.se$celltype, 51 | genotype = atac_peak_matrix_pseudobulk.se$genotype 52 | ) %>% .[celltype=="Caudal_Mesoderm",celltype:="Somitic_mesoderm"] %>% 53 | .[,celltype_genotype:=sprintf("%s (%s)",celltype,genotype)] 54 | 55 | order <- c("Spinal_cord (WT)","Spinal_cord (T_KO)", "NMP (WT)", "NMP (T_KO)", "Somitic_mesoderm (WT)") 56 | to.plot[,celltype_genotype:=factor(celltype_genotype, levels=order)] 57 | 58 | my_comparisons <- list( c("NMP (WT)", "NMP (T_KO)")) 59 | 60 | to.plot.means <- to.plot[,.(acc=mean(acc),sd=sd(acc)), by=c("celltype_genotype","celltype","genotype")] 61 | 62 | p <- ggplot(to.plot, aes_string(x="celltype_genotype", y="acc", fill="genotype")) + 63 | geom_bar(stat="identity", color="black", alpha=1, data=to.plot.means) + 64 | geom_jitter(size=3, width=0.05, shape=21) + 65 | geom_errorbar(aes(ymin=acc-sd, ymax=acc+sd), width=0.15, alpha=1, size=0.6, data=to.plot.means) + 66 | stat_compare_means(aes(label = paste0("p = ", ..p.format..)), comparisons = my_comparisons, method="t.test") + 67 | # stat_summary(fun.data = give.n, geom = "text", position = position_dodge(width=0.75)) + 68 | scale_fill_manual(values=c("#EE0000","#1C86EE")) + 69 | labs(x="", y="Chromatin accessibility (log normalised counts)") + 70 | # geom_violin(aes(fill=celltype)) + 71 | theme_classic() + 72 | theme( 73 | axis.text.y = element_text(color="black"), 74 | axis.text.x = element_text(color="black"), 75 | legend.title = element_blank(), 76 | legend.position = "none" 77 | ) 78 | 79 | pdf(file.path(io$outdir,sprintf("boxplots_acc_genotype_%s.pdf",gsub(":","-",i))), width=6, height=6) 80 | print(p) 81 | dev.off() 82 | } 83 | -------------------------------------------------------------------------------- /atac/archR/processing/0_create_arrow_files.R: -------------------------------------------------------------------------------- 1 | here::i_am("atac/archR/processing/0_create_arrow_files.R") 2 | 3 | source(here::here("settings.R")) 4 | 5 | suppressPackageStartupMessages(library(ArchR)) 6 | 7 | ###################### 8 | ## Define arguments ## 9 | ###################### 10 | 11 | p <- ArgumentParser(description='') 12 | p$add_argument('--samples', type="character", nargs='+', help='Samples') 13 | p$add_argument('--fragments_files', type="character", nargs='+', help='ATAC Fragments files') 14 | p$add_argument('--genome', type="character", default="mm10", help='Genome') 15 | p$add_argument('--min_fragments', type="integer", default=1000, help='Minimum number of ATAC fragments') 16 | p$add_argument('--max_fragments', type="integer", default=1e7, help='Maximum number of ATAC fragments') 17 | p$add_argument('--min_tss_score', type="double", default=2.5, help='Minimum TSS score threshold') 18 | p$add_argument('--threads', type="integer", default=1, help='Number of threads') 19 | p$add_argument('--outdir', type="character", help='Output directory') 20 | 21 | args <- p$parse_args(commandArgs(TRUE)) 22 | 23 | ## START TEST ## 24 | # args$fragments_files <- c( 25 | # "/bi/group/reik/ricard/data/gastrulation_multiome_10x/original/E7.5_rep1/atac_fragments.tsv.gz", 26 | # "/bi/group/reik/ricard/data/gastrulation_multiome_10x/original/E7.5_rep2/atac_fragments.tsv.gz" 27 | # ) 28 | # args$samples <- c("E7.5_rep1","E7.5_rep2") 29 | # args$genome <- "mm10" 30 | # args$min_fragments <- 1000 31 | # args$max_fragments <- 1e7 32 | # args$min_tss_score <- 2.5 33 | # args$threads <- 1 34 | # args$outdir <- "/bi/group/reik/ricard/data/gastrulation_multiome_10x/processed/atac/archR" 35 | ## END TEST ## 36 | 37 | ##################### 38 | ## Define settings ## 39 | ##################### 40 | 41 | setwd(args$outdir) 42 | 43 | # ArchR options 44 | addArchRThreads(threads=args$threads) 45 | addArchRGenome(args$genome) 46 | 47 | rhdf5::h5disableFileLocking() 48 | 49 | ######################## 50 | ## create Arrow Files ## 51 | ######################## 52 | 53 | ArrowFiles <- createArrowFiles( 54 | inputFiles = args$fragments_files, 55 | sampleNames = args$samples, 56 | outputNames = args$samples, 57 | addTileMat = FALSE, 58 | addGeneScoreMat = FALSE, 59 | excludeChr = c("chrM", "chrY"), 60 | 61 | subThreading = FALSE, # parallel processing doesn't work well (https://github.com/GreenleafLab/ArchR/issues/248) 62 | force = TRUE, 63 | 64 | # QC metrics 65 | minFrags = args$min_fragments, # The minimum number of fragments per cell 66 | maxFrags = args$max_fragments, # The maximum number of fragments per cell 67 | minTSS = args$min_tss_score # The minimum TSS enrichment score per cell 68 | ) 69 | -------------------------------------------------------------------------------- /atac/archR/processing/1_create_archR_project.R: -------------------------------------------------------------------------------- 1 | here::i_am("atac/archR/processing/1_create_archR_project.R") 2 | 3 | source(here::here("settings.R")) 4 | 5 | suppressPackageStartupMessages(library(ArchR)) 6 | 7 | ###################### 8 | ## Define arguments ## 9 | ###################### 10 | 11 | p <- ArgumentParser(description='') 12 | p$add_argument('--arrow_files', type="character", nargs='+', help='Arrow files') 13 | p$add_argument('--genome', type="character", default="mm10", help='Genome') 14 | p$add_argument('--outdir', type="character", help='Output directory') 15 | 16 | args <- p$parse_args(commandArgs(TRUE)) 17 | 18 | ## START TEST ## 19 | # args$arrow_files <- c( 20 | # "/bi/group/reik/ricard/data/gastrulation_multiome_10x/processed/atac/archR_subset/ArrowFiles/E7.5_rep1.arrow", 21 | # "/bi/group/reik/ricard/data/gastrulation_multiome_10x/processed/atac/archR_subset/ArrowFiles/E7.5_rep2.arrow" 22 | # ) 23 | # args$genome <- "mm10" 24 | # args$threads <- 1 25 | # args$outdir <- "/bi/group/reik/ricard/data/gastrulation_multiome_10x/processed/atac/archR" 26 | ## END TEST ## 27 | 28 | ##################### 29 | ## Define settings ## 30 | ##################### 31 | 32 | # ArchR options 33 | addArchRGenome(args$genome) 34 | 35 | ############################ 36 | ## create an ArchRProject ## 37 | ############################ 38 | 39 | ArchRProject <- ArchRProject( 40 | ArrowFiles = args$arrow_files, 41 | outputDirectory = args$outdir, 42 | copyArrows = FALSE 43 | ) 44 | 45 | ########## 46 | ## Save ## 47 | ########## 48 | 49 | saveArchRProject(ArchRProject) -------------------------------------------------------------------------------- /atac/archR/processing/save_atac_anndata.R: -------------------------------------------------------------------------------- 1 | here::i_am("atac/archR/processing/save_atac_matrices.R") 2 | 3 | source(here::here("settings.R")) 4 | source(here::here("utils.R")) 5 | 6 | suppressPackageStartupMessages(library(reticulate)) 7 | 8 | ###################### 9 | ## Define arguments ## 10 | ###################### 11 | 12 | p <- ArgumentParser(description='') 13 | p$add_argument('--python', type="character", help='') 14 | p$add_argument('--atac_matrix', type="character", help='') 15 | p$add_argument('--metadata', type="character", help='Cell metadata file') 16 | p$add_argument('--outfile', type="character", help='Output file') 17 | 18 | args <- p$parse_args(commandArgs(TRUE)) 19 | 20 | ## START TEST ## 21 | # io$basedir <- file.path(io$basedir,"test") 22 | # args <- list() 23 | # args$python = "/Users/argelagr/opt/anaconda3/envs/main/bin/python" # "/bi/group/reik/ricard/software/miniconda3/envs/main/bin/python" 24 | # args$metadata <- file.path(io$basedir,"results/atac/archR/celltype_assignment/sample_metadata_after_celltype_assignment.txt.gz") 25 | # args$atac_matrix <- file.path(io$basedir,"processed/atac/archR/Matrices/PeakMatrix_summarized_experiment.rds") 26 | # args$outfile <- file.path(io$basedir,"processed/atac/anndata/PeakMatrox_anndata.h5ad") 27 | ## END TEST ## 28 | 29 | dir.create(dirname(args$outfile), showWarnings = F, recursive = T) 30 | 31 | ##################################### 32 | ## Reticulate connection to scanpy ## 33 | ##################################### 34 | 35 | use_python(args$python, required=TRUE) 36 | sc <- import("scanpy") 37 | 38 | ##################### 39 | ## Define settings ## 40 | ##################### 41 | 42 | ########################## 43 | ## Load sample metadata ## 44 | ########################## 45 | 46 | metadata.dt <- fread(args$metadata) %>% 47 | .[pass_rnaQC==TRUE & pass_atacQC==TRUE & doublet_call==FALSE & !is.na(celltype)] %>% 48 | .[,c("cell","sample","stage","genotype","celltype","nFrags_atac","nFeature_RNA")]# %>% 49 | # setnames("celltype.predicted","celltype") 50 | 51 | fwrite(metadata.dt, file.path(dirname(args$outfile),"cell_metadata.txt.gz"), sep="\t", quote=F, na="NA") 52 | 53 | ###################### 54 | ## Load atac matrix ## 55 | ###################### 56 | 57 | atac.se <- readRDS(args$atac_matrix)[,metadata.dt$cell] 58 | 59 | ############################################# 60 | ## Convert SingleCellExperiment to AnnData ## 61 | ############################################# 62 | 63 | adata <- sc$AnnData( 64 | X = t(assay(atac.se)), 65 | obs = as.data.frame(colData(atac.se)), 66 | var = as.data.frame(rowData(atac.se)) 67 | ) 68 | print(adata) 69 | print(head(adata$obs)) 70 | print(head(adata$var)) 71 | 72 | ########################## 73 | ## Parse anndata object ## 74 | ########################## 75 | 76 | adata$uns$update(celltype_colors = opts$celltype.colors[sort(unique(as.character(adata$obs$celltype)))]) 77 | adata$uns$update(stage_colors = opts$stage.colors[sort(unique(as.character(adata$obs$stage)))]) 78 | 79 | ########## 80 | ## Save ## 81 | ########## 82 | 83 | adata$write_h5ad(args$outfile) 84 | -------------------------------------------------------------------------------- /atac/archR/processing/save_atac_matrices.R: -------------------------------------------------------------------------------- 1 | here::i_am("atac/archR/processing/save_atac_matrices.R") 2 | 3 | source(here::here("settings.R")) 4 | source(here::here("utils.R")) 5 | 6 | suppressPackageStartupMessages(library(ArchR)) 7 | 8 | ###################### 9 | ## Define arguments ## 10 | ###################### 11 | 12 | p <- ArgumentParser(description='') 13 | p$add_argument('--archr_directory', type="character", help='ArchR directory') 14 | p$add_argument('--metadata', type="character", help='Cell metadata file') 15 | p$add_argument('--matrix', type="character", help='Matrix to save') 16 | p$add_argument('--outfile', type="character", help='Output file') 17 | 18 | args <- p$parse_args(commandArgs(TRUE)) 19 | 20 | ## START TEST ## 21 | # args <- list() 22 | # args$archr_directory <- file.path(io$basedir,"processed/atac/archR") 23 | # args$metadata <- file.path(io$basedir,"results/atac/archR/qc/sample_metadata_after_qc.txt.gz") 24 | # args$matrix <- "GeneScoreMatrix_TSS" 25 | # args$outfile <- file.path(io$basedir,sprintf("processed/atac/archR/Matrices/%s_summarized_experiment.rds",args$matrix)) 26 | ## END TEST ## 27 | 28 | print(args) 29 | 30 | # I/O 31 | dir.create(dirname(args$outfile), showWarnings=F) 32 | 33 | ################### 34 | ## Load metadata ## 35 | ################### 36 | 37 | cells_metadata.dt <- fread(args$metadata) %>% 38 | # .[pass_atacQC==TRUE & doublet_call==FALSE] 39 | .[pass_atacQC==TRUE] 40 | 41 | ######################## 42 | ## Load ArchR Project ## 43 | ######################## 44 | 45 | # source(here::here("atac/archR/load_archR_project.R")) 46 | 47 | setwd(args$archr_directory) 48 | 49 | addArchRGenome("mm10") 50 | addArchRThreads(threads = 1) 51 | 52 | ArchRProject <- loadArchRProject(args$archr_directory)[cells_metadata.dt$cell] 53 | 54 | # Sanity checks 55 | # mean(rownames(ArchRProject)%in%cells_metadata.dt$cell) 56 | # mean(cells_metadata.dt$cell%in%rownames(ArchRProject)) 57 | # table(cells_metadata.dt[!cell%in%rownames(ArchRProject),sample]) 58 | stopifnot(args$matrix %in% getAvailableMatrices(ArchRProject)) 59 | 60 | ################ 61 | ## PeakMatrix ## 62 | ################ 63 | 64 | if (args$matrix=="PeakMatrix") { 65 | 66 | atac.se <- getMatrixFromProject(ArchRProject, binarize = FALSE, useMatrix = "PeakMatrix") 67 | 68 | # Define peak names 69 | row_ranges.dt <- rowRanges(atac.se) %>% as.data.table %>% 70 | setnames("seqnames","chr") %>% 71 | .[,c("chr","start","end")] %>% 72 | .[,idx:=sprintf("%s:%s-%s",chr,start,end)] 73 | rownames(atac.se) <- row_ranges.dt$idx 74 | 75 | } 76 | 77 | ##################### 78 | ## GeneScoreMatrix ## 79 | ##################### 80 | 81 | if (grepl("GeneScoreMatrix",args$matrix)) { 82 | 83 | atac.se <- getMatrixFromProject(ArchRProject, binarize = FALSE, useMatrix = args$matrix) 84 | 85 | # Define gene names 86 | rownames(atac.se) <- rowData(atac.se)$name 87 | 88 | # Filter genes 89 | # atac.se <- atac.se[grep("^Rik|Rik$|^mt-|^Rps-|^Rpl-|^Gm|^Mir|^Olfr",rownames(atac.se),invert=T),] 90 | } 91 | 92 | ########## 93 | ## Save ## 94 | ########## 95 | 96 | # Sanity checks 97 | stopifnot(sum(duplicated(rownames(atac.se)))==0) 98 | 99 | saveRDS(atac.se, args$outfile) -------------------------------------------------------------------------------- /atac/archR/processing/update_archR_metadata.R: -------------------------------------------------------------------------------- 1 | ##################### 2 | ## Define settings ## 3 | ##################### 4 | 5 | here::i_am("atac/archR/processing/update_archR_metadata.R") 6 | 7 | source(here::here("settings.R")) 8 | source(here::here("utils.R")) 9 | 10 | 11 | ######################## 12 | ## Load ArchR project ## 13 | ######################## 14 | 15 | source(here::here("atac/archR/load_archR_project.R")) 16 | 17 | ############################################ 18 | ## Merge archR metadata with RNA metadata ## 19 | ############################################ 20 | 21 | # Fetch pre-computed archR's metadata 22 | io$archr.metadata <- paste0(io$basedir,"/processed/atac/archR/sample_metadata_after_archR.txt.gz") 23 | archr_metadata <- fread(io$archr.metadata) 24 | stopifnot(all(rownames(ArchRProject) %in% archr_metadata$cell)) 25 | # cols.to.rename <- c("TSSEnrichment","ReadsInTSS","PromoterRatio","NucleosomeRatio","nFrags","BlacklistRatio") 26 | # idx.cols.to.rename <- which(colnames(archr_metadata)%in%cols.to.rename) 27 | # colnames(archr_metadata)[idx.cols.to.rename] <- paste0(colnames(archr_metadata)[idx.cols.to.rename], "_atac") 28 | 29 | # Fetch the metadata file of interest 30 | io$updated.metadata <- paste0(io$basedir,"/sample_metadata.txt.gz") 31 | updated_metadata <- fread(io$updated.metadata) 32 | colnames(updated_metadata) 33 | 34 | # remove overlapping columns in the archR metadata 35 | overlaping.columns <- intersect(colnames(updated_metadata),colnames(archr_metadata)) 36 | overlaping.columns <- overlaping.columns[!overlaping.columns%in%c("sample","cell","barcode")] 37 | archr_metadata <- archr_metadata[,which(!colnames(archr_metadata)%in%overlaping.columns),with=F] 38 | 39 | ########### 40 | ## Merge ## 41 | ########### 42 | 43 | foo <- updated_metadata %>% 44 | merge(archr_metadata, by=c("cell","sample","barcode"), all=TRUE) 45 | 46 | ############################# 47 | ## Update ArchR's metadata ## 48 | ############################# 49 | 50 | bar <- foo %>% 51 | .[cell%in%rownames(ArchRProject)] %>% setkey(cell) %>% .[rownames(ArchRProject)] %>% 52 | as.data.frame() %>% tibble::column_to_rownames("cell") 53 | 54 | stopifnot(bar$cell == rownames(getCellColData(ArchRProject))) 55 | 56 | for (i in colnames(bar)) { 57 | ArchRProject <- addCellColData( 58 | ArchRProject, 59 | data = bar[[i]], 60 | name = i, 61 | cells = rownames(bar), 62 | force = TRUE 63 | ) 64 | } 65 | 66 | colnames(getCellColData(ArchRProject)) 67 | 68 | ########## 69 | ## Save ## 70 | ########## 71 | 72 | io$metadata.out <- paste0(io$basedir,"/processed/atac/archR/sample_metadata_after_archR.txt.gz") 73 | fwrite(foo, io$metadata.out, sep="\t", na="NA", quote=F) 74 | 75 | saveArchRProject(ArchRProject) 76 | -------------------------------------------------------------------------------- /atac/archR/pseudobulk/1_archR_add_GroupCoverage.R: -------------------------------------------------------------------------------- 1 | # https://www.ArchRProject.com/bookdown/how-does-archr-make-pseudo-bulk-replicates.html 2 | here::i_am("atac/archR/pseudobulk/1_archR_add_GroupCoverage.R") 3 | 4 | source(here::here("settings.R")) 5 | source(here::here("utils.R")) 6 | 7 | suppressPackageStartupMessages(library(ArchR)) 8 | 9 | ###################### 10 | ## Define arguments ## 11 | ###################### 12 | 13 | p <- ArgumentParser(description='') 14 | p$add_argument('--archr_directory', type="character", help='ArchR directory') 15 | p$add_argument('--metadata', type="character", help='metadata file') 16 | p$add_argument('--group_by', type="character", help='Metadata column to group by') 17 | p$add_argument('--min_cells', type="integer", default=50, help='Minimum number of cells') 18 | p$add_argument('--max_cells', type="integer", default=1000, help='Maximum number of cells') 19 | p$add_argument('--threads', type="integer", default=1, help='Number of threads') 20 | 21 | args <- p$parse_args(commandArgs(TRUE)) 22 | 23 | ##################### 24 | ## Define settings ## 25 | ##################### 26 | 27 | ## START TEST ## 28 | # args$metadata <- file.path(io$basedir,"results_new/atac/archR/qc/sample_metadata_after_qc.txt.gz") 29 | # args$group_by <- "celltype.mapped_mnn" 30 | # args$min_cells <- 100 31 | # args$max_cells <- 5000 32 | # args$threads <- 1 33 | ## END TEST ## 34 | 35 | ######################## 36 | ## Load cell metadata ## 37 | ######################## 38 | 39 | sample_metadata <- fread(args$metadata) %>% 40 | .[pass_atacQC==TRUE & doublet_call==FALSE & genotype=="WT"] 41 | stopifnot(args$group_by%in%colnames(sample_metadata)) 42 | sample_metadata <- sample_metadata[!is.na(sample_metadata[[args$group_by]])] 43 | 44 | # Filter celltypes by minimum number of cells 45 | sample_metadata <- sample_metadata[,N:=.N,by=c(args$group_by)] %>% .[N>=args$min_cells] %>% .[,N:=NULL] 46 | 47 | ######################## 48 | ## Load ArchR project ## 49 | ######################## 50 | 51 | # source(here::here("atac/archR/load_archR_project.R")) 52 | 53 | setwd(args$archr_directory) 54 | 55 | addArchRGenome("mm10") 56 | addArchRThreads(threads = args$threads) 57 | 58 | ArchRProject <- loadArchRProject(args$archr_directory)[sample_metadata$cell] 59 | 60 | ########################### 61 | ## Update ArchR metadata ## 62 | ########################### 63 | 64 | sample_metadata.to.archr <- sample_metadata %>% 65 | .[cell%in%rownames(ArchRProject)] %>% setkey(cell) %>% .[rownames(ArchRProject)] %>% 66 | as.data.frame() %>% tibble::column_to_rownames("cell") 67 | 68 | stopifnot(all(rownames(sample_metadata.to.archr) == rownames(getCellColData(ArchRProject)))) 69 | ArchRProject <- addCellColData( 70 | ArchRProject, 71 | data = sample_metadata.to.archr[[args$group_by]], 72 | name = args$group_by, 73 | cells = rownames(sample_metadata.to.archr), 74 | force = TRUE 75 | ) 76 | 77 | # print cell numbers 78 | table(getCellColData(ArchRProject,args$group_by)[[1]]) 79 | 80 | ######################### 81 | ## Add Group Coverages ## 82 | ######################### 83 | 84 | # Check if group Coverages already exist 85 | # ArchRProject@projectMetadata$GroupCoverages 86 | 87 | # This function will merge cells within each designated cell group for the generation of pseudo-bulk replicates 88 | # and then merge these replicates into a single insertion coverage file. 89 | # Output: creates files in archR/GroupCoverages/celltype: [X]._.Rep[Y].insertions.coverage.h5 90 | ArchRProject <- addGroupCoverages(ArchRProject, 91 | groupBy = args$group_by, 92 | useLabels = FALSE, # do not use sample information 93 | minCells = args$min_cells, 94 | maxCells = args$max_cells, 95 | force = TRUE 96 | ) 97 | 98 | ########## 99 | ## Save ## 100 | ########## 101 | 102 | saveRDS(ArchRProject@projectMetadata, paste0(io$archR.directory,"/projectMetadata.rds")) -------------------------------------------------------------------------------- /atac/archR/snakemake/README.txt: -------------------------------------------------------------------------------- 1 | snakemake --cores 1 2 | snakemake --cores 1 --dry-run -p 3 | snakemake --cores 10 -j 99 --latency-wait 90 -p --cluster "sbatch -n {threads} --mem {resources.mem_mb}M" 4 | 5 | -------------------------------------------------------------------------------- /atac/archR/snakemake/run_cluster.sh: -------------------------------------------------------------------------------- 1 | snakemake --cores 15 -j 99 --latency-wait 90 -p --cluster "sbatch -n {threads} --mem {resources.mem_mb}M" -------------------------------------------------------------------------------- /atac/archR/snakemake/run_cluster_single.sh: -------------------------------------------------------------------------------- 1 | snakemake --cores 1 -j 1 --latency-wait 90 -p --cluster "sbatch -n {threads} --mem {resources.mem_mb}M" 2 | -------------------------------------------------------------------------------- /gastrulation_multiome_10x.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | -------------------------------------------------------------------------------- /images/igv_screenshot_github.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rargelaguet/mouse_organogenesis_10x_multiome_publication/3ee0ba0ae5fbdf6817ef1d341ff483b3028c085f/images/igv_screenshot_github.png -------------------------------------------------------------------------------- /images/overview_github.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rargelaguet/mouse_organogenesis_10x_multiome_publication/3ee0ba0ae5fbdf6817ef1d341ff483b3028c085f/images/overview_github.png -------------------------------------------------------------------------------- /load_paga_graph.R: -------------------------------------------------------------------------------- 1 | library(GGally) 2 | library(network) 3 | library(sna) 4 | library(ggraph) 5 | library(igraph) 6 | library(tidygraph) 7 | 8 | connectivity.mtx <- fread(io$paga.connectivity) %>% 9 | matrix.please %>% .[opts$celltypes,opts$celltypes] 10 | 11 | df.coordinates <- fread(io$paga.coordinates) %>% 12 | matrix.please %>% .[opts$celltypes,] 13 | 14 | # Parse data 15 | connectivity.mtx[connectivity.mtx<0.20] <- 0 16 | connectivity.mtx[connectivity.mtx>=0.20] <- 1 17 | 18 | # Create igraph object 19 | igraph.paga <- graph_from_adjacency_matrix(connectivity.mtx, mode = "undirected") 20 | 21 | # Create tbl_graph object 22 | igraph.paga.tbl <- as_tbl_graph(igraph.paga) %>% 23 | activate(nodes) %>% 24 | mutate(celltype=rownames(connectivity.mtx)) %>% 25 | mutate(x=df.coordinates[,1]) %>% mutate(y=df.coordinates[,2]) 26 | 27 | # Create network object 28 | net.paga = network(connectivity.mtx) 29 | net.paga %v% "x" = connectivity.mtx[, 1] 30 | net.paga %v% "y" = connectivity.mtx[, 2] 31 | 32 | ########## 33 | ## TEST ## 34 | ########## 35 | 36 | # sum(connectivity.mtx==1) 37 | # connectivity.mtx["Epiblast","Rostral_neurectoderm"] 38 | # igraph.paga.tbl %>% activate(edges) %>% as.data.table() %>% nrow 39 | # filter(celltype=="Epiblast") 40 | -------------------------------------------------------------------------------- /rna/TF2gene_coexpression/utils.R: -------------------------------------------------------------------------------- 1 | library(Matrix) 2 | library(qlcMatrix) 3 | set.seed(42) 4 | 5 | #' Replace non-zero entries in a sparse entries with non-zero ranks 6 | #' 7 | #' This method creates a rank matrix for a sparse matrix X using the following approach: 8 | #' 1. Use non-zero enries in a column to calculate the ranks 9 | #' 2. Add (z-1)/2 to the ranks (only non-zero entries are changed). z is the number of zeros 10 | #' in the column 11 | #' Since all the entries are shifted by the same constant (the zeros 12 | #' are already shifted), the covariance matrix of this shifted matrix is 13 | #' the same as the rank matrix of the entire matrix (where the zeros would 14 | #' all also have a rank = (z+1)/2) where z is the number of zeros 15 | #' 16 | #' This rank matrix can then be used to calculate pearson correlation 17 | 18 | SparsifiedRanks <- function(X) { 19 | X <- as(object = X, Class = "dgCMatrix") 20 | j <- summary(object = X)$j 21 | n_zeros_per_col <- nrow(X) - diff(X@p) 22 | 23 | for (column in unique(x = j)) { 24 | non_zero_element_index <- which(j == column) 25 | elements_along_row <- X@x[non_zero_element_index] 26 | ranks <- rank(elements_along_row) 27 | ranks <- ranks + (n_zeros_per_col[column] - 1) / 2 28 | X@x[non_zero_element_index] <- ranks 29 | } 30 | return(X) 31 | } 32 | 33 | SparseSpearmanCor <- function(X, Y = NULL, cov = FALSE) { 34 | 35 | # Get sparsified ranks 36 | rankX <- SparsifiedRanks(X) 37 | if (is.null(Y)){ 38 | # Calculate pearson correlation on rank matrices 39 | return (corSparse(X=rankX, cov=cov)) 40 | } 41 | rankY <- SparsifiedRanks(Y) 42 | return(corSparse( X = rankX, Y = rankY, cov = cov)) 43 | } 44 | 45 | 46 | SparsifiedRanks2 <- function(X) { 47 | if (class(X)[1] != "dgCMatrix") { 48 | X <- as(object = X, Class = "dgCMatrix") 49 | } 50 | non_zeros_per_col <- diff(x = X@p) 51 | n_zeros_per_col <- nrow(x = X) - non_zeros_per_col 52 | offsets <- (n_zeros_per_col - 1) / 2 53 | x <- X@x 54 | ## split entries to columns 55 | col_lst <- split(x = x, f = rep.int(1:ncol(X), non_zeros_per_col)) 56 | ## calculate sparsified ranks and do shifting 57 | sparsified_ranks <- unlist(x = lapply(X = seq_along(col_lst), FUN = function(i) rank(x = col_lst[[i]]) + offsets[i])) 58 | ## Create template rank matrix 59 | X.ranks <- X 60 | X.ranks@x <- sparsified_ranks 61 | return(X.ranks) 62 | } 63 | 64 | 65 | SparseSpearmanCor2 <- function(X, Y = NULL, cov = FALSE) { 66 | 67 | # Get sparsified ranks 68 | rankX <- SparsifiedRanks2(X) 69 | if (is.null(Y)){ 70 | # Calculate pearson correlation on rank matrices 71 | return (corSparse(X=rankX, cov=cov)) 72 | } 73 | rankY <- SparsifiedRanks2(Y) 74 | return(corSparse( X = rankX, Y = rankY, cov = cov)) 75 | } -------------------------------------------------------------------------------- /rna/conversions/convert_anndata_to_SingleCellExperiment.R: -------------------------------------------------------------------------------- 1 | here::i_am("rna/conversions/convert_SingleCellExperiment_to_anndata.R") 2 | 3 | # Load default settings 4 | source(here::here("settings.R")) 5 | source(here::here("utils.R")) 6 | 7 | suppressPackageStartupMessages(library(reticulate)) 8 | suppressPackageStartupMessages(library(scuttle)) 9 | library(zellkonverter) 10 | 11 | ################################ 12 | ## Initialize argument parser ## 13 | ################################ 14 | 15 | p <- ArgumentParser(description='') 16 | p$add_argument('--python_path', type="character", help='Python path for reticulate') 17 | p$add_argument('--anndata', type="character", help='Anndata input file') 18 | p$add_argument('--outfile', type="character", help='SingleCellExperiment output file') 19 | args <- p$parse_args(commandArgs(TRUE)) 20 | 21 | ## START TEST ## 22 | args <- list() 23 | args$python_path <- "/bi/group/reik/ricard/software/miniconda3/envs/main/bin/python" # "/Users/argelagr/opt/anaconda3/envs/main/bin/python" 24 | args$anndata <- file.path(io$basedir,"processed/rna/velocyto/anndata_velocyto.h5ad") 25 | args$outfile <- file.path(io$basedir,"processed/rna/velocyto/SingleCellExperiment_velocyto.rds") 26 | ## END TEST ## 27 | 28 | ################ 29 | ## Reticulate ## 30 | ################ 31 | 32 | reticulate::use_python(args$python_path, required = TRUE) 33 | 34 | sc <- import("scanpy") 35 | 36 | ############################################ 37 | ## Load anndata into SingleCellExperiment ## 38 | ############################################ 39 | 40 | sce <- readH5AD(args$anndata, use_hdf5 = FALSE, reader = "python") 41 | 42 | print("Overview of colData") 43 | head(colData(sce)) 44 | 45 | print("Overview of rowData") 46 | head(rowData(sce)) 47 | 48 | # Set gene names 49 | if (is.null(rownames(sce))) { 50 | if ("gene"%in%colnames(rowData(sce))) { 51 | rownames(sce) <- rowData(sce)$gene 52 | } 53 | } 54 | stopifnot(!is.null(rownames(sce))) 55 | print("Overview of gene names") 56 | head(rownames(sce)) 57 | 58 | # Set cell names 59 | if (is.null(colnames(sce))) { 60 | if ("cell"%in%colnames(colData(sce))) { 61 | colnames(sce) <- colData(sce)$cell 62 | } 63 | } 64 | stopifnot(!is.null(colnames(sce))) 65 | print("Overview of cell names") 66 | head(colnames(sce)) 67 | 68 | # set assay names 69 | # assayNames(sce) <- "counts" 70 | assayNames(sce) <- c("counts","spliced","unspliced") 71 | print("Overview of counts") 72 | counts(sce)[1:10,1:10] 73 | 74 | # reducedDims 75 | 76 | ########### 77 | ## Parse ## 78 | ########### 79 | 80 | saveRDS(sce, args$outfile) 81 | -------------------------------------------------------------------------------- /rna/differential/cells/parse_differential_results.R: -------------------------------------------------------------------------------- 1 | here::i_am("rna/differential/cells/parse_differential_results.R") 2 | 3 | # Load default settings 4 | source(here::here("settings.R")) 5 | 6 | ###################### 7 | ## Define arguments ## 8 | ###################### 9 | 10 | p <- ArgumentParser(description='') 11 | p$add_argument('--diff_results_dir', type="character", help='File') 12 | p$add_argument('--min_cells', type="integer", default=5, help='Minimum number of cells per group') 13 | p$add_argument('--outdir', type="character", help='File') 14 | args <- p$parse_args(commandArgs(TRUE)) 15 | 16 | ## START TEST ## 17 | # io$basedir <- file.path(io$basedir,"test") 18 | # args <- list() 19 | # args$diff_results_dir <- file.path(io$basedir,"results/rna/differential/metacells/celltype") 20 | # args$min_cells <- 5 21 | # args$outdir <- file.path(io$basedir,"results/rna/differential/metacells/celltype") 22 | ## END TEST ## 23 | 24 | # I/O 25 | dir.create(args$outdir, showWarnings = F, recursive=T) 26 | 27 | ######################### 28 | ## Load and parse data ## 29 | ######################### 30 | 31 | stats.dt <- data.table(celltypeA=as.character(NA), celltypeB=as.character(NA), groupA_N=as.integer(NA), groupB_N=as.integer(NA), included=as.logical(NA)) 32 | diff_results_list <- list() 33 | 34 | # i <- "Visceral_endoderm"; j <- "Surface_ectoderm" 35 | for (i in 1:length(opts$celltypes)) { 36 | for (j in i:length(opts$celltypes)) { 37 | 38 | if (i!=j) { 39 | file <- file.path(args$diff_results_dir,sprintf("%s_vs_%s.txt.gz",opts$celltypes[[i]],opts$celltypes[[j]])) 40 | if (file.exists(file)) { 41 | tmp <- fread(file) %>% 42 | # setnames(c("gene", "logFC", "padj_fdr", "groupA_N", "groupB_N", "groupA_detection_rate", "groupB_detection_rate")) %>% 43 | .[,c("celltypeA","celltypeB"):=list(opts$celltypes[[i]],opts$celltypes[[j]])] 44 | 45 | # Empty file (not enough cells to do DE) 46 | if (nrow(tmp)>1) { 47 | stats.dt <- rbind(stats.dt, data.table(celltypeA=opts$celltypes[[i]], celltypeB=opts$celltypes[[j]], groupA_N=tmp[celltypeA==opts$celltypes[[i]],groupA_N][1], groupB_N=tmp[celltypeB==opts$celltypes[[j]],groupB_N][1], included=TRUE)) 48 | diff_results_list[[sprintf("%s_vs_%s",opts$celltypes[[i]],opts$celltypes[[j]])]] <- tmp 49 | } else { 50 | stats.dt <- rbind(stats.dt, data.table(celltypeA=opts$celltypes[[i]], celltypeB=opts$celltypes[[j]], groupA_N=tmp[celltypeA==opts$celltypes[[i]],groupA_N][1], groupB_N=tmp[celltypeB==opts$celltypes[[j]],groupB_N][1], included=FALSE)) 51 | } 52 | } else { 53 | print(sprintf("%s not found...",file)) 54 | } 55 | } 56 | } 57 | } 58 | 59 | # if (tmp[celltypeA==opts$celltypes[[i]],groupA_N][1]>=args$min_cells & tmp[celltypeB==opts$celltypes[[j]],groupB_N][1]>=args$min_cells) { 60 | 61 | ########## 62 | ## Save ## 63 | ########## 64 | 65 | fwrite(stats.dt[-1], file.path(args$outdir,"diff_expr_stats.txt.gz"), sep="\t", quote=F, na="NA") 66 | fwrite(rbindlist(diff_results_list), file.path(args$outdir,"diff_expr_results.txt.gz"), sep="\t", quote=F, na="NA") 67 | 68 | -------------------------------------------------------------------------------- /rna/differential/metacells/parse_differential_results.R: -------------------------------------------------------------------------------- 1 | here::i_am("rna/differential/metacells/parse_differential_results.R") 2 | 3 | # Load default settings 4 | source(here::here("settings.R")) 5 | # source(here::here("utils.R")) 6 | 7 | ###################### 8 | ## Define arguments ## 9 | ###################### 10 | 11 | p <- ArgumentParser(description='') 12 | p$add_argument('--diff_results_dir', type="character", help='File') 13 | p$add_argument('--min_cells', type="integer", default=5, help='Minimum number of cells per group') 14 | p$add_argument('--outdir', type="character", help='File') 15 | args <- p$parse_args(commandArgs(TRUE)) 16 | 17 | ## START TEST ## 18 | # io$basedir <- file.path(io$basedir,"test") 19 | # args <- list() 20 | # args$diff_results_dir <- file.path(io$basedir,"results/rna/differential/metacells/celltype") 21 | # args$min_cells <- 5 22 | # args$outdir <- file.path(io$basedir,"results/rna/differential/metacells/celltype") 23 | ## END TEST ## 24 | 25 | # I/O 26 | dir.create(args$outdir, showWarnings = F, recursive = T) 27 | 28 | ######################### 29 | ## Load and parse data ## 30 | ######################### 31 | 32 | stats.dt <- data.table(celltypeA=as.character(NA), celltypeB=as.character(NA), groupA_N=as.integer(NA), groupB_N=as.integer(NA), included=as.logical(NA)) 33 | diff_results_list <- list() 34 | 35 | # i <- "Visceral_endoderm"; j <- "Surface_ectoderm" 36 | for (i in 1:length(opts$celltypes)) { 37 | for (j in i:length(opts$celltypes)) { 38 | 39 | if (i!=j) { 40 | file <- file.path(args$diff_results_dir,sprintf("%s_vs_%s.txt.gz",opts$celltypes[[i]],opts$celltypes[[j]])) 41 | if (file.exists(file)) { 42 | tmp <- fread(file) %>% 43 | # setnames(c("gene", "logFC", "padj_fdr", "groupA_N", "groupB_N", "groupA_detection_rate", "groupB_detection_rate")) %>% 44 | .[,c("celltypeA","celltypeB"):=list(opts$celltypes[[i]],opts$celltypes[[j]])] 45 | 46 | # Empty file (not enough cells to do DE) 47 | if (nrow(tmp)>1) { 48 | stats.dt <- rbind(stats.dt, data.table(celltypeA=opts$celltypes[[i]], celltypeB=opts$celltypes[[j]], groupA_N=tmp[celltypeA==opts$celltypes[[i]],groupA_N][1], groupB_N=tmp[celltypeB==opts$celltypes[[j]],groupB_N][1], included=TRUE)) 49 | diff_results_list[[sprintf("%s_vs_%s",opts$celltypes[[i]],opts$celltypes[[j]])]] <- tmp 50 | } else { 51 | stats.dt <- rbind(stats.dt, data.table(celltypeA=opts$celltypes[[i]], celltypeB=opts$celltypes[[j]], groupA_N=tmp[celltypeA==opts$celltypes[[i]],groupA_N][1], groupB_N=tmp[celltypeB==opts$celltypes[[j]],groupB_N][1], included=FALSE)) 52 | } 53 | } else { 54 | print(sprintf("%s not found...",file)) 55 | } 56 | } 57 | } 58 | } 59 | 60 | # if (tmp[celltypeA==opts$celltypes[[i]],groupA_N][1]>=args$min_cells & tmp[celltypeB==opts$celltypes[[j]],groupB_N][1]>=args$min_cells) { 61 | 62 | ########## 63 | ## Save ## 64 | ########## 65 | 66 | fwrite(stats.dt[-1], file.path(args$outdir,"diff_expr_stats.txt.gz"), sep="\t", quote=F, na="NA") 67 | fwrite(rbindlist(diff_results_list), file.path(args$outdir,"diff_expr_results.txt.gz"), sep="\t", quote=F, na="NA") 68 | 69 | -------------------------------------------------------------------------------- /rna/differential/other/extract_TFs_diff.R: -------------------------------------------------------------------------------- 1 | here::i_am("rna/differential/other/extract_TFs_diff.R") 2 | 3 | # Load default settings 4 | source(here::here("settings.R")) 5 | 6 | ###################### 7 | ## Define arguments ## 8 | ###################### 9 | 10 | p <- ArgumentParser(description='') 11 | p$add_argument('--TFs', type="character", help='Cell metadata file') 12 | p$add_argument('--diff_results', type="character", help='File') 13 | p$add_argument('--outfile', type="character", help='File') 14 | args <- p$parse_args(commandArgs(TRUE)) 15 | 16 | ## START TEST ## 17 | # io$basedir <- file.path(io$basedir,"test") 18 | # args <- list() 19 | # args$TFs <- "/Users/argelagr/data/mm10_regulation/TFs/TFs.txt" 20 | # args$diff_results <- file.path(io$basedir,"results/rna/differential/metacells/celltype/parsed/diff_expr_results.txt.gz") 21 | # args$outfile <- file.path(io$basedir,"results/rna/differential/metacells/celltype/parsed/diff_expr_results_tfs.txt.gz") 22 | ## END TEST ## 23 | 24 | dir.create(dirname(args$outfile), showWarnings = F) 25 | 26 | ############## 27 | ## Load TFs ## 28 | ############## 29 | 30 | # TFs <- fread(args$TFs)[["gene"]] 31 | TFs <- fread(args$TFs)[[1]] %>% str_to_title 32 | 33 | ################################################ 34 | ## Load differential expression and fetch TFs ## 35 | ################################################ 36 | 37 | diff_results.dt <- fread(args$diff_results) %>% 38 | .[gene%in%TFs] %>% .[,gene:=toupper(gene)] 39 | 40 | # diff_tf.dt <- opts$celltypes %>% map(function(i) { 41 | # opts$celltypes %>% map(function(j) { 42 | # if (i!=j) { 43 | # file <- file.path(args$diff_results_dir,sprintf("%s_vs_%s.txt.gz",i,j)) 44 | # if (file.exists(file)) { 45 | # fread(file, select=c(1,2,4,6,7,8,9)) %>% 46 | # setnames(c("gene", "logFC", "padj_fdr", "groupA_N", "groupB_N", "groupA_detection_rate", "groupB_detection_rate")) %>% 47 | # .[gene%in%TFs] %>% .[,c("celltypeA","celltypeB"):=list(i,j)] 48 | # } 49 | # } 50 | # }) %>% rbindlist 51 | # }) %>% rbindlist %>% .[,gene:=toupper(gene)] 52 | 53 | print(sprintf("Number of TFs in the differential expression results: %s",length(unique(diff_results.dt$gene)))) 54 | 55 | ########## 56 | ## Save ## 57 | ########## 58 | 59 | fwrite(diff_results.dt, args$outfile, sep="\t", quote=F, na="NA") 60 | 61 | -------------------------------------------------------------------------------- /rna/differential/pseudobulk/celltype/analysis/old/extract_TFs_diff.R: -------------------------------------------------------------------------------- 1 | here::i_am("rna/differential/metacells/analysis/TFs/extract_TFs_diff.R") 2 | 3 | # Load default settings 4 | source(here::here("settings.R")) 5 | 6 | ###################### 7 | ## Define arguments ## 8 | ###################### 9 | 10 | p <- ArgumentParser(description='') 11 | p$add_argument('--TFs', type="character", help='Cell metadata file') 12 | p$add_argument('--diff_results', type="character", help='File') 13 | p$add_argument('--outfile', type="character", help='File') 14 | args <- p$parse_args(commandArgs(TRUE)) 15 | 16 | ## START TEST ## 17 | # io$basedir <- file.path(io$basedir,"test") 18 | # args <- list() 19 | # args$TFs <- "/Users/argelagr/data/mm10_regulation/TFs/TFs.txt" 20 | # args$diff_results <- file.path(io$basedir,"results/rna/differential/metacells/celltype/parsed/diff_expr_results.txt.gz") 21 | # args$outfile <- file.path(io$basedir,"results/rna/differential/metacells/celltype/parsed/diff_expr_results_tfs.txt.gz") 22 | ## END TEST ## 23 | 24 | dir.create(dirname(args$outfile), showWarnings = F) 25 | 26 | ############## 27 | ## Load TFs ## 28 | ############## 29 | 30 | # TFs <- fread(args$TFs)[["gene"]] 31 | TFs <- fread(args$TFs)[[1]] %>% str_to_title 32 | 33 | ################################################ 34 | ## Load differential expression and fetch TFs ## 35 | ################################################ 36 | 37 | diff_results.dt <- fread(args$differential_results) %>% 38 | .[gene%in%TFs] %>% .[,gene:=toupper(gene)] 39 | 40 | # diff_tf.dt <- opts$celltypes %>% map(function(i) { 41 | # opts$celltypes %>% map(function(j) { 42 | # if (i!=j) { 43 | # file <- file.path(args$diff_results_dir,sprintf("%s_vs_%s.txt.gz",i,j)) 44 | # if (file.exists(file)) { 45 | # fread(file, select=c(1,2,4,6,7,8,9)) %>% 46 | # setnames(c("gene", "logFC", "padj_fdr", "groupA_N", "groupB_N", "groupA_detection_rate", "groupB_detection_rate")) %>% 47 | # .[gene%in%TFs] %>% .[,c("celltypeA","celltypeB"):=list(i,j)] 48 | # } 49 | # } 50 | # }) %>% rbindlist 51 | # }) %>% rbindlist %>% .[,gene:=toupper(gene)] 52 | 53 | print(sprintf("Number of TFs in the differential expression results: %s",length(unique(diff_results.dt$gene)))) 54 | 55 | ########## 56 | ## Save ## 57 | ########## 58 | 59 | fwrite(diff_results.dt, args$outfile, sep="\t", quote=F, na="NA") 60 | 61 | -------------------------------------------------------------------------------- /rna/differential/pseudobulk/celltype/analysis/plot_marker_genes_stats.R: -------------------------------------------------------------------------------- 1 | # Load default settings 2 | source(here::here("settings.R")) 3 | source(here::here("utils.R")) 4 | 5 | ##################### 6 | ## Define settings ## 7 | ##################### 8 | 9 | io$basedir <- file.path(io$basedir,"test") 10 | io$marker_genes <- file.path(io$basedir,"results/rna/differential/pseudobulk/celltype/parsed/marker_genes_filtered.txt.gz") 11 | io$outdir <- file.path(io$basedir,"results/rna/differential/pseudobulk/celltype/parsed/pdf"); dir.create(io$outdir, showWarnings = F) 12 | 13 | opts$celltypes <- c( 14 | "Epiblast", 15 | "Primitive_Streak", 16 | "Caudal_epiblast", 17 | "PGC", 18 | "Anterior_Primitive_Streak", 19 | "Notochord", 20 | "Def._endoderm", 21 | "Gut", 22 | "Nascent_mesoderm", 23 | "Mixed_mesoderm", 24 | "Intermediate_mesoderm", 25 | "Caudal_Mesoderm", 26 | "Paraxial_mesoderm", 27 | "Somitic_mesoderm", 28 | "Pharyngeal_mesoderm", 29 | "Cardiomyocytes", 30 | "Allantois", 31 | "ExE_mesoderm", 32 | "Mesenchyme", 33 | "Haematoendothelial_progenitors", 34 | "Endothelium", 35 | "Blood_progenitors_1", 36 | "Blood_progenitors_2", 37 | "Erythroid1", 38 | "Erythroid2", 39 | "Erythroid3", 40 | "NMP", 41 | "Rostral_neurectoderm", 42 | # "Caudal_neurectoderm", 43 | "Neural_crest", 44 | "Forebrain_Midbrain_Hindbrain", 45 | "Spinal_cord", 46 | "Surface_ectoderm" 47 | # "Visceral_endoderm" 48 | # "ExE_endoderm", 49 | # "ExE_ectoderm" 50 | # "Parietal_endoderm" 51 | ) 52 | 53 | ############################### 54 | ## Load differential results ## 55 | ############################### 56 | 57 | markers_genes.dt <- fread(io$marker_genes) %>% .[celltype%in%opts$celltypes] 58 | 59 | ################################################ 60 | ## Plot number of marker genes per cell types ## 61 | ################################################ 62 | 63 | to.plot <- markers_genes.dt %>% .[,.N,by=c("celltype")] 64 | 65 | p <- ggbarplot(to.plot, x="celltype", y="N", fill="celltype") + 66 | scale_fill_manual(values=opts$celltype.colors) + 67 | labs(x="", y="Number of marker genes") + 68 | theme( 69 | axis.text.y = element_text(size=rel(0.65)), 70 | axis.text.x = element_text(colour="black",size=rel(0.7), angle=90, hjust=1, vjust=0.5), 71 | axis.title = element_text(colour="black",size=rel(0.75)), 72 | axis.ticks.x = element_blank(), 73 | legend.position = "none" 74 | ) 75 | 76 | pdf(file.path(io$outdir,"barplot_number_marker_genes.pdf"), width = 6, height = 4) 77 | print(p) 78 | dev.off() 79 | 80 | ################################## 81 | ## Plot gene marker exclusivity ## 82 | ################################## 83 | 84 | to.plot <- markers_genes.dt %>% 85 | .[,.(Nx=.N),by="gene"] %>% 86 | .[,Nx:=factor(Nx)] %>% 87 | .[,.(Ny=.N),by="Nx"] 88 | 89 | p <- ggbarplot(to.plot, x="Nx", y="Ny", fill="gray70") + 90 | labs(x="Number of different cell types per marker gene", y="") + 91 | theme( 92 | axis.text = element_text(size=rel(0.75)), 93 | ) 94 | 95 | pdf(file.path(io$outdir,"boxplot_exclusivity_per_gene.pdf"), width = 7, height = 5) 96 | print(p) 97 | dev.off() 98 | 99 | ################################################ 100 | ## Plot gene marker exclusivity per cell type ## 101 | ################################################ 102 | 103 | to.plot <- markers_genes.dt %>% .[,N:=.N,by="gene"] 104 | 105 | p <- ggboxplot(to.plot, x="celltype", y="N", fill="celltype", color="black") + 106 | scale_fill_manual(values=opts$celltype.colors) + 107 | labs(x="", y="Exclusivity of gene markers\n(the smaller the more exclusive)") + 108 | theme( 109 | axis.text.y = element_text(size=rel(0.75)), 110 | axis.title.y = element_text(size=rel(0.85)), 111 | axis.text.x = element_text(colour="black",size=rel(0.7), angle=90, hjust=1, vjust=0.5), 112 | legend.position = "none" 113 | ) 114 | 115 | pdf(file.path(io$outdir,"boxplot_exclusivity_per_celltype.pdf"), width = 9, height = 5) 116 | print(p) 117 | dev.off() 118 | 119 | 120 | -------------------------------------------------------------------------------- /rna/differential/pseudobulk/celltype/differential_celltype_pseudobulk.R: -------------------------------------------------------------------------------- 1 | here::i_am("rna/differential/pseudobulk/celltype/differential_celltype_pseudobulk.R") 2 | 3 | # Load default settings 4 | source(here::here("settings.R")) 5 | source(here::here("utils.R")) 6 | 7 | suppressMessages(library(edgeR)) 8 | suppressMessages(library(scater)) 9 | 10 | ###################### 11 | ## Define arguments ## 12 | ###################### 13 | 14 | p <- ArgumentParser(description='') 15 | p$add_argument('--sce', type="character", help='SingleCellExperiment file') 16 | p$add_argument('--groupA', type="character", help='group A') 17 | p$add_argument('--groupB', type="character", help='group B') 18 | p$add_argument('--outfile', type="character", help='Output file') 19 | args <- p$parse_args(commandArgs(TRUE)) 20 | 21 | ## START TEST 22 | # io$basedir <- file.path(io$basedir,"test") 23 | # args$sce <- file.path(io$basedir,"results/rna/pseudobulk/celltype/SingleCellExperiment_pseudobulk_with_replicates.rds") 24 | # args$groupA <- "Epiblast" 25 | # args$groupB <- "Erythroid2" 26 | # args$outfile <- NULL 27 | ## END TEST 28 | 29 | dir.create(dirname(args$outfile), showWarnings = F) 30 | 31 | ##################### 32 | ## Define settings ## 33 | ##################### 34 | 35 | # Define groups 36 | opts$groups <- c(args$groupA,args$groupB) 37 | 38 | # stupid stuff but otherwise the snakemake pipeline doesn't work 39 | if (args$groupA==args$groupB) { 40 | out <- data.table(feature=NA, logFC=NA, padj_fdr=NA) 41 | fwrite(out, args$outfile, sep="\t", na="NA", quote=F) 42 | warning("groupA and groupB are the same, saving an empty file...") 43 | quit(status=0) 44 | } 45 | 46 | ######################### 47 | ## Load RNA expression ## 48 | ######################### 49 | 50 | # Load SingleCellExperiment object 51 | sce <- readRDS(args$sce) 52 | 53 | # temporary 54 | if (!"celltype" %in% colnames(colData(sce))) { 55 | sce$celltype <- colnames(sce) %>% strsplit("_rep") %>% map_chr(1) 56 | } 57 | 58 | sce <- sce[,sce$celltype %in% opts$groups] 59 | 60 | sce$celltype <- factor(sce$celltype, levels=opts$groups) 61 | table(sce$celltype) 62 | 63 | ######################################### 64 | ## Calculate average expression levels ## 65 | ######################################### 66 | 67 | expr.dt <- data.table( 68 | gene = rownames(sce), 69 | mean_groupA = rowMeans(logcounts(sce[,sce$celltype==args$groupA])) %>% round(2), 70 | mean_groupB = rowMeans(logcounts(sce[,sce$celltype==args$groupB])) %>% round(2) 71 | ) 72 | 73 | ####################### 74 | ## Feature selection ## 75 | ####################### 76 | 77 | opts$min.expr <- 4 # 2**4 = 16, at least an average of 8 counts per milion for each group 78 | 79 | genes.to.use <- expr.dt[mean_groupA>=opts$min.expr | mean_groupB>=opts$min.expr,gene] 80 | 81 | ################################################ 82 | ## Differential expression testing with edgeR ## 83 | ################################################ 84 | 85 | # Convert SCE to DGEList 86 | sce_edger <- scran::convertTo(sce[genes.to.use,], type="edgeR") 87 | 88 | # Define design matrix (with intercept) 89 | design <- model.matrix(~sce$celltype) 90 | 91 | # Estimate dispersions 92 | sce_edger <- estimateDisp(sce_edger,design) 93 | 94 | # Fit GLM 95 | fit <- glmQLFit(sce_edger,design) 96 | 97 | # Likelihood ratio test 98 | lrt <- glmQLFTest(fit) 99 | 100 | # Construct output data.frame 101 | out <- topTags(lrt, n=nrow(lrt))$table %>% as.data.table(keep.rownames=T) %>% 102 | setnames(c("gene","logFC","logCPM","LR","p.value","padj_fdr")) %>% 103 | .[,c("logCPM","LR","p.value"):=NULL] %>% 104 | .[,c("padj_fdr","logFC"):=list(signif(padj_fdr,digits=3), round(logFC,3))] %>% 105 | merge(expr.dt, by="gene", all.y=TRUE) %>% 106 | setorder(padj_fdr, na.last=T) 107 | 108 | ################## 109 | ## Save results ## 110 | ################## 111 | 112 | fwrite(out, args$outfile, sep="\t", na="NA", quote=F) 113 | -------------------------------------------------------------------------------- /rna/differential/pseudobulk/celltype/old/run_diff_expr_celltype_pseudobulk.R: -------------------------------------------------------------------------------- 1 | here::i_am("rna/differential/metacells/celltype/run_diff_expr_celltype.R") 2 | 3 | # Load default settings 4 | source(here::here("settings.R")) 5 | 6 | ###################### 7 | ## Define arguments ## 8 | ###################### 9 | 10 | p <- ArgumentParser(description='') 11 | p$add_argument('--sce', type="character", help='SingleCellExperiment file') 12 | p$add_argument('--outdir', type="character", help='Output directory') 13 | p$add_argument('--test_mode', action="store_true", help='Test mode? subset data') 14 | 15 | args <- p$parse_args(commandArgs(TRUE)) 16 | 17 | ## START TEST ## 18 | # io$basedir <- file.path(io$basedir,"test") 19 | # args <- list() 20 | # args$sce <- file.path(io$basedir,"results/rna/pseudobulk/celltype/SingleCellExperiment_pseudobulk_with_replicates.rds") 21 | # args$outdir <- file.path(io$basedir,"results/rna/differential/pseudobulk/with_replicates/celltype") 22 | # args$test_mode <- TRUE 23 | ## END TEST ## 24 | 25 | ##################### 26 | ## Define settings ## 27 | ##################### 28 | 29 | io$script <- here::here("rna/differential/pseudobulk/with_replicates/celltype/differential_rna_celltype_pseudobulk.R") 30 | dir.create(args$outdir, showWarnings=FALSE, recursive=TRUE) 31 | 32 | ######### 33 | ## Run ## 34 | ######### 35 | 36 | if (args$test_mode) { 37 | print("Test mode activated, running only a few comparisons...") 38 | opts$celltypes <- opts$celltypes %>% head(n=3) 39 | } 40 | 41 | for (i in 1:length(opts$celltypes)) { 42 | for (j in i:length(opts$celltypes)) { 43 | if (i!=j) { 44 | groupA <- opts$celltypes[[i]] 45 | groupB <- opts$celltypes[[j]] 46 | 47 | outfile <- sprintf("%s/%s_vs_%s.txt.gz", args$outdir,groupA,groupB) 48 | 49 | # Define LSF command 50 | if (grepl("BI",Sys.info()['nodename'])) { 51 | lsf <- "" 52 | } else if (grepl("pebble|headstone", Sys.info()['nodename'])) { 53 | lsf <- sprintf("sbatch -n 1 --mem 7G --wrap") 54 | } 55 | cmd <- sprintf("%s 'Rscript %s --sce %s --groupA %s --groupB %s --outfile %s'", 56 | lsf, io$script, args$sce, groupA, groupB, outfile) 57 | # if (isTRUE(opts$test_mode)) cmd <- paste0(cmd, " --test_mode") 58 | 59 | # Run 60 | print(cmd) 61 | system(cmd) 62 | } 63 | } 64 | } 65 | 66 | 67 | # Completion token 68 | file.create(file.path(args$outdir,"completed.txt")) -------------------------------------------------------------------------------- /rna/differential/pseudobulk/celltype/parse_differential_results.R: -------------------------------------------------------------------------------- 1 | here::i_am("rna/differential/pseudobulk/celltype/parse_differential_results.R") 2 | 3 | # Load default settings 4 | source(here::here("settings.R")) 5 | # source(here::here("utils.R")) 6 | 7 | ###################### 8 | ## Define arguments ## 9 | ###################### 10 | 11 | p <- ArgumentParser(description='') 12 | p$add_argument('--diff_results_dir', type="character", help='File') 13 | p$add_argument('--outdir', type="character", help='File') 14 | args <- p$parse_args(commandArgs(TRUE)) 15 | 16 | ## START TEST ## 17 | # io$basedir <- file.path(io$basedir,"test") 18 | # args <- list() 19 | # args$diff_results_dir <- file.path(io$basedir,"results/rna/differential/pseudobulk/celltype") 20 | # args$outdir <- file.path(io$basedir,"results/rna/differential/pseudobulk/celltype") 21 | ## END TEST ## 22 | 23 | # I/O 24 | dir.create(args$outdir, showWarnings = F, recursive = T) 25 | 26 | ################################################ 27 | ## Load differential expression and fetch TFs ## 28 | ################################################ 29 | 30 | diff_results_list <- list() 31 | 32 | # i <- "Visceral_endoderm"; j <- "Surface_ectoderm" 33 | for (i in 1:length(opts$celltypes)) { 34 | for (j in i:length(opts$celltypes)) { 35 | 36 | if (i!=j) { 37 | file <- file.path(args$diff_results_dir,sprintf("%s_vs_%s.txt.gz",opts$celltypes[[i]],opts$celltypes[[j]])) 38 | if (file.exists(file)) { 39 | tmp <- fread(file) %>% .[,c("celltypeA","celltypeB"):=list(opts$celltypes[[i]],opts$celltypes[[j]])] 40 | diff_results_list[[sprintf("%s_vs_%s",opts$celltypes[[i]],opts$celltypes[[j]])]] <- tmp 41 | } else { 42 | print(sprintf("%s not found...",file)) 43 | } 44 | } 45 | } 46 | } 47 | 48 | ########## 49 | ## Save ## 50 | ########## 51 | 52 | fwrite(rbindlist(diff_results_list), file.path(args$outdir,"diff_expr_results.txt.gz"), sep="\t", quote=F, na="NA") 53 | 54 | -------------------------------------------------------------------------------- /rna/differential/pseudobulk/celltype_genotype/parse_differential_results.R: -------------------------------------------------------------------------------- 1 | here::i_am("rna/differential/pseudobulk/celltype_genotype/parse_differential_results.R") 2 | 3 | # Load default settings 4 | source(here::here("settings.R")) 5 | # source(here::here("utils.R")) 6 | 7 | ###################### 8 | ## Define arguments ## 9 | ###################### 10 | 11 | p <- ArgumentParser(description='') 12 | p$add_argument('--diff_results_dir', type="character", help='File') 13 | p$add_argument('--outfile', type="character", help='File') 14 | args <- p$parse_args(commandArgs(TRUE)) 15 | 16 | ## START TEST ## 17 | # io$basedir <- file.path(io$basedir,"test") 18 | # args <- list() 19 | # args$diff_results_dir <- file.path(io$basedir,"results/rna/differential/pseudobulk/celltype_genotype") 20 | # args$outfile <- file.path(io$basedir,"results/rna/differential/pseudobulk/celltype_genotype/parsed/diff_expr_results.txt.gz") 21 | ## END TEST ## 22 | 23 | # I/O 24 | dir.create(dirname(args$outfile), showWarnings = F, recursive = T) 25 | 26 | ########################################## 27 | ## Load differential expression results ## 28 | ########################################## 29 | 30 | diff_results_list <- list() 31 | 32 | # i <- "Visceral_endoderm"; j <- "Surface_ectoderm" 33 | for (i in 1:length(opts$celltypes)) { 34 | file <- file.path(args$diff_results_dir,sprintf("%s.txt.gz",opts$celltypes[[i]])) 35 | if (file.exists(file)) { 36 | tmp <- fread(file) %>% .[,celltype:=opts$celltypes[[i]]] 37 | if (nrow(tmp)>1) { 38 | diff_results_list[[opts$celltypes[[i]]]] <- tmp 39 | } 40 | } else { 41 | print(sprintf("%s not found...",file)) 42 | } 43 | } 44 | 45 | print(names(diff_results_list)) 46 | 47 | ########## 48 | ## Save ## 49 | ########## 50 | 51 | fwrite(rbindlist(diff_results_list), args$outfile, sep="\t", quote=F, na="NA") 52 | 53 | -------------------------------------------------------------------------------- /rna/differential/utils.R: -------------------------------------------------------------------------------- 1 | 2 | # Function to differential expression 3 | # - sce: SingleCellExperiment object with the column "group" in the colData 4 | # - groups: the names of the two groups 5 | # - min_detection_rate_per_group: minimum detection rate per group 6 | doDiffExpr <- function(sce, groups, min_detection_rate_per_group = 0.50) { 7 | 8 | # Sanity checks 9 | if (!is(sce, "SingleCellExperiment")) stop("'sce' has to be an instance of SingleCellExperiment") 10 | stopifnot(length(groups)==2) 11 | 12 | # Filter genes by detection rate per group 13 | cdr_A <- rowMeans(logcounts(sce[,sce$group==groups[1]])>0) >= min_detection_rate_per_group 14 | cdr_B <- rowMeans(logcounts(sce[,sce$group==groups[2]])>0) >= min_detection_rate_per_group 15 | out <- .edgeR(sce[cdr_B | cdr_A,]) 16 | 17 | return(out) 18 | } 19 | 20 | 21 | .edgeR <- function(sce) { 22 | 23 | # Convert SCE to DGEList 24 | sce_edger <- scran::convertTo(sce, type="edgeR") 25 | 26 | # Define design matrix (with intercept) 27 | cdr <- colMeans(logcounts(sce)>0) 28 | design <- model.matrix(~cdr+sce$group) 29 | 30 | # Estimate dispersions 31 | sce_edger <- estimateDisp(sce_edger,design) 32 | 33 | # Fit GLM 34 | fit <- glmQLFit(sce_edger,design) 35 | 36 | # Likelihood ratio test 37 | lrt <- glmQLFTest(fit) 38 | 39 | # Construct output data.frame 40 | out <- topTags(lrt, n=nrow(lrt))$table %>% as.data.table(keep.rownames=T) %>% 41 | setnames(c("gene","logFC","logCPM","LR","p.value","padj_fdr")) %>% 42 | .[,c("logCPM","LR","p.value"):=NULL] 43 | 44 | return(out) 45 | } 46 | 47 | ################ 48 | ## Plot utils ## 49 | ################ 50 | 51 | 52 | gg_volcano_plot <- function(to.plot, top_genes=10, xlim=NULL, ylim=NULL, label_groups = NULL) { 53 | 54 | negative_hits <- to.plot[sig==TRUE & logFC<0,gene] 55 | positive_hits <- to.plot[sig==TRUE & logFC>0,gene] 56 | all <- nrow(to.plot) 57 | 58 | # if (is.null(xlim)) 59 | # xlim <- max(abs(to.plot$logFC), na.rm=T) 60 | # if (is.null(ylim)) 61 | # ylim <- max(-log10(to.plot$padj_fdr+1e-100), na.rm=T) 62 | 63 | to.plot <- to.plot[!is.na(logFC) & !is.na(padj_fdr)] 64 | 65 | p <- ggplot(to.plot, aes(x=logFC, y=-log10(padj_fdr+1e-100))) + 66 | labs(x="Log fold change", y=expression(paste("-log"[10],"(q.value)"))) + 67 | ggrastr::geom_point_rast(aes(color=sig, size=sig)) + 68 | # geom_hline(yintercept = -log10(opts$threshold_fdr), color="blue") + 69 | geom_segment(aes(x=0, xend=0, y=0, yend=105), color="orange", size=0.5) + 70 | scale_color_manual(values=c("black","red")) + 71 | scale_size_manual(values=c(0.5,1)) + 72 | scale_x_continuous(limits=c(-6,6)) + 73 | scale_y_continuous(limits=c(0,115)) + 74 | annotate("text", x=0, y=115, size=4, label=sprintf("(%d)", all)) + 75 | annotate("text", x=-5, y=115, size=4, label=sprintf("%d (-)",length(negative_hits))) + 76 | annotate("text", x=5, y=115, size=4, label=sprintf("%d (+)",length(positive_hits))) + 77 | ggrepel::geom_text_repel(data=head(to.plot[sig==T],n=top_genes), aes(x=logFC, y=-log10(padj_fdr+1e-100), label=gene), max.overlaps=Inf, size=4) + 78 | theme_classic() + 79 | theme( 80 | axis.text = element_text(size=rel(0.75), color='black'), 81 | axis.title = element_text(size=rel(1.0), color='black'), 82 | legend.position="none" 83 | ) 84 | 85 | 86 | if (length(label_groups)>0) { 87 | p <- p + 88 | annotate("text", x=-4, y=0, size=4, label=sprintf("Up in %s",label_groups[2])) + 89 | annotate("text", x=4, y=0, size=4, label=sprintf("Up in %s",label_groups[1])) 90 | } 91 | 92 | return(p) 93 | } 94 | 95 | -------------------------------------------------------------------------------- /rna/mapping/analysis/plot_utils.R: -------------------------------------------------------------------------------- 1 | 2 | plot.dimred <- function(plot_df, query.label, atlas.label = "Atlas") { 3 | 4 | # Define dot size 5 | size.values <- c(opts$size.mapped, opts$size.nomapped) 6 | names(size.values) <- c(query.label, atlas.label) 7 | 8 | # Define dot alpha 9 | alpha.values <- c(opts$alpha.mapped, opts$alpha.nomapped) 10 | names(alpha.values) <- c(query.label, atlas.label) 11 | 12 | # Define dot colours 13 | colour.values <- c("red", "lightgrey") 14 | names(colour.values) <- c(query.label, atlas.label) 15 | 16 | # Plot 17 | ggplot(plot_df, aes(x=V1, y=V2)) + 18 | ggrastr::geom_point_rast(aes(size=mapped, alpha=mapped, colour=mapped)) + 19 | scale_size_manual(values = size.values) + 20 | scale_alpha_manual(values = alpha.values) + 21 | scale_colour_manual(values = colour.values) + 22 | # labs(x="UMAP Dimension 1", y="UMAP Dimension 2") + 23 | guides(colour = guide_legend(override.aes = list(size=6))) + 24 | theme_classic() + 25 | theme( 26 | legend.position = "top", 27 | legend.title = element_blank(), 28 | axis.line = element_blank(), 29 | axis.text = element_blank(), 30 | axis.title = element_blank(), 31 | axis.ticks = element_blank() 32 | ) 33 | } 34 | 35 | plot.dimred.wtko <- function(plot_df, wt.label = "WT", ko.label = "KO", nomapped.label = "-") { 36 | 37 | # Define dot size 38 | size.values <- c(opts$size.mapped, opts$size.mapped, opts$size.nomapped) 39 | names(size.values) <- c(wt.label, ko.label, nomapped.label) 40 | 41 | # Define dot alpha 42 | alpha.values <- c(opts$alpha.mapped, opts$alpha.mapped, opts$alpha.nomapped) 43 | names(alpha.values) <- c(wt.label, ko.label, nomapped.label) 44 | 45 | # Define dot colours 46 | colour.values <- c("red", "blue", "lightgrey") 47 | names(colour.values) <- c(wt.label, ko.label, nomapped.label) 48 | 49 | # Plot 50 | ggplot(plot_df, aes(x=V1, y=V2)) + 51 | ggrastr::geom_point_rast(aes(size=mapped, alpha=mapped, colour=mapped)) + 52 | scale_size_manual(values = size.values) + 53 | scale_alpha_manual(values = alpha.values) + 54 | scale_colour_manual(values = colour.values) + 55 | guides(colour = guide_legend(override.aes = list(size=6))) + 56 | theme_classic() + 57 | theme( 58 | legend.position = "top", 59 | legend.title = element_blank(), 60 | axis.text = element_blank(), 61 | axis.title = element_blank(), 62 | axis.ticks = element_blank() 63 | ) 64 | } 65 | -------------------------------------------------------------------------------- /rna/mapping/run/parse_sample_metadata_after_mapping.R: -------------------------------------------------------------------------------- 1 | here::i_am("rna/mapping/run/parse_sample_metadata_after_mapping.R") 2 | 3 | source(here::here("settings.R")) 4 | 5 | ###################### 6 | ## Define arguments ## 7 | ###################### 8 | 9 | p <- ArgumentParser(description='') 10 | # p$add_argument('--query_samples', type="character", nargs='+', help='Query samples') 11 | p$add_argument('--metadata', type="character", help='Metadata file to use as input') 12 | # p$add_argument('--mapping_seurat', type="character", nargs="+", help='Results of the Seurat mapping') 13 | p$add_argument('--mapping_mnn', type="character", nargs="+", help='Results of the MNN mapping') 14 | p$add_argument('--outfile', type="character", help='Output file') 15 | args <- p$parse_args(commandArgs(TRUE)) 16 | 17 | ################### 18 | ## Load settings ## 19 | ################### 20 | 21 | 22 | ## START TEST ## 23 | # args$query_samples <- opts$samples 24 | # args$metadata <- file.path(io$basedir,"results/rna/qc/sample_metadata_after_qc.txt.gz") 25 | # # args$mapping_dir <- file.path(io$basedir,"results/rna/mapping") 26 | # args$mapping_mnn <- file.path(io$basedir,"results/rna/mapping(..)") 27 | # args$mapping_seurat <- file.path(io$basedir,"results/rna/mapping/(..)") 28 | # args$outfile <- file.path(io$basedir,"results/rna/mapping/sample_metadata_after_mapping.txt.gz") 29 | ## END TEST ## 30 | 31 | 32 | ################### 33 | ## Load metadata ## 34 | ################### 35 | 36 | sample_metadata <- fread(args$metadata) 37 | 38 | ########################## 39 | ## Load mapping results ## 40 | ########################## 41 | 42 | # MNN 43 | mapping_mnn.dt <- args$mapping_mnn %>% map(~ fread(.)) %>% rbindlist 44 | stopifnot(mapping_mnn.dt$cell%in%sample_metadata$cell) 45 | 46 | # Seurat 47 | # mapping_seurat.dt <- args$mapping_seurat %>% map(~ fread(.)) %>% rbindlist 48 | # stopifnot(mapping_seurat.dt$cell%in%sample_metadata$cell) 49 | 50 | ########### 51 | ## Merge ## 52 | ########### 53 | 54 | # mapping.dt <- merge(mapping_mnn.dt, mapping_seurat.dt, by="cell", suffixes=c("_mnn","_seurat")) 55 | # to.save <- sample_metadata %>% merge(mapping.dt,by="cell",all.x=TRUE) 56 | 57 | to.save <- sample_metadata %>% merge(mapping_mnn.dt, by="cell", all.x=TRUE) 58 | 59 | # .[,celltype_genotype:=sprintf("%s-%s",celltype,genotype)] %>% 60 | 61 | ################# 62 | ## Save output ## 63 | ################# 64 | 65 | fwrite(to.save, args$outfile, sep="\t", na="NA", quote=F) 66 | 67 | ###################### 68 | ## Compare mappings ## 69 | ###################### 70 | 71 | # mapping_mnn.dt <- readRDS(sprintf("%s/mapping_mnn_%s.rds",io$mapping.dir,paste(opts$samples,collapse="-")))$mapping %>% .[,c("cell","celltype.mapped","celltype.score","closest.cell")] %>% as.data.table 72 | # mapping_seurat.dt <- fread(sprintf("%s/mapping_seurat_%s.txt.gz",io$mapping.dir,paste(opts$samples,collapse="-"))) %>% .[,c("predicted.id")] %>% as.data.table 73 | # 74 | # foo <- merge( 75 | # mapping_mnn.dt[,c("cell","celltype.mapped")] %>% setnames("celltype.mapped","celltype_mnn"), 76 | # mapping_seurat.dt[,c("cell","predicted.id")] %>% setnames("predicted.id","celltype_seurat"), 77 | # by = c("cell") 78 | # ) 79 | -------------------------------------------------------------------------------- /rna/mapping/trajectories/parse_sample_metadata_after_mapping.R: -------------------------------------------------------------------------------- 1 | here::i_am("mapping/trajectories/parse_sample_metadata_after_mapping.R") 2 | 3 | source(here::here("settings.R")) 4 | 5 | ###################### 6 | ## Define arguments ## 7 | ###################### 8 | 9 | p <- ArgumentParser(description='') 10 | p$add_argument('--metadata', type="character", help='Metadata file to use as input') 11 | # p$add_argument('--mapping_seurat', type="character", nargs="+", help='Results of the Seurat mapping') 12 | p$add_argument('--mapping_mnn', type="character", nargs="+", help='Results of the MNN mapping') 13 | p$add_argument('--outfile', type="character", help='Output file') 14 | args <- p$parse_args(commandArgs(TRUE)) 15 | 16 | ################### 17 | ## Load settings ## 18 | ################### 19 | 20 | ## START TEST ## 21 | # args$metadata <- file.path(io$basedir,"results/mapping/sample_metadata_after_mapping.txt.gz") 22 | # args$mapping_mnn <- file.path(io$basedir,sprintf("results/mapping/trajectories/NMP/mapping_mnn_%s.txt.gz",opts$samples)) 23 | # args$outfile <- file.path(io$basedir,"results/mapping/trajectories/NMP/sample_metadata_after_mapping.txt.gz") 24 | ## END TEST ## 25 | 26 | stopifnot(file.exists(args$mapping_mnn)) 27 | 28 | ################### 29 | ## Load metadata ## 30 | ################### 31 | 32 | sample_metadata <- fread(args$metadata) %>% 33 | .[,c("cell","sample","class","alias","celltype.mapped")] %>% 34 | setnames("celltype.mapped","global_mapping") 35 | 36 | ########################## 37 | ## Load mapping results ## 38 | ########################## 39 | 40 | mapping_mnn.dt <- args$mapping_mnn %>% map(~ fread(.)) %>% rbindlist 41 | stopifnot(mapping_mnn.dt$cell%in%sample_metadata$cell) 42 | 43 | ########### 44 | ## Merge ## 45 | ########### 46 | 47 | to.save <- sample_metadata %>% 48 | merge(mapping_mnn.dt, by=c("cell","sample","class")) 49 | 50 | ################# 51 | ## Save output ## 52 | ################# 53 | 54 | fwrite(to.save, args$outfile, sep="\t", na="NA", quote=F) 55 | -------------------------------------------------------------------------------- /rna/mapping/trajectories/plot_mapping_trajectory_wt_vs_ko.R: -------------------------------------------------------------------------------- 1 | here::i_am("rna/mapping/trajectories/plot_mapping_dimred.R") 2 | 3 | source(here::here("settings.R")) 4 | source(here::here("rna/mapping/analysis/plot_utils.R")) 5 | 6 | ###################### 7 | ## Define arguments ## 8 | ###################### 9 | 10 | p <- ArgumentParser(description='') 11 | p$add_argument('--query_metadata', type="character", help='Cell metadata (after mapping)') 12 | p$add_argument('--atlas_metadata', type="character", help='Cell metadata (after mapping)') 13 | p$add_argument('--outdir', type="character", help='Output file') 14 | 15 | args <- p$parse_args(commandArgs(TRUE)) 16 | 17 | ## START TEST ## 18 | args$query_metadata <- file.path(io$basedir,"results/rna/mapping/trajectories/nmp_somitic_spinal/sample_metadata_after_mapping.txt.gz") 19 | args$atlas_metadata <- file.path(io$atlas.basedir,"results/trajectories/nmp_somitic_spinal/nmp_trajectory.txt.gz") 20 | args$outdir <- file.path(io$basedir,"results/rna/mapping/trajectories/nmp_somitic_spinal/pdf") 21 | ## END TEST ## 22 | 23 | dir.create(args$outdir, showWarnings = F, recursive = T) 24 | 25 | ##################### 26 | ## Define settings ## 27 | ##################### 28 | 29 | # Options 30 | 31 | # Dot size 32 | opts$size.mapped <- 1.20 33 | opts$size.nomapped <- 0.1 34 | 35 | # Transparency 36 | opts$alpha.mapped <- 0.80 37 | opts$alpha.nomapped <- 0.35 38 | 39 | ######################### 40 | ## Load query metadata ## 41 | ######################### 42 | 43 | sample_metadata <- fread(args$query_metadata) %>% 44 | .[!is.na(closest.cell)] 45 | 46 | stopifnot("closest.cell"%in%colnames(sample_metadata)) 47 | 48 | ########################### 49 | ## Load atlas trajectory ## 50 | ########################### 51 | 52 | meta_atlas <- fread(args$atlas_metadata) %>% 53 | setnames(c("cell","V1","V2")) 54 | 55 | ########## 56 | ## Plot ## 57 | ########## 58 | 59 | # i <- "E7.5" 60 | to.plot <- meta_atlas %>% copy %>% 61 | .[,index.wt:=match(cell, sample_metadata[genotype=="WT",closest.cell] )] %>% 62 | .[,index.ko:=match(cell, sample_metadata[genotype=="T_KO",closest.cell] )] %>% 63 | .[,mapped.wt:=c(0,-10)[as.numeric(as.factor(!is.na(index.wt)))]] %>% 64 | .[,mapped.ko:=c(0,10)[as.numeric(as.factor(!is.na(index.ko)))]] %>% 65 | .[,mapped:=factor(mapped.wt + mapped.ko, levels=c("0","-10","10"))] %>% 66 | .[,mapped:=plyr::mapvalues(mapped, from = c("0","-10","10"), to = c("Atlas","WT","T_KO"))] %>% setorder(mapped) 67 | 68 | p <- plot.dimred.wtko(to.plot, wt.label = "WT", ko.label = "T_KO", nomapped.label = "Atlas") + 69 | theme(legend.position = "none", axis.line = element_blank()) 70 | 71 | pdf(file.path(args$outdir,"umap_mapped_trajectory_WT_and_KO.pdf"), width=4.5, height=5) 72 | print(p) 73 | dev.off() 74 | -------------------------------------------------------------------------------- /rna/metacells/SEACell_env.yml: -------------------------------------------------------------------------------- 1 | # https://github.com/dpeerlab/SEACells 2 | 3 | channels: 4 | - conda-forge 5 | - bioconda 6 | dependencies: 7 | - scanpy=1.8.2 8 | - loompy=3.0.6 9 | - jupyter 10 | - louvain 11 | - python-igraph 12 | - louvain>=0.6,!=0.6.2 13 | - leidenalg 14 | # - harmonypy 15 | # - scanorama 16 | - seaborn 17 | - cython 18 | - pyranges 19 | - pip 20 | - pip: 21 | - dfply 22 | # - palantir 23 | # - PhenoGraph 24 | 25 | 26 | # commands: 27 | # conda create -n metacells python==3.9 --yes 28 | # conda activate metacells 29 | # conda install mamba --yes 30 | # mamba env update -n metacells --file SEACell_env.yml 31 | # pip install git+https://github.com/settylab/Palantir@removeTSNE 32 | # python setup.py develop # inside SEACells 33 | 34 | 35 | # mamba remove -n metacells --all -------------------------------------------------------------------------------- /rna/metacells/analysis/overlay_metacells_atlas_umap.R: -------------------------------------------------------------------------------- 1 | # here::i_am("atac/archR/processing/save_archr_matrices.R") 2 | 3 | source(here::here("settings.R")) 4 | source(here::here("utils.R")) 5 | 6 | source(here::here("rna/mapping/analysis/plot_utils.R")) 7 | 8 | ##################### 9 | ## Define settings ## 10 | ##################### 11 | 12 | ## I/O 13 | io$metacell_metadata <- file.path(io$basedir,"results/rna/metacells/metacells_metadata.txt.gz") 14 | io$metacell_sce <- file.path(io$basedir,"results/rna/metacells/SingleCellExperiment_metacells.rds") 15 | # io$umap <- file.path(io$basedir,"results/rna/dimensionality_reduction/sce/batch_correction_by_sample_remove_ExE_cells_False/umap_features2500_pcs50_neigh25_dist0.5.txt.gz") 16 | io$outdir <- file.path(io$basedir,"results/rna/metacells/pdf"); dir.create(io$outdir, showWarnings = F) 17 | 18 | # Dot size 19 | opts$size.mapped <- 0.30 20 | opts$size.nomapped <- 0.1 21 | 22 | # Dot transparency 23 | opts$alpha.mapped <- 0.75 24 | opts$alpha.nomapped <- 0.35 25 | 26 | ################### 27 | ## Load metadata ## 28 | ################### 29 | 30 | metacell_metadata.dt <- fread(io$metacell_metadata) 31 | # sample_metadata.dt <- fread(io$metadata) 32 | 33 | ############################### 34 | ## Load SingleCellExperiment ## 35 | ############################### 36 | 37 | sce <- readRDS(io$metacell_sce) 38 | 39 | ########################### 40 | ## Load precomputed UMAP ## 41 | ########################### 42 | 43 | # umap.dt <- fread(io$umap, select=c(3,1,2)) %>% setnames(c("cell","V1","V2")) 44 | 45 | umap.dt <- fread(io$rna.atlas.metadata) %>% 46 | .[stripped==F & doublet==F] %>% 47 | .[,c("cell","umapX","umapY","celltype")] %>% 48 | setnames(c("umapX","umapY"),c("V1","V2")) 49 | 50 | ######################################################### 51 | ## Plot dimensionality reduction: one sample at a time ## 52 | ######################################################### 53 | 54 | to.plot <- umap.dt %>% copy %>% 55 | .[,index:=match(cell, metacell_metadata.dt$closest.cell)] %>% 56 | .[,mapped:=as.factor(!is.na(index))] %>% 57 | .[,mapped:=plyr::mapvalues(mapped, from = c("FALSE","TRUE"), to = c("Atlas","Metacell"))] %>% 58 | setorder(mapped) 59 | 60 | p <- plot.dimred(to.plot, query.label = "Metacell", atlas.label = "Atlas") 61 | 62 | pdf(file.path(io$outdir,"umap_metacell.pdf"), width=8, height=6.5) 63 | print(p) 64 | dev.off() 65 | -------------------------------------------------------------------------------- /rna/metacells/analysis/trajectories/overlay_metacells_atlas_trajectory.R: -------------------------------------------------------------------------------- 1 | # here::i_am("atac/archR/processing/save_archr_matrices.R") 2 | 3 | source(here::here("settings.R")) 4 | source(here::here("utils.R")) 5 | 6 | source(here::here("rna/mapping/analysis/plot_utils.R")) 7 | 8 | ##################### 9 | ## Define settings ## 10 | ##################### 11 | 12 | ## I/O 13 | io$metacell_metadata <- file.path(io$basedir,"results/rna/metacells/trajectories/nmp/metacells_metadata.txt.gz") 14 | io$metacell_sce <- file.path(io$basedir,"results/rna/metacells/trajectories/nmp/SingleCellExperiment_metacells.rds") 15 | io$trajectory <- file.path(io$basedir,"results/rna/trajectories/nmp/nmp_trajectory.txt.gz") 16 | io$atlas_trajectory <- file.path(io$atlas.basedir,"results/trajectories/nmp_somitic_spinal/nmp_trajectory.txt.gz") 17 | io$outdir <- file.path(io$basedir,"results/rna/metacells/trajectories/nmp/pdf"); dir.create(io$outdir, showWarnings = F) 18 | 19 | # Dot size 20 | opts$size.mapped <- 1 21 | opts$size.nomapped <- 0.1 22 | 23 | # Dot transparency 24 | opts$alpha.mapped <- 0.85 25 | opts$alpha.nomapped <- 0.35 26 | 27 | ################### 28 | ## Load metadata ## 29 | ################### 30 | 31 | metacell_metadata.dt <- fread(io$metacell_metadata) 32 | # sample_metadata.dt <- fread(io$metadata) 33 | 34 | ############################### 35 | ## Load SingleCellExperiment ## 36 | ############################### 37 | 38 | sce <- readRDS(io$metacell_sce) 39 | 40 | ##################### 41 | ## Load trajectory ## 42 | ##################### 43 | 44 | # trajectory.dt <- fread(io$atlas_trajectory) %>% setnames(c("cell","V1","V2")) 45 | trajectory.dt <- fread(io$trajectory) %>% setnames(c("cell","V1","V2")) 46 | 47 | ################################################# 48 | ## Plot mapping of metacells to the trajectory ## 49 | ################################################# 50 | 51 | to.plot <- trajectory.dt %>% copy %>% 52 | # .[,index:=match(cell, metacell_metadata.dt$closest.cell)] %>% 53 | .[,index:=match(cell, metacell_metadata.dt$metacell)] %>% 54 | .[,mapped:=as.factor(!is.na(index))] %>% 55 | .[,mapped:=plyr::mapvalues(mapped, from = c("FALSE","TRUE"), to = c("Atlas","Metacell"))] %>% 56 | setorder(mapped) 57 | 58 | p <- plot.dimred(to.plot, query.label = "Metacell", atlas.label = "Atlas") 59 | 60 | pdf(file.path(io$outdir,"trajectory_highlight_metacells.pdf"), width=8, height=6.5) 61 | print(p) 62 | dev.off() 63 | -------------------------------------------------------------------------------- /rna/plot_individual_genes/pseudobulk/plot_paga_individual_genes_pseudobulk.R: -------------------------------------------------------------------------------- 1 | ##################### 2 | ## Define settings ## 3 | ##################### 4 | 5 | if (grepl("ricard",Sys.info()['nodename'])) { 6 | source("/Users/ricard/gastrulation_multiome_10x/settings.R") 7 | source("/Users/ricard/gastrulation_multiome_10x/utils.R") 8 | } else if (grepl("ebi",Sys.info()['nodename'])) { 9 | source("/homes/ricard/gastrulation_multiome_10x/settings.R") 10 | source("/homes/ricard/gastrulation_multiome_10x/utils.R") 11 | } 12 | 13 | # I/O 14 | io$outdir <- paste0(io$basedir,"/results/rna/individual_genes/pseudobulk"); dir.create(io$outdir, showWarnings = F) 15 | 16 | # Options 17 | opts$celltypes = c( 18 | "Epiblast", 19 | "Primitive_Streak", 20 | "Caudal_epiblast", 21 | "PGC", 22 | "Anterior_Primitive_Streak", 23 | "Notochord", 24 | "Def._endoderm", 25 | "Gut", 26 | "Nascent_mesoderm", 27 | "Mixed_mesoderm", 28 | "Intermediate_mesoderm", 29 | "Caudal_Mesoderm", 30 | "Paraxial_mesoderm", 31 | "Somitic_mesoderm", 32 | "Pharyngeal_mesoderm", 33 | "Cardiomyocytes", 34 | "Allantois", 35 | "ExE_mesoderm", 36 | "Mesenchyme", 37 | "Haematoendothelial_progenitors", 38 | "Endothelium", 39 | "Blood_progenitors_1", 40 | "Blood_progenitors_2", 41 | "Erythroid1", 42 | "Erythroid2", 43 | "Erythroid3", 44 | "NMP", 45 | "Rostral_neurectoderm", 46 | "Caudal_neurectoderm", 47 | "Neural_crest", 48 | "Forebrain_Midbrain_Hindbrain", 49 | "Spinal_cord", 50 | "Surface_ectoderm", 51 | "Visceral_endoderm", 52 | "ExE_endoderm", 53 | "Parietal_endoderm", 54 | "ExE_ectoderm" 55 | ) 56 | 57 | ############### 58 | ## Load data ## 59 | ############### 60 | 61 | # Load SingleCellExperiment object 62 | rna.sce <- readRDS(io$rna.pseudobulk.sce)[,opts$celltypes] 63 | 64 | # Load gene metadata 65 | gene_metadata <- fread(io$gene_metadata) %>% 66 | .[symbol%in%rownames(rna.sce)] 67 | 68 | ############### 69 | ## Load PAGA ## 70 | ############### 71 | 72 | if (grepl("ricard",Sys.info()['nodename'])) { 73 | source("/Users/ricard/gastrulation_multiome_10x/load_paga_graph.R") 74 | } else if (grepl("ebi",Sys.info()['nodename'])) { 75 | source("/homes/ricard/gastrulation_multiome_10x/load_paga_graph.R") 76 | } else { 77 | stop("Computer not recognised") 78 | } 79 | 80 | # Plot graph structure 81 | p <- ggnet2( 82 | net = net.paga, 83 | mode = c("x", "y"), 84 | node.size = 0, 85 | edge.size = 0.15, 86 | edge.color = "grey", 87 | label = FALSE, 88 | label.size = 2.3 89 | ) 90 | 91 | 92 | ########## 93 | ## Plot ## 94 | ########## 95 | 96 | # Define color scale 97 | rna.col.seq <- chromvar.col.seq <- round(seq(0,1,0.1), 2) 98 | rna.colors <- colorRampPalette(c("gray92", "darkgreen"))(length(rna.col.seq)) 99 | 100 | # Define genes to plot 101 | # genes.to.plot <- rownames(rna.sce)[grep("Gata",rownames(rna.sce))] 102 | genes.to.plot <- c("Foxa2","Tfap2a","Mesp1") 103 | 104 | for (i in 1:length(genes.to.plot)) { 105 | gene <- genes.to.plot[i] 106 | print(sprintf("%s/%s: %s",i,length(genes.to.plot),gene)) 107 | 108 | expr.values <- logcounts(rna.sce[gene,])[1,] %>% minmax.normalisation() 109 | expr.colors <- round(expr.values,1) %>% map(~ rna.colors[which(rna.col.seq == .)]) %>% unlist 110 | 111 | p.rna <- p + geom_text(label = "\u25D0", aes(x=x, y=y), color=expr.colors, size=20, family = "Arial Unicode MS", 112 | data = p$data[,c("x","y")] %>% dplyr::mutate(expr=expr.colors)) + 113 | scale_colour_manual(values=expr.colors) + 114 | labs(title=gene) + 115 | theme( 116 | plot.title = element_text(hjust = 0.5) 117 | ) 118 | 119 | 120 | png(sprintf("%s/%s_rna_expression_paga.png",io$outdir,gene), width = 350, height = 400) 121 | # pdf(sprintf("%s/%s_rna_expression_paga.pdf",io$outdir,i), width=5, height=3.5) 122 | print(p.rna) 123 | dev.off() 124 | } 125 | 126 | 127 | -------------------------------------------------------------------------------- /rna/processing/3_seurat_to_SCE.R: -------------------------------------------------------------------------------- 1 | here::i_am("rna/processing/3_seurat_to_SCE.R") 2 | 3 | # Load default settings 4 | source(here::here("settings.R")) 5 | 6 | suppressPackageStartupMessages(library(Seurat)) 7 | suppressPackageStartupMessages(library(scater)) 8 | suppressPackageStartupMessages(library(scran)) 9 | 10 | ###################### 11 | ## Define arguments ## 12 | ###################### 13 | 14 | p <- ArgumentParser(description='') 15 | p$add_argument('--test', action="store_true", help='Testing mode') 16 | p$add_argument('--normalise', action="store_true", help='Log-Normalise?') 17 | p$add_argument('--samples', type="character", nargs="+", help='Samples') 18 | p$add_argument('--seurat', type="character", help='Seurat object (input)') 19 | p$add_argument('--metadata', type="character", help='Metadata file') 20 | p$add_argument('--outfile', type="character", help='Output file') 21 | args <- p$parse_args(commandArgs(TRUE)) 22 | 23 | ##################### 24 | ## Define settings ## 25 | ##################### 26 | 27 | ## START TEST ## 28 | # args <- list() 29 | # args$outfile <- io$rna.sce 30 | # args$outfile <- paste0(io$basedir,"/processed/rna_new/SingleCellExperiment.rds") 31 | # args$samples <- opts$samples 32 | # args$metadata <- paste0(io$basedir,"/results/rna_new/qc/sample_metadata_after_qc.txt.gz") 33 | # args$seurat <- paste0(io$basedir,"/processed/rna_new/seurat.rds") 34 | # args$test <- FALSE 35 | # args$normalise <- FALSE 36 | ## END TEST ## 37 | 38 | # Sanity checks 39 | stopifnot(args$samples%in%opts$samples) 40 | if (args$test) args$samples <- head(args$samples,n=2) 41 | 42 | ############### 43 | ## Load data ## 44 | ############### 45 | 46 | # Load sample metadata 47 | sample_metadata <- fread(args$metadata) %>% .[pass_rnaQC==TRUE & sample%in%args$samples] 48 | table(sample_metadata$sample) 49 | 50 | # Load seurat 51 | seurat <- readRDS(args$seurat)[,sample_metadata$cell] 52 | 53 | ##################################### 54 | ## Convert to SingleCellExperiment ## 55 | ##################################### 56 | 57 | sce <- as.SingleCellExperiment(seurat) 58 | 59 | # remove logcounts assays 60 | sce@assays@data[["logcounts"]] <- NULL 61 | 62 | # Add metadata 63 | # stopifnot(sample_metadata$cell%in%colnames(sce)) 64 | # stopifnot(colnames(sce)%in%sample_metadata$cell) 65 | sample_metadata <- sample_metadata %>% .[cell%in%colnames(sce)] %>% setkey(cell) %>% .[colnames(sce)] 66 | stopifnot(sample_metadata$cell == colnames(sce)) 67 | colData(sce) <- sample_metadata %>% as.data.frame %>% tibble::column_to_rownames("cell") %>% 68 | .[colnames(sce),] %>% DataFrame() 69 | 70 | ########################## 71 | ## Compute size factors ## 72 | ########################## 73 | 74 | clusts <- as.numeric(quickCluster(sce, method = "igraph", min.size = 100, BPPARAM = mcparam)) 75 | # clusts <- as.numeric(quickCluster(sce)) 76 | min.clust <- min(table(clusts))/2 77 | new_sizes <- c(floor(min.clust/3), floor(min.clust/2), floor(min.clust)) 78 | sce <- computeSumFactors(sce, clusters = clusts, sizes = new_sizes, max.cluster.size = 3000) 79 | 80 | ################### 81 | ## Log Normalise ## 82 | ################### 83 | 84 | if (args$normalise) { 85 | sce <- logNormCounts(sce) 86 | } 87 | 88 | ########## 89 | ## Plot ## 90 | ########## 91 | 92 | # to.plot <- data.frame(X = Matrix::colSums(counts(sce)), Y = sizeFactors(sce)) 93 | # ggplot(to.plot, mapping = aes(x = X, y = Y)) + 94 | # geom_point() + 95 | # labs(x = "Number of UMIs", y = "Size Factor") + 96 | # theme_classic() 97 | 98 | ########## 99 | ## Save ## 100 | ########## 101 | 102 | saveRDS(sce, args$outfile) 103 | -------------------------------------------------------------------------------- /rna/processing/4_doublet_detection.R: -------------------------------------------------------------------------------- 1 | here::i_am("rna/processing/4_doublet_detection.R") 2 | 3 | source(here::here("settings.R")) 4 | source(here::here("utils.R")) 5 | 6 | suppressPackageStartupMessages(library(SingleCellExperiment)) 7 | suppressPackageStartupMessages(library(scds)) 8 | suppressPackageStartupMessages(library(scran)) 9 | suppressPackageStartupMessages(library(scater)) 10 | 11 | ###################### 12 | ## Define arguments ## 13 | ###################### 14 | 15 | p <- ArgumentParser(description='') 16 | p$add_argument('--sce', type="character", help='SingleCellExperiment file') 17 | p$add_argument('--metadata', type="character", help='metadata file') 18 | p$add_argument('--samples', type="character", nargs='+', help='Sample(s)') 19 | p$add_argument('--doublet_score_threshold', type="double", default=1.25, help='Doublet score threshold') 20 | p$add_argument('--test', action = "store_true", help='Testing mode') 21 | p$add_argument('--outfile', type="character", help='Output file') 22 | args <- p$parse_args(commandArgs(TRUE)) 23 | 24 | ## START TEST ## 25 | # args <- list() 26 | # args$sce <- io$rna.sce 27 | # args$metadata <- paste0(io$basedir,"/results/rna/mapping/sample_metadata_after_mapping.txt.gz") # .io$metadata 28 | # args$samples <- c("E8.5_rep2") 29 | # args$doublet_score_threshold <- 1.0 30 | # args$test <- TRUE 31 | ## END TEST ## 32 | 33 | # Parse arguments 34 | dir.create(dirname(args$outfile)) 35 | if (isTRUE(args$test)) print("Test mode activated...") 36 | 37 | ########################## 38 | ## Load sample metadata ## 39 | ########################## 40 | 41 | sample_metadata <- fread(args$metadata) %>% 42 | .[pass_rnaQC==TRUE & sample%in%args$samples] 43 | table(sample_metadata$sample) 44 | 45 | ############### 46 | ## Load data ## 47 | ############### 48 | 49 | # Load SingleCellExperiment object 50 | sce <- load_SingleCellExperiment(args$sce, cells=sample_metadata$cell, normalise = TRUE) 51 | dim(sce) 52 | 53 | # Add sample metadata as colData 54 | colData(sce) <- sample_metadata %>% tibble::column_to_rownames("cell") %>% DataFrame 55 | 56 | ############################# 57 | ## Calculate doublet score ## 58 | ############################# 59 | 60 | sce <- cxds_bcds_hybrid(sce, estNdbl=TRUE) 61 | 62 | dt <- colData(sce) %>% 63 | .[,c("sample","cxds_score", "bcds_score", "hybrid_score")] %>% 64 | as.data.frame %>% tibble::rownames_to_column("cell") %>% as.data.table %>% 65 | .[,c("cxds_score","bcds_score","hybrid_score"):=list(round(cxds_score,2),round(bcds_score,2),round(hybrid_score,2))] 66 | 67 | # Call doublets 68 | dt[,doublet_call:=hybrid_score>args$doublet_score_threshold] 69 | table(dt$doublet_call) 70 | 71 | # Save 72 | # io$outfile <- sprintf("%s/doublets_%s_%s.txt.gz",args$outdir, paste(args$samples,collapse="-"),round(args$doublet_score_threshold,2)) 73 | fwrite(dt, args$outfile, sep="\t", na="NA", quote=F) 74 | 75 | 76 | -------------------------------------------------------------------------------- /rna/processing/5_parse_sample_metadata_after_doublets.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages(library(argparse)) 2 | 3 | here::i_am("rna/processing/5_parse_sample_metadata_after_doublets.R") 4 | source(here::here("settings.R")) 5 | 6 | 7 | ###################### 8 | ## Define arguments ## 9 | ###################### 10 | 11 | p <- ArgumentParser(description='') 12 | p$add_argument('--metadata', type="character", help='Metadata file to use as input') 13 | p$add_argument('--doublet_files', type="character", nargs="+", help='Results of the doublet score detection algorithm') 14 | p$add_argument('--outfile', type="character", help='Output file') 15 | args <- p$parse_args(commandArgs(TRUE)) 16 | 17 | ################### 18 | ## Load settings ## 19 | ################### 20 | 21 | 22 | ## START TEST ## 23 | # args$metadata <- file.path(io$basedir,"results/rna/qc/sample_metadata_after_qc.txt.gz") 24 | # args$doublet_files <- file.path(io$basedir,"results/rna/doublet_detection/doublets_AGTCAA_R7_L001_mm10_sorted_merged_rmdup_mtx2_1.25.txt.gz") 25 | # args$outfile <- file.path(io$basedir,"results/rna/doublet_detection/sample_metadata_after_doublets.txt.gz") 26 | ## END TEST ## 27 | 28 | ########################## 29 | ## Load mapping results ## 30 | ########################## 31 | 32 | doublet.dt <- args$doublet_files %>% map(~ fread(.)) %>% rbindlist 33 | 34 | # stopifnot(mapping_mnn.dt$cell%in%sample_metadata$cell) 35 | # stopifnot(mapping_seurat.dt$cell%in%sample_metadata$cell) 36 | 37 | #################### 38 | ## Merge and save ## 39 | #################### 40 | 41 | to.save <- fread(args$metadata) %>% 42 | merge(doublet.dt[,c("cell","hybrid_score","doublet_call")] %>% setnames("hybrid_score","doublet_score"), by="cell", all.x=TRUE) 43 | fwrite(to.save, args$outfile, sep="\t", na="NA", quote=F) 44 | 45 | # to.save[pass_rnaQC==T & is.na(celltype.mapped_mnn)] 46 | # stopifnot(to.save[pass_rnaQC==T & is.na(celltype.mapped_mnn),.N]==0) 47 | # stopifnot(to.save[pass_rnaQC==T & is.na(celltype.mapped_seurat),.N]==0) 48 | -------------------------------------------------------------------------------- /rna/processing/extract_TFs_from_SingleCellExperiment.R: -------------------------------------------------------------------------------- 1 | here::i_am("rna/processing/extract_TFs_from_SingleCellExperiment.R") 2 | 3 | source(here::here("settings.R")) 4 | source(here::here("utils.R")) 5 | 6 | ################################ 7 | ## Initialize argument parser ## 8 | ################################ 9 | 10 | p <- ArgumentParser(description='') 11 | p$add_argument('--sce', type="character", help='') 12 | p$add_argument('--motif_annotation', type="character", help='') 13 | # p$add_argument('--motif2gene', type="character", help='') 14 | p$add_argument('--TF_file', type="character", help='') 15 | p$add_argument('--outfile', type="character", help='Output file') 16 | args <- p$parse_args(commandArgs(TRUE)) 17 | 18 | ## START TEST 19 | # io$basedir <- file.path(io$basedir,"test") 20 | # args$sce <- file.path(io$basedir,"results/rna/pseudobulk/celltype/SingleCellExperiment_pseudobulk.rds") 21 | # args$motif_annotation <- "JASPAR" 22 | # # args$motif2gene <- file.path(io$basedir,sprintf("processed/atac/archR/Annotations/%s_motif2gene.txt.gz",args$motif_annotation)) 23 | # args$TF_file <- "/Users/argelagr/data/mm10_regulation/TFs/TFs.txt" 24 | # args$outfile <- file.path(io$basedir,sprintf("processed/rna/SingleCellExperiment_TFs_%s.rds",args$motif_annotation)) 25 | ## END TEST 26 | 27 | ######################### 28 | ## Load RNA expression ## 29 | ######################### 30 | 31 | rna.sce <- readRDS(args$sce) 32 | 33 | ################ 34 | ## Subset TFs ## 35 | ################ 36 | 37 | # Load TF annotation 38 | # motif2gene.dt <- fread(args$motif2gene) %>% 39 | # .[gene%in%toupper(rownames(rna.sce))] 40 | # rna_tf.sce <- rna.sce[str_to_title(motif2gene.dt$gene),] 41 | 42 | TFs <- fread(args$TF_file)[[1]] 43 | TFs <- TFs[TFs%in%toupper(rownames(rna.sce))] 44 | 45 | # Subset TFs 46 | rna_tf.sce <- rna.sce[str_to_title(TFs),] 47 | rownames(rna_tf.sce) <- toupper(rownames(rna_tf.sce)) 48 | 49 | ########## 50 | ## Save ## 51 | ########## 52 | 53 | saveRDS(rna_tf.sce, args$outfile) 54 | -------------------------------------------------------------------------------- /rna/pseudobulk/old/create_pseudobulk_metadata_with_replicates.R: -------------------------------------------------------------------------------- 1 | here::i_am("rna/pseudobulk/create_pseudobulk_metadata_with_replicates.R") 2 | 3 | 4 | source(here::here("settings.R")) 5 | source(here::here("utils.R")) 6 | 7 | ###################### 8 | ## Define arguments ## 9 | ###################### 10 | 11 | p <- ArgumentParser(description='') 12 | p$add_argument('--metadata', type="character", help='metadata file') 13 | p$add_argument('--group_by', type="character", help='') 14 | p$add_argument('--nrep', type="integer", default=5, help='Number of replicates per group (cells sampled with replacement)') 15 | p$add_argument('--min_cells', type="integer", default=5, help='Minimum number of cells per replicate') 16 | p$add_argument('--percentage_cells_per_replicate', type="double", default=0.3, help='Percentage of cells per replicate') 17 | p$add_argument('--outdir', type="character", help='Output directory') 18 | 19 | args <- p$parse_args(commandArgs(TRUE)) 20 | 21 | ## START TEST ## 22 | io$basedir <- file.path(io$basedir,"test") 23 | args$metadata <- file.path(io$basedir,"results/rna/mapping/sample_metadata_after_mapping.txt.gz") 24 | args$sce <- file.path(io$basedir,"processed/rna/SingleCellExperiment.rds") 25 | args$group_by <- "celltype" 26 | args$nrep <- 5 27 | args$min_cells <- 25 28 | args$percentage_cells_per_replicate <- 0.30 29 | args$outfile <- file.path(io$basedir,sprintf("results/rna/pseudobulk/%s/cell2replicate.txt.gz",args$group_by)) 30 | ## END TEST ## 31 | 32 | dir.create(dirname(args$outfile), showWarnings = F, recursive = T) 33 | 34 | ################### 35 | ## Load metadata ## 36 | ################### 37 | 38 | # Load cell metadata 39 | cell_metadata.dt <- fread(args$metadata) %>% 40 | .[,celltype_genotype:=sprintf("%s-%s",celltype,genotype)] %>% 41 | .[pass_rnaQC==TRUE & doublet_call==FALSE & !is.na(eval(as.name(args$group_by)))] %>% 42 | setnames(args$group_by,"group") 43 | 44 | print(table(cell_metadata.dt$group)) 45 | 46 | ################################## 47 | ## Create pseudobulk replicates ## 48 | ################################## 49 | 50 | cell2group.dt <- unique(cell_metadata.dt$group) %>% map(function(i) { 51 | tmp <- cell_metadata.dt[group==i] 52 | if ((args$percentage_cells_per_replicate*nrow(tmp))<=args$min_cells) { 53 | ncells_per_replicate <- args$min_cells 54 | } else { 55 | ncells_per_replicate <- round(args$percentage_cells_per_replicate*nrow(tmp)) 56 | } 57 | seq(1,args$nrep) %>% map(function(j) { 58 | tmp[sample.int(nrow(tmp),ncells_per_replicate)] %>% 59 | .[,replicate:=sprintf("%s_rep%s",i,j)] %>% 60 | .[,c("cell","group","replicate")] %>% 61 | return 62 | }) %>% rbindlist %>% return 63 | }) %>% rbindlist 64 | 65 | 66 | stats.dt <- cell2group.dt[,.(ncells=.N),c("group","replicate")] 67 | print(stats.dt) 68 | 69 | ########## 70 | ## Save ## 71 | ########## 72 | 73 | fwrite(cell2group.dt, args$outfile, sep="\t", quote = F) 74 | 75 | -------------------------------------------------------------------------------- /rna/pseudobulk/old/old_code.R: -------------------------------------------------------------------------------- 1 | 2 | ########################################################################### 3 | ## Calculate average expression as the average of log-transformed values ## 4 | ########################################################################### 5 | 6 | # expr.dt <- unique(sce$celltype.mapped) %>% map(function(i) { 7 | # dt <- logcounts(sce[,sce$celltype.mapped==i]) %>% as.matrix %>% as.data.table(keep.rownames = T) %>% 8 | # melt(id.vars="rn") %>% setnames(c("symbol","cell","value")) %>% 9 | # .[,.(mean_expr=round(mean(value),3)),by="symbol"] %>% 10 | # .[,celltype:=i] 11 | # return(dt) 12 | # }) %>% rbindlist 13 | # 14 | # length(unique(expr.dt$symbol)) 15 | # length(unique(expr.dt$celltype)) 16 | 17 | ################################################################# 18 | ## Calculate average expression using the average count values ## 19 | ################################################################# 20 | 21 | # NOTE: NOT WORKING 22 | # expr.dt <- unique(sce$celltype.mapped) %>% map(function(i) { 23 | # dt <- counts(sce[,sce$celltype.mapped==i]) %>% as.matrix %>% as.data.table(keep.rownames = T) %>% 24 | # melt(id.vars="rn") %>% setnames(c("symbol","cell","value")) %>% 25 | # .[,.(counts=sum(value), mean_counts=round(mean(value),3)),by="symbol"] %>% 26 | # .[,celltype:=i] 27 | # return(dt) 28 | # }) %>% rbindlist 29 | # 30 | # foo <-expr.dt %>% 31 | # .[,sum_counts:=sum(counts),by="celltype"] %>% 32 | # .[,.(mean_counts=unique(mean_counts), mean_counts2=counts/unique(sum_counts)),by="symbol"] 33 | # 34 | # length(unique(expr.dt$symbol)) 35 | # length(unique(expr.dt$celltype)) 36 | 37 | ########## 38 | ## Save ## 39 | ########## 40 | 41 | # to.save <- expr.dt %>% 42 | # merge(gene_metadata[,c("symbol","ens_id")], all.x=T) %>% 43 | # setnames("symbol","gene") 44 | # fwrite(to.save, paste0(io$outdir,"/avg_expr_per_celltype_and_gene.txt.gz"), sep="\t") 45 | -------------------------------------------------------------------------------- /rna/pseudobulk/old/pseudobulk_rna.R: -------------------------------------------------------------------------------- 1 | library(muscat) 2 | library(DESeq2) 3 | 4 | ##################### 5 | ## Define settings ## 6 | ##################### 7 | 8 | if (grepl("ricard",Sys.info()['nodename'])) { 9 | source("/Users/ricard/gastrulation_multiome_10x/settings.R") 10 | } else if (grepl("ebi",Sys.info()['nodename'])) { 11 | source("/homes/ricard/gastrulation_multiome_10x/settings.R") 12 | } else { 13 | stop("Computer not recognised") 14 | } 15 | 16 | # I/O 17 | io$outdir <- paste0(io$basedir,"/results/rna/pseudobulk") 18 | 19 | # Options 20 | opts$samples <- c( 21 | "E7.5_rep1", 22 | "E7.5_rep2", 23 | "E8.0_rep1", 24 | "E8.0_rep2", 25 | "E8.5_rep1", 26 | "E8.5_rep2" 27 | ) 28 | 29 | ############### 30 | ## Load data ## 31 | ############### 32 | 33 | # Load cell metadata 34 | # io$metadata <- "/Users/ricard/data/gastrulation_multiome_10x/results/rna/doublets/sample_metadata_after_doublets.txt.gz" 35 | sample_metadata <- fread(io$metadata) %>% 36 | .[pass_rnaQC==TRUE & doublet_call==FALSE & sample%in%opts$samples & !is.na(celltype.mapped)] 37 | 38 | # Load SingleCellExperiment 39 | sce <- load_SingleCellExperiment(io$rna.sce, cells=sample_metadata$cell) 40 | colData(sce) <- sample_metadata %>% tibble::column_to_rownames("cell") %>% DataFrame 41 | 42 | ################################### 43 | ## Aggregate counts per celltype ## 44 | ################################### 45 | 46 | # assays(sce)$cpm <- edgeR::cpm(assay(sce), normalized.lib.sizes = FALSE, log = FALSE) 47 | 48 | sce_pseudobulk <- aggregateData( 49 | sce, 50 | assay = "counts", 51 | by = c("celltype.mapped"), 52 | fun = c("sum"), 53 | scale = FALSE # Should pseudo-bulks be scaled with the effective library size & multiplied by 1M? 54 | ) 55 | 56 | assayNames(sce_pseudobulk) <- "counts" 57 | 58 | ############### 59 | ## Normalise ## 60 | ############### 61 | 62 | # create DESeq object 63 | dds <- DESeqDataSet(sce_pseudobulk, design=~1) 64 | 65 | # This function calculates a variance stabilizing transformation (VST) from the fitted dispersion-mean relation(s) 66 | # and then transforms the count data (normalized by division by the size factors or normalization factors), 67 | # yielding a matrix of values which are now approximately homoskedastic 68 | dds <- varianceStabilizingTransformation(dds) 69 | 70 | logcounts(sce_pseudobulk) <- assay(dds) 71 | 72 | ################### 73 | ## Sanity checks ## 74 | ################### 75 | 76 | # cor( 77 | # colMeans(logcounts(sce_pseudobulk)), 78 | # metadata(sce_pseudobulk)$n_cells 79 | # ) 80 | 81 | ########## 82 | ## Save ## 83 | ########## 84 | 85 | saveRDS(sce_pseudobulk, paste0(io$outdir,"/SingleCellExperiment.rds")) 86 | -------------------------------------------------------------------------------- /rna/pseudobulk/old/pseudobulk_rna_intronic_exonic.R: -------------------------------------------------------------------------------- 1 | library(muscat) 2 | library(DESeq2) 3 | 4 | ##################### 5 | ## Define settings ## 6 | ##################### 7 | 8 | if (grepl("ricard",Sys.info()['nodename'])) { 9 | source("/Users/ricard/gastrulation_multiome_10x/settings.R") 10 | } else if (grepl("ebi",Sys.info()['nodename'])) { 11 | source("/homes/ricard/gastrulation_multiome_10x/settings.R") 12 | } else { 13 | stop("Computer not recognised") 14 | } 15 | 16 | # I/O 17 | io$rna.sce <- paste0(io$basedir,"/processed/rna/SingleCellExperiment_velocyto.rds") 18 | io$outdir <- paste0(io$basedir,"/results/rna/pseudobulk") 19 | 20 | # Options 21 | opts$samples <- c( 22 | "E7.5_rep1", 23 | "E7.5_rep2", 24 | "E8.0_rep1", 25 | "E8.0_rep2", 26 | "E8.5_rep1", 27 | "E8.5_rep2" 28 | ) 29 | 30 | ############### 31 | ## Load data ## 32 | ############### 33 | 34 | # Load cell metadata 35 | sample_metadata <- fread(io$metadata) %>% 36 | .[pass_rnaQC==TRUE & doublet_call==FALSE & sample%in%opts$samples & !is.na(celltype.mapped)] 37 | 38 | # Load velocyto SingleCellExperiment 39 | sce <- load_SingleCellExperiment(io$rna.sce, cells=sample_metadata$cell) 40 | 41 | # Filter genes 42 | # gene_metadata <- fread(io$gene_metadata) 43 | # genes <- unique(gene_metadata$symbol) 44 | # sce <- sce[rownames(sce)%in%genes,] 45 | sce <- sce[!duplicated(rownames(sce)),] 46 | 47 | ################################### 48 | ## Aggregate counts per celltype ## 49 | ################################### 50 | 51 | sce_pseudobulk_unspliced <- aggregateData( 52 | sce, 53 | assay = "unspliced", 54 | by = c("celltype.mapped"), 55 | fun = c("sum"), 56 | scale = FALSE # Should pseudo-bulks be scaled with the effective library size & multiplied by 1M? 57 | ); assayNames(sce_pseudobulk_unspliced) <- "counts" 58 | 59 | sce_pseudobulk_spliced <- aggregateData( 60 | sce, 61 | assay = "spliced", 62 | by = c("celltype.mapped"), 63 | fun = c("sum"), 64 | scale = FALSE # Should pseudo-bulks be scaled with the effective library size & multiplied by 1M? 65 | ); assayNames(sce_pseudobulk_spliced) <- "counts" 66 | 67 | 68 | ############### 69 | ## Normalise ## 70 | ############### 71 | 72 | # create DESeq object 73 | dds.unspliced <- DESeqDataSet(sce_pseudobulk_unspliced, design=~1) 74 | dds.spliced <- DESeqDataSet(sce_pseudobulk_spliced, design=~1) 75 | 76 | # This function calculates a variance stabilizing transformation (VST) from the fitted dispersion-mean relation(s) 77 | # and then transforms the count data (normalized by division by the size factors or normalization factors), 78 | # yielding a matrix of values which are now approximately homoskedastic 79 | dds.unspliced <- varianceStabilizingTransformation(dds.unspliced) 80 | dds.spliced <- varianceStabilizingTransformation(dds.spliced) 81 | 82 | sce_pseudobulk <- SingleCellExperiment( 83 | assays = list( 84 | "spliced" = assay(sce_pseudobulk_spliced), 85 | "unspliced" = assay(sce_pseudobulk_unspliced), 86 | "unspliced_log" = assay(dds.unspliced), 87 | "spliced_log" = assay(dds.spliced) 88 | ) 89 | ) 90 | 91 | colnames(sce_pseudobulk) %>% head 92 | rownames(sce_pseudobulk) %>% head 93 | 94 | ########## 95 | ## Save ## 96 | ########## 97 | 98 | saveRDS(sce_pseudobulk, paste0(io$outdir,"/SingleCellExperiment_velocyto.rds")) 99 | -------------------------------------------------------------------------------- /rna/scanpy/create_anndata_from_SingleCellExperiment.R: -------------------------------------------------------------------------------- 1 | here::i_am("rna/scanpy/create_anndata_from_SingleCellExperiment.R") 2 | 3 | # Load default settings 4 | source(here::here("settings.R")) 5 | source(here::here("utils.R")) 6 | 7 | # Load libraries 8 | suppressPackageStartupMessages({ 9 | library("reticulate") 10 | library("SingleCellExperiment") 11 | }) 12 | 13 | ##################### 14 | ## Define settings ## 15 | ##################### 16 | 17 | io$outfile <- file.path(io$basedir,"processed/rna/anndata.h5ad") 18 | 19 | ##################################### 20 | ## Reticulate connection to scanpy ## 21 | ##################################### 22 | 23 | sc <- import("scanpy") 24 | 25 | ########################## 26 | ## Load sample metadata ## 27 | ########################## 28 | 29 | sample_metadata <- fread(io$metadata) %>% 30 | .[pass_rnaQC==TRUE & doublet_call==FALSE & !is.na(celltype.mapped)] 31 | 32 | ############### 33 | ## Load data ## 34 | ############### 35 | 36 | # Load RNA expression data as SingleCellExperiment object 37 | sce <- load_SingleCellExperiment(io$rna.sce, cells=sample_metadata$cell, normalise = FALSE) 38 | 39 | # Add sample metadata as colData 40 | colData(sce) <- sample_metadata %>% tibble::column_to_rownames("cell") %>% DataFrame 41 | 42 | 43 | ############################################# 44 | ## Convert SingleCellExperiment to AnnData ## 45 | ############################################# 46 | 47 | adata_sce <- sc$AnnData( 48 | X = t(counts(sce)), 49 | obs = as.data.frame(colData(sce)), 50 | var = data.frame(gene=rownames(sce), row.names=rownames(sce)) 51 | ) 52 | # adata_sce$obsm$update(umap = reducedDim(sce, "umap")) 53 | 54 | adata_sce 55 | 56 | # Add cell type colors 57 | # colPalette_celltypes = [opts["celltype_colors"][i.replace(" ","_").replace("/","_")] for i in sorted(np.unique(adata.obs['celltype']))] 58 | # adata.uns['celltype'] = colPalette_celltypes 59 | # colPalette_stages = [opts["stage_colors"][i.replace(" ","_").replace("/","_")] for i in sorted(np.unique(adata.obs['stage']))] 60 | # adata.uns['stage_colors'] = colPalette_stages 61 | adata_sce$uns$update(celltype.mapped_colors = opts$celltype.colors[sort(unique(as.character(adata_sce$obs$celltype.mapped)))]) 62 | adata_sce$uns$update(stage_colors = opts$stage.colors[sort(unique(as.character(adata_sce$obs$stage)))]) 63 | adata_sce$uns["celltype.mapped_colors"] 64 | adata_sce$uns["stage_colors"] 65 | 66 | ########## 67 | ## Save ## 68 | ########## 69 | 70 | adata_sce$write_h5ad(io$outfile) 71 | -------------------------------------------------------------------------------- /rna/scanpy/create_anndata_scvelo.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import scvelo as scv 3 | 4 | ###################### 5 | ## Define arguments ## 6 | ###################### 7 | 8 | p = argparse.ArgumentParser( description='' ) 9 | p.add_argument( '--loom_directory', type=str, help='Input directory for the loom files (after velocyto)' ) 10 | p.add_argument( '--anndata', type=str, help='Anndata object' ) 11 | p.add_argument( '--outfile', type=str, help='Output file (anndata)' ) 12 | p.add_argument( '--metadata', type=str, help='Metadata file' ) 13 | p.add_argument( '--samples', type=str, nargs="+", help='Samples' ) 14 | args = p.parse_args() 15 | 16 | ##################### 17 | ## Define settings ## 18 | ##################### 19 | 20 | exec(open('../../settings.py').read()) 21 | exec(open('../../utils.py').read()) 22 | 23 | ## START TEST ## 24 | args.outfile = io["basedir"]+"/processed/rna/anndata_scvelo.h5ad" 25 | args.anndata = io["basedir"]+"/processed/rna/anndata.h5ad" 26 | args.loom_directory = io["basedir"]+"/processed/rna/loom" 27 | args.metadata = io["basedir"]+"/results_new2/rna/mapping/sample_metadata_after_mapping.txt.gz" 28 | args.samples = ["E7.5_rep1", "E7.5_rep2"] 29 | ## END TEST ## 30 | 31 | 32 | ################### 33 | ## Load metadata ## 34 | ################### 35 | 36 | print("Loading metadata...") 37 | 38 | metadata = (pd.read_table(args.metadata) >> 39 | mask(X.pass_rnaQC==True, X.doublet_call==False) >> 40 | mask(X["sample"].isin(args.samples)) 41 | ).set_index("cell", drop=False) 42 | print(metadata.shape) 43 | 44 | ######################### 45 | ## Load anndata object ## 46 | ######################### 47 | 48 | print("Loading anndata...") 49 | 50 | adata = load_adata(adata_file = args.anndata, metadata_file = args.metadata, normalise = False, cells = metadata.index.values) 51 | 52 | ############################################################### 53 | ## Load spliced and unspliced count matrices from loom files ## 54 | ############################################################### 55 | 56 | print("Loading loom files...") 57 | 58 | looms = [None for i in range(len(args.samples))] 59 | 60 | for i in range(len(args.samples)): 61 | loom_file = args.loom_directory + "/" + args.samples[i] + ".loom" 62 | looms[i] = sc.read_loom(loom_file, sparse=True, X_name='spliced', obs_names='CellID', obsm_names=None, var_names='Gene') 63 | # looms[i].var_names_make_unique() 64 | # looms[i].obs.index = looms[i].obs.index.str.replace(rename_dict[args.samples[i]]+":",args.samples[i]+"_").str.replace("x","-1") 65 | # print(looms[i].shape) 66 | # print(looms[i].obs.head()) 67 | 68 | #################### 69 | ## Create anndata ## 70 | #################### 71 | 72 | print("Creating anndata file...") 73 | 74 | # Concatenate 75 | adata_loom = anndata.AnnData.concatenate(*looms, join='inner', batch_key=None, index_unique=None) 76 | del looms 77 | 78 | # Remove non-used layers to save memory 79 | del adata_loom.layers["ambiguous"] 80 | del adata_loom.layers["matrix"] 81 | 82 | # Merge anndata objects 83 | adata_final = scv.utils.merge(adata, adata_loom) 84 | del adata_loom 85 | del adata 86 | adata_final 87 | 88 | adata_final.obs.index.name = None 89 | 90 | ########## 91 | ## Save ## 92 | ########## 93 | 94 | print("Saving anndata object...") 95 | 96 | adata.write_h5ad(args.outfile) -------------------------------------------------------------------------------- /rna/scanpy/velocyto/run_velocyto.sh: -------------------------------------------------------------------------------- 1 | 2 | # velocyto run10x -m repeat_msk.gtf mypath/sample01 somepath/refdata-cellranger-mm10-1.2.0/genes/genes.gtf 3 | 4 | indir="/bi/group/reik/ricard/data/gastrulation_multiome_10x/original" 5 | 6 | # samples=( "E7.5_rep1" "E7.5_rep2" "E8.0_rep1" "E8.0_rep2" "E8.5_rep1" "E8.5_rep2" "E8.75_rep1" "E8.75_rep2" ) 7 | # samples=( "E7.5_rep2" "E8.5_rep1" "E8.5_rep2" "E8.75_rep1" "E8.75_rep2" ) 8 | samples=( "E7.75_rep1" "E8.5_CRISPR_T_KO" "E8.5_CRISPR_T_WT" ) 9 | 10 | threads=1 11 | # mem=1000 12 | 13 | mask_file="/bi/group/reik/ricard/data/mm10_sequence/repeats/mm10_rmsk.gtf" 14 | 15 | for i in "${samples[@]}"; do 16 | echo "$i" 17 | # cmd="velocyto run10x -m ${mask_file} --samtools-threads $threads --samtools-memory 40000 ${indir}/${i} /bi/scratch/Stephen_Clark/annotations/gtf/Mus_musculus.GRCm38.98.gtf" 18 | cmd="velocyto run10x -m ${mask_file} ${indir}/${i} /bi/scratch/Stephen_Clark/annotations/gtf/Mus_musculus.GRCm38.98.gtf" 19 | echo $cmd 20 | sbatch -n $threads --mem 90G --wrap $cmd 21 | done 22 | -------------------------------------------------------------------------------- /rna/scanpy/velocyto/velocyto_env.yml: -------------------------------------------------------------------------------- 1 | name: velocyto 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | dependencies: 6 | - samtools 7 | - velocyto.py 8 | 9 | 10 | # add scvelo et al 11 | # WARNING: Unable to create progress bar. Consider installing `tqdm` as `pip install tqdm` and `ipywidgets` as `pip install ipywidgets`, -------------------------------------------------------------------------------- /rna/snakemake/README.txt: -------------------------------------------------------------------------------- 1 | ################# 2 | ## Run locally ## 3 | ################# 4 | 5 | snakemake --use-conda --cores 1 6 | snakemake --forceall --use-conda --cores 1 7 | snakemake --forceall --use-conda --cores 1 --dry-run 8 | 9 | ################################# 10 | ## Run on the Babraham cluster ## 11 | ################################# 12 | 13 | sbatch -n 1 --mem 5G snakemake --forceall -j 4 --use-conda --latency-wait 90 --cluster "sbatch -n 1 --mem 5G" 14 | snakemake --forceall -j 4 --use-conda --latency-wait 90 --cluster "sbatch -n 1 --mem 12G" 15 | snakemake -j 4 --use-conda --latency-wait 90 --cluster "sbatch -n 1 --mem {params.memory}G" -------------------------------------------------------------------------------- /rna/snakemake/environment.yaml: -------------------------------------------------------------------------------- 1 | name: gastrulation_multiome_10x_rna_snakemake 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | dependencies: 6 | # - python=3.9.10 7 | - python=3.8 8 | - cython 9 | - ipython 10 | # - jupyter 11 | - ipywidgets 12 | - bioconductor-scater 13 | - bioconductor-scran 14 | - bioconductor-singlecellexperiment 15 | - bioconductor-batchelor 16 | - bioconductor-scds 17 | - bioconductor-edger 18 | - bioconductor-deseq2 19 | # - bioconductor-destiny = 3.4 # THIS IS DEPRECIATED, DEPENDS ON bioconductor-singlecellexperiment 1.12 20 | # - r-xml2 =1.3.2 21 | - r-r.utils 22 | - r-matrix 23 | - r-future 24 | - r-argparse 25 | - r-ggpubr 26 | - r-data.table 27 | - r-purrr 28 | - r-furrr 29 | - r-argparse 30 | - r-seurat=4.1.0 31 | - r-pheatmap 32 | - r-ggrastr 33 | - scanpy=1.8.2 34 | - loompy=3.0.6 35 | - velocyto.py 36 | # - h5py=2.10.0 37 | - python-igraph 38 | - louvain>=0.6,!=0.6.2 39 | - fa2 40 | - leidenalg 41 | - harmonypy 42 | - scanorama 43 | - seaborn 44 | - samtools=1.15 45 | - snakemake 46 | - pip 47 | - pip: 48 | - dfply 49 | - scvelo 50 | - git+https://github.com/settylab/Palantir@removeTSNE 51 | - git+https://github.com/dpeerlab/SEACells 52 | 53 | 54 | # conda create -n gastrulation_multiome_10x_rna_snakemake python==3.9 --yes 55 | # conda activate gastrulation_multiome_10x_rna_snakemake 56 | # conda install mamba --yes 57 | # mamba env update -n gastrulation_multiome_10x_rna_snakemake --file environment.yaml -------------------------------------------------------------------------------- /rna/snakemake/run_cluster.sh: -------------------------------------------------------------------------------- 1 | # snakemake --use-conda --cores 15 -j 99 --latency-wait 90 -p --cluster "sbatch -n {threads} --mem {resources.mem_mb}M" 2 | snakemake --cores 15 -j 99 --latency-wait 90 -p --cluster "sbatch -n {threads} --mem {resources.mem_mb}M" -------------------------------------------------------------------------------- /rna_atac/rna_vs_acc/metacells/gene_expr_vs_promoter_acc/cor_gene_expr_vs_promoter_acc_metacells.R: -------------------------------------------------------------------------------- 1 | here::i_am("rna_atac/rna_vs_acc/metacells/gene_expr_vs_promoter_acc/cor_gene_expr_vs_promoter_acc_metacells.R") 2 | 3 | source(here::here("settings.R")) 4 | source(here::here("utils.R")) 5 | 6 | ################################ 7 | ## Initialize argument parser ## 8 | ################################ 9 | 10 | p <- ArgumentParser(description='') 11 | p$add_argument('--sce', type="character", help='RNA SingleCellExperiment (pseudobulk)') 12 | p$add_argument('--gene_score_matrix', type="character", help='ATAC Gene score matrix (pseudobulk)') 13 | p$add_argument('--outfile', type="character", help='Output directory') 14 | args <- p$parse_args(commandArgs(TRUE)) 15 | 16 | ## START TEST ## 17 | # io$basedir <- file.path(io$basedir,"test") 18 | # args <- list() 19 | # args$sce <- file.path(io$basedir,"results/rna/metacells/all_cells/SingleCellExperiment_metacells.rds") 20 | # args$gene_score_matrix <- file.path(io$basedir,"results/atac/archR/metacells/all_cells/GeneScoreMatrix_TSS/GeneScoreMatrix_TSS_summarized_experiment_metacells.rds") 21 | # args$outfile <- file.path(io$basedir,"results/rna_atac/rna_vs_acc/metacells/gene_expr_vs_promoter_acc/cor_gene_expr_vs_promoter_acc_metacells.txt.gz") 22 | ## END TEST ## 23 | 24 | ##################### 25 | ## Define settings ## 26 | ##################### 27 | 28 | # I/O 29 | dir.create(dirname(args$outfile), showWarnings=FALSE, recursive=TRUE) 30 | 31 | # Options 32 | 33 | ####################################### 34 | ## Load pseudobulk RNA and ATAC data ## 35 | ####################################### 36 | 37 | # Load SingleCellExperiment 38 | rna_metacells.sce <- readRDS(args$sce) 39 | 40 | # Load ATAC SummarizedExperiment 41 | atac_GeneScoreMatrix_metacells.se <- readRDS(args$gene_score_matrix) 42 | 43 | # Normalise ATAC data 44 | assayNames(atac_GeneScoreMatrix_metacells.se) <- "counts" 45 | assay(atac_GeneScoreMatrix_metacells.se,"logcounts") <- log(1e6*(sweep(assay(atac_GeneScoreMatrix_metacells.se),2,colSums(assay(atac_GeneScoreMatrix_metacells.se),na.rm=T),"/"))+1) 46 | 47 | # hist(assay(atac_GeneScoreMatrix_metacells.se,"logcounts")[1:1000,]) 48 | 49 | ########################################### 50 | ## Convert to long data.tables and merge ## 51 | ########################################### 52 | 53 | rna_metacells.dt <- logcounts(rna_metacells.sce) %>% 54 | as.data.table(keep.rownames = T) %>% 55 | setnames("rn","gene") %>% 56 | melt(id.vars="gene", variable.name="celltype", value.name="expr") 57 | 58 | atac_gene_scores_metacells.dt <- as.matrix(assay(atac_GeneScoreMatrix_metacells.se,"logcounts")) %>% t %>% 59 | as.data.table(keep.rownames = T) %>% 60 | setnames("rn","celltype") %>% 61 | melt(id.vars=c("celltype"), variable.name="gene", value.name="acc") 62 | 63 | 64 | # Merge 65 | rna_atac.dt <- merge(rna_metacells.dt, atac_gene_scores_metacells.dt, by = c("gene","celltype")) 66 | 67 | ########################## 68 | ## Correlation analysis ## 69 | ########################## 70 | 71 | cor.dt <- rna_atac.dt %>% copy %>% 72 | .[,c("acc","expr"):=list(acc + rnorm(n=.N,mean=0,sd=1e-5), expr + rnorm(n=.N,mean=0,sd=1e-5))] %>% 73 | .[, .(V1 = unlist(cor.test(acc, expr)[c("estimate", "p.value")])), by = c("gene")] %>% 74 | .[, para := rep(c("r","p"), .N/2)] %>% 75 | data.table::dcast(gene ~ para, value.var = "V1") %>% 76 | .[,"padj_fdr" := list(p.adjust(p, method="fdr"))] %>% 77 | # .[, sig := padj_fdr<=0.10] %>% 78 | setorder(padj_fdr, na.last = T) 79 | 80 | cor.dt[,c("p","r","padj_fdr"):=list(format(p,digits=3),round(r,3), format(padj_fdr,digits=3))] 81 | 82 | # Save 83 | fwrite(cor.dt, args$outfile, sep="\t", quote=F) 84 | -------------------------------------------------------------------------------- /rna_atac/rna_vs_acc/pseudobulk/TFexpr_vs_peakAcc/README.txt: -------------------------------------------------------------------------------- 1 | Fix duplicated TF-motif pairs JASPAR: 2 | ASCL1, BACH2, BHLHE22, CEBPG, CREB3L4, HNF4A, JDP2, JUN, JUNB, JUND, MEIS1, MEIS2, MZF1, NEUROG2, NFIC, NFIX, NR2C2, NR2F1, PAX3, POU6F1, RARA, RORA, RXRB, RXRG, SREBF1, SREBF2, TFAP2A, TFAP2B, TFAP2C, TFAP4, THRB -------------------------------------------------------------------------------- /rna_atac/rna_vs_acc/pseudobulk/TFexpr_vs_peakAcc/analysis/plot_TFexpr_vs_peakAcc_general_stats.R: -------------------------------------------------------------------------------- 1 | here::i_am("rna_atac/rna_vs_acc/pseudobulk/TFexpr_vs_peakAcc/analysis/plot_TFexpr_vs_peakAcc_general_stats.R") 2 | 3 | source(here::here("settings.R")) 4 | source(here::here("utils.R")) 5 | 6 | ################################ 7 | ## Initialize argument parser ## 8 | ################################ 9 | 10 | p <- ArgumentParser(description='') 11 | p$add_argument('--sce', type="character", help='RNA SingleCellExperiment (pseudobulk)') 12 | p$add_argument('--atac_peak_matrix', type="character", help='ATAC Peak matrix (pseudobulk)') 13 | p$add_argument('--tf2peak_cor', type="character", help='Correlations between TF RNA expression and peak accessibility') 14 | p$add_argument('--outdir', type="character", help='Output directory') 15 | args <- p$parse_args(commandArgs(TRUE)) 16 | 17 | ##################### 18 | ## Define settings ## 19 | ##################### 20 | 21 | ## START TEST ## 22 | io$basedir <- file.path(io$basedir,"test") 23 | args <- list() 24 | args$sce <- file.path(io$basedir,"results/rna/pseudobulk/celltype/SingleCellExperiment_pseudobulk.rds") 25 | args$atac_peak_matrix <- file.path(io$basedir,"results/atac/archR/pseudobulk/celltype/PeakMatrix/pseudobulk_PeakMatrix_summarized_experiment.rds") 26 | args$tf2peak_cor <- file.path(io$basedir,"results/rna_atac/rna_vs_acc/pseudobulk/TFexpr_vs_peakAcc/CISBP_cor_TFexpr_vs_peakAcc.rds") 27 | args$outdir <- file.path(io$basedir,"results/rna_atac/rna_vs_acc/pseudobulk/TFexpr_vs_peakAcc/stats") 28 | ## END TEST ## 29 | 30 | # I/O 31 | dir.create(args$outdir, showWarnings = F) 32 | 33 | ############### 34 | ## Load data ## 35 | ############### 36 | 37 | # Load SingleCellExperiment 38 | rna_pseudobulk.sce <- readRDS(args$sce) 39 | 40 | # Load ATAC SummarizedExperiment 41 | atac_peakMatrix_pseudobulk.se <- readRDS(args$atac_peak_matrix) 42 | 43 | # Normalise ATAC data 44 | assayNames(atac_peakMatrix_pseudobulk.se) <- "counts" 45 | assay(atac_peakMatrix_pseudobulk.se,"logcounts") <- log(1e6*(sweep(assay(atac_peakMatrix_pseudobulk.se),2,colSums(assay(atac_peakMatrix_pseudobulk.se),na.rm=T),"/"))+1) 46 | 47 | ############################### 48 | ## Load TF2peak correlations ## 49 | ############################### 50 | 51 | tf2peak_cor.se <- readRDS(args$tf2peak_cor) 52 | 53 | ############ 54 | ## Filter ## 55 | ############ 56 | 57 | TFs <- colnames(tf2peak_cor.se) 58 | peaks <- rownames(tf2peak_cor.se) 59 | 60 | ########## 61 | ## Plot ## 62 | ########## 63 | 64 | # tmp <- assay(tf2peak_cor.se[,i],"cor")[,1] 65 | i <- "T" 66 | j <- "chr5:4894870-4895470" 67 | 68 | to.plot <- data.table( 69 | atac = assay(atac_peakMatrix_pseudobulk.se[j,],"logcounts")[1,], 70 | rna = logcounts(rna_pseudobulk.sce[i,])[1,], 71 | celltype = colnames(rna_pseudobulk.sce) 72 | ) 73 | 74 | p <- ggplot(to.plot, aes(x=rna, y=atac, fill=celltype)) + 75 | geom_point(color="black", size=4, shape=21) + 76 | # geom_smooth(method="lm") + 77 | stat_cor(method = "pearson") + 78 | scale_fill_manual(values=opts$celltype.colors) + 79 | ggrepel::geom_text_repel(aes(label=celltype), size=3, data=to.plot[rna>5 & atac>0.3]) + 80 | labs(x="RNA expression", y="Peak accessibility", title=sprintf("%s expression vs %s accessibility",i,j)) + 81 | theme_classic() + 82 | theme( 83 | plot.title = element_text(hjust=0.5, size=rel(0.8)), 84 | axis.text = element_text(color="black"), 85 | legend.position = "none" 86 | ) 87 | 88 | pdf(file.path(args$outdir,sprintf("%s_vs_%s_rna_vs_acc_pseudobulk.pdf",i,gsub("[:_]","-",j))), width = 8, height = 5) 89 | print(p) 90 | dev.off() -------------------------------------------------------------------------------- /rna_atac/rna_vs_acc/pseudobulk/TFexpr_vs_peakAcc/analysis/plot_TFexpr_vs_peakAcc_individual_examples.R: -------------------------------------------------------------------------------- 1 | here::i_am("rna_atac/rna_vs_acc/pseudobulk/TFexpr_vs_peakAcc/plot_TFexpr_vs_peakAcc_individual_examples.R") 2 | 3 | source(here::here("settings.R")) 4 | source(here::here("utils.R")) 5 | 6 | ################################ 7 | ## Initialize argument parser ## 8 | ################################ 9 | 10 | p <- ArgumentParser(description='') 11 | p$add_argument('--sce', type="character", help='RNA SingleCellExperiment (pseudobulk)') 12 | p$add_argument('--atac_peak_matrix', type="character", help='ATAC Peak matrix (pseudobulk)') 13 | p$add_argument('--tf2peak_cor', type="character", help='Correlations between TF RNA expression and peak accessibility') 14 | p$add_argument('--outdir', type="character", help='Output directory') 15 | args <- p$parse_args(commandArgs(TRUE)) 16 | 17 | ##################### 18 | ## Define settings ## 19 | ##################### 20 | 21 | ## START TEST ## 22 | # io$basedir <- file.path(io$basedir,"test") 23 | # args <- list() 24 | # args$sce <- file.path(io$basedir,"results/rna/pseudobulk/celltype/SingleCellExperiment_pseudobulk.rds") 25 | # args$atac_peak_matrix <- file.path(io$basedir,"results/atac/archR/pseudobulk/celltype/PeakMatrix/pseudobulk_PeakMatrix_summarized_experiment.rds") 26 | # args$tf2peak_cor <- file.path(io$basedir,"results/rna_atac/rna_vs_acc/pseudobulk/TFexpr_vs_peakAcc/CISBP_cor_TFexpr_vs_peakAcc.rds") 27 | # args$outdir <- file.path(io$basedir,"results/rna_atac/rna_vs_acc/pseudobulk/TFexpr_vs_peakAcc/individual_examples") 28 | ## END TEST ## 29 | 30 | # I/O 31 | dir.create(args$outdir, showWarnings = F) 32 | 33 | ############### 34 | ## Load data ## 35 | ############### 36 | 37 | # Load SingleCellExperiment 38 | rna_pseudobulk.sce <- readRDS(args$sce) 39 | 40 | # Load ATAC SummarizedExperiment 41 | atac_peakMatrix_pseudobulk.se <- readRDS(args$atac_peak_matrix) 42 | 43 | # Normalise ATAC data 44 | assayNames(atac_peakMatrix_pseudobulk.se) <- "counts" 45 | assay(atac_peakMatrix_pseudobulk.se,"logcounts") <- log(1e6*(sweep(assay(atac_peakMatrix_pseudobulk.se),2,colSums(assay(atac_peakMatrix_pseudobulk.se),na.rm=T),"/"))+1) 46 | 47 | ############################### 48 | ## Load TF2peak correlations ## 49 | ############################### 50 | 51 | tf2peak_cor.se <- readRDS(args$tf2peak_cor) 52 | 53 | ############ 54 | ## Filter ## 55 | ############ 56 | 57 | TFs <- colnames(tf2peak_cor.se) 58 | peaks <- rownames(tf2peak_cor.se) 59 | 60 | ########## 61 | ## Plot ## 62 | ########## 63 | 64 | # tmp <- assay(tf2peak_cor.se[,i],"cor")[,1] 65 | i <- "T" 66 | j <- "chr5:4894870-4895470" 67 | 68 | to.plot <- data.table( 69 | atac = assay(atac_peakMatrix_pseudobulk.se[j,],"logcounts")[1,], 70 | rna = logcounts(rna_pseudobulk.sce[i,])[1,], 71 | celltype = colnames(rna_pseudobulk.sce) 72 | ) 73 | 74 | p <- ggplot(to.plot, aes(x=rna, y=atac, fill=celltype)) + 75 | geom_point(color="black", size=4, shape=21) + 76 | # geom_smooth(method="lm") + 77 | stat_cor(method = "pearson") + 78 | scale_fill_manual(values=opts$celltype.colors) + 79 | ggrepel::geom_text_repel(aes(label=celltype), size=3, data=to.plot[rna>5 & atac>0.3]) + 80 | labs(x="RNA expression", y="Peak accessibility", title=sprintf("%s expression vs %s accessibility",i,j)) + 81 | theme_classic() + 82 | theme( 83 | plot.title = element_text(hjust=0.5, size=rel(0.8)), 84 | axis.text = element_text(color="black"), 85 | legend.position = "none" 86 | ) 87 | 88 | pdf(file.path(args$outdir,sprintf("%s_vs_%s_rna_vs_acc_pseudobulk.pdf",i,gsub("[:_]","-",j))), width = 8, height = 5) 89 | print(p) 90 | dev.off() -------------------------------------------------------------------------------- /rna_atac/rna_vs_acc/pseudobulk/gene_expr_vs_promoter_acc/cor_gene_expr_vs_promoter_acc_pseudobulk.R: -------------------------------------------------------------------------------- 1 | here::i_am("rna_atac/rna_vs_acc/pseudobulk/gene_expr_vs_promoter_acc/cor_gene_expr_vs_promoter_acc_pseudobulk.R") 2 | 3 | source(here::here("settings.R")) 4 | source(here::here("utils.R")) 5 | 6 | ################################ 7 | ## Initialize argument parser ## 8 | ################################ 9 | 10 | p <- ArgumentParser(description='') 11 | p$add_argument('--sce', type="character", help='RNA SingleCellExperiment (pseudobulk)') 12 | p$add_argument('--gene_score_matrix', type="character", help='ATAC Gene score matrix (pseudobulk)') 13 | p$add_argument('--outfile', type="character", help='Output directory') 14 | args <- p$parse_args(commandArgs(TRUE)) 15 | 16 | ## START TEST ## 17 | # args <- list() 18 | # args$sce <- file.path(io$basedir,"results/rna/pseudobulk/SingleCellExperiment_pseudobulk_celltype.mapped.rds") 19 | # args$gene_score_matrix <- file.path(io$basedir,"results/atac/archR/pseudobulk/celltype.mapped/pseudobulk_GeneScoreMatrix_TSS_summarized_experiment.rds") # io$archR.pseudobulk.GeneMatrix.se 20 | # args$outfile <- file.path(io$basedir,"results/rna_atac/gene_expr_vs_promoter_acc/pseudobulk/cor_gene_expr_vs_promoter_acc_pseudobulk.txt.gz") 21 | ## END TEST ## 22 | 23 | ##################### 24 | ## Define settings ## 25 | ##################### 26 | 27 | # I/O 28 | dir.create(dirname(args$outfile), showWarnings=FALSE, recursive=TRUE) 29 | 30 | # Options 31 | 32 | ####################################### 33 | ## Load pseudobulk RNA and ATAC data ## 34 | ####################################### 35 | 36 | # Load SingleCellExperiment 37 | rna_pseudobulk.sce <- readRDS(args$sce) 38 | 39 | # Load ATAC SummarizedExperiment 40 | atac_pseudobulk_GeneScoreMatrix.se <- readRDS(args$gene_score_matrix) 41 | assayNames(atac_pseudobulk_GeneScoreMatrix.se) <- "counts" 42 | 43 | # Normalise ATAC data 44 | assay(atac_pseudobulk_GeneScoreMatrix.se,"logcounts") <- log(1e6*(sweep(assay(atac_pseudobulk_GeneScoreMatrix.se),2,colSums(assay(atac_pseudobulk_GeneScoreMatrix.se),na.rm=T),"/"))+1) 45 | 46 | ########################################### 47 | ## Convert to long data.tables and merge ## 48 | ########################################### 49 | 50 | rna_pseudobulk.dt <- logcounts(rna_pseudobulk.sce) %>% 51 | as.data.table(keep.rownames = T) %>% 52 | setnames("rn","gene") %>% 53 | melt(id.vars="gene", variable.name="celltype", value.name="expr") 54 | 55 | atac_gene_scores_pseudobulk.dt <- as.matrix(assay(atac_pseudobulk_GeneScoreMatrix.se)) %>% t %>% 56 | as.data.table(keep.rownames = T) %>% 57 | setnames("rn","celltype") %>% 58 | melt(id.vars=c("celltype"), variable.name="gene", value.name="acc") 59 | 60 | # Merge 61 | rna_atac.dt <- merge(rna_pseudobulk.dt, atac_gene_scores_pseudobulk.dt, by = c("gene","celltype")) 62 | 63 | ########################## 64 | ## Correlation analysis ## 65 | ########################## 66 | 67 | cor.dt <- rna_atac.dt %>% copy %>% 68 | .[,c("acc","expr"):=list(acc + rnorm(n=.N,mean=0,sd=1e-5), expr + rnorm(n=.N,mean=0,sd=1e-5))] %>% 69 | .[, .(V1 = unlist(cor.test(acc, expr)[c("estimate", "p.value")])), by = c("gene")] %>% 70 | .[, para := rep(c("r","p"), .N/2)] %>% 71 | data.table::dcast(gene ~ para, value.var = "V1") %>% 72 | .[,"padj_fdr" := list(p.adjust(p, method="fdr"))] %>% 73 | # .[, sig := padj_fdr<=0.10] %>% 74 | setorder(padj_fdr, na.last = T) 75 | 76 | cor.dt[,c("p","r","padj_fdr"):=list(format(p,digits=3),round(r,3), format(padj_fdr,digits=3))] 77 | 78 | # Save 79 | fwrite(cor.dt, args$outfile, sep="\t", quote=F) 80 | -------------------------------------------------------------------------------- /rna_atac/rna_vs_acc/pseudobulk/gene_markers_rna_vs_acc/plot_number_markers.R: -------------------------------------------------------------------------------- 1 | ##################### 2 | ## Define settings ## 3 | ##################### 4 | 5 | source(here::here("settings.R")) 6 | source(here::here("utils.R")) 7 | 8 | # I/O 9 | io$basedir <- file.path(io$basedir,"test") 10 | io$marker_genes_rna <- file.path(io$basedir,"results/rna/differential/pseudobulk/celltype/parsed/marker_genes_filtered.txt.gz") 11 | io$marker_peaks_atac <- file.path(io$basedir,"results/atac/archR/differential/pseudobulk/celltype/PeakMatrix/parsed/markers_filt.txt.gz") 12 | io$outdir <- file.path(io$basedir,"results/rna_atac/rna_vs_acc/pseudobulk/gene_markers_rna_vs_acc") 13 | 14 | # Options 15 | opts$celltypes <- c( 16 | "Epiblast", 17 | "Primitive_Streak", 18 | "Caudal_epiblast", 19 | "PGC", 20 | # "Anterior_Primitive_Streak", 21 | "Notochord", 22 | "Def._endoderm", 23 | "Gut", 24 | "Nascent_mesoderm", 25 | # "Mixed_mesoderm", 26 | "Intermediate_mesoderm", 27 | "Caudal_Mesoderm", 28 | "Paraxial_mesoderm", 29 | "Somitic_mesoderm", 30 | "Pharyngeal_mesoderm", 31 | "Cardiomyocytes", 32 | "Allantois", 33 | "ExE_mesoderm", 34 | # "Mesenchyme", 35 | "Haematoendothelial_progenitors", 36 | "Endothelium", 37 | "Blood_progenitors_1", 38 | "Blood_progenitors_2", 39 | "Erythroid1", 40 | "Erythroid2", 41 | "Erythroid3", 42 | "NMP", 43 | "Rostral_neurectoderm", 44 | # "Caudal_neurectoderm", 45 | "Neural_crest", 46 | "Forebrain_Midbrain_Hindbrain", 47 | "Spinal_cord", 48 | "Surface_ectoderm" 49 | # "Visceral_endoderm", 50 | # "ExE_endoderm", 51 | # "ExE_ectoderm", 52 | # "Parietal_endoderm" 53 | ) 54 | 55 | 56 | ############### 57 | ## Load data ## 58 | ############### 59 | 60 | markers_genes_rna.dt <- fread(io$marker_genes_rna) %>% .[celltype%in%opts$celltypes] 61 | marker_peaks_atac.dt <- fread(io$marker_peaks_atac) %>% .[celltype%in%opts$celltypes] 62 | 63 | ########## 64 | ## Plot ## 65 | ########## 66 | 67 | # Plot number of marker genes per cell types 68 | to.plot <- rbind( 69 | markers_genes_rna.dt %>% .[,.N,by=c("celltype")] %>% .[,class:="Genes"], 70 | marker_peaks_atac.dt %>% .[,.N,by=c("celltype")] %>% .[,class:="ATAC peaks"] 71 | ) %>% .[,class:=factor(class, levels=c("Genes","ATAC peaks"))] 72 | 73 | # Rename celltypes 74 | opts$rename.celltypes <- c( 75 | "Forebrain_Midbrain_Hindbrain" = "Brain", 76 | "Haematoendothelial_progenitors" = "Haematoend. progenitors" 77 | ) 78 | to.plot %>% .[,celltype:=stringr::str_replace_all(celltype,opts$rename.celltypes)] %>% .[,celltype:=gsub("_"," ",celltype)] 79 | opts$celltype.colors["Haematoend. progenitors"] <- opts$celltype.colors["Haematoendothelial_progenitors"] 80 | names(opts$celltype.colors) <- gsub("_"," ",names(opts$celltype.colors)) 81 | 82 | # Plot 83 | p <- ggbarplot(to.plot, x="celltype", y="N", fill="celltype") + 84 | facet_wrap(~class, ncol=2, scales="free_y") + 85 | scale_fill_manual(values=opts$celltype.colors) + 86 | labs(x="", y="Number of markers") + 87 | theme( 88 | strip.background = element_rect(colour="black", fill=NA), 89 | axis.text.y = element_text(size=rel(0.65)), 90 | axis.text.x = element_text(colour="black",size=rel(0.6), angle=90, hjust=1, vjust=0.5), 91 | axis.title = element_text(colour="black",size=rel(0.75)), 92 | axis.ticks.x = element_blank(), 93 | legend.position = "none" 94 | ) 95 | 96 | pdf(file.path(io$outdir,"barplot_number_markers.pdf"), width = 7, height = 4) 97 | print(p) 98 | dev.off() 99 | 100 | -------------------------------------------------------------------------------- /rna_atac/rna_vs_chromvar_chip/pseudobulk/per_gene/fig/plot_rna_vs_chromvar_per_gene_pseudobulk_fig.R: -------------------------------------------------------------------------------- 1 | source(here::here("settings.R")) 2 | source(here::here("utils.R")) 3 | 4 | ##################### 5 | ## Define settings ## 6 | ##################### 7 | 8 | # Options 9 | opts$motif_annotation <- "CISBP" 10 | 11 | # I/O 12 | io$rna_sce_pseudobulk_file <- file.path(io$basedir,"results/rna/pseudobulk/celltype/SingleCellExperiment_TFs_pseudobulk.rds") 13 | io$atac_chromvar_chip_pseudobulk_file <- file.path(io$basedir,sprintf("results/atac/archR/chromvar_chip/pseudobulk/chromVAR_chip_%s_archr.rds",opts$motif_annotation)) 14 | io$outdir <- file.path(io$basedir,"results/rna_atac/rna_vs_chromvar_chip/pseudobulk/per_gene/fig"); dir.create(io$outdir, showWarnings=F, recursive = T) 15 | 16 | ####################################### 17 | ## Load pseudobulk RNA and ATAC data ## 18 | ####################################### 19 | 20 | # Load pseudobulk RNA expression 21 | rna_pseudobulk_tf.se <- readRDS(io$rna_sce_pseudobulk_file) 22 | 23 | # Load chromVAR matrix 24 | atac_chromvar_pseudobulk.se <- readRDS(io$atac_chromvar_chip_pseudobulk_file) 25 | 26 | # Select TFs 27 | TFs <- intersect(rownames(rna_pseudobulk_tf.se),rownames(atac_chromvar_pseudobulk.se)) 28 | rna_pseudobulk_tf.se <- rna_pseudobulk_tf.se[TFs,] 29 | atac_chromvar_pseudobulk.se <- atac_chromvar_pseudobulk.se[TFs,] 30 | 31 | ######################## 32 | ## Prepare data table ## 33 | ######################## 34 | 35 | atac_chromvar_pseudobulk.dt <- assay(atac_chromvar_pseudobulk.se) %>% t %>% 36 | as.data.table(keep.rownames = T) %>% 37 | setnames("rn","celltype") %>% 38 | melt(id.vars=c("celltype"), variable.name="gene", value.name="chromvar_zscore") 39 | 40 | rna_tf_pseudobulk.dt <- logcounts(rna_pseudobulk_tf.se) %>% 41 | as.data.table(keep.rownames = T) %>% 42 | setnames("rn","gene") %>% 43 | data.table::melt(id.vars="gene", variable.name="celltype", value.name="expr") 44 | 45 | ########### 46 | ## Merge ## 47 | ########### 48 | 49 | rna_chromvar.dt <- merge( 50 | rna_tf_pseudobulk.dt, 51 | atac_chromvar_pseudobulk.dt, 52 | by = c("celltype","gene") 53 | ) 54 | 55 | ###################################### 56 | ## Scatter plot of individual genes ## 57 | ###################################### 58 | 59 | genes.to.plot <- unique(rna_chromvar.dt$gene) 60 | genes.to.plot <- c("FOXA2","FOXB1","FOXC2") 61 | 62 | # i <- "FOXA2" 63 | for (i in genes.to.plot) { 64 | 65 | to.plot <- rna_chromvar.dt[gene==i] 66 | 67 | to.plot.text <- rbind( 68 | to.plot %>% setorder(-expr) %>% head(n=7), 69 | to.plot %>% setorder(-chromvar_zscore) %>% head(n=7) 70 | ) %>% unique 71 | 72 | p <- ggscatter(to.plot, x="expr", y="chromvar_zscore", fill="celltype", size=5.5, shape=21, 73 | add="reg.line", add.params = list(color="black", fill="lightgray"), conf.int=TRUE) + 74 | stat_cor(method = "pearson", label.x.npc = "middle", label.y.npc = "bottom") + 75 | ggrepel::geom_text_repel(data=to.plot.text, aes(label=gsub("_"," ",celltype)), size=3.5) + 76 | scale_fill_manual(values=opts$celltype.colors) + 77 | # labs(x=sprintf("%s expression",i), y=sprintf("Accessibility of %s targets (z-score)",i)) + 78 | labs(x="RNA expression", y="chromVAR-ChIP") + 79 | guides(fill="none") + 80 | theme( 81 | axis.text = element_text(size=rel(0.85)) 82 | ) 83 | 84 | pdf(file.path(io$outdir,sprintf("%s_%s_rna_vs_chromvar_chip_pseudobulk.pdf",i,opts$motif_annotation)), width = 5.5, height = 4) 85 | print(p) 86 | dev.off() 87 | } 88 | 89 | -------------------------------------------------------------------------------- /rna_atac/snakemake/run_cluster.sh: -------------------------------------------------------------------------------- 1 | snakemake --cores 15 -j 99 --latency-wait 90 -p --cluster "sbatch -n {threads} --mem {resources.mem_mb}M" -------------------------------------------------------------------------------- /rna_atac/virtual_chipseq_library/metacells/analysis/virtual_chipseq_metacells_exploration.R: -------------------------------------------------------------------------------- 1 | 2 | source(here::here("settings.R")) 3 | source(here::here("utils.R")) 4 | 5 | ##################### 6 | ## Define settings ## 7 | ##################### 8 | 9 | # Options 10 | opts$motif_annotation <- "CISBP" 11 | opts$trajectory <- "nmp" 12 | 13 | # I/O 14 | io$rna_metacells.sce <- file.path(io$basedir, 'results/rna/metacells/trajectories/nmp/SingleCellExperiment_metacells.rds') 15 | io$metacell_metadata <- file.path(io$basedir, 'results/atac/archR/metacells/trajectories/nmp/PeakMatrix/metacells_metadata.txt.gz') 16 | io$archR.peakMatrix.metacells <- file.path(io$basedir,"results/atac/archR/metacells/trajectories/nmp/PeakMatrix/PeakMatrix_summarized_experiment_metacells.rds") 17 | io$virtual_chip.mtx <- file.path(io$basedir,sprintf("results/rna_atac/virtual_chipseq/metacells/trajectories/nmp/%s/virtual_chip_matrix.rds",opts$motif_annotation)) 18 | io$trajectory <- "nmp" 19 | io$trajectory_file <- file.path(io$basedir,"results/rna/metacells/trajectories/nmp/metacell_trajectory.txt.gz") 20 | io$outdir <- file.path(io$basedir,sprintf("results/rna_atac/virtual_chipseq/metacells/trajectories/nmp/%s/pdf",opts$trajectory)); dir.create(io$outdir, showWarnings = F) 21 | 22 | 23 | if (io$trajectory=="nmp") { 24 | celltypes.to.plot <- c("Caudal_Mesoderm", "Somitic_mesoderm", "NMP", "Spinal_cord") 25 | } 26 | 27 | ################### 28 | ## Load metadata ## 29 | ################### 30 | 31 | metadata.dt <- fread(io$metacell_metadata) %>% .[celltype%in%celltypes.to.plot] 32 | 33 | ##################### 34 | ## Load trajectory ## 35 | ##################### 36 | 37 | trajectory.dt <- fread(io$trajectory_file) %>% setnames(c("metacell","V1","V2")) 38 | 39 | ################################## 40 | ## Load virtual ChIP-seq matrix ## 41 | ################################## 42 | 43 | virtual_chip.mtx <- readRDS(io$virtual_chip.mtx) 44 | 45 | ################################## 46 | ## Load chromatin accessibility ## 47 | ################################## 48 | 49 | atac_peakMatrix_metacells.se <- readRDS(io$archR.peakMatrix.metacells) 50 | 51 | metacells <- intersect(trajectory.dt$metacell,colnames(atac_peakMatrix_metacells.se)) 52 | 53 | # Normalise ATAC data 54 | assayNames(atac_peakMatrix_metacells.se) <- "counts" 55 | assay(atac_peakMatrix_metacells.se,"logcounts") <- log(1e6*(sweep(assay(atac_peakMatrix_metacells.se),2,colSums(assay(atac_peakMatrix_metacells.se),na.rm=T),"/"))+1) 56 | 57 | ########## 58 | ## Plot ## 59 | ########## 60 | 61 | brachyury_binding_sites <- virtual_chip.mtx[,"T"][virtual_chip.mtx[,"T"]>=0.40] %>% sort 62 | 63 | tmp <- assay(atac_peakMatrix_metacells.se,"logcounts")[names(brachyury_binding_sites),] 64 | 65 | to.plot <- data.table( 66 | acc = colMeans(tmp), 67 | metacell = colnames(tmp) 68 | ) %>% merge(trajectory.dt,by="metacell") 69 | 70 | p <- ggplot(to.plot, aes(x=V1, y=V2)) + 71 | geom_point(aes(fill=acc), size=2.5, shape=21, stroke=0.25) + 72 | # facet_wrap(~gene) + 73 | scale_fill_gradient(low = "gray95", high = "darkgreen") + 74 | labs(x="Force-directed layout (Dim 1)", y="Force-directed layout (Dim 2)") + 75 | theme_classic() + 76 | ggplot_theme_NoAxes() + 77 | theme( 78 | legend.position = "right" 79 | ) 80 | 81 | pdf(file.path(io$outdir,sprintf("network_coloured_by_%s_expr.pdf",i)), width = 5, height = 5.5) 82 | print(p) 83 | dev.off() 84 | 85 | 86 | -------------------------------------------------------------------------------- /rna_atac/virtual_chipseq_library/pseudobulk/analysis/virtual_chipseq_exploration.R: -------------------------------------------------------------------------------- 1 | # here::i_am("rna_atac/virtual_chipseq_library/virtual_chipseq_plot_stats.R") 2 | 3 | # Load default settings 4 | source(here::here("settings.R")) 5 | source(here::here("utils.R")) 6 | 7 | ##################### 8 | ## Define settings ## 9 | ##################### 10 | 11 | # Options 12 | opts$motif_annotation <- "CISBP" 13 | 14 | opts$TFs <- c("TAL1", "GATA1", "RUNX1", "FOXA2", "GATA4", "CDX2","NKX2-5","TBX5", "SOX10") 15 | opts$TFs <- c("T") 16 | 17 | # I/O 18 | io$virtual_chip.dir <- file.path(io$basedir,sprintf("results/rna_atac/virtual_chipseq/pseudobulk/%s",opts$motif_annotation)) 19 | io$virtual_chip.mtx <- file.path(io$virtual_chip.dir,"virtual_chip_matrix.rds") 20 | io$outdir <- file.path(io$basedir,sprintf("results/rna_atac/virtual_chipseq/pseudobulk/%s/test",opts$motif_annotation)); dir.create(io$outdir, showWarnings = F) 21 | 22 | ################################### 23 | ## Load virtual ChIP-seq library ## 24 | ################################### 25 | 26 | # Load detailed data.tables 27 | virtual_chip.dt <- opts$TFs %>% map(function(i) { 28 | fread(sprintf("%s/%s.txt.gz",io$virtual_chip.dir,i)) %>% 29 | # .[,c("chr","start","end"):=NULL] %>% 30 | .[,tf:=i] %>% 31 | return 32 | }) %>% rbindlist 33 | 34 | # Load matrix 35 | virtual_chip.mtx <- readRDS(io$virtual_chip.mtx) 36 | 37 | 38 | ####################### 39 | ## Explore Brachyury ## 40 | ####################### 41 | 42 | to.plot <- virtual_chip.dt[abs(score)>=0.20 & motif_score>=0.30] %>% 43 | .[,sign:=as.factor(c("Repressor","Activator")[(correlation_score>0)+1])] 44 | 45 | ggbarplot(to.plot[,.N,by=c("sign")], x="sign", y="N", fill="gray70") + 46 | labs(x="", y="Number of in silico T binding events") 47 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import anndata 2 | import scanpy as sc 3 | import scipy as s 4 | from scipy.sparse import csr_matrix, issparse 5 | 6 | def load_adata(adata_file, metadata_file = None, normalise = False, cells = None, cell_column = "cell", features = None, filter_lowly_expressed_genes = False, set_colors = False, keep_counts=False): 7 | 8 | adata = sc.read(adata_file) 9 | 10 | # Convert to sparse matrices 11 | if not s.sparse.issparse(adata.X): 12 | adata.X = csr_matrix(adata.X) 13 | if len(adata.layers.keys())>0: 14 | for i in list(adata.layers.keys()): 15 | if not issparse(adata.layers[i]): 16 | adata.layers[i] = csr_matrix(adata.layers[i]) 17 | 18 | if cells is not None: 19 | tmp = np.mean(np.isin(cells,adata.obs.index.values)==False) 20 | if tmp<1: print("%.2f%% of cells provided are not observed in the adata, taking the intersect..." % (100*tmp)) 21 | cells = np.intersect1d(cells,adata.obs.index.values) 22 | adata = adata[cells,:] 23 | 24 | if features is not None: 25 | adata = adata[:,features] 26 | 27 | if metadata_file is not None: 28 | metadata = pd.read_table(metadata_file, delimiter="\t", header=0).set_index(cell_column, drop=False) 29 | metadata = metadata.loc[cells] 30 | assert np.all(adata.obs.index.isin(metadata[cell_column])) 31 | # assert np.all(metadata.cell.isin(adata.obs.index)) 32 | assert metadata.shape[0] == adata.shape[0] 33 | adata.obs = metadata#.reindex(adata.obs.index) 34 | 35 | if filter_lowly_expressed_genes: 36 | sc.pp.filter_genes(adata, min_counts=10) 37 | 38 | if keep_counts: 39 | adata.layers["raw"] = adata.X.copy() 40 | 41 | if normalise: 42 | sc.pp.normalize_total(adata, target_sum=None, exclude_highly_expressed=False) 43 | sc.pp.log1p(adata) 44 | 45 | if set_colors: 46 | colPalette_celltypes = [opts["celltype_colors"][i.replace(" ","_").replace("/","_")] for i in sorted(np.unique(adata.obs['celltype']))] 47 | adata.uns['celltype_colors'] = colPalette_celltypes 48 | colPalette_stages = [opts["stage_colors"][i.replace(" ","_").replace("/","_")] for i in sorted(np.unique(adata.obs['stage']))] 49 | adata.uns['stage_colors'] = colPalette_stages 50 | 51 | return adata 52 | 53 | def scale(X, x_min, x_max): 54 | nom = (X - X.min(axis=0)) * (x_max - x_min) 55 | denom = X.max(axis=0) - X.min(axis=0) 56 | denom[denom == 0] = 1 57 | return x_min + nom / denom 58 | 59 | 60 | # cmap = custom_div_cmap(11, mincol='g', midcol='0.9' ,maxcol='CornflowerBlue') 61 | def custom_div_cmap(numcolors=11, name='custom_div_cmap', 62 | mincol='blue', midcol='white', maxcol='red'): 63 | """ 64 | Default is blue to white to red with 11 colors. 65 | Colors can be specified in any way understandable by matplotlib.colors.ColorConverter.to_rgb() 66 | """ 67 | 68 | from matplotlib.colors import LinearSegmentedColormap 69 | cmap = LinearSegmentedColormap.from_list(name=name, colors =[mincol, midcol, maxcol], N=numcolors) 70 | return cmap --------------------------------------------------------------------------------