├── .gitignore
├── README.md
├── atac
    ├── archR
    │   ├── add_motif_annotation
    │   │   ├── add_motif_annotation_manual.R
    │   │   ├── archR_add_background_peaks.R
    │   │   ├── archR_add_motif_annotation.R
    │   │   └── plot_motif_seqlogo.R
    │   ├── bigwig
    │   │   ├── README.txt
    │   │   └── archR_export_bw.R
    │   ├── celltype_assignment
    │   │   └── archR_celltype_assignment.R
    │   ├── chromvar
    │   │   ├── cells
    │   │   │   └── run_chromvar_from_archR.R
    │   │   ├── pseudobulk
    │   │   │   └── run_chromvar_pseudobulk.R
    │   │   └── utils.R
    │   ├── chromvar_chip
    │   │   ├── cells
    │   │   │   └── run_chromvar_chip.R
    │   │   ├── metacells
    │   │   │   └── differential
    │   │   │   │   ├── celltype
    │   │   │   │       ├── analysis
    │   │   │   │       │   ├── before_snakemake
    │   │   │   │       │   │   └── define_markers.R
    │   │   │   │       │   ├── compare_differential_chromvar_pseudobulk_metacells.R
    │   │   │   │       │   ├── define_markers.R
    │   │   │   │       │   ├── old
    │   │   │   │       │   │   └── load_data.R
    │   │   │   │       │   └── plot_marker_peaks_archR.R
    │   │   │   │       └── run_diff_chromvar_chip_celltype_metacells.R
    │   │   │   │   ├── differential_chromvar_metacells.R
    │   │   │   │   └── old
    │   │   │   │       ├── plot_differential_chromvar_chip_pseudobulk.R
    │   │   │   │       └── run_differential_chromvar_chip_pseudobulk.R
    │   │   ├── pseudobulk
    │   │   │   ├── compare_chromvar_chip_cells_vs_pseudobulk.R
    │   │   │   └── differential
    │   │   │   │   └── celltype
    │   │   │   │       ├── analysis
    │   │   │   │           ├── define_markers.R
    │   │   │   │           └── old
    │   │   │   │           │   └── plot_differential_chromvar_chip_pseudobulk.R
    │   │   │   │       ├── differential_chromvar_pseudobulk.R
    │   │   │   │       └── parse_differential_results.R
    │   │   └── run_chromvar_chip.R
    │   ├── differential
    │   │   ├── cells
    │   │   │   ├── archr_differential_accessibility_cells.R
    │   │   │   ├── celltype
    │   │   │   │   ├── analysis
    │   │   │   │   │   ├── define_markers.R
    │   │   │   │   │   ├── load_data.R
    │   │   │   │   │   ├── old
    │   │   │   │   │   │   ├── GeneScoreMatrix
    │   │   │   │   │   │   │   ├── define_marker_genes.R
    │   │   │   │   │   │   │   └── plot_marker_genes_archR.R
    │   │   │   │   │   │   ├── PeakMatrix
    │   │   │   │   │   │   │   ├── define_marker_peaks.R
    │   │   │   │   │   │   │   └── plot_marker_peaks_archR.R
    │   │   │   │   │   │   ├── browser_plot_archR.R
    │   │   │   │   │   │   ├── define_markers_archR.R
    │   │   │   │   │   │   └── plot_marker_peaks_archR.R
    │   │   │   │   │   └── volcano_plots_diff_atac.R
    │   │   │   │   ├── old
    │   │   │   │   │   └── run_diff_acc_celltype.R
    │   │   │   │   └── parse_differential_results.R
    │   │   │   └── genotype
    │   │   │   │   ├── analysis
    │   │   │   │       ├── analysis.R
    │   │   │   │       ├── atac_boxplots_wt_vs_ko_hits.R
    │   │   │   │       └── load_data.R
    │   │   │   │   └── run_diff_acc_genotype.R
    │   │   ├── compare_differential_atac_pseudobulk_metacells_cells.R
    │   │   ├── metacells
    │   │   │   ├── celltype
    │   │   │   │   ├── analysis
    │   │   │   │   │   ├── before_snakemake
    │   │   │   │   │   │   └── define_markers.R
    │   │   │   │   │   ├── old
    │   │   │   │   │   │   ├── define_markers.R
    │   │   │   │   │   │   └── load_data.R
    │   │   │   │   │   └── plot_marker_peaks_archR.R
    │   │   │   │   ├── parse_differential_results.R
    │   │   │   │   └── run_diff_acc_celltype_metacells.R
    │   │   │   ├── differential_accessibility_metacells.R
    │   │   │   ├── genotype
    │   │   │   │   ├── analysis
    │   │   │   │   │   ├── atac_boxplots_wt_vs_ko_hits.R
    │   │   │   │   │   ├── compare_differential_atac_pseudobulk_metacells_cells.R
    │   │   │   │   │   ├── diff_atac_genotype_metacells_analysis.R
    │   │   │   │   │   └── load_data.R
    │   │   │   │   ├── old
    │   │   │   │   │   └── analysis
    │   │   │   │   │   │   ├── analysis.R
    │   │   │   │   │   │   ├── atac_boxplots_wt_vs_ko_hits.R
    │   │   │   │   │   │   └── load_data.R
    │   │   │   │   └── run_diff_acc_genotype_metacells.R
    │   │   │   └── utils.R
    │   │   ├── pseudobulk
    │   │   │   ├── celltype
    │   │   │   │   ├── analysis
    │   │   │   │   │   ├── define_markers.R
    │   │   │   │   │   └── plot_marker_peaks_stats.R
    │   │   │   │   ├── differential_accessibility_pseudobulk.R
    │   │   │   │   └── parse_differential_results.R
    │   │   │   └── celltype_genotype
    │   │   │   │   ├── analysis
    │   │   │   │       ├── diff_acc_genotype_pseudobulk_analysis.R
    │   │   │   │       └── load_data.R
    │   │   │   │   ├── differential_accessibility_celltype_genotype_pseudobulk.R
    │   │   │   │   ├── explore_diff_acc_genotype.R
    │   │   │   │   ├── old
    │   │   │   │       └── old_stuff.R
    │   │   │   │   └── parse_differential_results.R
    │   │   └── utils.R
    │   ├── dimensionality_reduction
    │   │   ├── cells
    │   │   │   ├── archR_dimensionality_reduction.R
    │   │   │   └── atac_dimensionality_reduction_cells.R
    │   │   └── metacells
    │   │   │   └── atac_dimensionality_reduction_metacells.R
    │   ├── feature_stats
    │   │   ├── archR_calculate_feature_stats.R
    │   │   └── plot_feature_stats_atac.R
    │   ├── gene_scores
    │   │   ├── add_GeneScore_matrices.R
    │   │   ├── compare_gene_scores.R
    │   │   └── plot_genes_BrowserTrack_archR.R
    │   ├── load_archR_project.R
    │   ├── load_motif_annotation.R
    │   ├── metacells
    │   │   ├── aggregate_atac_metacell.R
    │   │   └── compare_metacell_vs_singlecell_vs_pseudobulk_atac.R
    │   ├── peak_calling
    │   │   ├── README.txt
    │   │   ├── analysis
    │   │   │   ├── calculate_cpg_density_atac_peaks.R
    │   │   │   ├── calculate_peak_stats_archR.R
    │   │   │   ├── link_TFs2genes_motifmatchr.R
    │   │   │   ├── link_peaks2genes_genomic_distance.R
    │   │   │   ├── motifmatcher_analysis.R
    │   │   │   └── plot_peak_calling_stats_archR.R
    │   │   ├── filter_peaks_archR.R
    │   │   └── peak_calling_archR.R
    │   ├── plot_individual_peaks
    │   │   ├── compare_celltypes
    │   │   │   └── plot_individual_peaks_compare_celltypes.R
    │   │   └── compare_genotypes
    │   │   │   ├── plot_individual_peaks_compare_genotypes_cells_metacells_pseudobulk.R
    │   │   │   └── pseudobulk_with_replicates
    │   │   │       └── plot_individual_peaks_genotypes_pseudobulk_with_replicates.R
    │   ├── processing
    │   │   ├── 0_create_arrow_files.R
    │   │   ├── 1_create_archR_project.R
    │   │   ├── 2_create_archR_metadata.R
    │   │   ├── 3_qc.R
    │   │   ├── save_atac_anndata.R
    │   │   ├── save_atac_matrices.R
    │   │   └── update_archR_metadata.R
    │   ├── pseudobulk
    │   │   ├── 1_archR_add_GroupCoverage.R
    │   │   ├── 2_archR_pseudobulk.R
    │   │   └── archR_pseudobulk_with_replicates.R
    │   └── snakemake
    │   │   ├── README.txt
    │   │   ├── Snakefile
    │   │   ├── config_ricard_babraham.yaml
    │   │   ├── run_cluster.sh
    │   │   └── run_cluster_single.sh
    ├── igv_settings
    │   ├── igv_session_babraham_celltypes.xml
    │   └── igv_session_babraham_nmp_metacells.xml
    └── motifs
    │   ├── calculate_motif_similarity.R
    │   └── utils.R
├── gastrulation_multiome_10x.Rproj
├── images
    ├── igv_screenshot_github.png
    └── overview_github.png
├── load_paga_graph.R
├── rna
    ├── PijuanSala2019_comparison
    │   └── PijuanSala2019_comparison_pseudobulk.R
    ├── TF2gene_coexpression
    │   ├── coexpression_TF_vs_gene_metacells.R
    │   ├── coexpression_TF_vs_gene_pseudobulk.R
    │   ├── coexpression_TF_vs_gene_single_cells.R
    │   ├── compare_coexpression_TF_vs_gene_cell_vs_metacell_vs_pseudobulk.R
    │   └── utils.R
    ├── celltype_proportions
    │   ├── compare_celltype_proportions.R
    │   ├── compare_celltype_proportions_paga.R
    │   └── plot_celltype_proportions.R
    ├── conversions
    │   ├── convert_SingleCellExperiment_to_anndata.R
    │   └── convert_anndata_to_SingleCellExperiment.R
    ├── differential
    │   ├── cells
    │   │   ├── differential.R
    │   │   └── parse_differential_results.R
    │   ├── metacells
    │   │   ├── analysis
    │   │   │   └── define_marker_genes.R
    │   │   ├── differential.R
    │   │   └── parse_differential_results.R
    │   ├── other
    │   │   └── extract_TFs_diff.R
    │   ├── pseudobulk
    │   │   ├── celltype
    │   │   │   ├── analysis
    │   │   │   │   ├── define_marker_TFs.R
    │   │   │   │   ├── define_marker_genes.R
    │   │   │   │   ├── old
    │   │   │   │   │   ├── extract_TFs_diff.R
    │   │   │   │   │   └── volcano_plots_diff_rna.R
    │   │   │   │   └── plot_marker_genes_stats.R
    │   │   │   ├── differential_celltype_pseudobulk.R
    │   │   │   ├── old
    │   │   │   │   └── run_diff_expr_celltype_pseudobulk.R
    │   │   │   └── parse_differential_results.R
    │   │   └── celltype_genotype
    │   │   │   ├── analysis
    │   │   │       └── explore_diff_rna_genotype.R
    │   │   │   ├── differential_celltype_genotype_pseudobulk.R
    │   │   │   └── parse_differential_results.R
    │   └── utils.R
    ├── dimensionality_reduction
    │   ├── dimensionality_reduction_sce.R
    │   ├── dimensionality_reduction_seurat.R
    │   └── metacells
    │   │   └── dimensionality_reduction_sce_metacells.R
    ├── iSEE
    │   └── iSEE.R
    ├── mapping
    │   ├── analysis
    │   │   ├── plot_mapping_umap.R
    │   │   ├── plot_mapping_wt_vs_ko.R
    │   │   └── plot_utils.R
    │   ├── run
    │   │   ├── mapping_functions.R
    │   │   ├── mnn
    │   │   │   ├── mapping_functions.R
    │   │   │   └── mapping_mnn.R
    │   │   └── parse_sample_metadata_after_mapping.R
    │   └── trajectories
    │   │   ├── mapping_functions.R
    │   │   ├── mapping_mnn_trajectory.R
    │   │   ├── parse_sample_metadata_after_mapping.R
    │   │   └── plot_mapping_trajectory_wt_vs_ko.R
    ├── metacells
    │   ├── SEACell_env.yml
    │   ├── analysis
    │   │   ├── compare_expr_umap_metacell_vs_singlecell.R
    │   │   ├── compare_metacell_vs_singlecell_vs_pseudobulk_expr.R
    │   │   ├── overlay_metacells_atlas_umap.R
    │   │   └── trajectories
    │   │   │   ├── overlay_metacells_atlas_trajectory.R
    │   │   │   └── plot_trajectory_metacells.R
    │   └── run
    │   │   ├── aggregate_rna_metacell.R
    │   │   ├── aggregate_rna_metacell_velocyto.R
    │   │   ├── run_metacell.py
    │   │   └── run_metacell_trajectory.py
    ├── plot_individual_genes
    │   ├── celltypes
    │   │   └── plot_individual_genes_cells_metacells_pseudobulk.R
    │   ├── genotype
    │   │   └── plot_individual_genes_by_genotype.R
    │   └── pseudobulk
    │   │   ├── barplot_individual_genes_pseudobulk.R
    │   │   └── plot_paga_individual_genes_pseudobulk.R
    ├── processing
    │   ├── 1_create_seurat_rna.R
    │   ├── 2_QC.R
    │   ├── 3_seurat_to_SCE.R
    │   ├── 4_doublet_detection.R
    │   ├── 5_parse_sample_metadata_after_doublets.R
    │   ├── 6_plot_stats.R
    │   └── extract_TFs_from_SingleCellExperiment.R
    ├── pseudobulk
    │   ├── old
    │   │   ├── create_pseudobulk_metadata_with_replicates.R
    │   │   ├── old_code.R
    │   │   ├── pseudobulk_rna.R
    │   │   └── pseudobulk_rna_intronic_exonic.R
    │   ├── pseudobulk_rna.R
    │   └── pseudobulk_rna_with_replicates.R
    ├── scanpy
    │   ├── create_anndata_from_SingleCellExperiment.R
    │   ├── create_anndata_scvelo.py
    │   ├── dimensionality_reduction
    │   │   └── dimensionality_reduction.py
    │   ├── scvelo
    │   │   ├── nmp_trajectory
    │   │   │   └── scvelo_analysis_cells.ipynb
    │   │   └── run_scvelo.py
    │   ├── template.ipynb
    │   └── velocyto
    │   │   ├── create_anndata_from_loom_files.py
    │   │   ├── run_velocyto.sh
    │   │   └── velocyto_env.yml
    └── snakemake
    │   ├── README.txt
    │   ├── Snakefile
    │   ├── config_ricard_babraham.yaml
    │   ├── environment.yaml
    │   └── run_cluster.sh
├── rna_atac
    ├── gene_regulatory_networks
    │   └── metacells
    │   │   └── trajectories
    │   │       ├── build_GRN_metacells_trajectory.R
    │   │       ├── cell_oracle
    │   │           └── celloracle_train.ipynb
    │   │       └── plot_GRN_metacells_nmp_trajectory.R
    ├── load_rna_atac_pseudobulk.R
    ├── mofa
    │   ├── not_used
    │   │   └── run.py
    │   ├── plot_mofa_results.R
    │   ├── prepare_mofa.R
    │   └── run_mofa_fast.R
    ├── rna_vs_acc
    │   ├── metacells
    │   │   ├── TFexpr_vs_peakAcc
    │   │   │   ├── compare_cor_TFexpr_vs_peak_acc_pseudobulk_vs_metacell.R
    │   │   │   ├── plot_TFexpr_vs_peakAcc_individual_examples.R
    │   │   │   ├── run_cor_TFexpr_vs_peakAcc_metacells.R
    │   │   │   └── trajectories
    │   │   │   │   └── compare_cor_TFexpr_vs_peak_acc_pseudobulk_vs_metacell_trajectories.R
    │   │   ├── gene_expr_vs_peak_acc
    │   │   │   ├── cor_gene_expr_vs_peak_acc_metacells.R
    │   │   │   └── plot_gene_expr_vs_peak_acc_metacells.R
    │   │   └── gene_expr_vs_promoter_acc
    │   │   │   ├── cor_gene_expr_vs_promoter_acc_metacells.R
    │   │   │   └── plot_gene_expr_vs_promoter_acc_metacells.R
    │   └── pseudobulk
    │   │   ├── TFexpr_vs_peakAcc
    │   │       ├── README.txt
    │   │       ├── analysis
    │   │       │   ├── old
    │   │       │   │   └── TF_cobinding_analysis.R
    │   │       │   ├── plot_TFexpr_vs_peakAcc_general_stats.R
    │   │       │   ├── plot_TFexpr_vs_peakAcc_individual_examples.R
    │   │       │   ├── plot_cor_TFexpr_vs_peakAcc_stats_per_TF.R
    │   │       │   └── plot_cor_TFexpr_vs_peakAcc_stats_per_peak.R
    │   │       └── run_cor_TFexpr_vs_peakAcc_pseudobulk.R
    │   │   ├── gene_expr_vs_peak_acc
    │   │       ├── analysis
    │   │       │   └── plot_gene_expr_vs_peak_acc_general_stats_pseudobulk.R
    │   │       ├── cor_gene_expr_vs_peak_acc_pseudobulk.R
    │   │       └── peak_markers_rna_vs_acc.R
    │   │   ├── gene_expr_vs_promoter_acc
    │   │       ├── cor_gene_expr_vs_promoter_acc_pseudobulk.R
    │   │       └── plot_gene_expr_vs_promoter_acc_pseudobulk.R
    │   │   └── gene_markers_rna_vs_acc
    │   │       ├── gene_markers_rna_vs_acc.R
    │   │       └── plot_number_markers.R
    ├── rna_vs_chromvar
    │   └── pseudobulk
    │   │   ├── per_celltype
    │   │       └── rna_vs_chromvar_pseudobulk_per_celltype.R
    │   │   └── per_gene
    │   │       ├── PAGA
    │   │           └── plot_rna_vs_chromvar_paga.R
    │   │       ├── cor_rna_vs_chromvar_per_gene_pseudobulk.R
    │   │       ├── pgc_neural_crest
    │   │           └── plot_rna_vs_chromvar_per_gene_pseudobulk_pgc_neural_crest.R
    │   │       └── plot_rna_vs_chromvar_per_gene_pseudobulk.R
    ├── rna_vs_chromvar_chip
    │   └── pseudobulk
    │   │   ├── per_celltype
    │   │       ├── rna_vs_chromvar_TF_markers_heatmap.R
    │   │       ├── rna_vs_chromvar_TF_pleiotropy.R
    │   │       ├── rna_vs_chromvar_pseudobulk_per_celltype.R
    │   │       └── rna_vs_chromvar_pseudobulk_pgc_neural_crest.R
    │   │   └── per_gene
    │   │       ├── PAGA
    │   │           ├── fig
    │   │           │   └── plot_rna_vs_chromvar_paga_fig.R
    │   │           └── plot_rna_vs_chromvar_paga.R
    │   │       ├── fig
    │   │           ├── plot_rna_vs_chromvar_per_gene_pseudobulk_fig.R
    │   │           └── rna_vs_chromvar_marker_score_per_gene_fig.R
    │   │       ├── rna_vs_chromvar_marker_score_per_gene.R
    │   │       └── rna_vs_chromvar_per_gene_pseudobulk.R
    ├── snakemake
    │   ├── Snakefile
    │   ├── config_ricard_babraham.yaml
    │   └── run_cluster.sh
    └── virtual_chipseq_library
    │   ├── link_TF2genes_virtual_chip.R
    │   ├── metacells
    │       ├── analysis
    │       │   └── virtual_chipseq_metacells_exploration.R
    │       ├── create_virtual_chipseq_library_metacells.R
    │       └── virtual_chipseq_compare_pseudobulk_vs_metacells.R
    │   └── pseudobulk
    │       ├── analysis
    │           ├── stats
    │           │   ├── virtual_chipseq_plot_individual_peaks.R
    │           │   └── virtual_chipseq_plot_stats.R
    │           ├── validation
    │           │   ├── virtual_chipseq_validation.R
    │           │   └── virtual_chipseq_validation_roc_curves.R
    │           └── virtual_chipseq_exploration.R
    │       └── create_virtual_chipseq_library_pseudobulk.R
├── settings.R
├── settings.py
├── utils.R
└── utils.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Resilio Sync
 2 | *.sync
 3 | 
 4 | # MAC
 5 | *Icon*
 6 | *.DS_Store
 7 | 
 8 | # R stuff
 9 | *.Rhistory
10 | *.rds
11 | 
12 | # Images
13 | *.pdf
14 | *.tiff
15 | *.ai
16 | *.svg
17 | 
18 | # Data files
19 | *.hdf5
20 | *.gz
21 | *.zip
22 | 
23 | # output files
24 | *.html
25 | *.log
26 | *.tsv
27 | 
28 | # Ignore local folders
29 | **/local-data
30 | *.Rproj.user
31 | 
32 | # Jupyter notebooks
33 | *.ipynb_checkpoints
34 | # *.ipynb
35 | .Rproj.user
36 | 
37 | *.snakemake*
38 | 
39 | 
40 | *slurm*out
41 | 


--------------------------------------------------------------------------------
/atac/archR/add_motif_annotation/archR_add_background_peaks.R:
--------------------------------------------------------------------------------
 1 | 
 2 | suppressPackageStartupMessages(library(ArchR))
 3 | suppressPackageStartupMessages(library(argparse))
 4 | 
 5 | here::i_am("atac/archR/add_motif_annotation/archR_add_background_peaks.R")
 6 | 
 7 | ######################
 8 | ## Define arguments ##
 9 | ######################
10 | 
11 | p <- ArgumentParser(description='')
12 | p$add_argument('--method',        type="character", default="chromVAR",                              help='ArchR or chromVAR')
13 | p$add_argument('--number_background_peaks',     type="integer",    default=50,    help='Number of background peaks')
14 | p$add_argument('--threads',     type="integer",    default=1,    help='Number of threads')
15 | 
16 | args <- p$parse_args(commandArgs(TRUE))
17 | 
18 | ## START TEST ##
19 | args$number_background_peaks <- 50
20 | args$method <- "chromVAR"
21 | args$threads <- 1
22 | ## END TEST ##
23 | 
24 | #####################
25 | ## Define settings ##
26 | #####################
27 | 
28 | source(here::here("settings.R"))
29 | 
30 | ########################
31 | ## Load ArchR Project ##
32 | ########################
33 | 
34 | source(here::here("atac/archR/load_archR_project.R"))
35 | 
36 | addArchRThreads(threads = args$threads)
37 | 
38 | ##########################
39 | ## Add background peaks ##
40 | ##########################
41 | 
42 | # This function will compute background peaks controlling for total accessibility and GC-content
43 | # changes in the ArchR project: (1) it creates Background-Peaks.rds and (2) adds "bgdPeaks" entry to "metadata(getPeakSet(ArchRProject))"
44 | 
45 | # Background peaks are chosen by sampling peaks based on similarity in GC content and # of fragments across samples using the Mahalanobis distance. 
46 | # - The "w" paramter controls how similar background peaks should be. 
47 | # - The "binSize" parameter controls the precision with which the similarity is computed. Increasing "binSize" will make the function run slower.
48 | # Returns a matrix with one row per peak and one column per iteration. values in a row represent indices of background peaks for the peak with that index
49 | 
50 | ArchRProject <- addBgdPeaks(
51 |   ArchRProj = ArchRProject,
52 |   nIterations = args$number_background_peaks,
53 |   w = 0.1,
54 |   binSize = 50,
55 |   method = args$method,
56 |   seed = 42,
57 |   outFile = file.path(getOutputDirectory(ArchRProject), "Background-Peaks.rds"),   # default
58 |   force = TRUE
59 | )
60 | 
61 | # if (!file.exists(metadata(ArchRProject@peakSet)$bgdPeaks)) {
62 | 
63 | ##########
64 | ## TEST ##
65 | ##########
66 | 
67 | R.utils::sourceDirectory("/bi/group/reik/ricard/scripts/git/archR/R", verbose=T, modifiedOnly=FALSE)
68 | 
69 | ArchRProj = ArchRProject
70 | nIterations = 50
71 | w = 0.1
72 | binSize = 50
73 | seed = 1
74 | method = "chromVAR"
75 | 
76 | 


--------------------------------------------------------------------------------
/atac/archR/add_motif_annotation/archR_add_motif_annotation.R:
--------------------------------------------------------------------------------
  1 | # https://www.archrproject.com/bookdown/calculating-gene-scores-in-archr.html
  2 | 
  3 | suppressPackageStartupMessages(library(ArchR))
  4 | suppressPackageStartupMessages(library(argparse))
  5 | 
  6 | here::i_am("atac/archR/gene_scores/add_GeneScore_matrices.R")
  7 | 
  8 | ######################
  9 | ## Define arguments ##
 10 | ######################
 11 | 
 12 | p <- ArgumentParser(description='')
 13 | p$add_argument('--metadata',    type="character",    help='metadata file')
 14 | # p$add_argument('--outdir',    type="character",    help='Output directory')
 15 | p$add_argument('--threads',     type="integer",    default=1,    help='Number of threads')
 16 | 
 17 | args <- p$parse_args(commandArgs(TRUE))
 18 | 
 19 | ## START TEST ##
 20 | # args$metadata <- "/bi/group/reik/ricard/data/gastrulation_multiome_10x/results/atac/archR/qc/sample_metadata_after_qc.txt.gz"
 21 | # args$threads <- 1
 22 | ## END TEST ##
 23 | 
 24 | #####################
 25 | ## Define settings ##
 26 | #####################
 27 | 
 28 | source(here::here("settings.R"))
 29 | source(here::here("utils.R"))
 30 | 
 31 | ########################
 32 | ## Load ArchR Project ##
 33 | ########################
 34 | 
 35 | source(here::here("atac/archR/load_archR_project.R"))
 36 | 
 37 | addArchRThreads(threads = args$threads)
 38 | 
 39 | ##########################
 40 | ## Add motif annotation ##
 41 | ##########################
 42 | 
 43 | # cisbp (stringent threshold)
 44 | ArchRProject <- addMotifAnnotations(
 45 |   ArchRProject,
 46 |   motifSet = "cisbp",
 47 |   name = "Motif_cisbp",
 48 |   cutOff = 5e-05,
 49 |   width = 7,
 50 |   force = FALSE
 51 | )
 52 | 
 53 | # cisbp (lenient threshold)
 54 | # ArchRProject <- addMotifAnnotations(
 55 | #   ArchRProject,
 56 | #   motifSet = "cisbp",
 57 | #   name = "Motif_cisbp_lenient",
 58 | #   cutOff = 1e-04,
 59 | #   width = 7,
 60 | #   force = TRUE
 61 | # )
 62 | 
 63 | # homer
 64 | # ArchRProject <- addMotifAnnotations(
 65 | #   ArchRProject,
 66 | #   motifSet = "homer",
 67 | #   cutOff = opts$motif.pvalue.cutoff,
 68 | #   name = "Motif_homer",
 69 | #   force = TRUE
 70 | # )
 71 | 
 72 | 
 73 | # JASPAR2020 human (stringent)
 74 | ArchRProject <- addMotifAnnotations(
 75 |   ArchRProject, 
 76 |   motifSet = "JASPAR2020",      
 77 |   collection = "CORE",  
 78 |   species = "Homo sapiens",
 79 |   cutOff = 5e-05,   
 80 |   name = "Motif_JASPAR2020",
 81 |   force = FALSE
 82 | )
 83 | 
 84 | # JASPAR2020 human (lenient)
 85 | # ArchRProject <- addMotifAnnotations(
 86 | #   ArchRProject, 
 87 | #   motifSet = "JASPAR2020",      
 88 | #   collection = "CORE",  
 89 | #   species = "Homo sapiens",
 90 | #   cutOff = 1e-04,   
 91 | #   name = "Motif_JASPAR2020_lenient",
 92 | #   force = TRUE
 93 | # )
 94 | 
 95 | 
 96 | ################################
 97 | ## Save peakAnnotation object ##
 98 | ################################
 99 | 
100 | saveRDS(ArchRProject@peakAnnotation, sprintf("%s/Annotations/peakAnnotation.rds",io$archR.directory))
101 | 


--------------------------------------------------------------------------------
/atac/archR/add_motif_annotation/plot_motif_seqlogo.R:
--------------------------------------------------------------------------------
 1 | here::i_am("atac/archR/add_motif_annotation/plot_motif_seqlogo.R")
 2 | 
 3 | source(here::here("settings.R"))
 4 | 
 5 | suppressPackageStartupMessages(library(TFBSTools))
 6 | suppressPackageStartupMessages(library(ggseqlogo))
 7 | 
 8 | ######################
 9 | ## Define arguments ##
10 | ######################
11 | 
12 | p <- ArgumentParser(description='')
13 | p$add_argument('--motif_annotation',             type="character",            help='Motif annotation')
14 | # p$add_argument('--peak_annotation_file',          type="character",                               help='')
15 | p$add_argument('--outdir',          type="character",                               help='Output directory')
16 | p$add_argument('--test',    action="store_true",             help='Test mode')
17 | 
18 | args <- p$parse_args(commandArgs(TRUE))
19 | 
20 | ## START TEST ##
21 | args$motif_annotation <- "CISBP"
22 | args$peak_annotation_file <- file.path(io$archR.directory,"Annotations/peakAnnotation.rds")
23 | args$outdir <- file.path(io$archR.directory,sprintf("Annotations/seqlogo/%s",args$motif_annotation))
24 | args$test <- FALSE
25 | ## END TEST ##
26 | 
27 | # Parse arguments
28 | dir.create(args$outdir, showWarnings=F, recursive = T)
29 | 
30 | ###############
31 | ## Load data ##
32 | ###############
33 | 
34 | # raw position frequency matrix (PFM)
35 | pwm <- readRDS(args$peak_annotation_file)[[args$motif_annotation]][["motifs"]]
36 | # pwm <- readRDS(io$peak_annotation_file)[["Motif_JASPAR2020_human"]][["motifs"]]
37 | 
38 | ##########
39 | ## Plot ##
40 | ##########
41 | 
42 | motifs.to.plot <- names(pwm)
43 | if (args$test) {
44 |   motifs.to.plot <- motifs.to.plot %>% head(n=3)
45 | }
46 | 
47 | # postProbs = (PFM + bg * pseudocounts) / (colSums(PFM) + sum(bg) * pseudocounts)
48 | # priorProbs = bg / sum(bg)
49 | # PWM_log2probratio = log2(postProbs / priorProbs)
50 | 
51 | grep("YBX",motifs.to.plot,value=T)
52 | 
53 | # i <- "YBX2_827"
54 | for (i in motifs.to.plot) {
55 | 
56 |   # position weight matrix (PWM)
57 |   
58 |   if (args$motif_annotation=="JASPAR") {
59 |     tmp <- toPWM(pwm[[i]], type="prob") %>% as.matrix
60 |   } else if (args$motif_annotation=="CISBP") {
61 |     tmp <- (2**as.matrix(pwm[[i]]))*0.25  # this is not entirely accurate
62 |   }
63 |   
64 |   p <- ggseqlogo(tmp) + 
65 |     theme(
66 |       axis.line = element_line(size=rel(0.5), color="black"),
67 |       axis.text.x = element_blank(),
68 |       axis.text.y = element_text(size=rel(0.75)),
69 |       axis.title.y = element_text(size=rel(0.75)),
70 |       # axis.title.y = element_blank()
71 |     )
72 |   pdf(file.path(args$outdir,sprintf("seqlogo_%s_%s.pdf",args$motif_annotation,i)), width=5, height=2.2)
73 |   print(p)
74 |   dev.off()
75 | }
76 | 
77 | # Completion token
78 | file.create(file.path(args$outdir,"completed.txt"))


--------------------------------------------------------------------------------
/atac/archR/bigwig/README.txt:
--------------------------------------------------------------------------------
 1 | ############
 2 | ## Signac ##
 3 | ############
 4 | 
 5 | https://timoast.github.io/sinto/basic_usage.html#filter-cell-barcodes-from-bam-file
 6 | https://github.com/timoast/sinto
 7 | 
 8 | 
 9 | Hi, it is not currently possible to create a bigwig for different groups of cells in Signac. I'd suggest writing the cell names to a file and then splitting the bam file by cell using the sinto package (https://github.com/timoast/sinto), and then creating bigwig tracks using deeptools (https://deeptools.readthedocs.io/en/develop/content/tools/bamCoverage.html)
10 | 
11 | 
12 | ###########
13 | ## ArchR ##
14 | ###########
15 | 
16 | getGroupBW()


--------------------------------------------------------------------------------
/atac/archR/bigwig/archR_export_bw.R:
--------------------------------------------------------------------------------
  1 | # https://www.ArchRProject.com/bookdown/how-does-archr-make-pseudo-bulk-replicates.html
  2 | here::i_am("atac/archR/bigwig/archR_export_bw.R")
  3 | 
  4 | source(here::here("settings.R"))
  5 | source(here::here("utils.R"))
  6 | 
  7 | suppressPackageStartupMessages(library(ArchR))
  8 | 
  9 | ######################
 10 | ## Define arguments ##
 11 | ######################
 12 | 
 13 | p <- ArgumentParser(description='')
 14 | p$add_argument('--archr_directory',    type="character",    help='ArchR directory')
 15 | p$add_argument('--metadata',    type="character",    help='metadata file')
 16 | p$add_argument('--group_by',     type="character",    help='Metadata column to group by')
 17 | p$add_argument('--norm_method',     type="character", default="ReadsInTSS",    help='Normalisation method')
 18 | p$add_argument('--min_cells',     type="integer", default=100,    help='Minimum number of cells per celltype')
 19 | p$add_argument('--tile_size',     type="integer", default=100,    help='Tile size')
 20 | p$add_argument('--threads',     type="integer",    default=1,    help='Number of threads')
 21 | 
 22 | args <- p$parse_args(commandArgs(TRUE))
 23 | 
 24 | ## START TEST ##
 25 | # args$metadata <- file.path(io$basedir,"results/atac/archR/qc/sample_metadata_after_qc.txt.gz")
 26 | # args$group_by <- "celltype_genotype"
 27 | # args$norm_method <- c("ReadsInTSS")
 28 | # args$tile_size <- 100
 29 | # args$min_cells <- 100
 30 | # args$threads <- 4
 31 | ## END TEST ##
 32 | 
 33 | ########################
 34 | ## Load cell metadata ##
 35 | ########################
 36 | 
 37 | sample_metadata <- fread(args$metadata) %>%
 38 |   .[pass_atacQC==TRUE & sample%in%opts$samples] %>%
 39 |   .[,celltype_genotype:=sprintf("%s-%s",celltype,genotype)]
 40 | 
 41 | stopifnot(args$group_by%in%colnames(sample_metadata))
 42 | sample_metadata <- sample_metadata[!is.na(sample_metadata[[args$group_by]])]
 43 | 
 44 | # Filter celltypes by minimum number of cells
 45 | sample_metadata <- sample_metadata[,N:=.N,by=c(args$group_by)] %>% .[N>=args$min_cells] %>% .[,N:=NULL]
 46 | 
 47 | table(sample_metadata[[args$group_by]])
 48 | 
 49 | ########################
 50 | ## Load ArchR project ##
 51 | ########################
 52 | 
 53 | # source(here::here("atac/archR/load_archR_project.R"))
 54 | 
 55 | setwd(args$archr_directory)
 56 | 
 57 | addArchRGenome("mm10")
 58 | addArchRThreads(threads = args$threads)
 59 | 
 60 | ArchRProject <- loadArchRProject(args$archr_directory)[sample_metadata$cell]
 61 | 
 62 | ###########################
 63 | ## Update ArchR metadata ##
 64 | ###########################
 65 | 
 66 | sample_metadata.to.archr <- sample_metadata %>% 
 67 |   .[cell%in%rownames(ArchRProject.filt)] %>% setkey(cell) %>% .[rownames(ArchRProject.filt)] %>%
 68 |   as.data.frame() %>% tibble::column_to_rownames("cell")
 69 | 
 70 | stopifnot(all(rownames(sample_metadata.to.archr) == rownames(getCellColData(ArchRProject.filt))))
 71 | ArchRProject.filt <- addCellColData(
 72 |   ArchRProject.filt,
 73 |   data = sample_metadata.to.archr[[args$group_by]],
 74 |   name = args$group_by,
 75 |   cells = rownames(sample_metadata.to.archr),
 76 |   force = TRUE
 77 | )
 78 | 
 79 | # print cell numbers
 80 | table(getCellColData(ArchRProject.filt,args$group_by)[[1]])
 81 | 
 82 | 
 83 | ###################
 84 | ## Export bigwig ##
 85 | ###################
 86 | 
 87 | # This function will group, summarize and export a bigwig for each group in an ArchRProject.
 88 | getGroupBW(
 89 |   ArchRProj = ArchRProject.filt,
 90 |   groupBy = args$group_by,
 91 |   # groupBy = "Sample",
 92 |   normMethod = args$norm_method,
 93 |   tileSize = args$tile_size,
 94 |   maxCells = 1000, # default
 95 |   ceiling = 4
 96 | )
 97 | 
 98 | # Create a completion token
 99 | file.create(file.path(io$archR.directory,sprintf("/GroupBigWigs/%s/completed.txt",args$group_by)))
100 | 


--------------------------------------------------------------------------------
/atac/archR/chromvar_chip/metacells/differential/celltype/analysis/compare_differential_chromvar_pseudobulk_metacells.R:
--------------------------------------------------------------------------------
 1 | here::i_am("atac/archR/chromvar_chip/metacells/differential/celltype/analysis/compare_differential_chromvar_pseudobulk_metacells.R")
 2 | 
 3 | # Load default settings
 4 | source(here::here("settings.R"))
 5 | source(here::here("utils.R"))
 6 | 
 7 | #####################
 8 | ## Define settings ##
 9 | #####################
10 | 
11 | # Options
12 | opts$celltypes <- c("NMP","Epiblast","Gut")
13 | 
14 | # I/O
15 | io$basedir <- file.path(io$basedir,"test")
16 | io$diff.pseudobulk <- file.path(io$basedir,"results/atac/archR/chromvar_chip/pseudobulk/differential/celltypes/CISBP")
17 | io$diff.metacells <- file.path(io$basedir,"results/atac/archR/chromvar_chip/metacells/differential/celltypes/CISBP")
18 | # io$outdir <- file.path(io$basedir,sprintf("results/atac/archR/differential/comparison/%s/%s",opts$group_variable, opts$matrix)); dir.create(io$outdir, showWarnings = F, recursive = T)
19 | 
20 | ##########################################
21 | ## Load results at the pseudobulk level ##
22 | ##########################################
23 | 
24 | # i <- "Epiblast"; j <- "Primitive_Streak"
25 | chromvar_diff_pseudobulk.dt <- opts$celltypes %>% map(function(i) { opts$celltypes %>% map(function(j) {
26 |   file <- file.path(io$diff.pseudobulk,sprintf("chromVAR_%s_vs_%s.txt.gz",i,j))
27 |   if (file.exists(file)) {
28 |     fread(file, select=c(1,2)) %>% 
29 |       .[,c("celltypeA","celltypeB"):=list(as.factor(i),as.factor(j))] %>% 
30 |       .[,class:="pseudobulk"] %>%
31 |       return
32 |   }
33 | }) %>% rbindlist }) %>% rbindlist 
34 | 
35 | 
36 | ########################################
37 | ## Load results at the metacell level ##
38 | ########################################
39 | 
40 | chromvar_diff_metacells.dt <- opts$celltypes %>% map(function(i) { opts$celltypes %>% map(function(j) {
41 |   file <- file.path(io$diff.metacells,sprintf("chromVAR_%s_vs_%s.txt.gz",i,j))
42 |   if (file.exists(file)) {
43 |     fread(file, select=c(1,2)) %>% 
44 |       .[,c("celltypeA","celltypeB"):=list(as.factor(i),as.factor(j))] %>% 
45 |       .[,class:="metacells"] %>%
46 |       return
47 |   }
48 | }) %>% rbindlist }) %>% rbindlist 
49 |   
50 | ###################
51 | ## Sanity checks ##
52 | ###################
53 | 
54 | # all(sort(unique(atac_diff_cells.dt$peak))==sort(unique(chromvar_diff_metacells.dt$peak)))
55 | # all(sort(unique(atac_diff_cells.dt$peak))==sort(unique(chromvar_diff_pseudobulk.dt$peak)))
56 | # mean(is.na(chromvar_diff_metacells.dt$diff))
57 | # mean(is.na(atac_diff_cells.dt$diff))
58 | # mean(is.na(chromvar_diff_pseudobulk.dt$diff))
59 | 
60 | ###########
61 | ## Merge ##
62 | ###########
63 | 
64 | stopifnot(colnames(chromvar_diff_pseudobulk.dt)==colnames(chromvar_diff_metacells.dt))
65 | 
66 | chromvar_diff.dt <- rbindlist(list(chromvar_diff_metacells.dt, chromvar_diff_pseudobulk.dt)) %>%
67 |   dcast(gene+celltypeA+celltypeB~class, value.var="diff")
68 | 
69 | ##########
70 | ## Plot ##
71 | ##########
72 | 
73 | to.plot <- chromvar_diff.dt[celltypeA=="Epiblast" & celltypeB=="NMP"]
74 | 
75 | ggscatter(to.plot, x="pseudobulk", y="metacells", size=0.5, add="reg.line", add.params = list(color="blue", fill="lightgray"), conf.int=TRUE) +
76 |   labs(x="Differential chromvar (pseudobulk)", y="Differential chromvar (metacells)")
77 | 


--------------------------------------------------------------------------------
/atac/archR/chromvar_chip/metacells/differential/celltype/analysis/old/load_data.R:
--------------------------------------------------------------------------------
 1 | # i <- opts$celltypes[2]; j <- opts$celltypes[1]
 2 | 
 3 | diff.dt <- opts$celltypes %>% map(function(i) { opts$celltypes %>% map(function(j) {
 4 |   file <- sprintf("%s/%s_%s_vs_%s.txt.gz", io$diff.dir,opts$matrix,i,j)
 5 |   if (file.exists(file)) {
 6 |     fread(file) %>% .[,c("celltypeA","celltypeB"):=list(as.factor(i),as.factor(j))] %>% return
 7 |   }
 8 | }) %>% rbindlist }) %>% rbindlist %>% 
 9 |   .[,MeanDiff:=-MeanDiff] %>% # change sign to keep the groupB - groupA consistency
10 |   .[,sig:=FALSE] %>% .[abs(MeanDiff)>=opts$min.MeanDiff & FDR<=opts$fdr,sig:=TRUE] %>%
11 |   .[,direction:=c("up","down")[as.numeric(MeanDiff>0)+1]]  # up = higher accessibility in celltype A
12 | 
13 | # ad hoc
14 | # if ("name"%in%colnames(atac_diff_cells.dt)) {
15 | #   atac_diff_cells.dt[,idx:=NULL] %>% setnames("name","idx")
16 | # }
17 | 


--------------------------------------------------------------------------------
/atac/archR/chromvar_chip/metacells/differential/old/plot_differential_chromvar_chip_pseudobulk.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages(library(argparse))
 2 | 
 3 | here::i_am("atac/archR/chromvar_chip/pseudobulk/differential/plot_differential_chromvar_chip_pseudobulk.R")
 4 | 
 5 | ################################
 6 | ## Initialize argument parser ##
 7 | ################################
 8 | 
 9 | p <- ArgumentParser(description='')
10 | p$add_argument('--motif_annotation',  type="character",              help='Motif annotation') 
11 | p$add_argument('--chromvar_diff_pseudobulk_dir',  type="character",              help='Motif annotation') 
12 | p$add_argument('--outdir',  type="character",              help='Motif annotation') 
13 | args <- p$parse_args(commandArgs(TRUE))
14 | 
15 | #####################
16 | ## Define settings ##
17 | #####################
18 | 
19 | # load default setings
20 | source(here::here("settings.R"))
21 | source(here::here("utils.R"))
22 | 
23 | ## START TEST ##
24 | args$motif_annotation <- "CISBP"
25 | args$chromvar_diff_pseudobulk_dir <- file.path(io$basedir,sprintf("results_new/atac/archR/chromvar_chip/pseudobulk/differential/%s",args$motif_annotation))
26 | args$outdir <- file.path(io$basedir,sprintf("results_new/atac/archR/chromvar_chip/pseudobulk/differential/%s/pdf",args$motif_annotation))
27 | ## END TEST ##
28 | 
29 | dir.create(args$outdir, showWarnings = F)
30 | 
31 | ########################################################
32 | ## Load precomputed differential chromVAR-ChIP scores ##
33 | ########################################################
34 | 
35 | diff.dt <- (1:length(opts$celltypes)) %>% map(function(i) {
36 |   (i:length(opts$celltypes)) %>% map(function(j) {
37 |     if (i!=j) {
38 |       file <- file.path(args$chromvar_diff_pseudobulk_dir,sprintf("%s_vs_%s_chromVAR_chip_pseudobulk.txt.gz",opts$celltypes[[i]],opts$celltypes[[j]]))
39 |       if (file.exists(file)) {
40 |         fread(file) %>% 
41 |           .[,groupA:=factor(opts$celltypes[[i]],levels=opts$celltypes)] %>% 
42 |           .[,groupB:=factor(opts$celltypes[[j]],levels=opts$celltypes)] %>%
43 |           return
44 |       }
45 |     }
46 |   }) %>% rbindlist
47 | }) %>% rbindlist
48 | 
49 | ##########
50 | ## Plot ##
51 | ##########
52 | 
53 | # celltypes.to.plot <- c("Gut","Erythroid3")
54 | # genes.to.plot <- c("TAL1")
55 | 
56 | opts$xlim.max <- 3
57 | opts$xlim.min <- -3
58 | 
59 | # i <- "Gut"; j <- "Erythroid3"
60 | for (i in opts$celltypes) {
61 |   for (j in opts$celltypes) {
62 |     
63 |     to.plot <- diff.dt %>%
64 |       .[groupA==i & groupB==j] %>% 
65 |       .[,gene:=factor(gene,levels=rev(gene))] %>%
66 |       .[diff>=opts$xlim.max,diff:=opts$xlim.max] %>%
67 |       .[diff<=opts$xlim.min,diff:=opts$xlim.min]
68 |     
69 |     p <- ggplot(to.plot, aes(x=diff, y=gene)) +
70 |       geom_jitter(aes(color=abs(diff), alpha=abs(diff)), width = 0.15) +
71 |       ggrepel::geom_text_repel(data=head(to.plot[diff>0],n=10), aes(x=diff, y=gene, label=gene), size=4, max.overlaps=Inf) +
72 |       ggrepel::geom_text_repel(data=head(to.plot[diff<0],n=10), aes(x=diff, y=gene, label=gene), size=4, max.overlaps=Inf) +
73 |       scale_color_gradient(low = "gray80", high = "red") +
74 |       scale_alpha_continuous(range=c(0.25,1)) +
75 |       coord_cartesian(xlim=c(opts$xlim.min,opts$xlim.max)) +
76 |       theme_classic() +
77 |       labs(x="Differential motif accessibility (chromVAR)", y="") +
78 |       # coord_flip() +
79 |       annotate("text", x=opts$xlim.min/1.5, y=75, size=4, label=sprintf("(+) %s",i)) +
80 |       annotate("text", x=opts$xlim.max/1.5, y=75, size=4, label=sprintf("(+) %s",j)) +
81 |       geom_segment(x=0, xend=0, y=0, yend=nrow(to.plot), color="black", size=0.25, linetype="dashed") +
82 |       theme(
83 |         legend.position = "none",
84 |         axis.text.y = element_blank(),
85 |         axis.ticks.y = element_blank(),
86 |         axis.text.x = element_text(size=rel(1.0), color="black")
87 |       )
88 |     
89 |     
90 |     pdf(file.path(args$outdir,sprintf("%s_vs_%s_%s_chromVAR_chip_pseudobulk_volcano.pdf",i,j,args$motif_annotation)), width=7, height=5)
91 |     print(p)
92 |     dev.off()
93 |   }
94 | }
95 | 
96 | 


--------------------------------------------------------------------------------
/atac/archR/chromvar_chip/metacells/differential/old/run_differential_chromvar_chip_pseudobulk.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages(library(argparse))
 2 | 
 3 | here::i_am("atac/archR/chromvar_chip/pseudobulk/differential/run_differential_chromvar_chip_pseudobulk.R")
 4 | 
 5 | ################################
 6 | ## Initialize argument parser ##
 7 | ################################
 8 | 
 9 | p <- ArgumentParser(description='')
10 | p$add_argument('--motif_annotation',  type="character",              help='Motif annotation') 
11 | p$add_argument('--chromvar_chip_pseudobulk',  type="character",              help='Motif annotation') 
12 | p$add_argument('--outdir',  type="character",              help='Motif annotation') 
13 | args <- p$parse_args(commandArgs(TRUE))
14 | 
15 | #####################
16 | ## Define settings ##
17 | #####################
18 | 
19 | # load default setings
20 | source(here::here("settings.R"))
21 | source(here::here("utils.R"))
22 | 
23 | ## START TEST ##
24 | # args$motif_annotation <- "JASPAR"
25 | # args$chromvar_chip_pseudobulk <- file.path(io$basedir,sprintf("results_new/atac/archR/chromvar_chip/pseudobulk/chromVAR_deviations_%s_archr_chip.rds",args$motif_annotation))
26 | # args$outdir <- file.path(io$basedir,sprintf("results_new/atac/archR/chromvar_chip/pseudobulk/differential/%s",args$motif_annotation))
27 | ## END TEST ##
28 | 
29 | # I/O
30 | dir.create(args$outdir, showWarnings = F)
31 | 
32 | #####################################
33 | ## Load pseudobulk chromVAR scores ##
34 | #####################################
35 | 
36 | chromvar_deviations_pseudobulk.se <- readRDS(args$chromvar_chip_pseudobulk)
37 |   
38 | ######################################
39 | ## Differential motif accessibility ##
40 | ######################################
41 | 
42 | # i <- 1; j <- 2
43 | for (i in 1:length(opts$celltypes)) {
44 |   for (j in i:length(opts$celltypes)) {
45 |     if (i!=j) {
46 |       foo <- assay(chromvar_deviations_pseudobulk.se[,opts$celltypes[[j]]])[,1]
47 |       bar <- assay(chromvar_deviations_pseudobulk.se[,opts$celltypes[[i]]])[,1]
48 |       
49 |       chromvar_diff.dt <- data.table(
50 |         gene = names(foo), 
51 |         diff = round(foo-bar,2) 
52 |         # groupA = opts$celltypes[[i]], 
53 |         # groupB = opts$celltypes[[j]]
54 |       ) %>% sort.abs("diff") 
55 |       
56 |       # save      
57 |       outfile <- file.path(args$outdir,sprintf("%s_vs_%s_%s_chromVAR_chip_pseudobulk.txt.gz",opts$celltypes[[i]],opts$celltypes[[j]],args$motif_annotation))
58 |       fwrite(chromvar_diff.dt, outfile, sep="\t")
59 |     }
60 |   }
61 | }
62 | 


--------------------------------------------------------------------------------
/atac/archR/chromvar_chip/pseudobulk/differential/celltype/differential_chromvar_pseudobulk.R:
--------------------------------------------------------------------------------
 1 | here::i_am("atac/archR/chromvar_chip/pseudobulk/differential/celltype/differential_chromvar_pseudobulk.R")
 2 | 
 3 | source(here::here("settings.R"))
 4 | source(here::here("utils.R"))
 5 | 
 6 | ######################
 7 | ## Define arguments ##
 8 | ######################
 9 | 
10 | p <- ArgumentParser(description='')
11 | p$add_argument('--motif_annotation',        type="character",                               help='')
12 | p$add_argument('--chromvar_chip',        type="character",                               help='')
13 | p$add_argument('--groupA',    type="character",    help='group A')
14 | p$add_argument('--groupB',    type="character",    help='group B')
15 | p$add_argument('--outfile',          type="character",                               help='Output directory')
16 | 
17 | args <- p$parse_args(commandArgs(TRUE))
18 | 
19 | ## START TEST ##
20 | # io$basedir <- file.path(io$basedir,"test")
21 | # args <- list()
22 | # args$motif_annotation <- "JASPAR"
23 | # args$chromvar_chip <- file.path(io$basedir,sprintf("results/atac/archR/chromvar_chip/pseudobulk_with_replicates/chromVAR_chip_%s_archr.rds",args$motif_annotation))
24 | # args$groupA <- "ExE_ectoderm"
25 | # args$groupB <- "Caudal_neurectoderm"
26 | # args$outfile <- file.path(io$basedir,sprintf("results/atac/archR/chromvar_chip/pseudobulk_with_replicates/differential/celltype/chromVAR_%s_vs_%s.txt.gz",args$groupA,args$groupB))
27 | ## END TEST ##
28 | 
29 | #####################
30 | ## Define settings ##
31 | #####################
32 | 
33 | # I/O
34 | dir.create(dirname(args$outfile), showWarnings=F, recursive = T)
35 | 
36 | # Options
37 | opts$groups <- c(args$groupA,args$groupB)
38 | 
39 | # stupid stuff but otherwise the snakemake pipeline doesn't work
40 | if (args$groupA==args$groupB) {
41 |   out <- data.table(feature=NA, diff=NA, padj=NA)
42 |   fwrite(out, args$outfile, sep="\t", na="NA", quote=F)
43 |   warning("groupA and groupB are the same, saving an empty file...")
44 |   quit(status=0)
45 | }
46 | 
47 | ###########################
48 | ## Load chromVAR results ##
49 | ###########################
50 | 
51 | print(sprintf("Fetching chromVAR results: '%s'...",args$chromvar_chip))
52 | 
53 | # Load 
54 | atac_chromvar.se <- readRDS(args$chromvar_chip)
55 | 
56 | # parse
57 | if (!"celltype"%in%colnames(colData(atac_chromvar.se))) {
58 |   atac_chromvar.se$celltype <- colnames(atac_chromvar.se) %>% strsplit("_rep") %>% map_chr(1)
59 | }
60 | 
61 | atac_chromvar.se <- atac_chromvar.se[,atac_chromvar.se$celltype%in%opts$groups]
62 | atac_chromvar.se$celltype <- factor(atac_chromvar.se$celltype, levels=opts$groups)
63 | 
64 | # check that we have pseudobulk replicates for both cell types
65 | if (any(!opts$groups%in%unique(atac_chromvar.se$celltype))) {
66 |   warning("groups not found, saving an empty file...")
67 |   out <- data.table(feature=NA, diff=NA, padj=NA)
68 |   fwrite(out, args$outfile, sep="\t", na="NA", quote=F)
69 |   quit(status=0)
70 | }
71 | 
72 | # Create data.table
73 | atac_chromvar.dt <- assay(atac_chromvar.se,"z") %>% t %>%
74 |   as.data.table(keep.rownames = T) %>%
75 |   setnames("rn","sample") %>%
76 |   melt(id.vars=c("sample"), variable.name="gene", value.name="chromvar_zscore")
77 |   
78 | tmp <- data.table(sample=colnames(atac_chromvar.se), group=atac_chromvar.se$celltype)
79 | atac_chromvar.dt <- atac_chromvar.dt %>% merge(tmp[,c("sample","group")])
80 | 
81 | ##########################
82 | ## Differential testing ##
83 | ##########################
84 | 
85 | out <- atac_chromvar.dt %>% .[,.(
86 |     diff = mean(.SD[group==opts$groups[2],chromvar_zscore]) - mean(.SD[group==opts$groups[1],chromvar_zscore]),
87 |     p.value = t.test(x=.SD[group==opts$groups[1],chromvar_zscore], y=.SD[group==opts$groups[2],chromvar_zscore])[["p.value"]]
88 |   ), by="gene"] %>%
89 |   .[,padj:=p.adjust(p.value,method="fdr")] %>% .[,p.value:=NULL] %>%
90 |   setorder(padj, na.last=T) %>%
91 |   .[,c("diff","padj"):=list(round(diff,2),format(padj,digits=3))]
92 | 
93 | ##################
94 | ## Save results ##
95 | ##################
96 | 
97 | fwrite(out, args$outfile, sep="\t", na="NA", quote=F)
98 | 


--------------------------------------------------------------------------------
/atac/archR/chromvar_chip/pseudobulk/differential/celltype/parse_differential_results.R:
--------------------------------------------------------------------------------
 1 | here::i_am("atac/archR/differential/pseudobulk/celltype/parse_differential_results.R")
 2 | 
 3 | # Load default settings
 4 | source(here::here("settings.R"))
 5 | 
 6 | ######################
 7 | ## Define arguments ##
 8 | ######################
 9 | 
10 | p <- ArgumentParser(description='')
11 | p$add_argument('--diff_results_dir',   type="character",     help='File')
12 | p$add_argument('--outfile',             type="character",     help='File')
13 | args <- p$parse_args(commandArgs(TRUE))
14 | 
15 | ## START TEST ##
16 | # io$basedir <- file.path(io$basedir,"test")
17 | # args <- list()
18 | # args$diff_results_dir <- file.path(io$basedir,"results/atac/archR/chromvar_chip/pseudobulk_with_replicates/differential/celltypes/CISBP")
19 | # args$outfile <- file.path(io$basedir,"results/atac/archR/chromvar_chip/pseudobulk_with_replicates/differential/celltypes/CISBP/diff_results.txt.gz")
20 | ## END TEST ##
21 | 
22 | # I/O
23 | dir.create(dirname(args$outfile), showWarnings=F, recursive=T)
24 | 
25 | ################################################
26 | ## Load differential expression and fetch TFs ##
27 | ################################################
28 | 
29 | diff_results_list <- list()
30 | 
31 | # i <- "Visceral_endoderm"; j <- "Surface_ectoderm"
32 | for (i in 1:length(opts$celltypes)) {
33 |   for (j in i:length(opts$celltypes)) {
34 |     
35 |     if (i!=j) {
36 |       file <- file.path(args$diff_results_dir,sprintf("%s_vs_%s.txt.gz",opts$celltypes[[i]],opts$celltypes[[j]]))
37 |       if (file.exists(file)) {
38 |         tmp <- fread(file) %>% .[,c("celltypeA","celltypeB"):=list(opts$celltypes[[i]],opts$celltypes[[j]])]
39 |         diff_results_list[[sprintf("%s_vs_%s",opts$celltypes[[i]],opts$celltypes[[j]])]] <- tmp
40 |       } else {
41 |         print(sprintf("%s not found...",file))
42 |       }
43 |     }
44 |   }
45 | }
46 |  
47 | ##########
48 | ## Save ##
49 | ##########
50 | 
51 | fwrite(rbindlist(diff_results_list), args$outfile, sep="\t", quote=F, na="NA")
52 | 


--------------------------------------------------------------------------------
/atac/archR/differential/cells/celltype/analysis/load_data.R:
--------------------------------------------------------------------------------
 1 | # i <- opts$celltypes[2]; j <- opts$celltypes[1]
 2 | 
 3 | diff.dt <- opts$celltypes %>% map(function(i) { opts$celltypes %>% map(function(j) {
 4 |   file <- sprintf("%s/%s_%s_vs_%s.txt.gz", io$diff.dir,opts$matrix,i,j)
 5 |   if (file.exists(file)) {
 6 |     fread(file) %>% .[,c("celltypeA","celltypeB"):=list(as.factor(i),as.factor(j))] %>% return
 7 |   }
 8 | }) %>% rbindlist }) %>% rbindlist %>% 
 9 |   .[,MeanDiff:=-MeanDiff] %>% # change sign to keep the groupB - groupA consistency
10 |   .[,sig:=FALSE] %>% .[abs(MeanDiff)>=opts$min.MeanDiff & FDR<=opts$fdr,sig:=TRUE] %>%
11 |   .[,direction:=c("up","down")[as.numeric(MeanDiff>0)+1]]  # up = higher accessibility in celltype A
12 | 
13 | # ad hoc
14 | # if ("name"%in%colnames(atac_diff_cells.dt)) {
15 | #   atac_diff_cells.dt[,idx:=NULL] %>% setnames("name","idx")
16 | # }
17 | 


--------------------------------------------------------------------------------
/atac/archR/differential/cells/celltype/analysis/old/GeneScoreMatrix/define_marker_genes.R:
--------------------------------------------------------------------------------
 1 | #####################
 2 | ## Define settings ##
 3 | #####################
 4 | 
 5 | # Load default settings
 6 | source(here::here("settings.R"))
 7 | source(here::here("utils.R"))
 8 | 
 9 | # I/O
10 | io$archR.diff.dir <- file.path(io$basedir,"results_new/atac/archR/differential/GeneScoreMatrix_TSS")
11 | io$outdir <- file.path(io$basedir,"results_new/atac/archR/differential/GeneScoreMatrix_TSS/markers"); dir.create(io$outdir, showWarnings = F)
12 | 
13 | # Options
14 | # opts$groups <- strsplit(list.files(io$diff.dir, pattern="*.gz"),"_vs_") %>% map(~ .[[1]]) %>% unlist %>% unique
15 | opts$celltypes <- c(
16 |   "Epiblast",
17 |   "Primitive_Streak",
18 |   "Caudal_epiblast",
19 |   "PGC",
20 |   "Anterior_Primitive_Streak",
21 |   "Notochord",
22 |   "Def._endoderm",
23 |   "Gut",
24 |   "Nascent_mesoderm",
25 |   "Mixed_mesoderm",
26 |   "Intermediate_mesoderm",
27 |   "Caudal_Mesoderm",
28 |   "Paraxial_mesoderm",
29 |   "Somitic_mesoderm",
30 |   "Pharyngeal_mesoderm",
31 |   "Cardiomyocytes",
32 |   "Allantois",
33 |   "ExE_mesoderm",
34 |   "Mesenchyme",
35 |   "Haematoendothelial_progenitors",
36 |   "Endothelium",
37 |   "Blood_progenitors_1",
38 |   "Blood_progenitors_2",
39 |   "Erythroid1",
40 |   "Erythroid2",
41 |   "Erythroid3",
42 |   "NMP",
43 |   "Rostral_neurectoderm",
44 |   "Caudal_neurectoderm",
45 |   "Neural_crest",
46 |   "Forebrain_Midbrain_Hindbrain",
47 |   "Spinal_cord",
48 |   "Surface_ectoderm",
49 |   "Visceral_endoderm",
50 |   "ExE_endoderm",
51 |   "ExE_ectoderm",
52 |   "Parietal_endoderm"
53 | )# %>% head(n=4)
54 | 
55 | opts$min.MeanDiff <- 0.10
56 | opts$fdr <- 0.01
57 | 
58 | # Minimum fraction of significant differential pairwise comparisons
59 | opts$score <- 0.75
60 | 
61 | ##################
62 | ## Load results ##
63 | ##################
64 | 
65 | dt <- opts$celltypes %>% map(function(i) { opts$celltypes %>% map(function(j) {
66 |   file <- sprintf("%s/GeneScoreMatrix_TSS_%s_vs_%s.txt.gz", io$archR.diff.dir,i,j)
67 |   if (file.exists(file)) {
68 |     fread(file) %>% .[,c("celltypeA","celltypeB"):=list(as.factor(i),as.factor(j))] %>%
69 |       return
70 |   }
71 | }) %>% rbindlist }) %>% rbindlist %>% 
72 |   .[,sig:=FALSE] %>% .[abs(MeanDiff)>=opts$min.MeanDiff & FDR<=opts$fdr,sig:=TRUE] %>%
73 |   .[,direction:=c("up","down")[as.numeric(MeanDiff<0)+1]]  # up = higher accessibility in celltype A
74 | 
75 | ncelltypes <- length(intersect(unique(dt$celltypeA),unique(dt$celltypeB)))
76 | 
77 | #########################
78 | ## Define marker genes ##
79 | #########################
80 | 
81 | foo <- dt[,.(score=sum(sig==T & direction=="up")), by=c("celltypeA","name")] %>% setnames("celltypeA","celltype")
82 | bar <- dt[,.(score=sum(sig==T & direction=="down")), by=c("celltypeB","name")] %>% setnames("celltypeB","celltype")
83 |   
84 | markers_peaks.dt <- merge(foo,bar,by=c("celltype","name"), all=TRUE) %>% .[,score:=score.x+score.y] %>%
85 |   .[,c("score.x","score.y"):=NULL] %>%
86 |   .[,score:=round(score/(ncelltypes+1),2)] %>%
87 |   .[score>=opts$score] %>%
88 |   setorder(celltype,-score)
89 | rm(foo,bar)
90 | 
91 | length(unique(markers_peaks.dt$name))
92 | 
93 | ##########
94 | ## Save ##
95 | ##########
96 | 
97 | fwrite(markers_peaks.dt, file.path(io$outdir,"marker_genes.txt.gz"))
98 | 
99 | 


--------------------------------------------------------------------------------
/atac/archR/differential/cells/celltype/analysis/old/PeakMatrix/define_marker_peaks.R:
--------------------------------------------------------------------------------
 1 | here::i_am("atac/archR/differential/analysis/PeakMatrix/define_marker_peaks.R")
 2 | 
 3 | source(here::here("settings.R"))
 4 | source(here::here("utils.R"))
 5 | 
 6 | #############
 7 | ## Options ##
 8 | #############
 9 | 
10 | opts$matrix <- "PeakMatrix"
11 | opts$group_variable <- "celltype.mapped"
12 | opts$min.MeanDiff <- 0.10
13 | opts$fdr <- 0.01
14 | opts$score <- 0.75 # Minimum fraction of significant differential pairwise comparisons
15 | 
16 | #########
17 | ## I/O ##
18 | #########
19 | 
20 | io$diff.dir <- file.path(io$basedir,sprintf("results/atac/archR/differential/%s/%s",opts$group_variable,opts$matrix))
21 | io$outdir <- file.path(io$basedir,sprintf("results/atac/archR/differential/%s/%s/markers",opts$group_variable,opts$matrix)); dir.create(io$outdir, showWarnings = F)
22 | 
23 | ##################
24 | ## Load results ##
25 | ##################
26 | 
27 | source(here::here("atac/archR/differential/analysis/load_data.R"))
28 | 
29 | ###################
30 | ## Sanity checks ##
31 | ###################
32 | 
33 | # Load stats
34 | diff_stats.dt <- fread(file.path(io$diff.dir,"diff_stats.txt")) %>% setnames(c("celltypeA","celltypeB","N_groupA","N_groupB"))
35 | 
36 | # check if some DA comparison is missing
37 | tmp <- diff_stats.dt %>% 
38 |   merge(diff.dt[,c("celltypeA","celltypeB")] %>% unique %>% .[,done:=TRUE], all.x=TRUE, by=c("celltypeA","celltypeB")) %>%
39 |   .[is.na(done),done:=FALSE]
40 | stopifnot(tmp$done==TRUE)
41 | 
42 | #########################
43 | ## Define marker genes ##
44 | #########################
45 | 
46 | ncelltypes <- unique(c(as.character(unique(diff.dt$celltypeA)),as.character(unique(diff.dt$celltypeB)))) %>% length
47 | 
48 | foo <- diff.dt[,.(score=sum(sig==T & direction=="up")), by=c("celltypeA","idx")] %>% setnames("celltypeA","celltype")
49 | bar <- diff.dt[,.(score=sum(sig==T & direction=="down")), by=c("celltypeB","idx")] %>% setnames("celltypeB","celltype")
50 |   
51 | markers_peaks.dt <- merge(foo,bar,by=c("celltype","idx"), all=TRUE) %>% .[,score:=score.x+score.y] %>%
52 |   .[,c("score.x","score.y"):=NULL] %>%
53 |   # .[,score:=round(score/(ncelltypes+1),2)] %>%
54 |   .[,score:=round(score/(ncelltypes-1),2)] %>%
55 |   setorder(celltype,-score)
56 | # rm(foo,bar)
57 | 
58 | stopifnot(max(markers_peaks.dt$score,na.rm=T)==1)
59 | 
60 | 
61 | ##############################################
62 | ## Add MeanDiff values from pseudobulk data ##
63 | ##############################################
64 | 
65 | diff_pseudobulk.dt <- file.path(io$basedir,"results/atac/archR/differential/pseudobulk/celltype.mapped/PeakMatrix/differential_atac_PeakMatrix_pseudobulk_summary.txt.gz") %>% fread
66 | markers_peaks.dt <- markers_peaks.dt %>% merge(diff_pseudobulk.dt, by=c("celltype","idx"))
67 | 
68 | ##########
69 | ## Save ##
70 | ##########
71 | 
72 | # Save marker score for all combination of genes and cell types
73 | length(unique(markers_peaks.dt$idx))
74 | length(unique(markers_peaks.dt$celltype))
75 | fwrite(markers_peaks.dt, file.path(io$outdir,"marker_peaks_upregulated_all.txt.gz"), sep="\t")
76 | 
77 | # Save marker score for strong markers
78 | markers_peaks_filt.dt <- markers_peaks.dt %>% .[score>=opts$score & diff>=opts$min.MeanDiff]
79 | length(unique(markers_peaks_filt.dt$idx))
80 | length(unique(markers_peaks_filt.dt$celltype))
81 | fwrite(markers_peaks_filt.dt, file.path(io$outdir,"marker_peaks_upregulated_filtered.txt.gz"), sep="\t")
82 | 
83 | 


--------------------------------------------------------------------------------
/atac/archR/differential/cells/celltype/analysis/old/browser_plot_archR.R:
--------------------------------------------------------------------------------
 1 | ####################
 2 | ## Browser tracks ##
 3 | ####################
 4 | 
 5 | # i <- "chr2:39483639-39484239"
 6 | 
 7 | opts$extend.upstream <- 2500
 8 | opts$extend.downstream <- 2500
 9 | opts$tileSize <- 50
10 | 
11 | # Ugly hack
12 | # rename <- paste(1:length(opts$celltypes),opts$celltypes,sep="_")
13 | # names(rename) <- opts$celltypes
14 | # ArchRProject.filt$celltype.predicted <- stringr::str_replace_all(ArchRProject.filt$celltype.predicted,rename)
15 | 
16 | for (i in unique(markers_peaks.dt$idx) %>% head(n=5)) {
17 |   
18 |   # Fetch GRanges
19 |   to.plot <- peakset.gr[peakset.gr$idx==i]
20 |   start(to.plot) <- start(to.plot) - opts$extend.upstream
21 |   end(to.plot) <- end(to.plot) + opts$extend.downstream
22 |   
23 |   # Plot
24 |   p <- plotBrowserTrack(
25 |     ArchRProj = ArchRProject.filt, 
26 |     region = to.plot,
27 |     groupBy = "celltype.predicted", 
28 |     tileSize = opts$tileSize,
29 |     pal = opts$celltype.colors,
30 |     plotSummary = c("bulkTrack", "featureTrack", "geneTrack"),
31 |     sizes = c(10, 1.5, 1.5),
32 |   )
33 |   
34 |   # grid::grid.newpage()
35 |   
36 |   pdf(sprintf("%s/%s_BrowserTrack.pdf",io$outdir,gsub(":","-",i)), width = 9, height = 5)
37 |   grid::grid.draw(p)
38 |   dev.off()
39 | }


--------------------------------------------------------------------------------
/atac/archR/differential/cells/celltype/parse_differential_results.R:
--------------------------------------------------------------------------------
 1 | here::i_am("atac/archR/differential/cells/celltype/parse_differential_results.R")
 2 | 
 3 | # Load default settings
 4 | source(here::here("settings.R"))
 5 | 
 6 | ######################
 7 | ## Define arguments ##
 8 | ######################
 9 | 
10 | p <- ArgumentParser(description='')
11 | p$add_argument('--diff_results_dir',   type="character",     help='File')
12 | p$add_argument('--min_cells',       type="integer",       default=5,      help='Minimum number of cells per group')
13 | p$add_argument('--outdir',             type="character",     help='File')
14 | args <- p$parse_args(commandArgs(TRUE))
15 | 
16 | ## START TEST ##
17 | # io$basedir <- file.path(io$basedir,"test")
18 | # args <- list()
19 | # args$diff_results_dir <- file.path(io$basedir,"results/atac/archR/differential/cells/celltype/PeakMatrix")
20 | # args$min_cells <- 5
21 | # args$outdir <- file.path(io$basedir,"results/atac/archR/differential/cells/celltype/PeakMatrix/parsed")
22 | ## END TEST ##
23 | 
24 | # I/O
25 | dir.create(args$outdir, showWarnings=F, recursive=T)
26 | 
27 | ################################################
28 | ## Load differential expression and fetch TFs ##
29 | ################################################
30 | 
31 | stats.dt <- data.table(celltypeA=as.character(NA), celltypeB=as.character(NA), groupA_N=as.integer(NA), groupB_N=as.integer(NA), included=as.logical(NA))
32 | diff_results_list <- list()
33 | 
34 | # i <- "Visceral_endoderm"; j <- "Surface_ectoderm"
35 | for (i in 1:length(opts$celltypes)) {
36 |   for (j in i:length(opts$celltypes)) {
37 |     
38 |     if (i!=j) {
39 |       file <- file.path(args$diff_results_dir,sprintf("%s_vs_%s.txt.gz",opts$celltypes[[i]],opts$celltypes[[j]]))
40 |       if (file.exists(file)) {
41 |         tmp <- fread(file) %>% 
42 |           .[,c("celltypeA","celltypeB"):=list(opts$celltypes[[i]],opts$celltypes[[j]])]
43 |         
44 |         if (tmp[celltypeA==opts$celltypes[[i]],groupA_N][1]>=args$min_cells & tmp[celltypeB==opts$celltypes[[j]],groupB_N][1]>=args$min_cells) {
45 |           stats.dt <- rbind(stats.dt,data.table(celltypeA=opts$celltypes[[i]], celltypeB=opts$celltypes[[j]], groupA_N=tmp[celltypeA==opts$celltypes[[i]],groupA_N][1], groupB_N=tmp[celltypeB==opts$celltypes[[j]],groupB_N][1], included=TRUE))
46 |           diff_results_list[[sprintf("%s_vs_%s",opts$celltypes[[i]],opts$celltypes[[j]])]] <- tmp
47 |         } else {
48 |           stats.dt <- rbind(stats.dt,data.table(celltypeA=opts$celltypes[[i]], celltypeB=opts$celltypes[[j]], groupA_N=tmp[celltypeA==opts$celltypes[[i]],groupA_N][1], groupB_N=tmp[celltypeB==opts$celltypes[[j]],groupB_N][1], included=FALSE)) 
49 |         }
50 |       } else {
51 |         print(sprintf("%s not found...",file))
52 |       }
53 |     }
54 |   }
55 | }
56 |  
57 | ##########
58 | ## Save ##
59 | ##########
60 | 
61 | fwrite(stats.dt[-1], file.path(args$outdir,"diff_stats.txt.gz"), sep="\t", quote=F, na="NA")
62 | fwrite(rbindlist(diff_results_list), file.path(args$outdir,"diff_results.txt.gz"), sep="\t", quote=F, na="NA")
63 | 
64 | 


--------------------------------------------------------------------------------
/atac/archR/differential/cells/genotype/analysis/load_data.R:
--------------------------------------------------------------------------------
 1 | 
 2 | #############################################
 3 | ## Load results from differential analysis ##
 4 | #############################################
 5 | 
 6 | diff.dt <- opts$celltypes %>% map(function(j) {
 7 |   file <- sprintf("%s/%s_WT_vs_KO.txt.gz", io$indir,j)
 8 |   if (file.exists(file)) {
 9 |     fread(file, select=c(1,2,3)) %>% .[,c("celltype"):=list(j)]
10 |   }
11 | }) %>% rbindlist %>%
12 |   # .[,MeanDiff:=-MeanDiff] # change sign to keep the groupB - groupA consistency
13 |   .[, sign := ifelse(MeanDiff>0,"Upregulated in KO","Downregulated in KO")] %>%
14 |   .[, sig := (FDR<=opts$threshold_fdr & abs(MeanDiff)>=opts$min.MeanDiff)]
15 | 
16 | 
17 | # Print stats
18 | print(sprintf("Number of celltypes: %s",length(unique(diff.dt$celltype))))
19 | print(sprintf("Number of features: %s",length(unique(diff.dt$idx))))
20 | 
21 | 


--------------------------------------------------------------------------------
/atac/archR/differential/cells/genotype/run_diff_acc_genotype.R:
--------------------------------------------------------------------------------
 1 | here::i_am("atac/archR/differential/cells/genotype/run_diff_acc_genotype.R")
 2 | 
 3 | source(here::here("settings.R"))
 4 | 
 5 | ######################
 6 | ## Define arguments ##
 7 | ######################
 8 | 
 9 | p <- ArgumentParser(description='')
10 | p$add_argument('--archr_directory',    type="character",    help='ArchR directory')
11 | p$add_argument('--metadata',          type="character",   help='')
12 | p$add_argument('--matrix',          type="character",  default="PeakMatrix",   help='Matrix to use')
13 | # p$add_argument('--group_variable',          type="character",   help='')
14 | p$add_argument('--outdir',          type="character",                               help='Output directory')
15 | p$add_argument('--min_cells',       type="integer",       default=50,      help='Minimum number of cells per cell type')
16 | p$add_argument('--test_mode',    action="store_true",             help='Test mode? subset data')
17 | 
18 | args <- p$parse_args(commandArgs(TRUE))
19 | 
20 | ## START TEST ##
21 | # io$basedir <- file.path(io$basedir,"test")
22 | # args <- list()
23 | # args$archr_directory <- file.path(io$basedir,"processed/atac/archR")
24 | # args$metadata <- file.path(io$basedir,"results/atac/archR/qc/sample_metadata_after_qc.txt.gz")
25 | # args$matrix <- "PeakMatrix" # "GeneScoreMatrix_TSS"
26 | # # args$group_variable <- "celltype_genotype"
27 | # args$min_cells <- 30
28 | # args$outdir <- file.path(io$basedir,sprintf("results/atac/archR/differential/cells/celltype_genotype/%s",args$matrix))
29 | # args$test_mode <- TRUE
30 | ## END TEST ##
31 | 
32 | #####################
33 | ## Define settings ##
34 | #####################
35 | 
36 | # I/O
37 | io$script <- here::here("atac/archR/differential/cells/archr_differential_accessibility_cells.R")
38 | dir.create(args$outdir, showWarnings=FALSE, recursive=TRUE)
39 | 
40 | # Options
41 | opts$statistical.test <- "wilcoxon"
42 | 
43 | opts$samples <- c(
44 |   "E8.5_CRISPR_T_KO",
45 |   "E8.5_CRISPR_T_WT"
46 | )
47 | 
48 | ########################
49 | ## Load cell metadata ##
50 | ########################
51 | 
52 | cells_metadata.dt <- fread(args$metadata) %>%
53 |   .[sample%in%opts$samples & pass_atacQC==TRUE & !is.na(genotype) & !is.na(celltype)] %>%
54 |   .[,celltype_genotype:=sprintf("%s_%s",celltype,genotype)]
55 | 
56 | # cells_metadata.dt <- cells_metadata.dt %>%
57 | #   .[,group:=eval(as.name(args$group_variable))] %>%
58 | 
59 | # Only consider cell types with sufficient number of cellls
60 | stats.dt <- cells_metadata.dt[,.N,by=c("celltype","genotype")] %>% dcast(celltype~genotype, value.var="N", fill=0)
61 | celltypes.to.use <- stats.dt[T_KO>=args$min_cells & WT>=args$min_cells,celltype] 
62 | stats.dt <- stats.dt[celltype%in%celltypes.to.use]
63 | print(stats.dt)
64 | 
65 | #########
66 | ## Run ##
67 | #########
68 | 
69 | if (args$test_mode) {
70 |   print("Test mode activated, running only a few comparisons...")
71 |   celltypes.to.use <- celltypes.to.use %>% head(n=3)
72 | }
73 | 
74 | # j <- "NMP"
75 | for (j in celltypes.to.use) {
76 |   outfile <- sprintf("%s/%s_WT_vs_KO.txt.gz", args$outdir,j); dir.create(dirname(outfile), showWarnings = F)
77 |   if (!file.exists(outfile)) {
78 |     
79 |     # Define LSF command
80 |     if (grepl("BI",Sys.info()['nodename'])) {
81 |       lsf <- ""
82 |     } else if (grepl("pebble|headstone", Sys.info()['nodename'])) {
83 |       lsf <- sprintf("sbatch -n 1 --mem 8G --wrap")
84 |     }
85 |     cmd <- sprintf("%s 'Rscript %s --archr_directory %s --metadata %s --samples %s --celltypes %s --groupA WT --groupB T_KO --matrix %s --group_variable genotype --statistical_test %s --outfile %s'",
86 |       lsf, io$script, args$archr_directory, args$metadata, paste(opts$samples,collapse=" "), j, args$matrix, opts$statistical.test, outfile)
87 |     
88 |     # Run
89 |     print(cmd)
90 |     system(cmd)
91 |   }
92 | }
93 | 
94 | 
95 | # Save stats
96 | fwrite(stats.dt, file.path(args$outdir,"diff_stats.txt"), sep="\t", quote=F)
97 | 
98 | # Completion token
99 | file.create(file.path(args$outdir,"completed.txt"))


--------------------------------------------------------------------------------
/atac/archR/differential/metacells/celltype/analysis/old/load_data.R:
--------------------------------------------------------------------------------
 1 | # i <- opts$celltypes[2]; j <- opts$celltypes[1]
 2 | 
 3 | diff.dt <- opts$celltypes %>% map(function(i) { opts$celltypes %>% map(function(j) {
 4 |   file <- sprintf("%s/%s_%s_vs_%s.txt.gz", io$diff.dir,opts$matrix,i,j)
 5 |   if (file.exists(file)) {
 6 |     fread(file) %>% .[,c("celltypeA","celltypeB"):=list(as.factor(i),as.factor(j))] %>% return
 7 |   }
 8 | }) %>% rbindlist }) %>% rbindlist %>% 
 9 |   .[,MeanDiff:=-MeanDiff] %>% # change sign to keep the groupB - groupA consistency
10 |   .[,sig:=FALSE] %>% .[abs(MeanDiff)>=opts$min.MeanDiff & FDR<=opts$fdr,sig:=TRUE] %>%
11 |   .[,direction:=c("up","down")[as.numeric(MeanDiff>0)+1]]  # up = higher accessibility in celltype A
12 | 
13 | # ad hoc
14 | # if ("name"%in%colnames(atac_diff_cells.dt)) {
15 | #   atac_diff_cells.dt[,idx:=NULL] %>% setnames("name","idx")
16 | # }
17 | 


--------------------------------------------------------------------------------
/atac/archR/differential/metacells/celltype/parse_differential_results.R:
--------------------------------------------------------------------------------
 1 | here::i_am("atac/archR/differential/metacells/celltype/parse_differential_results.R")
 2 | 
 3 | # Load default settings
 4 | source(here::here("settings.R"))
 5 | 
 6 | ######################
 7 | ## Define arguments ##
 8 | ######################
 9 | 
10 | p <- ArgumentParser(description='')
11 | p$add_argument('--diff_results_dir',   type="character",     help='File')
12 | p$add_argument('--min_cells',       type="integer",       default=5,      help='Minimum number of cells per group')
13 | p$add_argument('--outdir',             type="character",     help='File')
14 | args <- p$parse_args(commandArgs(TRUE))
15 | 
16 | ## START TEST ##
17 | # io$basedir <- file.path(io$basedir,"test")
18 | # args <- list()
19 | # args$diff_results_dir <- file.path(io$basedir,"results/atac/archR/differential/metacells/celltype/PeakMatrix")
20 | # args$min_cells <- 5
21 | # args$outdir <- file.path(io$basedir,"results/atac/archR/differential/metacells/celltype/PeakMatrix/parsed")
22 | ## END TEST ##
23 | 
24 | # I/O
25 | dir.create(args$outdir, showWarnings=F, recursive=T)
26 | 
27 | ################################################
28 | ## Load differential expression and fetch TFs ##
29 | ################################################
30 | 
31 | stats.dt <- data.table(celltypeA=as.character(NA), celltypeB=as.character(NA), groupA_N=as.integer(NA), groupB_N=as.integer(NA), included=as.logical(NA))
32 | diff_results_list <- list()
33 | 
34 | # i <- "Visceral_endoderm"; j <- "Surface_ectoderm"
35 | for (i in 1:length(opts$celltypes)) {
36 |   for (j in i:length(opts$celltypes)) {
37 |     
38 |     if (i!=j) {
39 |       file <- file.path(args$diff_results_dir,sprintf("%s_vs_%s.txt.gz",opts$celltypes[[i]],opts$celltypes[[j]]))
40 |       if (file.exists(file)) {
41 |         tmp <- fread(file) %>% 
42 |           .[,c("celltypeA","celltypeB"):=list(opts$celltypes[[i]],opts$celltypes[[j]])]
43 |         
44 |         if (tmp[celltypeA==opts$celltypes[[i]],groupA_N][1]>=args$min_cells & tmp[celltypeB==opts$celltypes[[j]],groupB_N][1]>=args$min_cells) {
45 |           stats.dt <- rbind(stats.dt,data.table(celltypeA=opts$celltypes[[i]], celltypeB=opts$celltypes[[j]], groupA_N=tmp[celltypeA==opts$celltypes[[i]],groupA_N][1], groupB_N=tmp[celltypeB==opts$celltypes[[j]],groupB_N][1], included=TRUE))
46 |           diff_results_list[[sprintf("%s_vs_%s",opts$celltypes[[i]],opts$celltypes[[j]])]] <- tmp
47 |         } else {
48 |           stats.dt <- rbind(stats.dt,data.table(celltypeA=opts$celltypes[[i]], celltypeB=opts$celltypes[[j]], groupA_N=tmp[celltypeA==opts$celltypes[[i]],groupA_N][1], groupB_N=tmp[celltypeB==opts$celltypes[[j]],groupB_N][1], included=FALSE)) 
49 |         }
50 |       } else {
51 |         print(sprintf("%s not found...",file))
52 |       }
53 |     }
54 |   }
55 | }
56 |  
57 | ##########
58 | ## Save ##
59 | ##########
60 | 
61 | fwrite(stats.dt[-1], file.path(args$outdir,"diff_stats.txt.gz"), sep="\t", quote=F, na="NA")
62 | fwrite(rbindlist(diff_results_list), file.path(args$outdir,"diff_results.txt.gz"), sep="\t", quote=F, na="NA")
63 | 
64 | 


--------------------------------------------------------------------------------
/atac/archR/differential/metacells/genotype/analysis/load_data.R:
--------------------------------------------------------------------------------
 1 | 
 2 | #############################################
 3 | ## Load results from differential analysis ##
 4 | #############################################
 5 | 
 6 | diff.dt <- opts$celltypes %>% map(function(j) {
 7 |   file <- sprintf("%s/%s_WT_vs_KO.txt.gz", io$indir,j)
 8 |   if (file.exists(file)) {
 9 |     fread(file, select=c(1,2,3)) %>% .[,c("celltype"):=list(j)]
10 |   }
11 | }) %>% rbindlist %>%
12 |   # .[,MeanDiff:=-MeanDiff] # change sign to keep the groupB - groupA consistency
13 |   .[, sign := ifelse(logFC>0,"Upregulated in KO","Downregulated in KO")] %>%
14 |   .[, sig := (padj_fdr<=opts$threshold_fdr & abs(logFC)>=opts$min.logFC)]
15 | 
16 | 
17 | # Print stats
18 | print(sprintf("Number of celltypes: %s",length(unique(diff.dt$celltype))))
19 | print(sprintf("Number of features: %s",length(unique(diff.dt$feature))))


--------------------------------------------------------------------------------
/atac/archR/differential/metacells/genotype/old/analysis/load_data.R:
--------------------------------------------------------------------------------
 1 | 
 2 | #############################################
 3 | ## Load results from differential analysis ##
 4 | #############################################
 5 | 
 6 | diff.dt <- opts$celltypes %>% map(function(j) {
 7 |   file <- sprintf("%s/%s_WT_vs_KO.txt.gz", io$indir,j)
 8 |   if (file.exists(file)) {
 9 |     fread(file, select=c(1,2,3)) %>% .[,c("celltype"):=list(j)]
10 |   }
11 | }) %>% rbindlist %>%
12 |   # .[,MeanDiff:=-MeanDiff] # change sign to keep the groupB - groupA consistency
13 |   .[, sign := ifelse(MeanDiff>0,"Downregulated in KO","Upregulated in KO")] %>%
14 |   .[, sig := (FDR<=opts$threshold_fdr & abs(MeanDiff)>=opts$min.MeanDiff)]
15 | 
16 | 
17 | # Print stats
18 | print(sprintf("Number of celltypes: %s",length(unique(diff.dt$celltype))))
19 | print(sprintf("Number of features: %s",length(unique(diff.dt$idx))))
20 | 
21 | 


--------------------------------------------------------------------------------
/atac/archR/differential/metacells/utils.R:
--------------------------------------------------------------------------------
 1 | 
 2 | # Function to differential expression
 3 | # - sce: SingleCellExperiment object with the column "group" in the colData
 4 | # - groups: the names of the two groups
 5 | # - min_detection_rate_per_group: minimum detection rate per group
 6 | calculate_diff_acc_edgeR <- function(sce, groups, min_detection_rate_per_group = 0.50) {
 7 |     
 8 |   # Sanity checks
 9 |   if (!is(sce, "SingleCellExperiment")) stop("'sce' has to be an instance of SingleCellExperiment")
10 |   stopifnot(length(groups)==2)
11 | 
12 |   # Filter genes by detection rate per group
13 |   cdr_A <- rowMeans(logcounts(sce[,sce$group==groups[1]])>0) >= min_detection_rate_per_group
14 |   cdr_B <- rowMeans(logcounts(sce[,sce$group==groups[2]])>0) >= min_detection_rate_per_group
15 |   out <- .edgeR(sce[cdr_B | cdr_A,]) %>% .[,log_padj_fdr:= -log10(padj_fdr)]
16 |   
17 |   return(out)
18 | }
19 | 
20 | 
21 | .edgeR <- function(sce) {
22 |   
23 |   # Convert SCE to DGEList
24 |   sce_edger <- scran::convertTo(sce, type="edgeR")
25 |   
26 |   # Define design matrix (with intercept)
27 |   cdr <- colMeans(logcounts(sce)>0)
28 |   design <- model.matrix(~cdr+sce$group)
29 |   
30 |   # Estimate dispersions
31 |   sce_edger  <- estimateDisp(sce_edger,design)
32 |   
33 |   # Fit GLM
34 |   fit <- glmQLFit(sce_edger,design)
35 |   
36 |   # Likelihood ratio test
37 |   lrt <- glmQLFTest(fit)
38 |   
39 |   # Construct output data.frame
40 |   out <- topTags(lrt, n=nrow(lrt))$table %>% as.data.table(keep.rownames=T) %>%
41 |     setnames(c("gene","logFC","logCPM","LR","p.value","padj_fdr")) %>%
42 |     .[,c("logCPM","LR"):=NULL]
43 |   
44 |   return(out)
45 | }
46 | 


--------------------------------------------------------------------------------
/atac/archR/differential/pseudobulk/celltype/analysis/plot_marker_peaks_stats.R:
--------------------------------------------------------------------------------
  1 | # Load default settings
  2 | source(here::here("settings.R"))
  3 | source(here::here("utils.R"))
  4 | 
  5 | #####################
  6 | ## Define settings ##
  7 | #####################
  8 | 
  9 | io$basedir <- file.path(io$basedir,"test")
 10 | io$marker_peaks <- file.path(io$basedir,"results/atac/archR/differential/pseudobulk/celltype/PeakMatrix/parsed/markers_filt.txt.gz")
 11 | io$outdir <- file.path(io$basedir,"results/atac/archR/differential/pseudobulk/celltype/PeakMatrix/parsed/pdf"); dir.create(io$outdir, showWarnings = F)
 12 | 
 13 | opts$celltypes <- c(
 14 |   "Epiblast",
 15 |   "Primitive_Streak",
 16 |   "Caudal_epiblast",
 17 |   "PGC",
 18 |   "Anterior_Primitive_Streak",
 19 |   "Notochord",
 20 |   "Def._endoderm",
 21 |   "Gut",
 22 |   "Nascent_mesoderm",
 23 |   "Mixed_mesoderm",
 24 |   "Intermediate_mesoderm",
 25 |   "Caudal_Mesoderm",
 26 |   "Paraxial_mesoderm",
 27 |   "Somitic_mesoderm",
 28 |   "Pharyngeal_mesoderm",
 29 |   "Cardiomyocytes",
 30 |   "Allantois",
 31 |   "ExE_mesoderm",
 32 |   "Mesenchyme",
 33 |   "Haematoendothelial_progenitors",
 34 |   "Endothelium",
 35 |   "Blood_progenitors_1",
 36 |   "Blood_progenitors_2",
 37 |   "Erythroid1",
 38 |   "Erythroid2",
 39 |   "Erythroid3",
 40 |   "NMP",
 41 |   "Rostral_neurectoderm",
 42 |   # "Caudal_neurectoderm",
 43 |   "Neural_crest",
 44 |   "Forebrain_Midbrain_Hindbrain",
 45 |   "Spinal_cord",
 46 |   "Surface_ectoderm"
 47 |   # "Visceral_endoderm"
 48 |   # "ExE_endoderm",
 49 |   # "ExE_ectoderm"
 50 |   # "Parietal_endoderm"
 51 | )
 52 | 
 53 | ###############################
 54 | ## Load differential results ##
 55 | ###############################
 56 | 
 57 | marker_peaks.dt <- fread(io$marker_peaks) %>% .[celltype%in%opts$celltypes]
 58 | 
 59 | ###############################################
 60 | ## Plot number of marker peaks per cell type ##
 61 | ###############################################
 62 | 
 63 | to.plot <- marker_peaks.dt %>% .[,.N,by=c("celltype")]
 64 | 
 65 | p <- ggbarplot(to.plot, x="celltype", y="N", fill="celltype") +
 66 |   scale_fill_manual(values=opts$celltype.colors) +
 67 |   labs(x="", y="Number of marker peaks") +
 68 |   theme(
 69 |     axis.text.y = element_text(size=rel(0.65)),
 70 |     axis.text.x = element_text(colour="black",size=rel(0.7), angle=90, hjust=1, vjust=0.5),
 71 |     axis.title = element_text(colour="black",size=rel(0.75)),
 72 |     axis.ticks.x = element_blank(),
 73 |     legend.position = "none"
 74 | )
 75 | 
 76 | pdf(file.path(io$outdir,"barplot_number_marker_peaks.pdf"), width = 6, height = 4)
 77 | print(p)
 78 | dev.off()
 79 | 
 80 | ##################################
 81 | ## Plot gene marker exclusivity ##
 82 | ##################################
 83 | 
 84 | to.plot <- marker_peaks.dt %>%
 85 |   .[,.(Nx=.N),by="gene"] %>%
 86 |   .[,Nx:=factor(Nx)] %>%
 87 |   .[,.(Ny=.N),by="Nx"]
 88 | 
 89 | p <- ggbarplot(to.plot, x="Nx", y="Ny", fill="gray70") +
 90 |   labs(x="Number of different cell types per marker peak", y="") +
 91 |   theme(
 92 |     axis.text = element_text(size=rel(0.75)),
 93 |   )
 94 | 
 95 | pdf(file.path(io$outdir,"boxplot_exclusivity_per_gene.pdf"), width = 7, height = 5)
 96 | print(p)
 97 | dev.off()
 98 | 
 99 | ################################################
100 | ## Plot gene marker exclusivity per cell type ##
101 | ################################################
102 | 
103 | to.plot <- marker_peaks.dt %>% .[,N:=.N,by="gene"]
104 | 
105 | p <- ggboxplot(to.plot, x="celltype", y="N", fill="celltype", color="black") +
106 |   scale_fill_manual(values=opts$celltype.colors) +
107 |   labs(x="", y="Exclusivity of gene markers\n(the smaller the more exclusive)") +
108 |   theme(
109 |     axis.text.y = element_text(size=rel(0.75)),
110 |     axis.title.y = element_text(size=rel(0.85)),
111 |     axis.text.x = element_text(colour="black",size=rel(0.7), angle=90, hjust=1, vjust=0.5),
112 |     legend.position = "none"
113 |   )
114 | 
115 | pdf(file.path(io$outdir,"boxplot_exclusivity_per_celltype.pdf"), width = 9, height = 5)
116 | print(p)
117 | dev.off()
118 | 
119 | 
120 | 


--------------------------------------------------------------------------------
/atac/archR/differential/pseudobulk/celltype/parse_differential_results.R:
--------------------------------------------------------------------------------
 1 | here::i_am("atac/archR/differential/pseudobulk/celltype/parse_differential_results.R")
 2 | 
 3 | # Load default settings
 4 | source(here::here("settings.R"))
 5 | 
 6 | ######################
 7 | ## Define arguments ##
 8 | ######################
 9 | 
10 | p <- ArgumentParser(description='')
11 | p$add_argument('--diff_results_dir',   type="character",     help='File')
12 | p$add_argument('--outdir',             type="character",     help='File')
13 | args <- p$parse_args(commandArgs(TRUE))
14 | 
15 | ## START TEST ##
16 | # io$basedir <- file.path(io$basedir,"test")
17 | # args <- list()
18 | # args$diff_results_dir <- file.path(io$basedir,"results/atac/archR/differential/metacells/celltype/PeakMatrix")
19 | # args$outdir <- file.path(io$basedir,"results/atac/archR/differential/metacells/celltype/PeakMatrix/parsed")
20 | ## END TEST ##
21 | 
22 | # I/O
23 | dir.create(args$outdir, showWarnings=F, recursive=T)
24 | 
25 | ################################################
26 | ## Load differential expression and fetch TFs ##
27 | ################################################
28 | 
29 | diff_results_list <- list()
30 | 
31 | # i <- "Visceral_endoderm"; j <- "Surface_ectoderm"
32 | for (i in 1:length(opts$celltypes)) {
33 |   for (j in i:length(opts$celltypes)) {
34 |     
35 |     if (i!=j) {
36 |       file <- file.path(args$diff_results_dir,sprintf("%s_vs_%s.txt.gz",opts$celltypes[[i]],opts$celltypes[[j]]))
37 |       if (file.exists(file)) {
38 |         tmp <- fread(file) %>% .[,c("celltypeA","celltypeB"):=list(opts$celltypes[[i]],opts$celltypes[[j]])]
39 |         diff_results_list[[sprintf("%s_vs_%s",opts$celltypes[[i]],opts$celltypes[[j]])]] <- tmp
40 |       } else {
41 |         print(sprintf("%s not found...",file))
42 |       }
43 |     }
44 |   }
45 | }
46 |  
47 | ##########
48 | ## Save ##
49 | ##########
50 | 
51 | fwrite(rbindlist(diff_results_list), file.path(args$outdir,"diff_results.txt.gz"), sep="\t", quote=F, na="NA")
52 | 
53 | ##########
54 | ## TEST ##
55 | ##########
56 | 
57 | tmp <- fread("/bi/group/reik/ricard/data/gastrulation_multiome_10x/test/results/atac/archR/differential/pseudobulk/celltype/PeakMatrix/parsed/diff_results.txt.gz") %>%
58 |   .[abs(logFC)>=2 & padj_fdr<=0.01 & (mean_groupA>=2.5 | mean_groupB>=2.5)] %>% .[,.N,by="feature"]
59 | 
60 | sum(tmp$N>=2)


--------------------------------------------------------------------------------
/atac/archR/differential/pseudobulk/celltype_genotype/analysis/load_data.R:
--------------------------------------------------------------------------------
 1 | 
 2 | #############################################
 3 | ## Load results from differential analysis ##
 4 | #############################################
 5 | 
 6 | diff.dt <- opts$celltypes %>% map(function(j) {
 7 |   file <- sprintf("%s/%s_WT_vs_KO.txt.gz", io$indir,j)
 8 |   if (file.exists(file)) {
 9 |     fread(file, select=c(1,2,3)) %>% .[,c("celltype"):=list(j)]
10 |   }
11 | }) %>% rbindlist %>%
12 |   # .[,MeanDiff:=-MeanDiff] # change sign to keep the groupB - groupA consistency
13 |   .[, sign := ifelse(logFC>0,"Upregulated in KO","Downregulated in KO")] %>%
14 |   .[, sig := (padj_fdr<=opts$threshold_fdr & abs(logFC)>=opts$min.logFC)]
15 | 
16 | 
17 | # Print stats
18 | print(sprintf("Number of celltypes: %s",length(unique(diff.dt$celltype))))
19 | print(sprintf("Number of features: %s",length(unique(diff.dt$feature))))


--------------------------------------------------------------------------------
/atac/archR/differential/pseudobulk/celltype_genotype/old/old_stuff.R:
--------------------------------------------------------------------------------
 1 | # i="T_789"; j <- "NMP"
 2 | tfs.to.use <- colnames(virtual_chip.mtx)
 3 | celltypes.to.use <- "NMP"
 4 | # i <- "T"
 5 | tf_enrichment_insilico_chip.dt <- tfs.to.use %>% map(function(i) { 
 6 |   print(i)
 7 |   
 8 |   celltypes.to.use %>% map(function(j) {
 9 |     
10 |     foreground.peaks <- diff.dt[celltype==j & sign=="Downregulated in KO" & sig==TRUE,feature]
11 |     background.peaks <- diff.dt[celltype==j & sig==FALSE,feature]
12 |     foreground.nmatches <- virtual_chip_logical.mtx[foreground.peaks,i] %>% sum
13 |     background.nmatches <- virtual_chip_logical.mtx[background.peaks,i] %>% sum
14 |     
15 |     p.value <- phyper(foreground.nmatches-1, background.nmatches, length(background.peaks)-background.nmatches,length(foreground.peaks), lower.tail = F)
16 | 
17 |     data.table(tf=i, celltype=j, pval=p.value)
18 |   }) %>% rbindlist }) %>% rbindlist
19 | 
20 | 
21 | to.plot <- tf_enrichment_insilico_chip.dt[pval<=0.10] %>% 
22 |   .[,pval:=as.numeric(pval)] %>%
23 |   .[,log_pval:=-log10(pval)] %>%
24 |   .[,celltype:=factor(celltype,levels=celltypes.to.use)]
25 | 
26 | to.plot[,dot_size:=minmax.normalisation(abs(log_pval))]
27 | 
28 | to.plot.text <- to.plot[pval<=0.01]
29 | 
30 | ggplot(to.plot[pval<=0.01], aes_string(x="log_pval", y="tf", size="dot_size")) +
31 |   geom_point(shape=21) +
32 |   # scale_x_discrete(drop=F) +
33 |   # scale_size_continuous(range = c(0.25,2)) +
34 |   # guides(x = guide_axis(angle = 90)) +
35 |   theme_classic() +
36 |   theme(
37 |     axis.text.x = element_text(color="black", size=rel(0.75)),
38 |     axis.text.y = element_text(color="black")
39 |   )
40 | 


--------------------------------------------------------------------------------
/atac/archR/differential/pseudobulk/celltype_genotype/parse_differential_results.R:
--------------------------------------------------------------------------------
 1 | here::i_am("atac/archR/differential/pseudobulk/celltype_genotype/parse_differential_results.R")
 2 | 
 3 | # Load default settings
 4 | source(here::here("settings.R"))
 5 | # source(here::here("utils.R"))
 6 | 
 7 | ######################
 8 | ## Define arguments ##
 9 | ######################
10 | 
11 | p <- ArgumentParser(description='')
12 | p$add_argument('--diff_results_dir',   type="character",     help='File')
13 | p$add_argument('--outfile',             type="character",     help='File')
14 | args <- p$parse_args(commandArgs(TRUE))
15 | 
16 | ## START TEST ##
17 | # io$basedir <- file.path(io$basedir,"test")
18 | # args <- list()
19 | # args$diff_results_dir <- file.path(io$basedir,"results/atac/archR/differential/pseudobulk/celltype_genotype/PeakMatrix")
20 | # args$outfile <- file.path(io$basedir,"results/atac/archR/differential/pseudobulk/celltype_genotype/PeakMatrix/parsed/diff_results.txt.gz")
21 | ## END TEST ##
22 | 
23 | # I/O
24 | dir.create(dirname(args$outfile), showWarnings = F, recursive = T)
25 | 
26 | ##########################################
27 | ## Load differential expression results ##
28 | ##########################################
29 | 
30 | diff_results_list <- list()
31 | 
32 | # i <- "Visceral_endoderm"; j <- "Surface_ectoderm"
33 | for (i in 1:length(opts$celltypes)) {
34 |   file <- file.path(args$diff_results_dir,sprintf("%s.txt.gz",opts$celltypes[[i]]))
35 |   if (file.exists(file)) {
36 |     tmp <- fread(file) %>% .[,celltype:=opts$celltypes[[i]]]
37 |     if (nrow(tmp)>1) {
38 |       diff_results_list[[opts$celltypes[[i]]]] <- tmp
39 |     }
40 |   } else {
41 |     print(sprintf("%s not found...",file))
42 |   }
43 | }
44 |  
45 | print(names(diff_results_list))
46 | 
47 | ##########
48 | ## Save ##
49 | ##########
50 | 
51 | fwrite(rbindlist(diff_results_list), args$outfile, sep="\t", quote=F, na="NA")
52 | 
53 | 


--------------------------------------------------------------------------------
/atac/archR/differential/utils.R:
--------------------------------------------------------------------------------
 1 | gg_volcano_plot <- function(to.plot, top_genes=10, xlim=NULL, ylim=NULL, label_groups = NULL) {
 2 |   
 3 |   negative_hits <- to.plot[sig==TRUE & MeanDiff<0,idx]
 4 |   positive_hits <- to.plot[sig==TRUE & MeanDiff>0,idx]
 5 |   all <- nrow(to.plot)
 6 |   
 7 |   if (is.null(xlim))
 8 |     xlim <- max(abs(to.plot$MeanDiff), na.rm=T)
 9 |   if (is.null(ylim))
10 |     ylim <- max(-log10(to.plot$FDR), na.rm=T)
11 |   
12 |   p <- ggplot(to.plot, aes(x=MeanDiff, y=-log10(FDR))) +
13 |     # ggrastr::geom_point_rast(aes(color=sig), size=1) +
14 |     geom_point(aes(color=sig), size=1) +
15 |     # geom_hline(yintercept = -log10(opts$threshold_fdr), color="blue") +
16 |     geom_segment(aes(x=0, xend=0, y=0, yend=ylim-1), color="orange") +
17 |     scale_color_manual(values=c("black","red")) +
18 |     # scale_x_continuous(limits=c(-xlim-10,xlim+10)) +
19 |     scale_x_continuous(limits=c(-xlim,xlim)) +
20 |     scale_y_continuous(limits=c(0,ylim+2.5)) +
21 |     labs(x="Accessibility difference (%)", y=expression(paste("-log"[10],"(FDR)"))) +
22 |     annotate("text", x=0, y=ylim+1, size=7, label=sprintf("(%d)", all)) +
23 |     annotate("text", x=-50, y=ylim+2, size=7, label=sprintf("%d (-)",length(negative_hits))) +
24 |     annotate("text", x=50, y=ylim+2, size=7, label=sprintf("%d (+)",length(positive_hits))) +
25 |     # ggrepel::geom_text_repel(data=head(to.plot[sig==T],n=top_genes), aes(x=MeanDiff, y=-log10(FDR), label=symbol), size=5) +
26 |     theme_classic() +
27 |     theme(
28 |       axis.text = element_text(size=rel(1.00), color='black'),
29 |       axis.title = element_text(size=rel(1.50), color='black'),
30 |       # axis.title = element_text(),
31 |       legend.position="none"
32 |     )
33 | 
34 |   if (length(label_groups)>0) {
35 |     p <- p +
36 |       annotate("text", x=-70, y=0, size=4.5, label=sprintf("Up in %s",label_groups[2])) +
37 |       annotate("text", x=70, y=0, size=4.5, label=sprintf("Up in %s",label_groups[1]))
38 |   }
39 | 
40 |   return(p)
41 | }
42 | 


--------------------------------------------------------------------------------
/atac/archR/feature_stats/plot_feature_stats_atac.R:
--------------------------------------------------------------------------------
 1 | # TO-DO: USE OUTPUT OF SAVE ATAC MATRICES
 2 | here::i_am("atac/archR/feature_stats/archR_calculate_feature_stats.R")
 3 | 
 4 | source(here::here("settings.R"))
 5 | source(here::here("utils.R"))
 6 | 
 7 | suppressPackageStartupMessages(library(ArchR))
 8 | suppressPackageStartupMessages(library(sparseMatrixStats))
 9 | 
10 | 
11 | ######################
12 | ## Define arguments ##
13 | ######################
14 | 
15 | p <- ArgumentParser(description='')
16 | p$add_argument('--feature_stats',    type="character",    help='feature stats file')
17 | p$add_argument('--outdir',     type="character",    help='Output directory')
18 | args <- p$parse_args(commandArgs(TRUE))
19 | 
20 | ## START TEST ##
21 | # args$feature_stats <- file.path(io$basedir, "results/atac/archR/feature_stats/PeakMatrix_feature_stats.txt.gz")
22 | # args$outdir <- file.path(io$basedir, "results/atac/archR/feature_stats/PeakMatrix/pdf")
23 | ## END TEST ##
24 | 
25 | dir.create(args$outdir, showWarnings=F, recursive = T)
26 | 
27 | ########################
28 | ## Load feature stats ##
29 | ########################
30 | 
31 | ##########
32 | ## Plot ##
33 | ##########
34 | 
35 | to.plot <- peak_metadata.dt %>% 
36 |   .[score>=args$min_peak_score] %>%
37 |   merge(peak_stats.dt,by="peak")
38 | 
39 | p <- ggboxplot(to.plot, x="peakType", y="mean_singlecell", fill="peakType", outlier.shape = NA) +
40 |   coord_cartesian(ylim=c(0,1.0)) +
41 |   labs(x="", y="Average chromatin accessibility") +
42 |   theme(
43 |     axis.text = element_text(size=rel(0.75)),
44 |     legend.position = "none"
45 |   )
46 | 
47 | pdf(file.path(args$outdir,"boxplots_atac_peak_type.pdf"), width = 7, height = 5)
48 | print(p)
49 | dev.off()
50 | 
51 | 
52 | ##########
53 | ## TEST ##
54 | ##########
55 | 
56 | # to.plot <- feature_stats.dt %>% head(n=1e4)
57 | # to.plot <- feature_stats.dt[var_pseudobulk>=0.10 & var_metacells<1]
58 | # 
59 | # ggscatter(to.plot, x="var_metacells", y="var_pseudobulk", size=1) + 
60 | #   geom_abline(slope=1, intercept=0)
61 | # 
62 | # ggscatter(to.plot, x="var_cells", y="var_pseudobulk", size=1) + 
63 | #   geom_abline(slope=1, intercept=0)
64 | # 
65 | # ggscatter(to.plot, x="mean_metacells", y="var_metacells", size=1) + 
66 | #   stat_smooth(method="loess")
67 | # 
68 | # ggscatter(to.plot, x="mean_pseudobulk", y="var_pseudobulk", size=0.5) + 
69 | #   stat_smooth(method="loess")
70 | 
71 | 


--------------------------------------------------------------------------------
/atac/archR/load_archR_project.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages(library(ArchR))
 2 | 
 3 | #####################
 4 | ## Define settings ##
 5 | #####################
 6 | 
 7 | io$archR.directory <- file.path(io$basedir,"processed/atac/archR")
 8 | # io$archR.directory <- file.path(io$basedir,"test/processed/atac/archR")
 9 | # io$atac.peak.annotation <- file.path(io$basedir,"/original/atac_peak_annotation.tsv")
10 | io$archR.projectMetadata <- file.path(io$archR.directory,"projectMetadata.rds")
11 | io$archR.peakSet.granges <- file.path(io$archR.directory,"PeakSet.rds")
12 | 
13 | setwd(io$archR.directory)
14 | 
15 | ####################
16 | ## Define options ##
17 | ####################
18 | 
19 | addArchRGenome("mm10")
20 | addArchRThreads(threads = 1) 
21 | 
22 | ########################
23 | ## Load ArchR project ##
24 | ########################
25 | 
26 | ArchRProject <- loadArchRProject(io$archR.directory)
27 | 
28 | # Load ArchR projectMetadata
29 | if (file.exists(io$archR.projectMetadata)) {
30 | 	ArchRProject@projectMetadata <- readRDS(io$archR.projectMetadata)
31 | }
32 | 
33 | # Load peaks
34 | if (file.exists(io$archR.peakSet.granges)) {
35 | 	ArchRProject <- addPeakSet(ArchRProject, peakSet = readRDS(io$archR.peakSet.granges), force = TRUE)
36 | }
37 | 
38 | # Load motif annotations over peaks
39 | if (!is.null(ArchRProject@peakAnnotation)) {
40 | 	io$archR.peakAnnotation <- file.path(io$archR.directory,"Annotations/peakAnnotation.rds")
41 | 	if (file.exists(io$archR.peakAnnotation)) {
42 | 		ArchRProject@peakAnnotation <- readRDS(io$archR.peakAnnotation)
43 | 	}
44 | }
45 | 
46 | # Add background peaks
47 | if (!is.null(getPeakSet(ArchRProject))) {
48 | 	io$archR.bgdPeaks <- file.path(io$archR.directory, "Background-Peaks.rds")
49 | 	if (!"bgdPeaks" %in% metadata(getPeakSet(ArchRProject))$bgdPeaks) {
50 | 		if (file.exists(io$archR.bgdPeaks)) metadata(ArchRProject@peakSet)$bgdPeaks <- io$archR.bgdPeaks
51 | 	}
52 | }
53 | 
54 | ##########
55 | ## TEST ##
56 | ##########
57 | 
58 | # ArchRProject@peakSet <- readRDS(io$archR.peakSet.granges)
59 | # seqlevels(ArchRProject@peakSet) <- sort(seqlevels(ArchRProject@peakSet))
60 | # ArchRProject@peakSet <- sort(ArchRProject@peakSet)
61 | 
62 | 
63 | # getAvailableMatrices(ArchRProject)
64 | 
65 | # io$arrow.files <- opts$samples %>% 
66 | #   # map_chr(~ sprintf("%s/%s.arrow",io$archR.directory,.))
67 | #   map_chr(~ sprintf("%s.arrow",.))
68 | # 
69 | # ArchRProject <- ArchRProject(
70 | #   ArrowFiles = io$arrow.files,
71 | #   # outputDirectory = "ArchROutput",
72 | #   outputDirectory = io$archR.directory,
73 | #   copyArrows = FALSE
74 | # )
75 | # saveArchRProject(ArchRProject)
76 | 


--------------------------------------------------------------------------------
/atac/archR/load_motif_annotation.R:
--------------------------------------------------------------------------------
 1 | # opts$motif_annotation <- "Motif_cisbp" # "Motif_JASPAR2020" 
 2 | 
 3 | motif2gene_file <- sprintf("%s/Annotations/%s_TFs.txt.gz",io$archR.directory,opts$motif_annotation)
 4 | 
 5 | if (file.exists(motif2gene_file)) {
 6 | 
 7 |   motif2gene.dt <- fread(motif2gene_file)
 8 | 
 9 | } else {
10 | 
11 |   peakAnnotation <- readRDS(sprintf("%s/Annotations/peakAnnotation.rds",io$archR.directory))
12 |   stopifnot(opts$motif_annotation%in%names(peakAnnotation))
13 |   motif2gene.dt <- peakAnnotation[[opts$motif_annotation]]$motifSummary %>%
14 |     as.data.table(keep.rownames = T) %>% setnames("rn","motif") %>% .[,strand:=NULL] %>% setnames("symbol","gene")
15 | 
16 |   # Rename genes
17 |   if (grepl("cisbp",opts$motif_annotation, ignore.case = T)) {
18 |     
19 |     tf2gene_rename <- c(
20 |       "Tcfe"="Tfe", "Nkx1"="Nkx1-", "Nkx2"="Nkx2-", "Nkx3"="Nkx3-", "Nkx4"="Nkx4-", "Nkx5"="Nkx5-", "Nkx6"="Nkx6-", "Foxf1a"="Foxf1",
21 |       "Hmga1rs1"="rs1", "Mycl1$"="Mycl", "Dux$"="Duxf3", "Duxbl$"="Duxbl1", "Pit1$"="Prop1",
22 |       "ENSMUSG00000079994"="Sox1", "Tcfap"="Tfap"
23 |     )
24 |     
25 |     motif2gene.dt[,gene:=stringr::str_replace_all(gene,tf2gene_rename)]
26 |     
27 |   } else if (grepl("JASPAR",opts$motif_annotation, ignore.case = T)) {
28 |     
29 |     # conflictive motifs: fusion proteins (UN::JUNB) and versions (TFAP2A(var.2))
30 |     # stop("To-do")
31 |     tf2gene_rename <- c("TBXT"="T")
32 |     motif2gene.dt[,gene:=stringr::str_replace_all(gene,tf2gene_rename)]
33 |     
34 |     # for JASPAR motifs
35 |     motif2gene.dt[,motif:=str_replace(motif,"\\.VAR\\.","\\.var\\."),]
36 |     
37 |   } else {
38 |     stop("Motif annotation not recognised")
39 |   }
40 | 
41 |   motif2gene.dt[,c("motif","gene"):=list(toupper(motif),toupper(gene))]
42 | 
43 |   # Save
44 |   fwrite(motif2gene.dt, motif2gene_file, sep="\t", quote=F)
45 | }
46 | 
47 | 
48 | 


--------------------------------------------------------------------------------
/atac/archR/peak_calling/README.txt:
--------------------------------------------------------------------------------
1 | 
2 | PijuanSala:
3 | we called peaks on the pooled sample of high-quality barcodes using macs2 callpeak50 (macs2 2.1.0.20150420) with ‘P = 0.05,–nomodel,–shift 0,–extsize 150’ and discarded peaks falling in blacklisted mm10 genomic regions from the ENCODE Project Consortium51 using bedtools intersect (v2.21.0). The resulting peak summits were extended ±250 bp and subsequently merged with the promoter coordinates of genes from ensembl GRCm38.92 (from TSS to TSS –500 bp) using bedtools merge (v2.21.0).


--------------------------------------------------------------------------------
/atac/archR/peak_calling/analysis/calculate_cpg_density_atac_peaks.R:
--------------------------------------------------------------------------------
 1 | library(BSgenome.Mmusculus.UCSC.mm10)
 2 | library(Biostrings)
 3 | 
 4 | #####################
 5 | ## Define settings ##
 6 | #####################
 7 | 
 8 | source(here::here("settings.R"))
 9 | source(here::here("utils.R"))
10 | 
11 | io$outfile <- file.path(io$basedir,"results_new/atac/archR/peak_calling/cpg_density_peaks.txt.gz")
12 | 
13 | ################
14 | ## Load peaks ##
15 | ################
16 | 
17 | peaks.dt <- fread(io$archR.peak.metadata) %>%
18 |   .[,c("chr","start","end")] %>%
19 |   .[,idx:=sprintf("%s:%s-%s",chr,start,end)]# %>%
20 |   # .[,c("idx")]
21 | 
22 | 
23 | ####################################
24 | ## Calculate CpG density per peak ##
25 | ####################################
26 | 
27 | chr_lengths.dt <- data.table(
28 |   chr = unique(peaks.dt$chr),
29 |   chr_length = seqlengths(Mmusculus) %>% .[unique(peaks.dt$chr)]
30 | )
31 | peaks.dt <- merge(peaks.dt, chr_lengths.dt, by="chr")
32 | 
33 | # Filter features that exceed chr  length
34 | peaks.dt <- peaks.dt[end<chr_length]
35 | 
36 | # Get sequence
37 | seq <- getSeq(Mmusculus, peaks.dt$chr, peaks.dt$start, peaks.dt$end+1)
38 | 
39 | # Calculate CpG density
40 | peaks.dt$cpg_density <- round(dinucleotideFrequency(seq)[,"CG"]/width(seq),4)
41 | 
42 | ##################
43 | ## Save results ##
44 | ##################
45 | 
46 | fwrite(peaks.dt[,c("idx","cpg_density")], io$outfile, col.names=T, quote=F, sep="\t")
47 | 


--------------------------------------------------------------------------------
/atac/archR/peak_calling/analysis/link_TFs2genes_motifmatchr.R:
--------------------------------------------------------------------------------
 1 | 
 2 | stop("TO FINISH")
 3 | 
 4 | #####################
 5 | ## Define settings ##
 6 | #####################
 7 | 
 8 | if (grepl("ricard",Sys.info()['nodename'])) {
 9 |   source("/Users/ricard/gastrulation_multiome_10x/settings.R")
10 |   source("/Users/ricard/gastrulation_multiome_10x/utils.R")
11 | } else if (grepl("ebi",Sys.info()['nodename'])) {
12 |   source("/homes/ricard/gastrulation_multiome_10x/settings.R")
13 |   source("/homes/ricard/gastrulation_multiome_10x/utils.R")
14 | } else {
15 |   stop("Computer not recognised")
16 | }
17 | 
18 | # I/O
19 | io$outdir <- paste0(io$basedir,"/results/atac/archR/peak_calling/TF2genes")
20 | 
21 | # Options
22 | opts <- list()
23 | opts$gene_window <- 1e5 # window length for the overlap
24 | 
25 | ###############
26 | ## Load data ##
27 | ###############
28 | 
29 | # Load gene metadata
30 | gene_metadata <- fread(io$gene_metadata) %>% 
31 |   .[,chr:=as.factor(sub("chr","",chr))] %>%
32 |   setnames("symbol","gene") %>%
33 |   .[, c("chr","start","end","gene")] %>%
34 |   setkey(chr,start,end)
35 | 
36 | # Load peak metadata
37 | peakSet.dt <- fread(io$archR.peak.metadata) %>%
38 |   .[,chr:=as.factor(sub("chr","",chr))] %>%
39 |   .[,c("chr","start","end")] %>%
40 |   .[,peak:=sprintf("%s_%s_%s",chr,start,end)] %>%
41 |   setkey(chr,start,end)
42 | 
43 | ##########################
44 | ## Load peak2gene links ##
45 | ##########################
46 | 
47 | #################################
48 | ## Load motifmatcher Matches ##
49 | #################################
50 | 
51 | motifmatcher.se <- readRDS(sprintf("%s/Annotations/Motif_cisbp-Matches-In-Peaks.rds",io$archR.directory))
52 | 
53 | # Rename TFs
54 | colnames(motifmatcher.se) <- colnames(motifmatcher.se) %>% toupper %>% stringr::str_split(.,"_") %>% map_chr(1)
55 | motifmatcher.se <- motifmatcher.se[,!duplicated(colnames(motifmatcher.se))]
56 | 
57 | # Rename peaks
58 | tmp <- rowRanges(motifmatcher.se)
59 | rownames(motifmatcher.se) <- sprintf("%s:%s-%s",seqnames(tmp), start(tmp), end(tmp))
60 | 
61 | # Subset pekas
62 | # motifmatcher.se <- motifmatcher.se[unique(cor_dt$peak),]
63 | 
64 | #############
65 | ## Overlap ##
66 | #############
67 | 
68 | ov <- foverlaps(
69 |   peakSet.dt,
70 |   gene_metadata[, c("chr","start","end","gene")],
71 |   nomatch = NA
72 | ) %>% 
73 |   setnames(c("i.start","i.end"),c("peak.start","peak.end")) %>%
74 |   setnames(c("start","end"),c("gene.start","gene.end")) %>%
75 |   .[,c("gene.start","gene.end") := list (gene.start+opts$gene_window, gene.end-opts$gene_window)] %>%
76 |   # .[,c("start_dist","end_dist"):=list( abs(gene.end-peak.start), abs(gene.start-peak.end))] %>%
77 |   .[,c("start_dist","end_dist"):=list( gene.end-peak.start, gene.start-peak.end)] %>%
78 |   .[,c("start_dist","end_dist"):=list( ifelse(end_dist<0 & start_dist>0,0,start_dist), ifelse(end_dist<0 & start_dist>0,0,end_dist) )] %>%
79 |   .[,dist:=ifelse(abs(start_dist)<abs(end_dist),abs(start_dist),abs(end_dist))] %>% .[,c("start_dist","end_dist"):=NULL]
80 | 
81 | # Select nearest gene
82 | ov_nearest <- ov %>%
83 |   .[.[,.I[dist==min(dist)], by=c("peak")]$V1] %>%
84 |   .[complete.cases(.)] %>%
85 |   .[!duplicated(peak)]
86 | 
87 | # Sanity check  
88 | # ov_nearest$gene[(duplicated(ov_nearest$peak))]
89 | 
90 | ##########
91 | ## Save ##
92 | ##########
93 | 
94 | fwrite(ov, paste0(io$outdir,"/peaks2genes_all.txt.gz"), sep="\t", na="NA")
95 | fwrite(ov_nearest, paste0(io$outdir,"/peaks2genes_nearest.txt.gz"), sep="\t", na="NA")
96 | 


--------------------------------------------------------------------------------
/atac/archR/peak_calling/analysis/link_peaks2genes_genomic_distance.R:
--------------------------------------------------------------------------------
 1 | here::i_am("atac/archR/peak_calling/analysis/link_peaks2genes_genomic_distance.R")
 2 | 
 3 | source(here::here("settings.R"))
 4 | source(here::here("utils.R"))
 5 | 
 6 | ################################
 7 | ## Initialize argument parser ##
 8 | ################################
 9 | 
10 | p <- ArgumentParser(description='')
11 | p$add_argument('--gene_metadata',  type="character",                help='Gene metadata') 
12 | p$add_argument('--peak_metadata',  type="character",                help='Peak metadata') 
13 | p$add_argument('--gene_window',  type="integer",  default=1e5,               help='Genomic window size') 
14 | p$add_argument('--outdir',  type="character",                help='Output directory') 
15 | args <- p$parse_args(commandArgs(TRUE))
16 | 
17 | #####################
18 | ## Define settings ##
19 | #####################
20 | 
21 | ## START TEST ##
22 | # args <- list()
23 | # args$gene_metadata <- io$gene_metadata
24 | # args$peak_metadata <- file.path(io$basedir,"processed_new/atac/archR/PeakCalls/peak_metadata.tsv.gz")
25 | # args$gene_window <- 5e5  # maximum window length for the overlap
26 | # args$outdir <- file.path(io$basedir,"results_new/atac/archR/peak_calling/peaks2genes")
27 | ## END TEST ##
28 | 
29 | # I/O
30 | dir.create(args$outdir, showWarnings=F)
31 | 
32 | ###############
33 | ## Load data ##
34 | ###############
35 | 
36 | # Load gene metadata
37 | gene_metadata <- fread(args$gene_metadata) %>% 
38 |   .[,chr:=as.factor(sub("chr","",chr))] %>%
39 |   setnames("symbol","gene") %>%
40 |   .[, c("chr","start","end","gene","ens_id","strand")]
41 | 
42 | # Load peak metadata
43 | peakSet.dt <- fread(args$peak_metadata) %>%
44 |   .[,chr:=as.factor(sub("chr","",chr))] %>%
45 |   .[,c("chr","start","end")] %>%
46 |   .[,peak:=sprintf("chr%s:%s-%s",chr,start,end)] %>%
47 |   setkey(chr,start,end)
48 | 
49 | #############
50 | ## Overlap ##
51 | #############
52 | 
53 | gene_metadata.ov <- copy(gene_metadata) %>%
54 |   .[strand=="+",c("gene.start","gene.end"):=list(start,end)] %>%
55 |   .[strand=="-",c("gene.start","gene.end"):=list(end,start)] %>%
56 |   .[strand=="+",c("start","end"):=list (gene.start-args$gene_window, gene.end+args$gene_window)] %>%
57 |   .[strand=="-",c("end","start"):=list (gene.start+args$gene_window, gene.end-args$gene_window)] %>% 
58 |   # .[,strand:=NULL] %>% 
59 |   setkey(chr,start,end)
60 | 
61 | stopifnot((gene_metadata.ov$end-gene_metadata.ov$start)>0)
62 | 
63 | ov <- foverlaps(
64 |   peakSet.dt,
65 |   gene_metadata.ov,
66 |   nomatch = NA
67 | ) %>%  .[,c("start","end"):=NULL] %>%
68 |   setnames(c("i.start","i.end"),c("peak.start","peak.end")) %>%
69 |   .[,peak.mean:=(peak.start+peak.end)/2] %>%
70 |   # calculate distance from the peak to the genebody
71 |   .[,dist:=min(abs(gene.end-peak.mean), abs(gene.start-peak.mean)), by=c("gene","ens_id","peak","strand")] %>%
72 |   .[strand=="+" & peak.mean>gene.start & peak.mean<gene.end,dist:=0] %>%
73 |   .[strand=="-" & peak.mean<gene.start & peak.mean>gene.end,dist:=0]
74 | 
75 | 
76 | # ov[peak=="18_64485555_64486155"]
77 | # gene_metadata[gene=="Fech"]
78 | # gene_metadata.ov[gene=="Fech"]
79 | # ov[gene=="Fech" & peak=="chr18:64485555-64486155"]
80 | # ov[peak=="chr7:103850833-103851433"]
81 | 
82 | # Select nearest gene
83 | ov_nearest <- ov %>%
84 |   .[.[,.I[dist==min(dist)], by=c("peak")]$V1] %>%
85 |   .[complete.cases(.)] %>%
86 |   .[!duplicated(peak)]
87 | 
88 | # Sanity check  
89 | # ov_nearest$gene[(duplicated(ov_nearest$peak))]
90 | 
91 | ##########
92 | ## Save ##
93 | ##########
94 | 
95 | fwrite(ov, file.path(args$outdir,"peaks2genes_all.txt.gz"), sep="\t", na="NA")
96 | fwrite(ov_nearest, file.path(args$outdir,"peaks2genes_nearest.txt.gz"), sep="\t", na="NA")
97 | 


--------------------------------------------------------------------------------
/atac/archR/plot_individual_peaks/compare_genotypes/pseudobulk_with_replicates/plot_individual_peaks_genotypes_pseudobulk_with_replicates.R:
--------------------------------------------------------------------------------
 1 | # here::i_am("atac/archR/processing/save_archr_matrices.R")
 2 | 
 3 | source(here::here("settings.R"))
 4 | source(here::here("utils.R"))
 5 | 
 6 | #####################
 7 | ## Define settings ##
 8 | #####################
 9 | 
10 | # I/O
11 | io$pseudobulk_atac_peak_matrix <- file.path(io$basedir,"results/atac/archR/pseudobulk/celltype_genotype/PeakMatrix/PeakMatrix_pseudobulk_with_replicates.rds")
12 | io$outdir <- file.path(io$basedir,"results/atac/archR/plot_individual_peaks/genotype"); dir.create(io$outdir, showWarnings = F)
13 | 
14 | # Options
15 | opts$samples <- c(
16 |   "E8.5_CRISPR_T_KO",
17 |   "E8.5_CRISPR_T_WT"
18 | )
19 | 
20 | opts$celltypes <- c("Somitic_mesoderm", "NMP", "Spinal_cord")
21 | 
22 | ####################
23 | ## Load metadata  ##
24 | ####################
25 | 
26 | ##########################
27 | ## Load ATAC PeakMatrix ##
28 | ##########################
29 | 
30 | atac_peak_matrix_pseudobulk.se <- readRDS(io$pseudobulk_atac_peak_matrix)
31 | 
32 | # subset
33 | atac_peak_matrix_pseudobulk.se <- atac_peak_matrix_pseudobulk.se[,atac_peak_matrix_pseudobulk.se$celltype%in%opts$celltypes]
34 | 
35 | # Normalise ATAC data
36 | assay(atac_peak_matrix_pseudobulk.se,"logcounts") <- log(1e6*(sweep(assay(atac_peak_matrix_pseudobulk.se),2,colSums(assay(atac_peak_matrix_pseudobulk.se),na.rm=T),"/"))+1)
37 | 
38 | ####################################################
39 | ## Boxplots of chromatin accessibility (WT vs KO) ##
40 | ####################################################
41 | 
42 | peaks.to.plot <- c("chr7:144884955-144885555","chr7:79789147-79789747","chr7:126785067-126785667")
43 | 
44 | # i <- "chr7:144884955-144885555"
45 | for (i in peaks.to.plot) {
46 |   
47 |   to.plot <- data.table(
48 |     acc = assay(atac_peak_matrix_pseudobulk.se,"logcounts")[i,],
49 |     sample = colnames(atac_peak_matrix_pseudobulk.se),
50 |     celltype = atac_peak_matrix_pseudobulk.se$celltype,
51 |     genotype = atac_peak_matrix_pseudobulk.se$genotype
52 |   ) %>% .[celltype=="Caudal_Mesoderm",celltype:="Somitic_mesoderm"] %>% 
53 |     .[,celltype_genotype:=sprintf("%s (%s)",celltype,genotype)]
54 |   
55 |   order <- c("Spinal_cord (WT)","Spinal_cord (T_KO)", "NMP (WT)", "NMP (T_KO)", "Somitic_mesoderm (WT)")
56 |   to.plot[,celltype_genotype:=factor(celltype_genotype, levels=order)]
57 |   
58 |   my_comparisons <- list( c("NMP (WT)", "NMP (T_KO)"))
59 |   
60 |   to.plot.means <- to.plot[,.(acc=mean(acc),sd=sd(acc)), by=c("celltype_genotype","celltype","genotype")]
61 |   
62 |   p <- ggplot(to.plot, aes_string(x="celltype_genotype", y="acc", fill="genotype")) +
63 |     geom_bar(stat="identity", color="black", alpha=1, data=to.plot.means) +
64 |     geom_jitter(size=3, width=0.05, shape=21) +
65 |     geom_errorbar(aes(ymin=acc-sd, ymax=acc+sd), width=0.15, alpha=1, size=0.6, data=to.plot.means) +
66 |     stat_compare_means(aes(label = paste0("p = ", ..p.format..)), comparisons = my_comparisons, method="t.test") +
67 |     # stat_summary(fun.data = give.n, geom = "text", position = position_dodge(width=0.75)) +
68 |     scale_fill_manual(values=c("#EE0000","#1C86EE")) +
69 |     labs(x="", y="Chromatin accessibility (log normalised counts)") +
70 |     # geom_violin(aes(fill=celltype)) +
71 |     theme_classic() +
72 |     theme(
73 |       axis.text.y = element_text(color="black"),
74 |       axis.text.x = element_text(color="black"),
75 |       legend.title = element_blank(),
76 |       legend.position = "none"
77 |     )
78 |   
79 |   pdf(file.path(io$outdir,sprintf("boxplots_acc_genotype_%s.pdf",gsub(":","-",i))), width=6, height=6)
80 |   print(p)
81 |   dev.off()
82 | }
83 | 


--------------------------------------------------------------------------------
/atac/archR/processing/0_create_arrow_files.R:
--------------------------------------------------------------------------------
 1 | here::i_am("atac/archR/processing/0_create_arrow_files.R")
 2 | 
 3 | source(here::here("settings.R"))
 4 | 
 5 | suppressPackageStartupMessages(library(ArchR))
 6 | 
 7 | ######################
 8 | ## Define arguments ##
 9 | ######################
10 | 
11 | p <- ArgumentParser(description='')
12 | p$add_argument('--samples',           type="character",  nargs='+',      help='Samples')
13 | p$add_argument('--fragments_files',           type="character",  nargs='+',      help='ATAC Fragments files')
14 | p$add_argument('--genome',           type="character", default="mm10",      help='Genome')
15 | p$add_argument('--min_fragments',     type="integer",    default=1000,   help='Minimum number of ATAC fragments')
16 | p$add_argument('--max_fragments',     type="integer",    default=1e7,    help='Maximum number of ATAC fragments')
17 | p$add_argument('--min_tss_score',   type="double",     default=2.5,    help='Minimum TSS score threshold')
18 | p$add_argument('--threads',     type="integer",    default=1,    help='Number of threads')
19 | p$add_argument('--outdir',          type="character",                               help='Output directory')
20 | 
21 | args <- p$parse_args(commandArgs(TRUE))
22 | 
23 | ## START TEST ##
24 | # args$fragments_files <- c(
25 | #   "/bi/group/reik/ricard/data/gastrulation_multiome_10x/original/E7.5_rep1/atac_fragments.tsv.gz",
26 | #   "/bi/group/reik/ricard/data/gastrulation_multiome_10x/original/E7.5_rep2/atac_fragments.tsv.gz"
27 | # )
28 | # args$samples <- c("E7.5_rep1","E7.5_rep2")
29 | # args$genome <- "mm10"
30 | # args$min_fragments <- 1000
31 | # args$max_fragments <- 1e7
32 | # args$min_tss_score <- 2.5
33 | # args$threads <- 1
34 | # args$outdir <- "/bi/group/reik/ricard/data/gastrulation_multiome_10x/processed/atac/archR"
35 | ## END TEST ##
36 | 
37 | #####################
38 | ## Define settings ##
39 | #####################
40 | 
41 | setwd(args$outdir)
42 | 
43 | # ArchR options
44 | addArchRThreads(threads=args$threads) 
45 | addArchRGenome(args$genome)
46 | 
47 | rhdf5::h5disableFileLocking()
48 | 
49 | ########################
50 | ## create Arrow Files ##
51 | ########################
52 | 
53 | ArrowFiles <- createArrowFiles(
54 |   inputFiles = args$fragments_files,
55 |   sampleNames = args$samples,
56 |   outputNames = args$samples,
57 |   addTileMat = FALSE,
58 |   addGeneScoreMat = FALSE,
59 |   excludeChr = c("chrM", "chrY"),
60 | 
61 |   subThreading = FALSE, # parallel processing doesn't work well (https://github.com/GreenleafLab/ArchR/issues/248)
62 |   force = TRUE,
63 | 
64 |   # QC metrics
65 |   minFrags = args$min_fragments,  # The minimum number of fragments per cell
66 |   maxFrags = args$max_fragments,  # The maximum number of fragments per cell
67 |   minTSS = args$min_tss_score   # The minimum TSS enrichment score per cell
68 | )
69 | 


--------------------------------------------------------------------------------
/atac/archR/processing/1_create_archR_project.R:
--------------------------------------------------------------------------------
 1 | here::i_am("atac/archR/processing/1_create_archR_project.R")
 2 | 
 3 | source(here::here("settings.R"))
 4 | 
 5 | suppressPackageStartupMessages(library(ArchR))
 6 | 
 7 | ######################
 8 | ## Define arguments ##
 9 | ######################
10 | 
11 | p <- ArgumentParser(description='')
12 | p$add_argument('--arrow_files',     type="character",  nargs='+',      help='Arrow files')
13 | p$add_argument('--genome',          type="character", default="mm10",      help='Genome')
14 | p$add_argument('--outdir',          type="character",                               help='Output directory')
15 | 
16 | args <- p$parse_args(commandArgs(TRUE))
17 | 
18 | ## START TEST ##
19 | # args$arrow_files <- c(
20 | #   "/bi/group/reik/ricard/data/gastrulation_multiome_10x/processed/atac/archR_subset/ArrowFiles/E7.5_rep1.arrow",
21 | #   "/bi/group/reik/ricard/data/gastrulation_multiome_10x/processed/atac/archR_subset/ArrowFiles/E7.5_rep2.arrow"
22 | # )
23 | # args$genome <- "mm10"
24 | # args$threads <- 1
25 | # args$outdir <- "/bi/group/reik/ricard/data/gastrulation_multiome_10x/processed/atac/archR"
26 | ## END TEST ##
27 | 
28 | #####################
29 | ## Define settings ##
30 | #####################
31 | 
32 | # ArchR options
33 | addArchRGenome(args$genome)
34 | 
35 | ############################
36 | ## create an ArchRProject ##
37 | ############################
38 | 
39 | ArchRProject <- ArchRProject(
40 |   ArrowFiles = args$arrow_files, 
41 |   outputDirectory = args$outdir,
42 |   copyArrows = FALSE
43 | )
44 | 
45 | ##########
46 | ## Save ##
47 | ##########
48 | 
49 | saveArchRProject(ArchRProject)


--------------------------------------------------------------------------------
/atac/archR/processing/save_atac_anndata.R:
--------------------------------------------------------------------------------
 1 | here::i_am("atac/archR/processing/save_atac_matrices.R")
 2 | 
 3 | source(here::here("settings.R"))
 4 | source(here::here("utils.R"))
 5 | 
 6 | suppressPackageStartupMessages(library(reticulate))
 7 | 
 8 | ######################
 9 | ## Define arguments ##
10 | ######################
11 | 
12 | p <- ArgumentParser(description='')
13 | p$add_argument('--python',    type="character",    help='')
14 | p$add_argument('--atac_matrix',    type="character",    help='')
15 | p$add_argument('--metadata',    type="character",  help='Cell metadata file')
16 | p$add_argument('--outfile',     type="character",  help='Output file')
17 | 
18 | args <- p$parse_args(commandArgs(TRUE))
19 | 
20 | ## START TEST ##
21 | # io$basedir <- file.path(io$basedir,"test")
22 | # args <- list()
23 | # args$python = "/Users/argelagr/opt/anaconda3/envs/main/bin/python" # "/bi/group/reik/ricard/software/miniconda3/envs/main/bin/python"
24 | # args$metadata <- file.path(io$basedir,"results/atac/archR/celltype_assignment/sample_metadata_after_celltype_assignment.txt.gz")
25 | # args$atac_matrix <- file.path(io$basedir,"processed/atac/archR/Matrices/PeakMatrix_summarized_experiment.rds")
26 | # args$outfile <- file.path(io$basedir,"processed/atac/anndata/PeakMatrox_anndata.h5ad")
27 | ## END TEST ##
28 | 
29 | dir.create(dirname(args$outfile), showWarnings = F, recursive = T)
30 | 
31 | #####################################
32 | ## Reticulate connection to scanpy ##
33 | #####################################
34 | 
35 | use_python(args$python, required=TRUE)
36 | sc <- import("scanpy")
37 | 
38 | #####################
39 | ## Define settings ##
40 | #####################
41 | 
42 | ##########################
43 | ## Load sample metadata ##
44 | ##########################
45 | 
46 | metadata.dt <- fread(args$metadata) %>% 
47 |   .[pass_rnaQC==TRUE & pass_atacQC==TRUE & doublet_call==FALSE & !is.na(celltype)] %>%
48 |   .[,c("cell","sample","stage","genotype","celltype","nFrags_atac","nFeature_RNA")]# %>%
49 |   # setnames("celltype.predicted","celltype")
50 | 
51 | fwrite(metadata.dt, file.path(dirname(args$outfile),"cell_metadata.txt.gz"), sep="\t", quote=F, na="NA")
52 | 
53 | ######################
54 | ## Load atac matrix ##
55 | ######################
56 | 
57 | atac.se <- readRDS(args$atac_matrix)[,metadata.dt$cell]
58 | 
59 | #############################################
60 | ## Convert SingleCellExperiment to AnnData ##
61 | #############################################
62 | 
63 | adata <- sc$AnnData(
64 |     X   = t(assay(atac.se)),
65 |     obs = as.data.frame(colData(atac.se)),
66 |     var = as.data.frame(rowData(atac.se))
67 | )
68 | print(adata)
69 | print(head(adata$obs))
70 | print(head(adata$var))
71 | 
72 | ##########################
73 | ## Parse anndata object ##
74 | ##########################
75 | 
76 | adata$uns$update(celltype_colors = opts$celltype.colors[sort(unique(as.character(adata$obs$celltype)))])
77 | adata$uns$update(stage_colors = opts$stage.colors[sort(unique(as.character(adata$obs$stage)))])
78 | 
79 | ##########
80 | ## Save ##
81 | ##########
82 | 
83 | adata$write_h5ad(args$outfile)
84 | 


--------------------------------------------------------------------------------
/atac/archR/processing/save_atac_matrices.R:
--------------------------------------------------------------------------------
 1 | here::i_am("atac/archR/processing/save_atac_matrices.R")
 2 | 
 3 | source(here::here("settings.R"))
 4 | source(here::here("utils.R"))
 5 | 
 6 | suppressPackageStartupMessages(library(ArchR))
 7 | 
 8 | ######################
 9 | ## Define arguments ##
10 | ######################
11 | 
12 | p <- ArgumentParser(description='')
13 | p$add_argument('--archr_directory',    type="character",    help='ArchR directory')
14 | p$add_argument('--metadata',    type="character",  help='Cell metadata file')
15 | p$add_argument('--matrix',      type="character",  help='Matrix to save')
16 | p$add_argument('--outfile',     type="character",  help='Output file')
17 | 
18 | args <- p$parse_args(commandArgs(TRUE))
19 | 
20 | ## START TEST ##
21 | # args <- list()
22 | # args$archr_directory <- file.path(io$basedir,"processed/atac/archR")
23 | # args$metadata <- file.path(io$basedir,"results/atac/archR/qc/sample_metadata_after_qc.txt.gz")
24 | # args$matrix <- "GeneScoreMatrix_TSS"
25 | # args$outfile <- file.path(io$basedir,sprintf("processed/atac/archR/Matrices/%s_summarized_experiment.rds",args$matrix))
26 | ## END TEST ##
27 | 
28 | print(args)
29 | 
30 | # I/O
31 | dir.create(dirname(args$outfile), showWarnings=F)
32 | 
33 | ###################
34 | ## Load metadata ##
35 | ###################
36 | 
37 | cells_metadata.dt <- fread(args$metadata) %>%
38 |   # .[pass_atacQC==TRUE & doublet_call==FALSE]
39 |   .[pass_atacQC==TRUE]
40 | 
41 | ########################
42 | ## Load ArchR Project ##
43 | ########################
44 | 
45 | # source(here::here("atac/archR/load_archR_project.R"))
46 | 
47 | setwd(args$archr_directory)
48 | 
49 | addArchRGenome("mm10")
50 | addArchRThreads(threads = 1)
51 | 
52 | ArchRProject <- loadArchRProject(args$archr_directory)[cells_metadata.dt$cell]
53 | 
54 | # Sanity checks
55 | # mean(rownames(ArchRProject)%in%cells_metadata.dt$cell)
56 | # mean(cells_metadata.dt$cell%in%rownames(ArchRProject))
57 | # table(cells_metadata.dt[!cell%in%rownames(ArchRProject),sample])
58 | stopifnot(args$matrix %in% getAvailableMatrices(ArchRProject))
59 | 
60 | ################
61 | ## PeakMatrix ##
62 | ################
63 | 
64 | if (args$matrix=="PeakMatrix") {
65 | 
66 | 	atac.se <- getMatrixFromProject(ArchRProject, binarize = FALSE, useMatrix = "PeakMatrix")
67 | 
68 | 	# Define peak names
69 | 	row_ranges.dt <- rowRanges(atac.se) %>% as.data.table %>% 
70 | 	  setnames("seqnames","chr") %>%
71 | 	  .[,c("chr","start","end")] %>%
72 | 	  .[,idx:=sprintf("%s:%s-%s",chr,start,end)]
73 | 	rownames(atac.se) <- row_ranges.dt$idx
74 | 
75 | }
76 | 
77 | #####################
78 | ## GeneScoreMatrix ##
79 | #####################
80 | 
81 | if (grepl("GeneScoreMatrix",args$matrix)) {
82 | 
83 | 	atac.se <- getMatrixFromProject(ArchRProject, binarize = FALSE, useMatrix = args$matrix)
84 | 
85 | 	# Define gene names
86 | 	rownames(atac.se) <- rowData(atac.se)$name
87 | 
88 | 	# Filter genes
89 | 	# atac.se <- atac.se[grep("^Rik|Rik$|^mt-|^Rps-|^Rpl-|^Gm|^Mir|^Olfr",rownames(atac.se),invert=T),]
90 | }
91 | 
92 | ##########
93 | ## Save ##
94 | ##########
95 | 
96 | # Sanity checks
97 | stopifnot(sum(duplicated(rownames(atac.se)))==0)
98 | 
99 | saveRDS(atac.se, args$outfile)


--------------------------------------------------------------------------------
/atac/archR/processing/update_archR_metadata.R:
--------------------------------------------------------------------------------
 1 | #####################
 2 | ## Define settings ##
 3 | #####################
 4 | 
 5 | here::i_am("atac/archR/processing/update_archR_metadata.R")
 6 | 
 7 | source(here::here("settings.R"))
 8 | source(here::here("utils.R"))
 9 | 
10 | 
11 | ########################
12 | ## Load ArchR project ##
13 | ########################
14 | 
15 | source(here::here("atac/archR/load_archR_project.R"))
16 | 
17 | ############################################
18 | ## Merge archR metadata with RNA metadata ##
19 | ############################################
20 | 
21 | # Fetch pre-computed archR's metadata
22 | io$archr.metadata <- paste0(io$basedir,"/processed/atac/archR/sample_metadata_after_archR.txt.gz")
23 | archr_metadata <- fread(io$archr.metadata)
24 | stopifnot(all(rownames(ArchRProject) %in% archr_metadata$cell))
25 | # cols.to.rename <- c("TSSEnrichment","ReadsInTSS","PromoterRatio","NucleosomeRatio","nFrags","BlacklistRatio")
26 | # idx.cols.to.rename <- which(colnames(archr_metadata)%in%cols.to.rename)
27 | # colnames(archr_metadata)[idx.cols.to.rename] <- paste0(colnames(archr_metadata)[idx.cols.to.rename], "_atac")
28 | 
29 | # Fetch the metadata file of interest
30 | io$updated.metadata <- paste0(io$basedir,"/sample_metadata.txt.gz")
31 | updated_metadata <- fread(io$updated.metadata)
32 | colnames(updated_metadata)
33 | 
34 | # remove overlapping columns in the archR metadata
35 | overlaping.columns <- intersect(colnames(updated_metadata),colnames(archr_metadata))
36 | overlaping.columns <- overlaping.columns[!overlaping.columns%in%c("sample","cell","barcode")]
37 | archr_metadata <- archr_metadata[,which(!colnames(archr_metadata)%in%overlaping.columns),with=F]
38 | 
39 | ###########
40 | ## Merge ##
41 | ###########
42 | 
43 | foo <- updated_metadata %>% 
44 |   merge(archr_metadata, by=c("cell","sample","barcode"), all=TRUE)
45 | 
46 | #############################
47 | ## Update ArchR's metadata ##
48 | #############################
49 | 
50 | bar <- foo %>% 
51 |   .[cell%in%rownames(ArchRProject)] %>% setkey(cell) %>% .[rownames(ArchRProject)] %>%
52 |   as.data.frame() %>% tibble::column_to_rownames("cell")
53 | 
54 | stopifnot(bar$cell == rownames(getCellColData(ArchRProject)))
55 | 
56 | for (i in colnames(bar)) {
57 |   ArchRProject <- addCellColData(
58 |     ArchRProject,
59 |     data = bar[[i]], 
60 |     name = i,
61 |     cells = rownames(bar),
62 |     force = TRUE
63 |   )
64 | }
65 | 
66 | colnames(getCellColData(ArchRProject))
67 | 
68 | ##########
69 | ## Save ##
70 | ##########
71 | 
72 | io$metadata.out <- paste0(io$basedir,"/processed/atac/archR/sample_metadata_after_archR.txt.gz")
73 | fwrite(foo, io$metadata.out, sep="\t", na="NA", quote=F)
74 | 
75 | saveArchRProject(ArchRProject)
76 | 


--------------------------------------------------------------------------------
/atac/archR/pseudobulk/1_archR_add_GroupCoverage.R:
--------------------------------------------------------------------------------
  1 | # https://www.ArchRProject.com/bookdown/how-does-archr-make-pseudo-bulk-replicates.html
  2 | here::i_am("atac/archR/pseudobulk/1_archR_add_GroupCoverage.R")
  3 | 
  4 | source(here::here("settings.R"))
  5 | source(here::here("utils.R"))
  6 | 
  7 | suppressPackageStartupMessages(library(ArchR))
  8 | 
  9 | ######################
 10 | ## Define arguments ##
 11 | ######################
 12 | 
 13 | p <- ArgumentParser(description='')
 14 | p$add_argument('--archr_directory',    type="character",    help='ArchR directory')
 15 | p$add_argument('--metadata',    type="character",    help='metadata file')
 16 | p$add_argument('--group_by',     type="character",    help='Metadata column to group by')
 17 | p$add_argument('--min_cells',     type="integer",    default=50,   help='Minimum number of cells')
 18 | p$add_argument('--max_cells',     type="integer",    default=1000,   help='Maximum number of cells')
 19 | p$add_argument('--threads',     type="integer",    default=1,    help='Number of threads')
 20 | 
 21 | args <- p$parse_args(commandArgs(TRUE))
 22 | 
 23 | #####################
 24 | ## Define settings ##
 25 | #####################
 26 | 
 27 | ## START TEST ##
 28 | # args$metadata <- file.path(io$basedir,"results_new/atac/archR/qc/sample_metadata_after_qc.txt.gz")
 29 | # args$group_by <- "celltype.mapped_mnn"
 30 | # args$min_cells <- 100
 31 | # args$max_cells <- 5000
 32 | # args$threads <- 1
 33 | ## END TEST ##
 34 | 
 35 | ########################
 36 | ## Load cell metadata ##
 37 | ########################
 38 | 
 39 | sample_metadata <- fread(args$metadata) %>%
 40 |   .[pass_atacQC==TRUE & doublet_call==FALSE & genotype=="WT"]
 41 | stopifnot(args$group_by%in%colnames(sample_metadata))
 42 | sample_metadata <- sample_metadata[!is.na(sample_metadata[[args$group_by]])]
 43 | 
 44 | # Filter celltypes by minimum number of cells
 45 | sample_metadata <- sample_metadata[,N:=.N,by=c(args$group_by)] %>% .[N>=args$min_cells] %>% .[,N:=NULL]
 46 | 
 47 | ########################
 48 | ## Load ArchR project ##
 49 | ########################
 50 | 
 51 | # source(here::here("atac/archR/load_archR_project.R"))
 52 | 
 53 | setwd(args$archr_directory)
 54 | 
 55 | addArchRGenome("mm10")
 56 | addArchRThreads(threads = args$threads)
 57 | 
 58 | ArchRProject <- loadArchRProject(args$archr_directory)[sample_metadata$cell]
 59 | 
 60 | ###########################
 61 | ## Update ArchR metadata ##
 62 | ###########################
 63 | 
 64 | sample_metadata.to.archr <- sample_metadata %>% 
 65 |   .[cell%in%rownames(ArchRProject)] %>% setkey(cell) %>% .[rownames(ArchRProject)] %>%
 66 |   as.data.frame() %>% tibble::column_to_rownames("cell")
 67 | 
 68 | stopifnot(all(rownames(sample_metadata.to.archr) == rownames(getCellColData(ArchRProject))))
 69 | ArchRProject <- addCellColData(
 70 |   ArchRProject,
 71 |   data = sample_metadata.to.archr[[args$group_by]],
 72 |   name = args$group_by,
 73 |   cells = rownames(sample_metadata.to.archr),
 74 |   force = TRUE
 75 | )
 76 | 
 77 | # print cell numbers
 78 | table(getCellColData(ArchRProject,args$group_by)[[1]])
 79 | 
 80 | #########################
 81 | ## Add Group Coverages ##
 82 | #########################
 83 | 
 84 | # Check if group Coverages already exist
 85 | # ArchRProject@projectMetadata$GroupCoverages
 86 | 
 87 | # This function will merge cells within each designated cell group for the generation of pseudo-bulk replicates 
 88 | # and then merge these replicates into a single insertion coverage file.
 89 | # Output: creates files in archR/GroupCoverages/celltype: [X]._.Rep[Y].insertions.coverage.h5
 90 | ArchRProject <- addGroupCoverages(ArchRProject, 
 91 |   groupBy = args$group_by,
 92 |   useLabels = FALSE,  # do not use sample information
 93 |   minCells = args$min_cells,
 94 |   maxCells = args$max_cells,
 95 |   force = TRUE
 96 | )
 97 | 
 98 | ##########
 99 | ## Save ##
100 | ##########
101 | 
102 | saveRDS(ArchRProject@projectMetadata, paste0(io$archR.directory,"/projectMetadata.rds"))


--------------------------------------------------------------------------------
/atac/archR/snakemake/README.txt:
--------------------------------------------------------------------------------
1 | snakemake --cores 1
2 | snakemake --cores 1 --dry-run -p
3 | snakemake --cores 10 -j 99 --latency-wait 90 -p --cluster "sbatch -n {threads} --mem {resources.mem_mb}M"
4 | 
5 | 


--------------------------------------------------------------------------------
/atac/archR/snakemake/run_cluster.sh:
--------------------------------------------------------------------------------
1 | snakemake --cores 15 -j 99 --latency-wait 90 -p --cluster "sbatch -n {threads} --mem {resources.mem_mb}M"


--------------------------------------------------------------------------------
/atac/archR/snakemake/run_cluster_single.sh:
--------------------------------------------------------------------------------
1 | snakemake --cores 1 -j 1 --latency-wait 90 -p --cluster "sbatch -n {threads} --mem {resources.mem_mb}M"
2 | 


--------------------------------------------------------------------------------
/gastrulation_multiome_10x.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 


--------------------------------------------------------------------------------
/images/igv_screenshot_github.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rargelaguet/mouse_organogenesis_10x_multiome_publication/3ee0ba0ae5fbdf6817ef1d341ff483b3028c085f/images/igv_screenshot_github.png


--------------------------------------------------------------------------------
/images/overview_github.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rargelaguet/mouse_organogenesis_10x_multiome_publication/3ee0ba0ae5fbdf6817ef1d341ff483b3028c085f/images/overview_github.png


--------------------------------------------------------------------------------
/load_paga_graph.R:
--------------------------------------------------------------------------------
 1 | library(GGally)
 2 | library(network)
 3 | library(sna)
 4 | library(ggraph)
 5 | library(igraph)
 6 | library(tidygraph)
 7 | 
 8 | connectivity.mtx <- fread(io$paga.connectivity) %>%
 9 |   matrix.please %>% .[opts$celltypes,opts$celltypes]
10 | 
11 | df.coordinates <- fread(io$paga.coordinates) %>% 
12 |   matrix.please %>% .[opts$celltypes,]
13 | 
14 | # Parse data
15 | connectivity.mtx[connectivity.mtx<0.20] <- 0
16 | connectivity.mtx[connectivity.mtx>=0.20] <- 1
17 | 
18 | # Create igraph object
19 | igraph.paga <- graph_from_adjacency_matrix(connectivity.mtx, mode = "undirected")
20 | 
21 | # Create tbl_graph object
22 | igraph.paga.tbl <- as_tbl_graph(igraph.paga) %>%
23 |   activate(nodes) %>%
24 |   mutate(celltype=rownames(connectivity.mtx)) %>%
25 |   mutate(x=df.coordinates[,1]) %>% mutate(y=df.coordinates[,2])
26 | 
27 | # Create network object
28 | net.paga = network(connectivity.mtx)
29 | net.paga %v% "x" = connectivity.mtx[, 1]
30 | net.paga %v% "y" = connectivity.mtx[, 2]
31 | 
32 | ##########
33 | ## TEST ##
34 | ##########
35 | 
36 | # sum(connectivity.mtx==1)
37 | # connectivity.mtx["Epiblast","Rostral_neurectoderm"]
38 | # igraph.paga.tbl %>% activate(edges) %>% as.data.table()  %>% nrow
39 | # filter(celltype=="Epiblast")
40 | 


--------------------------------------------------------------------------------
/rna/TF2gene_coexpression/utils.R:
--------------------------------------------------------------------------------
 1 | library(Matrix)
 2 | library(qlcMatrix)
 3 | set.seed(42)
 4 | 
 5 | #' Replace non-zero entries in a sparse entries with non-zero ranks
 6 | #'
 7 | #' This method creates a rank matrix for a sparse matrix X using the following approach:
 8 | #' 1. Use non-zero enries in a column to calculate the ranks
 9 | #' 2. Add (z-1)/2 to the ranks (only non-zero entries are changed). z is the number of zeros
10 | #' in the column
11 | #' Since all the entries are shifted by the same constant (the zeros
12 | #' are already shifted), the covariance matrix of this shifted matrix is
13 | #' the same as the rank matrix of the entire matrix (where the zeros would
14 | #' all also have a rank = (z+1)/2) where z is the number of zeros
15 | #'
16 | #' This rank matrix can then be used to calculate pearson correlation
17 | 
18 | SparsifiedRanks <- function(X) {
19 |   X <- as(object = X, Class = "dgCMatrix")
20 |   j <- summary(object = X)$j
21 |   n_zeros_per_col <- nrow(X) - diff(X@p)
22 | 
23 |   for (column in unique(x = j)) {
24 |     non_zero_element_index <- which(j == column)
25 |     elements_along_row <- X@x[non_zero_element_index]
26 |     ranks <- rank(elements_along_row)
27 |     ranks <- ranks + (n_zeros_per_col[column] - 1) / 2
28 |     X@x[non_zero_element_index] <- ranks
29 |   }
30 |   return(X)
31 | }
32 | 
33 | SparseSpearmanCor <- function(X, Y = NULL, cov = FALSE) {
34 | 
35 |   # Get sparsified ranks
36 |   rankX <- SparsifiedRanks(X)
37 |   if (is.null(Y)){
38 |     # Calculate pearson correlation on rank matrices
39 |     return (corSparse(X=rankX, cov=cov))
40 |     }
41 |   rankY <- SparsifiedRanks(Y)
42 |   return(corSparse( X = rankX, Y = rankY, cov = cov))
43 | }
44 | 
45 | 
46 | SparsifiedRanks2 <- function(X) {
47 |   if (class(X)[1] != "dgCMatrix") {
48 |     X <- as(object = X, Class = "dgCMatrix")
49 |   }
50 |   non_zeros_per_col <- diff(x = X@p)
51 |   n_zeros_per_col <- nrow(x = X) - non_zeros_per_col
52 |   offsets <- (n_zeros_per_col - 1) / 2
53 |   x <- X@x
54 |   ## split entries to columns
55 |   col_lst <- split(x = x, f = rep.int(1:ncol(X), non_zeros_per_col))
56 |   ## calculate sparsified ranks and do shifting
57 |   sparsified_ranks <- unlist(x = lapply(X = seq_along(col_lst), FUN = function(i) rank(x = col_lst[[i]]) + offsets[i]))
58 |   ## Create template rank matrix
59 |   X.ranks <- X
60 |   X.ranks@x <- sparsified_ranks
61 |   return(X.ranks)
62 | }
63 | 
64 | 
65 | SparseSpearmanCor2 <- function(X, Y = NULL, cov = FALSE) {
66 | 
67 |   # Get sparsified ranks
68 |   rankX <- SparsifiedRanks2(X)
69 |   if (is.null(Y)){
70 |     # Calculate pearson correlation on rank matrices
71 |     return (corSparse(X=rankX, cov=cov))
72 |     }
73 |   rankY <- SparsifiedRanks2(Y)
74 |   return(corSparse( X = rankX, Y = rankY, cov = cov))
75 | }


--------------------------------------------------------------------------------
/rna/conversions/convert_anndata_to_SingleCellExperiment.R:
--------------------------------------------------------------------------------
 1 | here::i_am("rna/conversions/convert_SingleCellExperiment_to_anndata.R")
 2 | 
 3 | # Load default settings
 4 | source(here::here("settings.R"))
 5 | source(here::here("utils.R"))
 6 | 
 7 | suppressPackageStartupMessages(library(reticulate))
 8 | suppressPackageStartupMessages(library(scuttle))
 9 | library(zellkonverter)
10 | 
11 | ################################
12 | ## Initialize argument parser ##
13 | ################################
14 | 
15 | p <- ArgumentParser(description='')
16 | p$add_argument('--python_path',   type="character",    help='Python path for reticulate')
17 | p$add_argument('--anndata',  type="character",              help='Anndata input file') 
18 | p$add_argument('--outfile',          type="character",                help='SingleCellExperiment output file')
19 | args <- p$parse_args(commandArgs(TRUE))
20 | 
21 | ## START TEST ##
22 | args <- list()
23 | args$python_path <- "/bi/group/reik/ricard/software/miniconda3/envs/main/bin/python" # "/Users/argelagr/opt/anaconda3/envs/main/bin/python"
24 | args$anndata <- file.path(io$basedir,"processed/rna/velocyto/anndata_velocyto.h5ad")
25 | args$outfile <- file.path(io$basedir,"processed/rna/velocyto/SingleCellExperiment_velocyto.rds")
26 | ## END TEST ##
27 | 
28 | ################
29 | ## Reticulate ##
30 | ################
31 | 
32 | reticulate::use_python(args$python_path, required = TRUE)
33 | 
34 | sc <- import("scanpy")
35 | 
36 | ############################################
37 | ## Load anndata into SingleCellExperiment ##
38 | ############################################
39 | 
40 | sce <- readH5AD(args$anndata, use_hdf5 = FALSE, reader = "python")
41 | 
42 | print("Overview of colData")
43 | head(colData(sce))
44 | 
45 | print("Overview of rowData")
46 | head(rowData(sce))
47 | 
48 | # Set gene names
49 | if (is.null(rownames(sce))) {
50 |   if ("gene"%in%colnames(rowData(sce))) {
51 |     rownames(sce) <- rowData(sce)$gene
52 |   }
53 | }
54 | stopifnot(!is.null(rownames(sce)))
55 | print("Overview of gene names")
56 | head(rownames(sce))
57 | 
58 | # Set cell names
59 | if (is.null(colnames(sce))) {
60 |   if ("cell"%in%colnames(colData(sce))) {
61 |     colnames(sce) <- colData(sce)$cell
62 |   }
63 | }
64 | stopifnot(!is.null(colnames(sce)))
65 | print("Overview of cell names")
66 | head(colnames(sce))
67 | 
68 | # set assay names
69 | # assayNames(sce) <- "counts"
70 | assayNames(sce) <- c("counts","spliced","unspliced")
71 | print("Overview of counts")
72 | counts(sce)[1:10,1:10]
73 | 
74 | # reducedDims
75 | 
76 | ###########
77 | ## Parse ##
78 | ###########
79 | 
80 | saveRDS(sce, args$outfile)
81 | 


--------------------------------------------------------------------------------
/rna/differential/cells/parse_differential_results.R:
--------------------------------------------------------------------------------
 1 | here::i_am("rna/differential/cells/parse_differential_results.R")
 2 | 
 3 | # Load default settings
 4 | source(here::here("settings.R"))
 5 | 
 6 | ######################
 7 | ## Define arguments ##
 8 | ######################
 9 | 
10 | p <- ArgumentParser(description='')
11 | p$add_argument('--diff_results_dir',   type="character",     help='File')
12 | p$add_argument('--min_cells',       type="integer",       default=5,      help='Minimum number of cells per group')
13 | p$add_argument('--outdir',             type="character",     help='File')
14 | args <- p$parse_args(commandArgs(TRUE))
15 | 
16 | ## START TEST ##
17 | # io$basedir <- file.path(io$basedir,"test")
18 | # args <- list()
19 | # args$diff_results_dir <- file.path(io$basedir,"results/rna/differential/metacells/celltype")
20 | # args$min_cells <- 5
21 | # args$outdir <- file.path(io$basedir,"results/rna/differential/metacells/celltype")
22 | ## END TEST ##
23 | 
24 | # I/O
25 | dir.create(args$outdir, showWarnings = F, recursive=T)
26 | 
27 | #########################
28 | ## Load and parse data ##
29 | #########################
30 | 
31 | stats.dt <- data.table(celltypeA=as.character(NA), celltypeB=as.character(NA), groupA_N=as.integer(NA), groupB_N=as.integer(NA), included=as.logical(NA))
32 | diff_results_list <- list()
33 | 
34 | # i <- "Visceral_endoderm"; j <- "Surface_ectoderm"
35 | for (i in 1:length(opts$celltypes)) {
36 |   for (j in i:length(opts$celltypes)) {
37 |     
38 |     if (i!=j) {
39 |       file <- file.path(args$diff_results_dir,sprintf("%s_vs_%s.txt.gz",opts$celltypes[[i]],opts$celltypes[[j]]))
40 |       if (file.exists(file)) {
41 |         tmp <- fread(file) %>% 
42 |           # setnames(c("gene", "logFC", "padj_fdr", "groupA_N", "groupB_N", "groupA_detection_rate", "groupB_detection_rate")) %>%
43 |           .[,c("celltypeA","celltypeB"):=list(opts$celltypes[[i]],opts$celltypes[[j]])]
44 |         
45 |         # Empty file (not enough cells to do DE)
46 |         if (nrow(tmp)>1) {
47 |           stats.dt <- rbind(stats.dt, data.table(celltypeA=opts$celltypes[[i]], celltypeB=opts$celltypes[[j]], groupA_N=tmp[celltypeA==opts$celltypes[[i]],groupA_N][1], groupB_N=tmp[celltypeB==opts$celltypes[[j]],groupB_N][1], included=TRUE))
48 |           diff_results_list[[sprintf("%s_vs_%s",opts$celltypes[[i]],opts$celltypes[[j]])]] <- tmp
49 |         } else {
50 |           stats.dt <- rbind(stats.dt, data.table(celltypeA=opts$celltypes[[i]], celltypeB=opts$celltypes[[j]], groupA_N=tmp[celltypeA==opts$celltypes[[i]],groupA_N][1], groupB_N=tmp[celltypeB==opts$celltypes[[j]],groupB_N][1], included=FALSE)) 
51 |         }
52 |       } else {
53 |         print(sprintf("%s not found...",file))
54 |       }
55 |     }
56 |   }
57 | }
58 |  
59 | # if (tmp[celltypeA==opts$celltypes[[i]],groupA_N][1]>=args$min_cells & tmp[celltypeB==opts$celltypes[[j]],groupB_N][1]>=args$min_cells) {
60 |  
61 | ##########
62 | ## Save ##
63 | ##########
64 | 
65 | fwrite(stats.dt[-1], file.path(args$outdir,"diff_expr_stats.txt.gz"), sep="\t", quote=F, na="NA")
66 | fwrite(rbindlist(diff_results_list), file.path(args$outdir,"diff_expr_results.txt.gz"), sep="\t", quote=F, na="NA")
67 | 
68 | 


--------------------------------------------------------------------------------
/rna/differential/metacells/parse_differential_results.R:
--------------------------------------------------------------------------------
 1 | here::i_am("rna/differential/metacells/parse_differential_results.R")
 2 | 
 3 | # Load default settings
 4 | source(here::here("settings.R"))
 5 | # source(here::here("utils.R"))
 6 | 
 7 | ######################
 8 | ## Define arguments ##
 9 | ######################
10 | 
11 | p <- ArgumentParser(description='')
12 | p$add_argument('--diff_results_dir',   type="character",     help='File')
13 | p$add_argument('--min_cells',       type="integer",       default=5,      help='Minimum number of cells per group')
14 | p$add_argument('--outdir',             type="character",     help='File')
15 | args <- p$parse_args(commandArgs(TRUE))
16 | 
17 | ## START TEST ##
18 | # io$basedir <- file.path(io$basedir,"test")
19 | # args <- list()
20 | # args$diff_results_dir <- file.path(io$basedir,"results/rna/differential/metacells/celltype")
21 | # args$min_cells <- 5
22 | # args$outdir <- file.path(io$basedir,"results/rna/differential/metacells/celltype")
23 | ## END TEST ##
24 | 
25 | # I/O
26 | dir.create(args$outdir, showWarnings = F, recursive = T)
27 | 
28 | #########################
29 | ## Load and parse data ##
30 | #########################
31 | 
32 | stats.dt <- data.table(celltypeA=as.character(NA), celltypeB=as.character(NA), groupA_N=as.integer(NA), groupB_N=as.integer(NA), included=as.logical(NA))
33 | diff_results_list <- list()
34 | 
35 | # i <- "Visceral_endoderm"; j <- "Surface_ectoderm"
36 | for (i in 1:length(opts$celltypes)) {
37 |   for (j in i:length(opts$celltypes)) {
38 |     
39 |     if (i!=j) {
40 |       file <- file.path(args$diff_results_dir,sprintf("%s_vs_%s.txt.gz",opts$celltypes[[i]],opts$celltypes[[j]]))
41 |       if (file.exists(file)) {
42 |         tmp <- fread(file) %>% 
43 |           # setnames(c("gene", "logFC", "padj_fdr", "groupA_N", "groupB_N", "groupA_detection_rate", "groupB_detection_rate")) %>%
44 |           .[,c("celltypeA","celltypeB"):=list(opts$celltypes[[i]],opts$celltypes[[j]])]
45 |         
46 |         # Empty file (not enough cells to do DE)
47 |         if (nrow(tmp)>1) {
48 |           stats.dt <- rbind(stats.dt, data.table(celltypeA=opts$celltypes[[i]], celltypeB=opts$celltypes[[j]], groupA_N=tmp[celltypeA==opts$celltypes[[i]],groupA_N][1], groupB_N=tmp[celltypeB==opts$celltypes[[j]],groupB_N][1], included=TRUE))
49 |           diff_results_list[[sprintf("%s_vs_%s",opts$celltypes[[i]],opts$celltypes[[j]])]] <- tmp
50 |         } else {
51 |           stats.dt <- rbind(stats.dt, data.table(celltypeA=opts$celltypes[[i]], celltypeB=opts$celltypes[[j]], groupA_N=tmp[celltypeA==opts$celltypes[[i]],groupA_N][1], groupB_N=tmp[celltypeB==opts$celltypes[[j]],groupB_N][1], included=FALSE)) 
52 |         }
53 |       } else {
54 |         print(sprintf("%s not found...",file))
55 |       }
56 |     }
57 |   }
58 | }
59 |  
60 | # if (tmp[celltypeA==opts$celltypes[[i]],groupA_N][1]>=args$min_cells & tmp[celltypeB==opts$celltypes[[j]],groupB_N][1]>=args$min_cells) {
61 | 
62 | ##########
63 | ## Save ##
64 | ##########
65 | 
66 | fwrite(stats.dt[-1], file.path(args$outdir,"diff_expr_stats.txt.gz"), sep="\t", quote=F, na="NA")
67 | fwrite(rbindlist(diff_results_list), file.path(args$outdir,"diff_expr_results.txt.gz"), sep="\t", quote=F, na="NA")
68 | 
69 | 


--------------------------------------------------------------------------------
/rna/differential/other/extract_TFs_diff.R:
--------------------------------------------------------------------------------
 1 | here::i_am("rna/differential/other/extract_TFs_diff.R")
 2 | 
 3 | # Load default settings
 4 | source(here::here("settings.R"))
 5 | 
 6 | ######################
 7 | ## Define arguments ##
 8 | ######################
 9 | 
10 | p <- ArgumentParser(description='')
11 | p$add_argument('--TFs',             type="character",     help='Cell metadata file')
12 | p$add_argument('--diff_results',   type="character",     help='File')
13 | p$add_argument('--outfile',             type="character",     help='File')
14 | args <- p$parse_args(commandArgs(TRUE))
15 | 
16 | ## START TEST ##
17 | # io$basedir <- file.path(io$basedir,"test")
18 | # args <- list()
19 | # args$TFs <- "/Users/argelagr/data/mm10_regulation/TFs/TFs.txt"
20 | # args$diff_results <- file.path(io$basedir,"results/rna/differential/metacells/celltype/parsed/diff_expr_results.txt.gz")
21 | # args$outfile <- file.path(io$basedir,"results/rna/differential/metacells/celltype/parsed/diff_expr_results_tfs.txt.gz")
22 | ## END TEST ##
23 | 
24 | dir.create(dirname(args$outfile), showWarnings = F)
25 | 
26 | ##############
27 | ## Load TFs ##
28 | ##############
29 | 
30 | # TFs <- fread(args$TFs)[["gene"]]
31 | TFs <- fread(args$TFs)[[1]] %>% str_to_title
32 | 
33 | ################################################
34 | ## Load differential expression and fetch TFs ##
35 | ################################################
36 | 
37 | diff_results.dt <- fread(args$diff_results) %>% 
38 |   .[gene%in%TFs] %>% .[,gene:=toupper(gene)]
39 | 
40 | # diff_tf.dt <- opts$celltypes %>% map(function(i) {
41 | #   opts$celltypes %>% map(function(j) {
42 | #     if (i!=j) {
43 | #       file <- file.path(args$diff_results_dir,sprintf("%s_vs_%s.txt.gz",i,j))
44 | #       if (file.exists(file)) {
45 | #         fread(file, select=c(1,2,4,6,7,8,9)) %>% 
46 | #           setnames(c("gene", "logFC", "padj_fdr", "groupA_N", "groupB_N", "groupA_detection_rate", "groupB_detection_rate")) %>%
47 | #           .[gene%in%TFs] %>% .[,c("celltypeA","celltypeB"):=list(i,j)]
48 | #       }
49 | #     }
50 | #   }) %>% rbindlist
51 | # }) %>% rbindlist %>% .[,gene:=toupper(gene)]
52 | 
53 | print(sprintf("Number of TFs in the differential expression results: %s",length(unique(diff_results.dt$gene))))
54 | 
55 | ##########
56 | ## Save ##
57 | ##########
58 | 
59 | fwrite(diff_results.dt, args$outfile, sep="\t", quote=F, na="NA")
60 | 
61 | 


--------------------------------------------------------------------------------
/rna/differential/pseudobulk/celltype/analysis/old/extract_TFs_diff.R:
--------------------------------------------------------------------------------
 1 | here::i_am("rna/differential/metacells/analysis/TFs/extract_TFs_diff.R")
 2 | 
 3 | # Load default settings
 4 | source(here::here("settings.R"))
 5 | 
 6 | ######################
 7 | ## Define arguments ##
 8 | ######################
 9 | 
10 | p <- ArgumentParser(description='')
11 | p$add_argument('--TFs',             type="character",     help='Cell metadata file')
12 | p$add_argument('--diff_results',   type="character",     help='File')
13 | p$add_argument('--outfile',             type="character",     help='File')
14 | args <- p$parse_args(commandArgs(TRUE))
15 | 
16 | ## START TEST ##
17 | # io$basedir <- file.path(io$basedir,"test")
18 | # args <- list()
19 | # args$TFs <- "/Users/argelagr/data/mm10_regulation/TFs/TFs.txt"
20 | # args$diff_results <- file.path(io$basedir,"results/rna/differential/metacells/celltype/parsed/diff_expr_results.txt.gz")
21 | # args$outfile <- file.path(io$basedir,"results/rna/differential/metacells/celltype/parsed/diff_expr_results_tfs.txt.gz")
22 | ## END TEST ##
23 | 
24 | dir.create(dirname(args$outfile), showWarnings = F)
25 | 
26 | ##############
27 | ## Load TFs ##
28 | ##############
29 | 
30 | # TFs <- fread(args$TFs)[["gene"]]
31 | TFs <- fread(args$TFs)[[1]] %>% str_to_title
32 | 
33 | ################################################
34 | ## Load differential expression and fetch TFs ##
35 | ################################################
36 | 
37 | diff_results.dt <- fread(args$differential_results) %>% 
38 |   .[gene%in%TFs] %>% .[,gene:=toupper(gene)]
39 | 
40 | # diff_tf.dt <- opts$celltypes %>% map(function(i) {
41 | #   opts$celltypes %>% map(function(j) {
42 | #     if (i!=j) {
43 | #       file <- file.path(args$diff_results_dir,sprintf("%s_vs_%s.txt.gz",i,j))
44 | #       if (file.exists(file)) {
45 | #         fread(file, select=c(1,2,4,6,7,8,9)) %>% 
46 | #           setnames(c("gene", "logFC", "padj_fdr", "groupA_N", "groupB_N", "groupA_detection_rate", "groupB_detection_rate")) %>%
47 | #           .[gene%in%TFs] %>% .[,c("celltypeA","celltypeB"):=list(i,j)]
48 | #       }
49 | #     }
50 | #   }) %>% rbindlist
51 | # }) %>% rbindlist %>% .[,gene:=toupper(gene)]
52 | 
53 | print(sprintf("Number of TFs in the differential expression results: %s",length(unique(diff_results.dt$gene))))
54 | 
55 | ##########
56 | ## Save ##
57 | ##########
58 | 
59 | fwrite(diff_results.dt, args$outfile, sep="\t", quote=F, na="NA")
60 | 
61 | 


--------------------------------------------------------------------------------
/rna/differential/pseudobulk/celltype/analysis/plot_marker_genes_stats.R:
--------------------------------------------------------------------------------
  1 | # Load default settings
  2 | source(here::here("settings.R"))
  3 | source(here::here("utils.R"))
  4 | 
  5 | #####################
  6 | ## Define settings ##
  7 | #####################
  8 | 
  9 | io$basedir <- file.path(io$basedir,"test")
 10 | io$marker_genes <- file.path(io$basedir,"results/rna/differential/pseudobulk/celltype/parsed/marker_genes_filtered.txt.gz")
 11 | io$outdir <- file.path(io$basedir,"results/rna/differential/pseudobulk/celltype/parsed/pdf"); dir.create(io$outdir, showWarnings = F)
 12 | 
 13 | opts$celltypes <- c(
 14 |   "Epiblast",
 15 |   "Primitive_Streak",
 16 |   "Caudal_epiblast",
 17 |   "PGC",
 18 |   "Anterior_Primitive_Streak",
 19 |   "Notochord",
 20 |   "Def._endoderm",
 21 |   "Gut",
 22 |   "Nascent_mesoderm",
 23 |   "Mixed_mesoderm",
 24 |   "Intermediate_mesoderm",
 25 |   "Caudal_Mesoderm",
 26 |   "Paraxial_mesoderm",
 27 |   "Somitic_mesoderm",
 28 |   "Pharyngeal_mesoderm",
 29 |   "Cardiomyocytes",
 30 |   "Allantois",
 31 |   "ExE_mesoderm",
 32 |   "Mesenchyme",
 33 |   "Haematoendothelial_progenitors",
 34 |   "Endothelium",
 35 |   "Blood_progenitors_1",
 36 |   "Blood_progenitors_2",
 37 |   "Erythroid1",
 38 |   "Erythroid2",
 39 |   "Erythroid3",
 40 |   "NMP",
 41 |   "Rostral_neurectoderm",
 42 |   # "Caudal_neurectoderm",
 43 |   "Neural_crest",
 44 |   "Forebrain_Midbrain_Hindbrain",
 45 |   "Spinal_cord",
 46 |   "Surface_ectoderm"
 47 |   # "Visceral_endoderm"
 48 |   # "ExE_endoderm",
 49 |   # "ExE_ectoderm"
 50 |   # "Parietal_endoderm"
 51 | )
 52 | 
 53 | ###############################
 54 | ## Load differential results ##
 55 | ###############################
 56 | 
 57 | markers_genes.dt <- fread(io$marker_genes) %>% .[celltype%in%opts$celltypes]
 58 | 
 59 | ################################################
 60 | ## Plot number of marker genes per cell types ##
 61 | ################################################
 62 | 
 63 | to.plot <- markers_genes.dt %>% .[,.N,by=c("celltype")]
 64 | 
 65 | p <- ggbarplot(to.plot, x="celltype", y="N", fill="celltype") +
 66 |   scale_fill_manual(values=opts$celltype.colors) +
 67 |   labs(x="", y="Number of marker genes") +
 68 |   theme(
 69 |     axis.text.y = element_text(size=rel(0.65)),
 70 |     axis.text.x = element_text(colour="black",size=rel(0.7), angle=90, hjust=1, vjust=0.5),
 71 |     axis.title = element_text(colour="black",size=rel(0.75)),
 72 |     axis.ticks.x = element_blank(),
 73 |     legend.position = "none"
 74 | )
 75 | 
 76 | pdf(file.path(io$outdir,"barplot_number_marker_genes.pdf"), width = 6, height = 4)
 77 | print(p)
 78 | dev.off()
 79 | 
 80 | ##################################
 81 | ## Plot gene marker exclusivity ##
 82 | ##################################
 83 | 
 84 | to.plot <- markers_genes.dt %>%
 85 |   .[,.(Nx=.N),by="gene"] %>%
 86 |   .[,Nx:=factor(Nx)] %>%
 87 |   .[,.(Ny=.N),by="Nx"]
 88 | 
 89 | p <- ggbarplot(to.plot, x="Nx", y="Ny", fill="gray70") +
 90 |   labs(x="Number of different cell types per marker gene", y="") +
 91 |   theme(
 92 |     axis.text = element_text(size=rel(0.75)),
 93 |   )
 94 | 
 95 | pdf(file.path(io$outdir,"boxplot_exclusivity_per_gene.pdf"), width = 7, height = 5)
 96 | print(p)
 97 | dev.off()
 98 | 
 99 | ################################################
100 | ## Plot gene marker exclusivity per cell type ##
101 | ################################################
102 | 
103 | to.plot <- markers_genes.dt %>% .[,N:=.N,by="gene"]
104 | 
105 | p <- ggboxplot(to.plot, x="celltype", y="N", fill="celltype", color="black") +
106 |   scale_fill_manual(values=opts$celltype.colors) +
107 |   labs(x="", y="Exclusivity of gene markers\n(the smaller the more exclusive)") +
108 |   theme(
109 |     axis.text.y = element_text(size=rel(0.75)),
110 |     axis.title.y = element_text(size=rel(0.85)),
111 |     axis.text.x = element_text(colour="black",size=rel(0.7), angle=90, hjust=1, vjust=0.5),
112 |     legend.position = "none"
113 |   )
114 | 
115 | pdf(file.path(io$outdir,"boxplot_exclusivity_per_celltype.pdf"), width = 9, height = 5)
116 | print(p)
117 | dev.off()
118 | 
119 | 
120 | 


--------------------------------------------------------------------------------
/rna/differential/pseudobulk/celltype/differential_celltype_pseudobulk.R:
--------------------------------------------------------------------------------
  1 | here::i_am("rna/differential/pseudobulk/celltype/differential_celltype_pseudobulk.R")
  2 | 
  3 | # Load default settings
  4 | source(here::here("settings.R"))
  5 | source(here::here("utils.R"))
  6 | 
  7 | suppressMessages(library(edgeR))
  8 | suppressMessages(library(scater))
  9 | 
 10 | ######################
 11 | ## Define arguments ##
 12 | ######################
 13 | 
 14 | p <- ArgumentParser(description='')
 15 | p$add_argument('--sce',    type="character",    help='SingleCellExperiment file')
 16 | p$add_argument('--groupA',    type="character",    help='group A')
 17 | p$add_argument('--groupB',    type="character",    help='group B')
 18 | p$add_argument('--outfile',   type="character",    help='Output file')
 19 | args <- p$parse_args(commandArgs(TRUE))
 20 | 
 21 | ## START TEST
 22 | # io$basedir <- file.path(io$basedir,"test")
 23 | # args$sce <- file.path(io$basedir,"results/rna/pseudobulk/celltype/SingleCellExperiment_pseudobulk_with_replicates.rds")
 24 | # args$groupA <- "Epiblast"
 25 | # args$groupB <- "Erythroid2"
 26 | # args$outfile <- NULL
 27 | ## END TEST
 28 | 
 29 | dir.create(dirname(args$outfile), showWarnings = F)
 30 | 
 31 | #####################
 32 | ## Define settings ##
 33 | #####################
 34 | 
 35 | # Define groups
 36 | opts$groups <- c(args$groupA,args$groupB)
 37 | 
 38 | # stupid stuff but otherwise the snakemake pipeline doesn't work
 39 | if (args$groupA==args$groupB) {
 40 |   out <- data.table(feature=NA, logFC=NA, padj_fdr=NA)
 41 |   fwrite(out, args$outfile, sep="\t", na="NA", quote=F)
 42 |   warning("groupA and groupB are the same, saving an empty file...")
 43 |   quit(status=0)
 44 | }
 45 | 
 46 | #########################
 47 | ## Load RNA expression ##
 48 | #########################
 49 | 
 50 | # Load SingleCellExperiment object
 51 | sce <- readRDS(args$sce)
 52 | 
 53 | # temporary
 54 | if (!"celltype" %in% colnames(colData(sce))) {
 55 |   sce$celltype <- colnames(sce) %>% strsplit("_rep") %>% map_chr(1)
 56 | }
 57 | 
 58 | sce <- sce[,sce$celltype %in% opts$groups]
 59 | 
 60 | sce$celltype <- factor(sce$celltype, levels=opts$groups)
 61 | table(sce$celltype)
 62 | 
 63 | #########################################
 64 | ## Calculate average expression levels ##
 65 | #########################################
 66 | 
 67 | expr.dt <- data.table(
 68 |   gene = rownames(sce),
 69 |   mean_groupA = rowMeans(logcounts(sce[,sce$celltype==args$groupA])) %>% round(2),
 70 |   mean_groupB = rowMeans(logcounts(sce[,sce$celltype==args$groupB])) %>% round(2)
 71 | )
 72 | 
 73 | #######################
 74 | ## Feature selection ##
 75 | #######################
 76 | 
 77 | opts$min.expr <- 4 # 2**4 = 16, at least an average of 8 counts per milion for each group
 78 | 
 79 | genes.to.use <- expr.dt[mean_groupA>=opts$min.expr | mean_groupB>=opts$min.expr,gene]
 80 | 
 81 | ################################################
 82 | ## Differential expression testing with edgeR ##
 83 | ################################################
 84 | 
 85 | # Convert SCE to DGEList
 86 | sce_edger <- scran::convertTo(sce[genes.to.use,], type="edgeR")
 87 | 
 88 | # Define design matrix (with intercept)
 89 | design <- model.matrix(~sce$celltype)
 90 | 
 91 | # Estimate dispersions
 92 | sce_edger  <- estimateDisp(sce_edger,design)
 93 | 
 94 | # Fit GLM
 95 | fit <- glmQLFit(sce_edger,design)
 96 | 
 97 | # Likelihood ratio test
 98 | lrt <- glmQLFTest(fit)
 99 | 
100 | # Construct output data.frame
101 | out <- topTags(lrt, n=nrow(lrt))$table %>% as.data.table(keep.rownames=T) %>%
102 |   setnames(c("gene","logFC","logCPM","LR","p.value","padj_fdr")) %>%
103 |   .[,c("logCPM","LR","p.value"):=NULL] %>%
104 |   .[,c("padj_fdr","logFC"):=list(signif(padj_fdr,digits=3), round(logFC,3))] %>%
105 |   merge(expr.dt, by="gene", all.y=TRUE) %>%
106 |   setorder(padj_fdr, na.last=T)
107 | 
108 | ##################
109 | ## Save results ##
110 | ##################
111 | 
112 | fwrite(out, args$outfile, sep="\t", na="NA", quote=F)
113 | 


--------------------------------------------------------------------------------
/rna/differential/pseudobulk/celltype/old/run_diff_expr_celltype_pseudobulk.R:
--------------------------------------------------------------------------------
 1 | here::i_am("rna/differential/metacells/celltype/run_diff_expr_celltype.R")
 2 | 
 3 | # Load default settings
 4 | source(here::here("settings.R"))
 5 | 
 6 | ######################
 7 | ## Define arguments ##
 8 | ######################
 9 | 
10 | p <- ArgumentParser(description='')
11 | p$add_argument('--sce',             type="character",     help='SingleCellExperiment file')
12 | p$add_argument('--outdir',          type="character",     help='Output directory')
13 | p$add_argument('--test_mode',       action="store_true",  help='Test mode? subset data')
14 | 
15 | args <- p$parse_args(commandArgs(TRUE))
16 | 
17 | ## START TEST ##
18 | # io$basedir <- file.path(io$basedir,"test")
19 | # args <- list()
20 | # args$sce <- file.path(io$basedir,"results/rna/pseudobulk/celltype/SingleCellExperiment_pseudobulk_with_replicates.rds")
21 | # args$outdir <- file.path(io$basedir,"results/rna/differential/pseudobulk/with_replicates/celltype")
22 | # args$test_mode <- TRUE
23 | ## END TEST ##
24 | 
25 | #####################
26 | ## Define settings ##
27 | #####################
28 | 
29 | io$script <- here::here("rna/differential/pseudobulk/with_replicates/celltype/differential_rna_celltype_pseudobulk.R")
30 | dir.create(args$outdir, showWarnings=FALSE, recursive=TRUE)
31 | 
32 | #########
33 | ## Run ##
34 | #########
35 | 
36 | if (args$test_mode) {
37 |   print("Test mode activated, running only a few comparisons...")
38 |   opts$celltypes <- opts$celltypes %>% head(n=3)
39 | }
40 | 
41 | for (i in 1:length(opts$celltypes)) {
42 |   for (j in i:length(opts$celltypes)) {
43 |     if (i!=j) {
44 |       groupA <- opts$celltypes[[i]]
45 |       groupB <- opts$celltypes[[j]]
46 |       
47 |       outfile <- sprintf("%s/%s_vs_%s.txt.gz", args$outdir,groupA,groupB)
48 | 
49 |       # Define LSF command
50 |       if (grepl("BI",Sys.info()['nodename'])) {
51 |         lsf <- ""
52 |       } else if (grepl("pebble|headstone", Sys.info()['nodename'])) {
53 |         lsf <- sprintf("sbatch -n 1 --mem 7G --wrap")
54 |       }
55 |       cmd <- sprintf("%s 'Rscript %s --sce %s --groupA %s --groupB %s --outfile %s'", 
56 |         lsf, io$script, args$sce, groupA, groupB, outfile)
57 |       # if (isTRUE(opts$test_mode)) cmd <- paste0(cmd, " --test_mode")
58 | 
59 |       # Run
60 |       print(cmd)
61 |       system(cmd)
62 |     }
63 |   }
64 | }
65 | 
66 | 
67 | # Completion token
68 | file.create(file.path(args$outdir,"completed.txt"))


--------------------------------------------------------------------------------
/rna/differential/pseudobulk/celltype/parse_differential_results.R:
--------------------------------------------------------------------------------
 1 | here::i_am("rna/differential/pseudobulk/celltype/parse_differential_results.R")
 2 | 
 3 | # Load default settings
 4 | source(here::here("settings.R"))
 5 | # source(here::here("utils.R"))
 6 | 
 7 | ######################
 8 | ## Define arguments ##
 9 | ######################
10 | 
11 | p <- ArgumentParser(description='')
12 | p$add_argument('--diff_results_dir',   type="character",     help='File')
13 | p$add_argument('--outdir',             type="character",     help='File')
14 | args <- p$parse_args(commandArgs(TRUE))
15 | 
16 | ## START TEST ##
17 | # io$basedir <- file.path(io$basedir,"test")
18 | # args <- list()
19 | # args$diff_results_dir <- file.path(io$basedir,"results/rna/differential/pseudobulk/celltype")
20 | # args$outdir <- file.path(io$basedir,"results/rna/differential/pseudobulk/celltype")
21 | ## END TEST ##
22 | 
23 | # I/O
24 | dir.create(args$outdir, showWarnings = F, recursive = T)
25 | 
26 | ################################################
27 | ## Load differential expression and fetch TFs ##
28 | ################################################
29 | 
30 | diff_results_list <- list()
31 | 
32 | # i <- "Visceral_endoderm"; j <- "Surface_ectoderm"
33 | for (i in 1:length(opts$celltypes)) {
34 |   for (j in i:length(opts$celltypes)) {
35 |     
36 |     if (i!=j) {
37 |       file <- file.path(args$diff_results_dir,sprintf("%s_vs_%s.txt.gz",opts$celltypes[[i]],opts$celltypes[[j]]))
38 |       if (file.exists(file)) {
39 |         tmp <- fread(file) %>% .[,c("celltypeA","celltypeB"):=list(opts$celltypes[[i]],opts$celltypes[[j]])]
40 |         diff_results_list[[sprintf("%s_vs_%s",opts$celltypes[[i]],opts$celltypes[[j]])]] <- tmp
41 |       } else {
42 |         print(sprintf("%s not found...",file))
43 |       }
44 |     }
45 |   }
46 | }
47 |  
48 | ##########
49 | ## Save ##
50 | ##########
51 | 
52 | fwrite(rbindlist(diff_results_list), file.path(args$outdir,"diff_expr_results.txt.gz"), sep="\t", quote=F, na="NA")
53 | 
54 | 


--------------------------------------------------------------------------------
/rna/differential/pseudobulk/celltype_genotype/parse_differential_results.R:
--------------------------------------------------------------------------------
 1 | here::i_am("rna/differential/pseudobulk/celltype_genotype/parse_differential_results.R")
 2 | 
 3 | # Load default settings
 4 | source(here::here("settings.R"))
 5 | # source(here::here("utils.R"))
 6 | 
 7 | ######################
 8 | ## Define arguments ##
 9 | ######################
10 | 
11 | p <- ArgumentParser(description='')
12 | p$add_argument('--diff_results_dir',   type="character",     help='File')
13 | p$add_argument('--outfile',             type="character",     help='File')
14 | args <- p$parse_args(commandArgs(TRUE))
15 | 
16 | ## START TEST ##
17 | # io$basedir <- file.path(io$basedir,"test")
18 | # args <- list()
19 | # args$diff_results_dir <- file.path(io$basedir,"results/rna/differential/pseudobulk/celltype_genotype")
20 | # args$outfile <- file.path(io$basedir,"results/rna/differential/pseudobulk/celltype_genotype/parsed/diff_expr_results.txt.gz")
21 | ## END TEST ##
22 | 
23 | # I/O
24 | dir.create(dirname(args$outfile), showWarnings = F, recursive = T)
25 | 
26 | ##########################################
27 | ## Load differential expression results ##
28 | ##########################################
29 | 
30 | diff_results_list <- list()
31 | 
32 | # i <- "Visceral_endoderm"; j <- "Surface_ectoderm"
33 | for (i in 1:length(opts$celltypes)) {
34 |   file <- file.path(args$diff_results_dir,sprintf("%s.txt.gz",opts$celltypes[[i]]))
35 |   if (file.exists(file)) {
36 |     tmp <- fread(file) %>% .[,celltype:=opts$celltypes[[i]]]
37 |     if (nrow(tmp)>1) {
38 |       diff_results_list[[opts$celltypes[[i]]]] <- tmp
39 |     }
40 |   } else {
41 |     print(sprintf("%s not found...",file))
42 |   }
43 | }
44 |  
45 | print(names(diff_results_list))
46 | 
47 | ##########
48 | ## Save ##
49 | ##########
50 | 
51 | fwrite(rbindlist(diff_results_list), args$outfile, sep="\t", quote=F, na="NA")
52 | 
53 | 


--------------------------------------------------------------------------------
/rna/differential/utils.R:
--------------------------------------------------------------------------------
 1 | 
 2 | # Function to differential expression
 3 | # - sce: SingleCellExperiment object with the column "group" in the colData
 4 | # - groups: the names of the two groups
 5 | # - min_detection_rate_per_group: minimum detection rate per group
 6 | doDiffExpr <- function(sce, groups, min_detection_rate_per_group = 0.50) {
 7 |     
 8 |   # Sanity checks
 9 |   if (!is(sce, "SingleCellExperiment")) stop("'sce' has to be an instance of SingleCellExperiment")
10 |   stopifnot(length(groups)==2)
11 | 
12 |   # Filter genes by detection rate per group
13 |   cdr_A <- rowMeans(logcounts(sce[,sce$group==groups[1]])>0) >= min_detection_rate_per_group
14 |   cdr_B <- rowMeans(logcounts(sce[,sce$group==groups[2]])>0) >= min_detection_rate_per_group
15 |   out <- .edgeR(sce[cdr_B | cdr_A,])
16 |   
17 |   return(out)
18 | }
19 | 
20 | 
21 | .edgeR <- function(sce) {
22 |   
23 |   # Convert SCE to DGEList
24 |   sce_edger <- scran::convertTo(sce, type="edgeR")
25 |   
26 |   # Define design matrix (with intercept)
27 |   cdr <- colMeans(logcounts(sce)>0)
28 |   design <- model.matrix(~cdr+sce$group)
29 |   
30 |   # Estimate dispersions
31 |   sce_edger  <- estimateDisp(sce_edger,design)
32 |   
33 |   # Fit GLM
34 |   fit <- glmQLFit(sce_edger,design)
35 |   
36 |   # Likelihood ratio test
37 |   lrt <- glmQLFTest(fit)
38 |   
39 |   # Construct output data.frame
40 |   out <- topTags(lrt, n=nrow(lrt))$table %>% as.data.table(keep.rownames=T) %>%
41 |     setnames(c("gene","logFC","logCPM","LR","p.value","padj_fdr")) %>%
42 |     .[,c("logCPM","LR","p.value"):=NULL]
43 |   
44 |   return(out)
45 | }
46 | 
47 | ################
48 | ## Plot utils ##
49 | ################
50 | 
51 | 
52 | gg_volcano_plot <- function(to.plot, top_genes=10, xlim=NULL, ylim=NULL, label_groups = NULL) {
53 |   
54 |   negative_hits <- to.plot[sig==TRUE & logFC<0,gene]
55 |   positive_hits <- to.plot[sig==TRUE & logFC>0,gene]
56 |   all <- nrow(to.plot)
57 |   
58 |   # if (is.null(xlim))
59 |   #   xlim <- max(abs(to.plot$logFC), na.rm=T)
60 |   # if (is.null(ylim))
61 |   #   ylim <- max(-log10(to.plot$padj_fdr+1e-100), na.rm=T)
62 |   
63 |   to.plot <- to.plot[!is.na(logFC) & !is.na(padj_fdr)]
64 |   
65 |   p <- ggplot(to.plot, aes(x=logFC, y=-log10(padj_fdr+1e-100))) +
66 |     labs(x="Log fold change", y=expression(paste("-log"[10],"(q.value)"))) +
67 |     ggrastr::geom_point_rast(aes(color=sig, size=sig)) +
68 |     # geom_hline(yintercept = -log10(opts$threshold_fdr), color="blue") +
69 |     geom_segment(aes(x=0, xend=0, y=0, yend=105), color="orange", size=0.5) +
70 |     scale_color_manual(values=c("black","red")) +
71 |     scale_size_manual(values=c(0.5,1)) +
72 |     scale_x_continuous(limits=c(-6,6)) +
73 |     scale_y_continuous(limits=c(0,115)) +
74 |     annotate("text", x=0, y=115, size=4, label=sprintf("(%d)", all)) +
75 |     annotate("text", x=-5, y=115, size=4, label=sprintf("%d (-)",length(negative_hits))) +
76 |     annotate("text", x=5, y=115, size=4, label=sprintf("%d (+)",length(positive_hits))) +
77 |     ggrepel::geom_text_repel(data=head(to.plot[sig==T],n=top_genes), aes(x=logFC, y=-log10(padj_fdr+1e-100), label=gene), max.overlaps=Inf, size=4) +
78 |     theme_classic() +
79 |     theme(
80 |       axis.text = element_text(size=rel(0.75), color='black'),
81 |       axis.title = element_text(size=rel(1.0), color='black'),
82 |       legend.position="none"
83 |     )
84 |   
85 |   
86 |   if (length(label_groups)>0) {
87 |     p <- p +
88 |       annotate("text", x=-4, y=0, size=4, label=sprintf("Up in %s",label_groups[2])) +
89 |       annotate("text", x=4, y=0, size=4, label=sprintf("Up in %s",label_groups[1]))
90 |   }
91 |   
92 |   return(p)
93 | }
94 | 
95 | 


--------------------------------------------------------------------------------
/rna/mapping/analysis/plot_utils.R:
--------------------------------------------------------------------------------
 1 | 
 2 | plot.dimred <- function(plot_df, query.label, atlas.label = "Atlas") {
 3 |   
 4 |   # Define dot size  
 5 |   size.values <- c(opts$size.mapped, opts$size.nomapped)
 6 |   names(size.values) <- c(query.label, atlas.label)
 7 |   
 8 |   # Define dot alpha  
 9 |   alpha.values <- c(opts$alpha.mapped, opts$alpha.nomapped)
10 |   names(alpha.values) <- c(query.label, atlas.label)
11 |   
12 |   # Define dot colours  
13 |   colour.values <- c("red", "lightgrey")
14 |   names(colour.values) <- c(query.label, atlas.label)
15 |   
16 |   # Plot
17 |   ggplot(plot_df, aes(x=V1, y=V2)) +
18 |     ggrastr::geom_point_rast(aes(size=mapped, alpha=mapped, colour=mapped)) +
19 |     scale_size_manual(values = size.values) +
20 |     scale_alpha_manual(values = alpha.values) +
21 |     scale_colour_manual(values = colour.values) +
22 |     # labs(x="UMAP Dimension 1", y="UMAP Dimension 2") +
23 |     guides(colour = guide_legend(override.aes = list(size=6))) +
24 |     theme_classic() +
25 |     theme(
26 |       legend.position = "top", 
27 |       legend.title = element_blank(),
28 |       axis.line = element_blank(),
29 |       axis.text = element_blank(),
30 |       axis.title = element_blank(),
31 |       axis.ticks = element_blank()
32 |     )
33 | }
34 | 
35 | plot.dimred.wtko <- function(plot_df, wt.label = "WT", ko.label = "KO", nomapped.label = "-") {
36 |   
37 |   # Define dot size  
38 |   size.values <- c(opts$size.mapped, opts$size.mapped, opts$size.nomapped)
39 |   names(size.values) <- c(wt.label, ko.label, nomapped.label)
40 |   
41 |   # Define dot alpha  
42 |   alpha.values <- c(opts$alpha.mapped, opts$alpha.mapped, opts$alpha.nomapped)
43 |   names(alpha.values) <- c(wt.label, ko.label, nomapped.label)
44 |   
45 |   # Define dot colours  
46 |   colour.values <- c("red", "blue", "lightgrey")
47 |   names(colour.values) <- c(wt.label, ko.label, nomapped.label)
48 |   
49 |   # Plot
50 |   ggplot(plot_df, aes(x=V1, y=V2)) +
51 |     ggrastr::geom_point_rast(aes(size=mapped, alpha=mapped, colour=mapped)) +
52 |     scale_size_manual(values = size.values) +
53 |     scale_alpha_manual(values = alpha.values) +
54 |     scale_colour_manual(values = colour.values) +
55 |     guides(colour = guide_legend(override.aes = list(size=6))) +
56 |     theme_classic() +
57 |     theme(
58 |       legend.position = "top", 
59 |       legend.title = element_blank(),
60 |       axis.text = element_blank(),
61 |       axis.title = element_blank(),
62 |       axis.ticks = element_blank()
63 |     )
64 | }
65 | 


--------------------------------------------------------------------------------
/rna/mapping/run/parse_sample_metadata_after_mapping.R:
--------------------------------------------------------------------------------
 1 | here::i_am("rna/mapping/run/parse_sample_metadata_after_mapping.R")
 2 | 
 3 | source(here::here("settings.R"))
 4 | 
 5 | ######################
 6 | ## Define arguments ##
 7 | ######################
 8 | 
 9 | p <- ArgumentParser(description='')
10 | # p$add_argument('--query_samples',   type="character",   nargs='+',  help='Query samples')
11 | p$add_argument('--metadata',    type="character",  help='Metadata file to use as input')
12 | # p$add_argument('--mapping_seurat',    type="character", nargs="+", help='Results of the Seurat mapping')
13 | p$add_argument('--mapping_mnn',    type="character",  nargs="+", help='Results of the MNN mapping')
14 | p$add_argument('--outfile',          type="character",               help='Output file')
15 | args <- p$parse_args(commandArgs(TRUE))
16 | 
17 | ###################
18 | ## Load settings ##
19 | ###################
20 | 
21 | 
22 | ## START TEST ##
23 | # args$query_samples <- opts$samples
24 | # args$metadata <- file.path(io$basedir,"results/rna/qc/sample_metadata_after_qc.txt.gz")
25 | # # args$mapping_dir <- file.path(io$basedir,"results/rna/mapping")
26 | # args$mapping_mnn <- file.path(io$basedir,"results/rna/mapping(..)")
27 | # args$mapping_seurat <- file.path(io$basedir,"results/rna/mapping/(..)")
28 | # args$outfile <- file.path(io$basedir,"results/rna/mapping/sample_metadata_after_mapping.txt.gz")
29 | ## END TEST ##
30 | 
31 | 
32 | ###################
33 | ## Load metadata ##
34 | ###################
35 | 
36 | sample_metadata <- fread(args$metadata)
37 | 
38 | ##########################
39 | ## Load mapping results ##
40 | ##########################
41 | 
42 | # MNN
43 | mapping_mnn.dt <- args$mapping_mnn %>% map(~ fread(.)) %>% rbindlist
44 | stopifnot(mapping_mnn.dt$cell%in%sample_metadata$cell)
45 | 
46 | # Seurat
47 | # mapping_seurat.dt <- args$mapping_seurat %>% map(~ fread(.)) %>% rbindlist
48 | # stopifnot(mapping_seurat.dt$cell%in%sample_metadata$cell)
49 | 
50 | ###########
51 | ## Merge ##
52 | ###########
53 | 
54 | # mapping.dt <- merge(mapping_mnn.dt, mapping_seurat.dt, by="cell", suffixes=c("_mnn","_seurat"))
55 | # to.save <- sample_metadata %>% merge(mapping.dt,by="cell",all.x=TRUE)
56 | 
57 | to.save <- sample_metadata %>% merge(mapping_mnn.dt, by="cell", all.x=TRUE)
58 | 
59 | #  .[,celltype_genotype:=sprintf("%s-%s",celltype,genotype)] %>%
60 | 
61 | #################
62 | ## Save output ##
63 | #################
64 | 
65 | fwrite(to.save, args$outfile, sep="\t", na="NA", quote=F)
66 | 
67 | ######################
68 | ## Compare mappings ##
69 | ######################
70 | 
71 | # mapping_mnn.dt <- readRDS(sprintf("%s/mapping_mnn_%s.rds",io$mapping.dir,paste(opts$samples,collapse="-")))$mapping %>% .[,c("cell","celltype.mapped","celltype.score","closest.cell")] %>% as.data.table
72 | # mapping_seurat.dt <- fread(sprintf("%s/mapping_seurat_%s.txt.gz",io$mapping.dir,paste(opts$samples,collapse="-"))) %>% .[,c("predicted.id")] %>% as.data.table
73 | # 
74 | # foo <- merge(
75 | #   mapping_mnn.dt[,c("cell","celltype.mapped")] %>% setnames("celltype.mapped","celltype_mnn"),
76 | #   mapping_seurat.dt[,c("cell","predicted.id")] %>% setnames("predicted.id","celltype_seurat"),
77 | #   by = c("cell")
78 | # )
79 | 


--------------------------------------------------------------------------------
/rna/mapping/trajectories/parse_sample_metadata_after_mapping.R:
--------------------------------------------------------------------------------
 1 | here::i_am("mapping/trajectories/parse_sample_metadata_after_mapping.R")
 2 | 
 3 | source(here::here("settings.R"))
 4 | 
 5 | ######################
 6 | ## Define arguments ##
 7 | ######################
 8 | 
 9 | p <- ArgumentParser(description='')
10 | p$add_argument('--metadata',    type="character",  help='Metadata file to use as input')
11 | # p$add_argument('--mapping_seurat',    type="character", nargs="+", help='Results of the Seurat mapping')
12 | p$add_argument('--mapping_mnn',    type="character",  nargs="+", help='Results of the MNN mapping')
13 | p$add_argument('--outfile',          type="character",               help='Output file')
14 | args <- p$parse_args(commandArgs(TRUE))
15 | 
16 | ###################
17 | ## Load settings ##
18 | ###################
19 | 
20 | ## START TEST ##
21 | # args$metadata <- file.path(io$basedir,"results/mapping/sample_metadata_after_mapping.txt.gz")
22 | # args$mapping_mnn <- file.path(io$basedir,sprintf("results/mapping/trajectories/NMP/mapping_mnn_%s.txt.gz",opts$samples))
23 | # args$outfile <- file.path(io$basedir,"results/mapping/trajectories/NMP/sample_metadata_after_mapping.txt.gz")
24 | ## END TEST ##
25 | 
26 | stopifnot(file.exists(args$mapping_mnn))
27 | 
28 | ###################
29 | ## Load metadata ##
30 | ###################
31 | 
32 | sample_metadata <- fread(args$metadata) %>%
33 |   .[,c("cell","sample","class","alias","celltype.mapped")] %>%
34 |   setnames("celltype.mapped","global_mapping")
35 | 
36 | ##########################
37 | ## Load mapping results ##
38 | ##########################
39 | 
40 | mapping_mnn.dt <- args$mapping_mnn %>% map(~ fread(.)) %>% rbindlist
41 | stopifnot(mapping_mnn.dt$cell%in%sample_metadata$cell)
42 | 
43 | ###########
44 | ## Merge ##
45 | ###########
46 | 
47 | to.save <- sample_metadata %>% 
48 |   merge(mapping_mnn.dt, by=c("cell","sample","class"))
49 | 
50 | #################
51 | ## Save output ##
52 | #################
53 | 
54 | fwrite(to.save, args$outfile, sep="\t", na="NA", quote=F)
55 | 


--------------------------------------------------------------------------------
/rna/mapping/trajectories/plot_mapping_trajectory_wt_vs_ko.R:
--------------------------------------------------------------------------------
 1 | here::i_am("rna/mapping/trajectories/plot_mapping_dimred.R")
 2 | 
 3 | source(here::here("settings.R"))
 4 | source(here::here("rna/mapping/analysis/plot_utils.R"))
 5 | 
 6 | ######################
 7 | ## Define arguments ##
 8 | ######################
 9 | 
10 | p <- ArgumentParser(description='')
11 | p$add_argument('--query_metadata',        type="character",                               help='Cell metadata (after mapping)')
12 | p$add_argument('--atlas_metadata',        type="character",                               help='Cell metadata (after mapping)')
13 | p$add_argument('--outdir',          type="character",                               help='Output file')
14 | 
15 | args <- p$parse_args(commandArgs(TRUE))
16 | 
17 | ## START TEST ##
18 | args$query_metadata <- file.path(io$basedir,"results/rna/mapping/trajectories/nmp_somitic_spinal/sample_metadata_after_mapping.txt.gz")
19 | args$atlas_metadata <- file.path(io$atlas.basedir,"results/trajectories/nmp_somitic_spinal/nmp_trajectory.txt.gz")
20 | args$outdir <- file.path(io$basedir,"results/rna/mapping/trajectories/nmp_somitic_spinal/pdf")
21 | ## END TEST ##
22 | 
23 | dir.create(args$outdir, showWarnings = F, recursive = T)
24 | 
25 | #####################
26 | ## Define settings ##
27 | #####################
28 | 
29 | # Options
30 | 
31 | # Dot size
32 | opts$size.mapped <- 1.20
33 | opts$size.nomapped <- 0.1
34 | 
35 | # Transparency
36 | opts$alpha.mapped <- 0.80
37 | opts$alpha.nomapped <- 0.35
38 | 
39 | #########################
40 | ## Load query metadata ##
41 | #########################
42 | 
43 | sample_metadata <- fread(args$query_metadata) %>%
44 |   .[!is.na(closest.cell)]
45 | 
46 | stopifnot("closest.cell"%in%colnames(sample_metadata))
47 | 
48 | ###########################
49 | ## Load atlas trajectory ##
50 | ###########################
51 | 
52 | meta_atlas <- fread(args$atlas_metadata) %>%
53 |   setnames(c("cell","V1","V2"))
54 | 
55 | ##########
56 | ## Plot ##
57 | ##########
58 | 
59 | # i <- "E7.5"
60 | to.plot <- meta_atlas %>% copy %>%
61 |   .[,index.wt:=match(cell, sample_metadata[genotype=="WT",closest.cell] )] %>%
62 |   .[,index.ko:=match(cell, sample_metadata[genotype=="T_KO",closest.cell] )] %>%
63 |   .[,mapped.wt:=c(0,-10)[as.numeric(as.factor(!is.na(index.wt)))]] %>%
64 |   .[,mapped.ko:=c(0,10)[as.numeric(as.factor(!is.na(index.ko)))]] %>%
65 |   .[,mapped:=factor(mapped.wt + mapped.ko, levels=c("0","-10","10"))] %>%
66 |   .[,mapped:=plyr::mapvalues(mapped, from = c("0","-10","10"), to = c("Atlas","WT","T_KO"))] %>% setorder(mapped)
67 | 
68 | p <- plot.dimred.wtko(to.plot, wt.label = "WT", ko.label = "T_KO", nomapped.label = "Atlas") +
69 |   theme(legend.position = "none", axis.line = element_blank())
70 | 
71 | pdf(file.path(args$outdir,"umap_mapped_trajectory_WT_and_KO.pdf"), width=4.5, height=5)
72 | print(p)
73 | dev.off()
74 | 


--------------------------------------------------------------------------------
/rna/metacells/SEACell_env.yml:
--------------------------------------------------------------------------------
 1 | # https://github.com/dpeerlab/SEACells
 2 | 
 3 | channels:
 4 |   - conda-forge
 5 |   - bioconda
 6 | dependencies:
 7 |   - scanpy=1.8.2
 8 |   - loompy=3.0.6
 9 |   - jupyter
10 |   - louvain
11 |   - python-igraph
12 |   - louvain>=0.6,!=0.6.2
13 |   - leidenalg
14 |   # - harmonypy
15 |   # - scanorama
16 |   - seaborn
17 |   - cython
18 |   - pyranges
19 |   - pip
20 |   - pip:
21 |     - dfply
22 |     # - palantir
23 |     # - PhenoGraph
24 | 
25 | 
26 | # commands:
27 | # conda create -n metacells python==3.9 --yes
28 | # conda activate metacells
29 | # conda install mamba --yes
30 | # mamba env update -n metacells --file SEACell_env.yml
31 | # pip install git+https://github.com/settylab/Palantir@removeTSNE
32 | # python setup.py develop # inside SEACells
33 | 
34 | 
35 | # mamba remove -n metacells --all


--------------------------------------------------------------------------------
/rna/metacells/analysis/overlay_metacells_atlas_umap.R:
--------------------------------------------------------------------------------
 1 | # here::i_am("atac/archR/processing/save_archr_matrices.R")
 2 | 
 3 | source(here::here("settings.R"))
 4 | source(here::here("utils.R"))
 5 | 
 6 | source(here::here("rna/mapping/analysis/plot_utils.R"))
 7 | 
 8 | #####################
 9 | ## Define settings ##
10 | #####################
11 | 
12 | ## I/O
13 | io$metacell_metadata <- file.path(io$basedir,"results/rna/metacells/metacells_metadata.txt.gz")
14 | io$metacell_sce <- file.path(io$basedir,"results/rna/metacells/SingleCellExperiment_metacells.rds")
15 | # io$umap <- file.path(io$basedir,"results/rna/dimensionality_reduction/sce/batch_correction_by_sample_remove_ExE_cells_False/umap_features2500_pcs50_neigh25_dist0.5.txt.gz")
16 | io$outdir <- file.path(io$basedir,"results/rna/metacells/pdf"); dir.create(io$outdir, showWarnings = F)
17 | 
18 | # Dot size
19 | opts$size.mapped <- 0.30
20 | opts$size.nomapped <- 0.1
21 | 
22 | # Dot transparency
23 | opts$alpha.mapped <- 0.75
24 | opts$alpha.nomapped <- 0.35
25 | 
26 | ###################
27 | ## Load metadata ##
28 | ###################
29 | 
30 | metacell_metadata.dt <- fread(io$metacell_metadata)
31 | # sample_metadata.dt <- fread(io$metadata)
32 | 
33 | ###############################
34 | ## Load SingleCellExperiment ##
35 | ###############################
36 | 
37 | sce <- readRDS(io$metacell_sce)
38 | 
39 | ###########################
40 | ## Load precomputed UMAP ##
41 | ###########################
42 | 
43 | # umap.dt <- fread(io$umap, select=c(3,1,2)) %>% setnames(c("cell","V1","V2"))
44 | 
45 | umap.dt <- fread(io$rna.atlas.metadata) %>%
46 |   .[stripped==F & doublet==F] %>%
47 |   .[,c("cell","umapX","umapY","celltype")] %>%
48 |   setnames(c("umapX","umapY"),c("V1","V2"))
49 | 
50 | #########################################################
51 | ## Plot dimensionality reduction: one sample at a time ##
52 | #########################################################
53 | 
54 | to.plot <- umap.dt %>% copy %>%
55 |   .[,index:=match(cell, metacell_metadata.dt$closest.cell)] %>% 
56 |   .[,mapped:=as.factor(!is.na(index))] %>% 
57 |   .[,mapped:=plyr::mapvalues(mapped, from = c("FALSE","TRUE"), to = c("Atlas","Metacell"))] %>%
58 |   setorder(mapped) 
59 | 
60 | p <- plot.dimred(to.plot, query.label = "Metacell", atlas.label = "Atlas")
61 | 
62 | pdf(file.path(io$outdir,"umap_metacell.pdf"), width=8, height=6.5)
63 | print(p)
64 | dev.off()
65 | 


--------------------------------------------------------------------------------
/rna/metacells/analysis/trajectories/overlay_metacells_atlas_trajectory.R:
--------------------------------------------------------------------------------
 1 | # here::i_am("atac/archR/processing/save_archr_matrices.R")
 2 | 
 3 | source(here::here("settings.R"))
 4 | source(here::here("utils.R"))
 5 | 
 6 | source(here::here("rna/mapping/analysis/plot_utils.R"))
 7 | 
 8 | #####################
 9 | ## Define settings ##
10 | #####################
11 | 
12 | ## I/O
13 | io$metacell_metadata <- file.path(io$basedir,"results/rna/metacells/trajectories/nmp/metacells_metadata.txt.gz")
14 | io$metacell_sce <- file.path(io$basedir,"results/rna/metacells/trajectories/nmp/SingleCellExperiment_metacells.rds")
15 | io$trajectory <- file.path(io$basedir,"results/rna/trajectories/nmp/nmp_trajectory.txt.gz")
16 | io$atlas_trajectory <- file.path(io$atlas.basedir,"results/trajectories/nmp_somitic_spinal/nmp_trajectory.txt.gz")
17 | io$outdir <- file.path(io$basedir,"results/rna/metacells/trajectories/nmp/pdf"); dir.create(io$outdir, showWarnings = F)
18 | 
19 | # Dot size
20 | opts$size.mapped <- 1
21 | opts$size.nomapped <- 0.1
22 | 
23 | # Dot transparency
24 | opts$alpha.mapped <- 0.85
25 | opts$alpha.nomapped <- 0.35
26 | 
27 | ###################
28 | ## Load metadata ##
29 | ###################
30 | 
31 | metacell_metadata.dt <- fread(io$metacell_metadata)
32 | # sample_metadata.dt <- fread(io$metadata)
33 | 
34 | ###############################
35 | ## Load SingleCellExperiment ##
36 | ###############################
37 | 
38 | sce <- readRDS(io$metacell_sce)
39 | 
40 | #####################
41 | ## Load trajectory ##
42 | #####################
43 | 
44 | # trajectory.dt <- fread(io$atlas_trajectory) %>% setnames(c("cell","V1","V2"))
45 | trajectory.dt <- fread(io$trajectory) %>% setnames(c("cell","V1","V2"))
46 | 
47 | #################################################
48 | ## Plot mapping of metacells to the trajectory ##
49 | #################################################
50 | 
51 | to.plot <- trajectory.dt %>% copy %>%
52 |   # .[,index:=match(cell, metacell_metadata.dt$closest.cell)] %>% 
53 |   .[,index:=match(cell, metacell_metadata.dt$metacell)] %>% 
54 |   .[,mapped:=as.factor(!is.na(index))] %>% 
55 |   .[,mapped:=plyr::mapvalues(mapped, from = c("FALSE","TRUE"), to = c("Atlas","Metacell"))] %>%
56 |   setorder(mapped) 
57 | 
58 | p <- plot.dimred(to.plot, query.label = "Metacell", atlas.label = "Atlas")
59 | 
60 | pdf(file.path(io$outdir,"trajectory_highlight_metacells.pdf"), width=8, height=6.5)
61 | print(p)
62 | dev.off()
63 | 


--------------------------------------------------------------------------------
/rna/plot_individual_genes/pseudobulk/plot_paga_individual_genes_pseudobulk.R:
--------------------------------------------------------------------------------
  1 | #####################
  2 | ## Define settings ##
  3 | #####################
  4 | 
  5 | if (grepl("ricard",Sys.info()['nodename'])) {
  6 |   source("/Users/ricard/gastrulation_multiome_10x/settings.R")
  7 |   source("/Users/ricard/gastrulation_multiome_10x/utils.R")
  8 | } else if (grepl("ebi",Sys.info()['nodename'])) {
  9 |   source("/homes/ricard/gastrulation_multiome_10x/settings.R")
 10 |   source("/homes/ricard/gastrulation_multiome_10x/utils.R")
 11 | }
 12 | 
 13 | # I/O
 14 | io$outdir <- paste0(io$basedir,"/results/rna/individual_genes/pseudobulk"); dir.create(io$outdir, showWarnings = F)
 15 | 
 16 | # Options
 17 | opts$celltypes = c(
 18 | 	"Epiblast",
 19 | 	"Primitive_Streak",
 20 | 	"Caudal_epiblast",
 21 | 	"PGC",
 22 | 	"Anterior_Primitive_Streak",
 23 | 	"Notochord",
 24 | 	"Def._endoderm",
 25 | 	"Gut",
 26 | 	"Nascent_mesoderm",
 27 | 	"Mixed_mesoderm",
 28 | 	"Intermediate_mesoderm",
 29 | 	"Caudal_Mesoderm",
 30 | 	"Paraxial_mesoderm",
 31 | 	"Somitic_mesoderm",
 32 | 	"Pharyngeal_mesoderm",
 33 | 	"Cardiomyocytes",
 34 | 	"Allantois",
 35 | 	"ExE_mesoderm",
 36 | 	"Mesenchyme",
 37 | 	"Haematoendothelial_progenitors",
 38 | 	"Endothelium",
 39 | 	"Blood_progenitors_1",
 40 | 	"Blood_progenitors_2",
 41 | 	"Erythroid1",
 42 | 	"Erythroid2",
 43 | 	"Erythroid3",
 44 | 	"NMP",
 45 | 	"Rostral_neurectoderm",
 46 | 	"Caudal_neurectoderm",
 47 | 	"Neural_crest",
 48 | 	"Forebrain_Midbrain_Hindbrain",
 49 | 	"Spinal_cord",
 50 | 	"Surface_ectoderm",
 51 | 	"Visceral_endoderm",
 52 | 	"ExE_endoderm",
 53 | 	"Parietal_endoderm",
 54 | 	"ExE_ectoderm"
 55 | )
 56 | 
 57 | ###############
 58 | ## Load data ##
 59 | ###############
 60 | 
 61 | # Load SingleCellExperiment object
 62 | rna.sce <- readRDS(io$rna.pseudobulk.sce)[,opts$celltypes]
 63 | 
 64 | # Load gene metadata
 65 | gene_metadata <- fread(io$gene_metadata) %>%
 66 |   .[symbol%in%rownames(rna.sce)]
 67 | 
 68 | ###############
 69 | ## Load PAGA ##
 70 | ###############
 71 | 
 72 | if (grepl("ricard",Sys.info()['nodename'])) {
 73 |   source("/Users/ricard/gastrulation_multiome_10x/load_paga_graph.R")
 74 | } else if (grepl("ebi",Sys.info()['nodename'])) {
 75 |   source("/homes/ricard/gastrulation_multiome_10x/load_paga_graph.R")
 76 | } else {
 77 |   stop("Computer not recognised")
 78 | }
 79 | 
 80 | # Plot graph structure
 81 | p <- ggnet2(
 82 |   net = net.paga,
 83 |   mode = c("x", "y"),
 84 |   node.size = 0,
 85 |   edge.size = 0.15,
 86 |   edge.color = "grey",
 87 |   label = FALSE,
 88 |   label.size = 2.3
 89 | )
 90 | 
 91 | 
 92 | ##########
 93 | ## Plot ##
 94 | ##########
 95 | 
 96 | # Define color scale
 97 | rna.col.seq <- chromvar.col.seq <- round(seq(0,1,0.1), 2)
 98 | rna.colors <- colorRampPalette(c("gray92", "darkgreen"))(length(rna.col.seq))
 99 | 
100 | # Define genes to plot
101 | # genes.to.plot <- rownames(rna.sce)[grep("Gata",rownames(rna.sce))]
102 | genes.to.plot <- c("Foxa2","Tfap2a","Mesp1")
103 | 
104 | for (i in 1:length(genes.to.plot)) {
105 |   gene <- genes.to.plot[i]
106 |   print(sprintf("%s/%s: %s",i,length(genes.to.plot),gene))
107 |     
108 |   expr.values <- logcounts(rna.sce[gene,])[1,] %>% minmax.normalisation()
109 |   expr.colors <- round(expr.values,1) %>% map(~ rna.colors[which(rna.col.seq == .)]) %>% unlist
110 |   
111 |   p.rna <- p + geom_text(label = "\u25D0", aes(x=x, y=y), color=expr.colors, size=20, family = "Arial Unicode MS",
112 |                       data = p$data[,c("x","y")] %>% dplyr::mutate(expr=expr.colors)) +
113 |     scale_colour_manual(values=expr.colors) + 
114 |     labs(title=gene) +
115 |     theme(
116 |       plot.title = element_text(hjust = 0.5)
117 |     )
118 |   
119 |   
120 |   png(sprintf("%s/%s_rna_expression_paga.png",io$outdir,gene), width = 350, height = 400)
121 |   # pdf(sprintf("%s/%s_rna_expression_paga.pdf",io$outdir,i), width=5, height=3.5)
122 |   print(p.rna)
123 |   dev.off()
124 | }
125 | 
126 | 
127 | 


--------------------------------------------------------------------------------
/rna/processing/3_seurat_to_SCE.R:
--------------------------------------------------------------------------------
  1 | here::i_am("rna/processing/3_seurat_to_SCE.R")
  2 | 
  3 | # Load default settings
  4 | source(here::here("settings.R"))
  5 | 
  6 | suppressPackageStartupMessages(library(Seurat))
  7 | suppressPackageStartupMessages(library(scater))
  8 | suppressPackageStartupMessages(library(scran))
  9 | 
 10 | ######################
 11 | ## Define arguments ##
 12 | ######################
 13 | 
 14 | p <- ArgumentParser(description='')
 15 | p$add_argument('--test',            action="store_true",                 help='Testing mode')
 16 | p$add_argument('--normalise',       action="store_true",                 help='Log-Normalise?')
 17 | p$add_argument('--samples',         type="character",       nargs="+",   help='Samples')
 18 | p$add_argument('--seurat',         type="character", help='Seurat object (input)')
 19 | p$add_argument('--metadata',         type="character", help='Metadata file')
 20 | p$add_argument('--outfile',         type="character", help='Output file')
 21 | args <- p$parse_args(commandArgs(TRUE))
 22 | 
 23 | #####################
 24 | ## Define settings ##
 25 | #####################
 26 | 
 27 | ## START TEST ##
 28 | # args <- list()
 29 | # args$outfile <- io$rna.sce
 30 | # args$outfile <- paste0(io$basedir,"/processed/rna_new/SingleCellExperiment.rds")
 31 | # args$samples <- opts$samples
 32 | # args$metadata <- paste0(io$basedir,"/results/rna_new/qc/sample_metadata_after_qc.txt.gz")
 33 | # args$seurat <- paste0(io$basedir,"/processed/rna_new/seurat.rds")
 34 | # args$test <- FALSE
 35 | # args$normalise <- FALSE
 36 | ## END TEST ##
 37 | 
 38 | # Sanity checks
 39 | stopifnot(args$samples%in%opts$samples)
 40 | if (args$test) args$samples <- head(args$samples,n=2)
 41 | 
 42 | ###############
 43 | ## Load data ##
 44 | ###############
 45 | 
 46 | # Load sample metadata
 47 | sample_metadata <- fread(args$metadata) %>% .[pass_rnaQC==TRUE & sample%in%args$samples]
 48 | table(sample_metadata$sample)
 49 | 
 50 | # Load seurat
 51 | seurat <- readRDS(args$seurat)[,sample_metadata$cell]
 52 | 
 53 | #####################################
 54 | ## Convert to SingleCellExperiment ##
 55 | #####################################
 56 | 
 57 | sce <- as.SingleCellExperiment(seurat)
 58 | 
 59 | # remove logcounts assays
 60 | sce@assays@data[["logcounts"]] <- NULL
 61 | 
 62 | # Add metadata
 63 | # stopifnot(sample_metadata$cell%in%colnames(sce))
 64 | # stopifnot(colnames(sce)%in%sample_metadata$cell)
 65 | sample_metadata <- sample_metadata %>% .[cell%in%colnames(sce)] %>% setkey(cell) %>% .[colnames(sce)]
 66 | stopifnot(sample_metadata$cell == colnames(sce))
 67 | colData(sce) <- sample_metadata %>% as.data.frame %>% tibble::column_to_rownames("cell") %>%
 68 |   .[colnames(sce),] %>% DataFrame()
 69 | 
 70 | ##########################
 71 | ## Compute size factors ##
 72 | ##########################
 73 | 
 74 | clusts <- as.numeric(quickCluster(sce, method = "igraph", min.size = 100, BPPARAM = mcparam))
 75 | # clusts <- as.numeric(quickCluster(sce))
 76 | min.clust <- min(table(clusts))/2
 77 | new_sizes <- c(floor(min.clust/3), floor(min.clust/2), floor(min.clust))
 78 | sce <- computeSumFactors(sce, clusters = clusts, sizes = new_sizes, max.cluster.size = 3000)
 79 | 
 80 | ###################
 81 | ## Log Normalise ##
 82 | ###################
 83 | 
 84 | if (args$normalise) {
 85 | 	sce <- logNormCounts(sce)
 86 | }
 87 | 
 88 | ##########
 89 | ## Plot ##
 90 | ##########
 91 | 
 92 | # to.plot <- data.frame(X = Matrix::colSums(counts(sce)), Y = sizeFactors(sce))
 93 | # ggplot(to.plot, mapping = aes(x = X, y = Y)) +
 94 | #   geom_point() +
 95 | #   labs(x = "Number of UMIs", y = "Size Factor") +
 96 | #   theme_classic()
 97 | 
 98 | ##########
 99 | ## Save ##
100 | ##########
101 | 
102 | saveRDS(sce, args$outfile)
103 | 


--------------------------------------------------------------------------------
/rna/processing/4_doublet_detection.R:
--------------------------------------------------------------------------------
 1 | here::i_am("rna/processing/4_doublet_detection.R")
 2 | 
 3 | source(here::here("settings.R"))
 4 | source(here::here("utils.R"))
 5 | 
 6 | suppressPackageStartupMessages(library(SingleCellExperiment))
 7 | suppressPackageStartupMessages(library(scds))
 8 | suppressPackageStartupMessages(library(scran))
 9 | suppressPackageStartupMessages(library(scater))
10 | 
11 | ######################
12 | ## Define arguments ##
13 | ######################
14 | 
15 | p <- ArgumentParser(description='')
16 | p$add_argument('--sce',         type="character",                            help='SingleCellExperiment file')
17 | p$add_argument('--metadata',    type="character",                            help='metadata file')
18 | p$add_argument('--samples',                 type="character",   nargs='+',     help='Sample(s)')
19 | p$add_argument('--doublet_score_threshold',  type="double",      default=1.25,   help='Doublet score threshold')
20 | p$add_argument('--test',                    action = "store_true",             help='Testing mode')
21 | p$add_argument('--outfile',                  type="character",                  help='Output file')
22 | args <- p$parse_args(commandArgs(TRUE))
23 | 
24 | ## START TEST ##
25 | # args <- list()
26 | # args$sce <- io$rna.sce
27 | # args$metadata <- paste0(io$basedir,"/results/rna/mapping/sample_metadata_after_mapping.txt.gz") # .io$metadata
28 | # args$samples <- c("E8.5_rep2")
29 | # args$doublet_score_threshold <- 1.0
30 | # args$test <- TRUE
31 | ## END TEST ##
32 | 
33 | # Parse arguments
34 | dir.create(dirname(args$outfile))
35 | if (isTRUE(args$test)) print("Test mode activated...")
36 | 
37 | ##########################
38 | ## Load sample metadata ##
39 | ##########################
40 | 
41 | sample_metadata <- fread(args$metadata) %>%
42 |   .[pass_rnaQC==TRUE & sample%in%args$samples]
43 | table(sample_metadata$sample)
44 | 
45 | ###############
46 | ## Load data ##
47 | ###############
48 | 
49 | # Load SingleCellExperiment object
50 | sce <- load_SingleCellExperiment(args$sce, cells=sample_metadata$cell, normalise = TRUE)
51 | dim(sce)
52 | 
53 | # Add sample metadata as colData
54 | colData(sce) <- sample_metadata %>% tibble::column_to_rownames("cell") %>% DataFrame
55 | 
56 | #############################
57 | ## Calculate doublet score ##
58 | #############################
59 | 
60 | sce <- cxds_bcds_hybrid(sce, estNdbl=TRUE)
61 | 
62 | dt <- colData(sce) %>%
63 |   .[,c("sample","cxds_score", "bcds_score", "hybrid_score")] %>%
64 |   as.data.frame %>% tibble::rownames_to_column("cell") %>% as.data.table %>%
65 |   .[,c("cxds_score","bcds_score","hybrid_score"):=list(round(cxds_score,2),round(bcds_score,2),round(hybrid_score,2))]
66 | 
67 | # Call doublets
68 | dt[,doublet_call:=hybrid_score>args$doublet_score_threshold]
69 | table(dt$doublet_call)
70 | 
71 | # Save
72 | # io$outfile <- sprintf("%s/doublets_%s_%s.txt.gz",args$outdir, paste(args$samples,collapse="-"),round(args$doublet_score_threshold,2))
73 | fwrite(dt, args$outfile, sep="\t", na="NA", quote=F)
74 | 
75 | 
76 | 


--------------------------------------------------------------------------------
/rna/processing/5_parse_sample_metadata_after_doublets.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages(library(argparse))
 2 | 
 3 | here::i_am("rna/processing/5_parse_sample_metadata_after_doublets.R")
 4 | source(here::here("settings.R"))
 5 | 
 6 | 
 7 | ######################
 8 | ## Define arguments ##
 9 | ######################
10 | 
11 | p <- ArgumentParser(description='')
12 | p$add_argument('--metadata',         type="character",   help='Metadata file to use as input')
13 | p$add_argument('--doublet_files',    type="character", nargs="+",  help='Results of the doublet score detection algorithm')
14 | p$add_argument('--outfile',          type="character",   help='Output file')
15 | args <- p$parse_args(commandArgs(TRUE))
16 | 
17 | ###################
18 | ## Load settings ##
19 | ###################
20 | 
21 | 
22 | ## START TEST ##
23 | # args$metadata <- file.path(io$basedir,"results/rna/qc/sample_metadata_after_qc.txt.gz")
24 | # args$doublet_files <- file.path(io$basedir,"results/rna/doublet_detection/doublets_AGTCAA_R7_L001_mm10_sorted_merged_rmdup_mtx2_1.25.txt.gz")
25 | # args$outfile <- file.path(io$basedir,"results/rna/doublet_detection/sample_metadata_after_doublets.txt.gz")
26 | ## END TEST ##
27 | 
28 | ##########################
29 | ## Load mapping results ##
30 | ##########################
31 | 
32 | doublet.dt <- args$doublet_files %>% map(~ fread(.)) %>% rbindlist
33 | 
34 | # stopifnot(mapping_mnn.dt$cell%in%sample_metadata$cell)
35 | # stopifnot(mapping_seurat.dt$cell%in%sample_metadata$cell)
36 | 
37 | ####################
38 | ## Merge and save ##
39 | ####################
40 | 
41 | to.save <- fread(args$metadata) %>% 
42 |   merge(doublet.dt[,c("cell","hybrid_score","doublet_call")] %>% setnames("hybrid_score","doublet_score"), by="cell", all.x=TRUE)
43 | fwrite(to.save, args$outfile, sep="\t", na="NA", quote=F)
44 | 
45 | # to.save[pass_rnaQC==T & is.na(celltype.mapped_mnn)]
46 | # stopifnot(to.save[pass_rnaQC==T & is.na(celltype.mapped_mnn),.N]==0)
47 | # stopifnot(to.save[pass_rnaQC==T & is.na(celltype.mapped_seurat),.N]==0)
48 | 


--------------------------------------------------------------------------------
/rna/processing/extract_TFs_from_SingleCellExperiment.R:
--------------------------------------------------------------------------------
 1 | here::i_am("rna/processing/extract_TFs_from_SingleCellExperiment.R")
 2 | 
 3 | source(here::here("settings.R"))
 4 | source(here::here("utils.R"))
 5 | 
 6 | ################################
 7 | ## Initialize argument parser ##
 8 | ################################
 9 | 
10 | p <- ArgumentParser(description='')
11 | p$add_argument('--sce',    type="character",    help='')
12 | p$add_argument('--motif_annotation',    type="character",    help='')
13 | # p$add_argument('--motif2gene',    type="character",    help='')
14 | p$add_argument('--TF_file',    type="character",    help='')
15 | p$add_argument('--outfile',   type="character",    help='Output file')
16 | args <- p$parse_args(commandArgs(TRUE))
17 | 
18 | ## START TEST
19 | # io$basedir <- file.path(io$basedir,"test")
20 | # args$sce <- file.path(io$basedir,"results/rna/pseudobulk/celltype/SingleCellExperiment_pseudobulk.rds")
21 | # args$motif_annotation <- "JASPAR"
22 | # # args$motif2gene <- file.path(io$basedir,sprintf("processed/atac/archR/Annotations/%s_motif2gene.txt.gz",args$motif_annotation))
23 | # args$TF_file <- "/Users/argelagr/data/mm10_regulation/TFs/TFs.txt"
24 | # args$outfile <- file.path(io$basedir,sprintf("processed/rna/SingleCellExperiment_TFs_%s.rds",args$motif_annotation))
25 | ## END TEST
26 | 
27 | #########################
28 | ## Load RNA expression ##
29 | #########################
30 | 
31 | rna.sce <- readRDS(args$sce)
32 | 
33 | ################
34 | ## Subset TFs ##
35 | ################
36 | 
37 | # Load TF annotation
38 | # motif2gene.dt <- fread(args$motif2gene) %>%
39 | #   .[gene%in%toupper(rownames(rna.sce))]
40 | # rna_tf.sce <- rna.sce[str_to_title(motif2gene.dt$gene),]
41 | 
42 | TFs <- fread(args$TF_file)[[1]]
43 | TFs <- TFs[TFs%in%toupper(rownames(rna.sce))]
44 | 
45 | # Subset TFs
46 | rna_tf.sce <- rna.sce[str_to_title(TFs),]
47 | rownames(rna_tf.sce) <- toupper(rownames(rna_tf.sce))
48 | 
49 | ##########
50 | ## Save ##
51 | ##########
52 | 
53 | saveRDS(rna_tf.sce, args$outfile)
54 | 


--------------------------------------------------------------------------------
/rna/pseudobulk/old/create_pseudobulk_metadata_with_replicates.R:
--------------------------------------------------------------------------------
 1 | here::i_am("rna/pseudobulk/create_pseudobulk_metadata_with_replicates.R")
 2 | 
 3 | 
 4 | source(here::here("settings.R"))
 5 | source(here::here("utils.R"))
 6 | 
 7 | ######################
 8 | ## Define arguments ##
 9 | ######################
10 | 
11 | p <- ArgumentParser(description='')
12 | p$add_argument('--metadata',    type="character",    help='metadata file')
13 | p$add_argument('--group_by',    type="character",    help='')
14 | p$add_argument('--nrep',       type="integer",       default=5,      help='Number of replicates per group (cells sampled with replacement)')
15 | p$add_argument('--min_cells',       type="integer",       default=5,      help='Minimum number of cells per replicate')
16 | p$add_argument('--percentage_cells_per_replicate',       type="double",       default=0.3,      help='Percentage of cells per replicate')
17 | p$add_argument('--outdir',      type="character",    help='Output directory')
18 | 
19 | args <- p$parse_args(commandArgs(TRUE))
20 | 
21 | ## START TEST ##
22 | io$basedir <- file.path(io$basedir,"test")
23 | args$metadata <- file.path(io$basedir,"results/rna/mapping/sample_metadata_after_mapping.txt.gz")
24 | args$sce <- file.path(io$basedir,"processed/rna/SingleCellExperiment.rds")
25 | args$group_by <- "celltype"
26 | args$nrep <- 5
27 | args$min_cells <- 25
28 | args$percentage_cells_per_replicate <- 0.30
29 | args$outfile <- file.path(io$basedir,sprintf("results/rna/pseudobulk/%s/cell2replicate.txt.gz",args$group_by))
30 | ## END TEST ##
31 | 
32 | dir.create(dirname(args$outfile), showWarnings = F, recursive = T)
33 | 
34 | ###################
35 | ## Load metadata ##
36 | ###################
37 | 
38 | # Load cell metadata
39 | cell_metadata.dt <- fread(args$metadata) %>%
40 |   .[,celltype_genotype:=sprintf("%s-%s",celltype,genotype)] %>%
41 |   .[pass_rnaQC==TRUE & doublet_call==FALSE & !is.na(eval(as.name(args$group_by)))] %>%
42 |   setnames(args$group_by,"group")
43 | 
44 | print(table(cell_metadata.dt$group))
45 | 
46 | ##################################
47 | ## Create pseudobulk replicates ##
48 | ##################################
49 | 
50 | cell2group.dt <- unique(cell_metadata.dt$group) %>% map(function(i) {
51 |   tmp <- cell_metadata.dt[group==i]
52 |   if ((args$percentage_cells_per_replicate*nrow(tmp))<=args$min_cells) {
53 |     ncells_per_replicate <- args$min_cells
54 |   } else {
55 |     ncells_per_replicate <- round(args$percentage_cells_per_replicate*nrow(tmp))
56 |   }
57 |   seq(1,args$nrep) %>% map(function(j) {
58 |     tmp[sample.int(nrow(tmp),ncells_per_replicate)] %>% 
59 |       .[,replicate:=sprintf("%s_rep%s",i,j)] %>%
60 |       .[,c("cell","group","replicate")] %>% 
61 |       return
62 |   }) %>% rbindlist %>% return
63 | }) %>% rbindlist
64 | 
65 | 
66 | stats.dt <- cell2group.dt[,.(ncells=.N),c("group","replicate")]
67 | print(stats.dt)
68 | 
69 | ##########
70 | ## Save ##
71 | ##########
72 | 
73 | fwrite(cell2group.dt, args$outfile, sep="\t", quote = F)
74 | 
75 | 


--------------------------------------------------------------------------------
/rna/pseudobulk/old/old_code.R:
--------------------------------------------------------------------------------
 1 | 
 2 | ###########################################################################
 3 | ## Calculate average expression as the average of log-transformed values ##
 4 | ###########################################################################
 5 | 
 6 | # expr.dt <- unique(sce$celltype.mapped) %>% map(function(i) {
 7 | #   dt <- logcounts(sce[,sce$celltype.mapped==i]) %>% as.matrix %>% as.data.table(keep.rownames = T) %>%
 8 | #     melt(id.vars="rn") %>% setnames(c("symbol","cell","value")) %>%
 9 | #     .[,.(mean_expr=round(mean(value),3)),by="symbol"] %>%
10 | #     .[,celltype:=i]
11 | #   return(dt)
12 | # }) %>% rbindlist
13 | # 
14 | # length(unique(expr.dt$symbol))
15 | # length(unique(expr.dt$celltype))
16 | 
17 | #################################################################
18 | ## Calculate average expression using the average count values ##
19 | #################################################################
20 | 
21 | # NOTE: NOT WORKING
22 | # expr.dt <- unique(sce$celltype.mapped) %>% map(function(i) {
23 | #   dt <- counts(sce[,sce$celltype.mapped==i]) %>% as.matrix %>% as.data.table(keep.rownames = T) %>%
24 | #     melt(id.vars="rn") %>% setnames(c("symbol","cell","value")) %>%
25 | #     .[,.(counts=sum(value), mean_counts=round(mean(value),3)),by="symbol"] %>%
26 | #     .[,celltype:=i]
27 | #   return(dt)
28 | # }) %>% rbindlist
29 | # 
30 | # foo <-expr.dt %>%
31 | #   .[,sum_counts:=sum(counts),by="celltype"] %>%
32 | #   .[,.(mean_counts=unique(mean_counts), mean_counts2=counts/unique(sum_counts)),by="symbol"]
33 | # 
34 | # length(unique(expr.dt$symbol))
35 | # length(unique(expr.dt$celltype))
36 | 
37 | ##########
38 | ## Save ##
39 | ##########
40 | 
41 | # to.save <- expr.dt %>%
42 | #   merge(gene_metadata[,c("symbol","ens_id")], all.x=T) %>%
43 | #   setnames("symbol","gene")
44 | # fwrite(to.save, paste0(io$outdir,"/avg_expr_per_celltype_and_gene.txt.gz"), sep="\t")
45 | 


--------------------------------------------------------------------------------
/rna/pseudobulk/old/pseudobulk_rna.R:
--------------------------------------------------------------------------------
 1 | library(muscat)
 2 | library(DESeq2)
 3 | 
 4 | #####################
 5 | ## Define settings ##
 6 | #####################
 7 | 
 8 | if (grepl("ricard",Sys.info()['nodename'])) {
 9 |   source("/Users/ricard/gastrulation_multiome_10x/settings.R")
10 | } else if (grepl("ebi",Sys.info()['nodename'])) {
11 |   source("/homes/ricard/gastrulation_multiome_10x/settings.R")
12 | } else {
13 |   stop("Computer not recognised")
14 | }
15 | 
16 | # I/O
17 | io$outdir <- paste0(io$basedir,"/results/rna/pseudobulk")
18 | 
19 | # Options
20 | opts$samples <- c(
21 |   "E7.5_rep1",
22 |   "E7.5_rep2",
23 |   "E8.0_rep1",
24 |   "E8.0_rep2",
25 |   "E8.5_rep1",
26 |   "E8.5_rep2"
27 | )
28 | 
29 | ###############
30 | ## Load data ##
31 | ###############
32 | 
33 | # Load cell metadata
34 | # io$metadata <- "/Users/ricard/data/gastrulation_multiome_10x/results/rna/doublets/sample_metadata_after_doublets.txt.gz"
35 | sample_metadata <- fread(io$metadata) %>%
36 |   .[pass_rnaQC==TRUE & doublet_call==FALSE & sample%in%opts$samples & !is.na(celltype.mapped)]
37 | 
38 | # Load SingleCellExperiment
39 | sce <- load_SingleCellExperiment(io$rna.sce, cells=sample_metadata$cell)
40 | colData(sce) <- sample_metadata %>% tibble::column_to_rownames("cell") %>% DataFrame
41 | 
42 | ###################################
43 | ## Aggregate counts per celltype ##
44 | ###################################
45 | 
46 | # assays(sce)$cpm <- edgeR::cpm(assay(sce), normalized.lib.sizes = FALSE, log = FALSE)
47 | 
48 | sce_pseudobulk <- aggregateData(
49 |   sce,
50 |   assay = "counts",
51 |   by = c("celltype.mapped"),
52 |   fun = c("sum"),
53 |   scale = FALSE # Should pseudo-bulks be scaled with the effective library size & multiplied by 1M?
54 | )
55 | 
56 | assayNames(sce_pseudobulk) <- "counts"
57 | 
58 | ###############
59 | ## Normalise ##
60 | ###############
61 | 
62 | # create DESeq object
63 | dds <- DESeqDataSet(sce_pseudobulk, design=~1)
64 | 
65 | # This function calculates a variance stabilizing transformation (VST) from the fitted dispersion-mean relation(s) 
66 | # and then transforms the count data (normalized by division by the size factors or normalization factors), 
67 | # yielding a matrix of values which are now approximately homoskedastic 
68 | dds <- varianceStabilizingTransformation(dds)
69 | 
70 | logcounts(sce_pseudobulk) <- assay(dds)
71 | 
72 | ###################
73 | ## Sanity checks ##
74 | ###################
75 | 
76 | # cor(
77 | #   colMeans(logcounts(sce_pseudobulk)),
78 | #   metadata(sce_pseudobulk)$n_cells
79 | # )
80 | 
81 | ##########
82 | ## Save ##
83 | ##########
84 | 
85 | saveRDS(sce_pseudobulk, paste0(io$outdir,"/SingleCellExperiment.rds"))
86 | 


--------------------------------------------------------------------------------
/rna/pseudobulk/old/pseudobulk_rna_intronic_exonic.R:
--------------------------------------------------------------------------------
 1 | library(muscat)
 2 | library(DESeq2)
 3 | 
 4 | #####################
 5 | ## Define settings ##
 6 | #####################
 7 | 
 8 | if (grepl("ricard",Sys.info()['nodename'])) {
 9 |   source("/Users/ricard/gastrulation_multiome_10x/settings.R")
10 | } else if (grepl("ebi",Sys.info()['nodename'])) {
11 |   source("/homes/ricard/gastrulation_multiome_10x/settings.R")
12 | } else {
13 |   stop("Computer not recognised")
14 | }
15 | 
16 | # I/O
17 | io$rna.sce  <- paste0(io$basedir,"/processed/rna/SingleCellExperiment_velocyto.rds")
18 | io$outdir <- paste0(io$basedir,"/results/rna/pseudobulk")
19 | 
20 | # Options
21 | opts$samples <- c(
22 |   "E7.5_rep1",
23 |   "E7.5_rep2",
24 |   "E8.0_rep1",
25 |   "E8.0_rep2",
26 |   "E8.5_rep1",
27 |   "E8.5_rep2"
28 | )
29 | 
30 | ###############
31 | ## Load data ##
32 | ###############
33 | 
34 | # Load cell metadata
35 | sample_metadata <- fread(io$metadata) %>%
36 |   .[pass_rnaQC==TRUE & doublet_call==FALSE & sample%in%opts$samples & !is.na(celltype.mapped)]
37 | 
38 | # Load velocyto SingleCellExperiment
39 | sce <- load_SingleCellExperiment(io$rna.sce, cells=sample_metadata$cell)
40 | 
41 | # Filter genes
42 | # gene_metadata <- fread(io$gene_metadata)
43 | # genes <- unique(gene_metadata$symbol)
44 | # sce <- sce[rownames(sce)%in%genes,]
45 | sce <- sce[!duplicated(rownames(sce)),]
46 | 
47 | ###################################
48 | ## Aggregate counts per celltype ##
49 | ###################################
50 | 
51 | sce_pseudobulk_unspliced <- aggregateData(
52 |   sce,
53 |   assay = "unspliced",
54 |   by = c("celltype.mapped"),
55 |   fun = c("sum"),
56 |   scale = FALSE # Should pseudo-bulks be scaled with the effective library size & multiplied by 1M?
57 | ); assayNames(sce_pseudobulk_unspliced) <- "counts"
58 | 
59 | sce_pseudobulk_spliced <- aggregateData(
60 |   sce,
61 |   assay = "spliced",
62 |   by = c("celltype.mapped"),
63 |   fun = c("sum"),
64 |   scale = FALSE # Should pseudo-bulks be scaled with the effective library size & multiplied by 1M?
65 | ); assayNames(sce_pseudobulk_spliced) <- "counts"
66 | 
67 | 
68 | ###############
69 | ## Normalise ##
70 | ###############
71 | 
72 | # create DESeq object
73 | dds.unspliced <- DESeqDataSet(sce_pseudobulk_unspliced, design=~1)
74 | dds.spliced <- DESeqDataSet(sce_pseudobulk_spliced, design=~1)
75 | 
76 | # This function calculates a variance stabilizing transformation (VST) from the fitted dispersion-mean relation(s) 
77 | # and then transforms the count data (normalized by division by the size factors or normalization factors), 
78 | # yielding a matrix of values which are now approximately homoskedastic 
79 | dds.unspliced <- varianceStabilizingTransformation(dds.unspliced)
80 | dds.spliced <- varianceStabilizingTransformation(dds.spliced)
81 | 
82 | sce_pseudobulk <- SingleCellExperiment(
83 |   assays = list(
84 |     "spliced" = assay(sce_pseudobulk_spliced), 
85 |     "unspliced" = assay(sce_pseudobulk_unspliced),
86 |     "unspliced_log" = assay(dds.unspliced),
87 |     "spliced_log" = assay(dds.spliced)
88 |     )
89 | )
90 | 
91 | colnames(sce_pseudobulk) %>% head
92 | rownames(sce_pseudobulk) %>% head
93 | 
94 | ##########
95 | ## Save ##
96 | ##########
97 | 
98 | saveRDS(sce_pseudobulk, paste0(io$outdir,"/SingleCellExperiment_velocyto.rds"))
99 | 


--------------------------------------------------------------------------------
/rna/scanpy/create_anndata_from_SingleCellExperiment.R:
--------------------------------------------------------------------------------
 1 | here::i_am("rna/scanpy/create_anndata_from_SingleCellExperiment.R")
 2 | 
 3 | # Load default settings
 4 | source(here::here("settings.R"))
 5 | source(here::here("utils.R"))
 6 | 
 7 | # Load libraries
 8 | suppressPackageStartupMessages({
 9 |   library("reticulate")
10 |   library("SingleCellExperiment")
11 | })
12 | 
13 | #####################
14 | ## Define settings ##
15 | #####################
16 | 
17 | io$outfile <- file.path(io$basedir,"processed/rna/anndata.h5ad")
18 | 
19 | #####################################
20 | ## Reticulate connection to scanpy ##
21 | #####################################
22 | 
23 | sc <- import("scanpy")
24 | 
25 | ##########################
26 | ## Load sample metadata ##
27 | ##########################
28 | 
29 | sample_metadata <- fread(io$metadata) %>% 
30 |   .[pass_rnaQC==TRUE & doublet_call==FALSE & !is.na(celltype.mapped)]
31 | 
32 | ###############
33 | ## Load data ##
34 | ###############
35 | 
36 | # Load RNA expression data as SingleCellExperiment object
37 | sce <- load_SingleCellExperiment(io$rna.sce, cells=sample_metadata$cell, normalise = FALSE)
38 | 
39 | # Add sample metadata as colData
40 | colData(sce) <- sample_metadata %>% tibble::column_to_rownames("cell") %>% DataFrame
41 | 
42 | 
43 | #############################################
44 | ## Convert SingleCellExperiment to AnnData ##
45 | #############################################
46 | 
47 | adata_sce <- sc$AnnData(
48 |     X   = t(counts(sce)),
49 |     obs = as.data.frame(colData(sce)),
50 |     var = data.frame(gene=rownames(sce), row.names=rownames(sce))
51 | )
52 | # adata_sce$obsm$update(umap = reducedDim(sce, "umap"))
53 | 
54 | adata_sce
55 | 
56 | # Add cell type colors
57 | # colPalette_celltypes = [opts["celltype_colors"][i.replace(" ","_").replace("/","_")] for i in sorted(np.unique(adata.obs['celltype']))]
58 | # adata.uns['celltype'] = colPalette_celltypes
59 | # colPalette_stages = [opts["stage_colors"][i.replace(" ","_").replace("/","_")] for i in sorted(np.unique(adata.obs['stage']))]
60 | # adata.uns['stage_colors'] = colPalette_stages
61 | adata_sce$uns$update(celltype.mapped_colors = opts$celltype.colors[sort(unique(as.character(adata_sce$obs$celltype.mapped)))])
62 | adata_sce$uns$update(stage_colors = opts$stage.colors[sort(unique(as.character(adata_sce$obs$stage)))])
63 | adata_sce$uns["celltype.mapped_colors"]
64 | adata_sce$uns["stage_colors"]
65 | 
66 | ##########
67 | ## Save ##
68 | ##########
69 | 
70 | adata_sce$write_h5ad(io$outfile)
71 | 


--------------------------------------------------------------------------------
/rna/scanpy/create_anndata_scvelo.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import scvelo as scv
 3 | 
 4 | ######################
 5 | ## Define arguments ##
 6 | ######################
 7 | 
 8 | p = argparse.ArgumentParser( description='' )
 9 | p.add_argument( '--loom_directory',               type=str,                       help='Input directory for the loom files (after velocyto)' )
10 | p.add_argument( '--anndata',               type=str,                       help='Anndata object' )
11 | p.add_argument( '--outfile',               type=str,                       help='Output file (anndata)' )
12 | p.add_argument( '--metadata',               type=str,                       help='Metadata file' )
13 | p.add_argument( '--samples',               type=str, nargs="+",                       help='Samples' )
14 | args = p.parse_args()
15 | 
16 | #####################
17 | ## Define settings ##
18 | #####################
19 | 
20 | exec(open('../../settings.py').read())
21 | exec(open('../../utils.py').read())
22 | 
23 | ## START TEST ##
24 | args.outfile = io["basedir"]+"/processed/rna/anndata_scvelo.h5ad"
25 | args.anndata = io["basedir"]+"/processed/rna/anndata.h5ad"
26 | args.loom_directory = io["basedir"]+"/processed/rna/loom"
27 | args.metadata = io["basedir"]+"/results_new2/rna/mapping/sample_metadata_after_mapping.txt.gz"
28 | args.samples = ["E7.5_rep1", "E7.5_rep2"]
29 | ## END TEST ##
30 | 
31 | 
32 | ###################
33 | ## Load metadata ##
34 | ###################
35 | 
36 | print("Loading metadata...")
37 | 
38 | metadata = (pd.read_table(args.metadata) >>
39 |     mask(X.pass_rnaQC==True, X.doublet_call==False) >>
40 |     mask(X["sample"].isin(args.samples))
41 | ).set_index("cell", drop=False)
42 | print(metadata.shape)
43 | 
44 | #########################
45 | ## Load anndata object ##
46 | #########################
47 | 
48 | print("Loading anndata...")
49 | 
50 | adata = load_adata(adata_file = args.anndata, metadata_file = args.metadata, normalise = False, cells = metadata.index.values)
51 | 
52 | ###############################################################
53 | ## Load spliced and unspliced count matrices from loom files ##
54 | ###############################################################
55 | 
56 | print("Loading loom files...")
57 | 
58 | looms = [None for i in range(len(args.samples))]
59 | 
60 | for i in range(len(args.samples)):
61 |     loom_file = args.loom_directory + "/" + args.samples[i] + ".loom"
62 |     looms[i] = sc.read_loom(loom_file, sparse=True, X_name='spliced', obs_names='CellID', obsm_names=None, var_names='Gene')
63 |     # looms[i].var_names_make_unique()
64 |     # looms[i].obs.index = looms[i].obs.index.str.replace(rename_dict[args.samples[i]]+":",args.samples[i]+"_").str.replace("x","-1")
65 |     # print(looms[i].shape)
66 |     # print(looms[i].obs.head())
67 | 
68 | ####################
69 | ## Create anndata ##
70 | ####################
71 | 
72 | print("Creating anndata file...")
73 | 
74 | # Concatenate
75 | adata_loom = anndata.AnnData.concatenate(*looms, join='inner', batch_key=None, index_unique=None)
76 | del looms
77 | 
78 | # Remove non-used layers to save memory
79 | del adata_loom.layers["ambiguous"]
80 | del adata_loom.layers["matrix"]
81 | 
82 | # Merge anndata objects
83 | adata_final = scv.utils.merge(adata, adata_loom)
84 | del adata_loom
85 | del adata
86 | adata_final
87 | 
88 | adata_final.obs.index.name = None
89 | 
90 | ##########
91 | ## Save ##
92 | ##########
93 | 
94 | print("Saving anndata object...")
95 | 
96 | adata.write_h5ad(args.outfile)


--------------------------------------------------------------------------------
/rna/scanpy/velocyto/run_velocyto.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | # velocyto run10x -m repeat_msk.gtf mypath/sample01 somepath/refdata-cellranger-mm10-1.2.0/genes/genes.gtf
 3 | 
 4 | indir="/bi/group/reik/ricard/data/gastrulation_multiome_10x/original"
 5 | 
 6 | # samples=( "E7.5_rep1" "E7.5_rep2" "E8.0_rep1" "E8.0_rep2" "E8.5_rep1" "E8.5_rep2" "E8.75_rep1" "E8.75_rep2" )
 7 | # samples=( "E7.5_rep2" "E8.5_rep1" "E8.5_rep2" "E8.75_rep1" "E8.75_rep2" )
 8 | samples=( "E7.75_rep1" "E8.5_CRISPR_T_KO" "E8.5_CRISPR_T_WT" )
 9 | 
10 | threads=1
11 | # mem=1000
12 | 
13 | mask_file="/bi/group/reik/ricard/data/mm10_sequence/repeats/mm10_rmsk.gtf"
14 | 
15 | for i in "${samples[@]}"; do
16 | 	echo "$i"
17 | 	# cmd="velocyto run10x -m ${mask_file} --samtools-threads $threads --samtools-memory 40000 ${indir}/${i} /bi/scratch/Stephen_Clark/annotations/gtf/Mus_musculus.GRCm38.98.gtf"
18 | 	cmd="velocyto run10x -m ${mask_file} ${indir}/${i} /bi/scratch/Stephen_Clark/annotations/gtf/Mus_musculus.GRCm38.98.gtf"
19 | 	echo $cmd
20 | 	sbatch -n $threads --mem 90G --wrap $cmd
21 | done
22 | 


--------------------------------------------------------------------------------
/rna/scanpy/velocyto/velocyto_env.yml:
--------------------------------------------------------------------------------
 1 | name: velocyto
 2 | channels:
 3 |   - conda-forge
 4 |   - bioconda
 5 | dependencies:
 6 |   - samtools
 7 |   - velocyto.py
 8 | 
 9 | 
10 | # add scvelo et al
11 | # WARNING: Unable to create progress bar. Consider installing `tqdm` as `pip install tqdm` and `ipywidgets` as `pip install ipywidgets`,


--------------------------------------------------------------------------------
/rna/snakemake/README.txt:
--------------------------------------------------------------------------------
 1 | #################
 2 | ## Run locally ##
 3 | #################
 4 | 
 5 | snakemake --use-conda --cores 1
 6 | snakemake --forceall --use-conda --cores 1
 7 | snakemake --forceall --use-conda --cores 1 --dry-run
 8 | 
 9 | #################################
10 | ## Run on the Babraham cluster ##
11 | #################################
12 | 
13 | sbatch -n 1 --mem 5G snakemake --forceall -j 4 --use-conda --latency-wait 90 --cluster "sbatch -n 1 --mem 5G"
14 | snakemake --forceall -j 4 --use-conda --latency-wait 90 --cluster "sbatch -n 1 --mem 12G"
15 | snakemake -j 4 --use-conda --latency-wait 90 --cluster "sbatch -n 1 --mem {params.memory}G"


--------------------------------------------------------------------------------
/rna/snakemake/environment.yaml:
--------------------------------------------------------------------------------
 1 | name: gastrulation_multiome_10x_rna_snakemake
 2 | channels:
 3 |   - conda-forge
 4 |   - bioconda
 5 | dependencies:
 6 |   # - python=3.9.10
 7 |   - python=3.8
 8 |   - cython
 9 |   - ipython
10 |   # - jupyter 
11 |   - ipywidgets
12 |   - bioconductor-scater
13 |   - bioconductor-scran
14 |   - bioconductor-singlecellexperiment
15 |   - bioconductor-batchelor
16 |   - bioconductor-scds
17 |   - bioconductor-edger
18 |   - bioconductor-deseq2
19 |   # - bioconductor-destiny = 3.4 # THIS IS DEPRECIATED, DEPENDS ON bioconductor-singlecellexperiment 1.12
20 |   # - r-xml2 =1.3.2
21 |   - r-r.utils
22 |   - r-matrix
23 |   - r-future
24 |   - r-argparse
25 |   - r-ggpubr
26 |   - r-data.table
27 |   - r-purrr
28 |   - r-furrr
29 |   - r-argparse
30 |   - r-seurat=4.1.0
31 |   - r-pheatmap
32 |   - r-ggrastr
33 |   - scanpy=1.8.2
34 |   - loompy=3.0.6
35 |   - velocyto.py
36 |   # - h5py=2.10.0
37 |   - python-igraph
38 |   - louvain>=0.6,!=0.6.2
39 |   - fa2
40 |   - leidenalg
41 |   - harmonypy
42 |   - scanorama
43 |   - seaborn
44 |   - samtools=1.15
45 |   - snakemake
46 |   - pip
47 |   - pip:
48 |     - dfply
49 |     - scvelo
50 |     - git+https://github.com/settylab/Palantir@removeTSNE
51 |     - git+https://github.com/dpeerlab/SEACells
52 | 
53 | 
54 | # conda create -n gastrulation_multiome_10x_rna_snakemake python==3.9 --yes 
55 | # conda activate gastrulation_multiome_10x_rna_snakemake
56 | # conda install mamba --yes
57 | # mamba env update -n gastrulation_multiome_10x_rna_snakemake --file environment.yaml 


--------------------------------------------------------------------------------
/rna/snakemake/run_cluster.sh:
--------------------------------------------------------------------------------
1 | # snakemake --use-conda --cores 15 -j 99 --latency-wait 90 -p --cluster "sbatch -n {threads} --mem {resources.mem_mb}M"
2 | snakemake --cores 15 -j 99 --latency-wait 90 -p --cluster "sbatch -n {threads} --mem {resources.mem_mb}M"


--------------------------------------------------------------------------------
/rna_atac/rna_vs_acc/metacells/gene_expr_vs_promoter_acc/cor_gene_expr_vs_promoter_acc_metacells.R:
--------------------------------------------------------------------------------
 1 | here::i_am("rna_atac/rna_vs_acc/metacells/gene_expr_vs_promoter_acc/cor_gene_expr_vs_promoter_acc_metacells.R")
 2 | 
 3 | source(here::here("settings.R"))
 4 | source(here::here("utils.R"))
 5 | 
 6 | ################################
 7 | ## Initialize argument parser ##
 8 | ################################
 9 | 
10 | p <- ArgumentParser(description='')
11 | p$add_argument('--sce',  type="character",              help='RNA SingleCellExperiment (pseudobulk)') 
12 | p$add_argument('--gene_score_matrix',  type="character",              help='ATAC Gene score matrix (pseudobulk)') 
13 | p$add_argument('--outfile',          type="character",                help='Output directory')
14 | args <- p$parse_args(commandArgs(TRUE))
15 | 
16 | ## START TEST ##
17 | # io$basedir <- file.path(io$basedir,"test")
18 | # args <- list()
19 | # args$sce <- file.path(io$basedir,"results/rna/metacells/all_cells/SingleCellExperiment_metacells.rds")
20 | # args$gene_score_matrix <- file.path(io$basedir,"results/atac/archR/metacells/all_cells/GeneScoreMatrix_TSS/GeneScoreMatrix_TSS_summarized_experiment_metacells.rds")
21 | # args$outfile <- file.path(io$basedir,"results/rna_atac/rna_vs_acc/metacells/gene_expr_vs_promoter_acc/cor_gene_expr_vs_promoter_acc_metacells.txt.gz")
22 | ## END TEST ##
23 | 
24 | #####################
25 | ## Define settings ##
26 | #####################
27 | 
28 | # I/O
29 | dir.create(dirname(args$outfile), showWarnings=FALSE, recursive=TRUE)
30 | 
31 | # Options
32 | 
33 | #######################################
34 | ## Load pseudobulk RNA and ATAC data ##
35 | #######################################
36 | 
37 | # Load SingleCellExperiment
38 | rna_metacells.sce <- readRDS(args$sce)
39 | 
40 | # Load ATAC SummarizedExperiment
41 | atac_GeneScoreMatrix_metacells.se <- readRDS(args$gene_score_matrix)
42 | 
43 | # Normalise ATAC data
44 | assayNames(atac_GeneScoreMatrix_metacells.se) <- "counts"
45 | assay(atac_GeneScoreMatrix_metacells.se,"logcounts") <- log(1e6*(sweep(assay(atac_GeneScoreMatrix_metacells.se),2,colSums(assay(atac_GeneScoreMatrix_metacells.se),na.rm=T),"/"))+1)
46 | 
47 | # hist(assay(atac_GeneScoreMatrix_metacells.se,"logcounts")[1:1000,])
48 | 
49 | ###########################################
50 | ## Convert to long data.tables and merge ##
51 | ###########################################
52 | 
53 | rna_metacells.dt <- logcounts(rna_metacells.sce) %>%
54 |   as.data.table(keep.rownames = T) %>%
55 |   setnames("rn","gene") %>%
56 |   melt(id.vars="gene", variable.name="celltype", value.name="expr")
57 | 
58 | atac_gene_scores_metacells.dt <- as.matrix(assay(atac_GeneScoreMatrix_metacells.se,"logcounts")) %>% t %>%
59 |   as.data.table(keep.rownames = T) %>%
60 |   setnames("rn","celltype") %>%
61 |   melt(id.vars=c("celltype"), variable.name="gene", value.name="acc")
62 | 
63 | 
64 | # Merge
65 | rna_atac.dt <- merge(rna_metacells.dt, atac_gene_scores_metacells.dt, by = c("gene","celltype"))
66 | 
67 | ##########################
68 | ## Correlation analysis ##
69 | ##########################
70 | 
71 | cor.dt <- rna_atac.dt %>% copy %>%
72 |   .[,c("acc","expr"):=list(acc + rnorm(n=.N,mean=0,sd=1e-5), expr + rnorm(n=.N,mean=0,sd=1e-5))] %>%
73 |   .[, .(V1 = unlist(cor.test(acc, expr)[c("estimate", "p.value")])), by = c("gene")] %>%
74 |   .[, para := rep(c("r","p"), .N/2)] %>% 
75 |   data.table::dcast(gene ~ para, value.var = "V1") %>%
76 |   .[,"padj_fdr" := list(p.adjust(p, method="fdr"))] %>%
77 |   # .[, sig := padj_fdr<=0.10] %>% 
78 |   setorder(padj_fdr, na.last = T)
79 | 
80 | cor.dt[,c("p","r","padj_fdr"):=list(format(p,digits=3),round(r,3), format(padj_fdr,digits=3))]
81 | 
82 | # Save
83 | fwrite(cor.dt, args$outfile, sep="\t", quote=F)
84 | 


--------------------------------------------------------------------------------
/rna_atac/rna_vs_acc/pseudobulk/TFexpr_vs_peakAcc/README.txt:
--------------------------------------------------------------------------------
1 | Fix duplicated TF-motif pairs JASPAR:
2 | ASCL1, BACH2, BHLHE22, CEBPG, CREB3L4, HNF4A, JDP2, JUN, JUNB, JUND, MEIS1, MEIS2, MZF1, NEUROG2, NFIC, NFIX, NR2C2, NR2F1, PAX3, POU6F1, RARA, RORA, RXRB, RXRG, SREBF1, SREBF2, TFAP2A, TFAP2B, TFAP2C, TFAP4, THRB


--------------------------------------------------------------------------------
/rna_atac/rna_vs_acc/pseudobulk/TFexpr_vs_peakAcc/analysis/plot_TFexpr_vs_peakAcc_general_stats.R:
--------------------------------------------------------------------------------
 1 | here::i_am("rna_atac/rna_vs_acc/pseudobulk/TFexpr_vs_peakAcc/analysis/plot_TFexpr_vs_peakAcc_general_stats.R")
 2 | 
 3 | source(here::here("settings.R"))
 4 | source(here::here("utils.R"))
 5 | 
 6 | ################################
 7 | ## Initialize argument parser ##
 8 | ################################
 9 | 
10 | p <- ArgumentParser(description='')
11 | p$add_argument('--sce',  type="character",              help='RNA SingleCellExperiment (pseudobulk)') 
12 | p$add_argument('--atac_peak_matrix',  type="character",              help='ATAC Peak matrix (pseudobulk)') 
13 | p$add_argument('--tf2peak_cor',  type="character",              help='Correlations between TF RNA expression and peak accessibility') 
14 | p$add_argument('--outdir',          type="character",                help='Output directory')
15 | args <- p$parse_args(commandArgs(TRUE))
16 | 
17 | #####################
18 | ## Define settings ##
19 | #####################
20 | 
21 | ## START TEST ##
22 | io$basedir <- file.path(io$basedir,"test")
23 | args <- list()
24 | args$sce <- file.path(io$basedir,"results/rna/pseudobulk/celltype/SingleCellExperiment_pseudobulk.rds")
25 | args$atac_peak_matrix <- file.path(io$basedir,"results/atac/archR/pseudobulk/celltype/PeakMatrix/pseudobulk_PeakMatrix_summarized_experiment.rds")
26 | args$tf2peak_cor <- file.path(io$basedir,"results/rna_atac/rna_vs_acc/pseudobulk/TFexpr_vs_peakAcc/CISBP_cor_TFexpr_vs_peakAcc.rds")
27 | args$outdir <- file.path(io$basedir,"results/rna_atac/rna_vs_acc/pseudobulk/TFexpr_vs_peakAcc/stats")
28 | ## END TEST ##
29 | 
30 | # I/O
31 | dir.create(args$outdir, showWarnings = F)
32 | 
33 | ###############
34 | ## Load data ##
35 | ###############
36 | 
37 | # Load SingleCellExperiment
38 | rna_pseudobulk.sce <- readRDS(args$sce)
39 | 
40 | # Load ATAC SummarizedExperiment
41 | atac_peakMatrix_pseudobulk.se <- readRDS(args$atac_peak_matrix)
42 | 
43 | # Normalise ATAC data
44 | assayNames(atac_peakMatrix_pseudobulk.se) <- "counts"
45 | assay(atac_peakMatrix_pseudobulk.se,"logcounts") <- log(1e6*(sweep(assay(atac_peakMatrix_pseudobulk.se),2,colSums(assay(atac_peakMatrix_pseudobulk.se),na.rm=T),"/"))+1)
46 | 
47 | ###############################
48 | ## Load TF2peak correlations ##
49 | ###############################
50 | 
51 | tf2peak_cor.se <- readRDS(args$tf2peak_cor)
52 | 
53 | ############
54 | ## Filter ##
55 | ############
56 | 
57 | TFs <- colnames(tf2peak_cor.se)
58 | peaks <- rownames(tf2peak_cor.se)
59 | 
60 | ##########
61 | ## Plot ##
62 | ##########
63 | 
64 | # tmp <- assay(tf2peak_cor.se[,i],"cor")[,1]
65 | i <- "T"
66 | j <- "chr5:4894870-4895470"
67 | 
68 | to.plot <- data.table(
69 |   atac = assay(atac_peakMatrix_pseudobulk.se[j,],"logcounts")[1,],
70 |   rna = logcounts(rna_pseudobulk.sce[i,])[1,],
71 |   celltype = colnames(rna_pseudobulk.sce)
72 | )
73 | 
74 | p <- ggplot(to.plot, aes(x=rna, y=atac, fill=celltype)) +
75 |   geom_point(color="black", size=4, shape=21) +
76 |   # geom_smooth(method="lm") +
77 |   stat_cor(method = "pearson") +
78 |   scale_fill_manual(values=opts$celltype.colors) +
79 |   ggrepel::geom_text_repel(aes(label=celltype), size=3, data=to.plot[rna>5 & atac>0.3]) +
80 |   labs(x="RNA expression", y="Peak accessibility", title=sprintf("%s expression vs %s accessibility",i,j)) +
81 |   theme_classic() + 
82 |   theme(
83 |     plot.title = element_text(hjust=0.5, size=rel(0.8)),
84 |     axis.text = element_text(color="black"),
85 |     legend.position = "none"
86 |   )
87 | 
88 | pdf(file.path(args$outdir,sprintf("%s_vs_%s_rna_vs_acc_pseudobulk.pdf",i,gsub("[:_]","-",j))), width = 8, height = 5)
89 | print(p)
90 | dev.off()


--------------------------------------------------------------------------------
/rna_atac/rna_vs_acc/pseudobulk/TFexpr_vs_peakAcc/analysis/plot_TFexpr_vs_peakAcc_individual_examples.R:
--------------------------------------------------------------------------------
 1 | here::i_am("rna_atac/rna_vs_acc/pseudobulk/TFexpr_vs_peakAcc/plot_TFexpr_vs_peakAcc_individual_examples.R")
 2 | 
 3 | source(here::here("settings.R"))
 4 | source(here::here("utils.R"))
 5 | 
 6 | ################################
 7 | ## Initialize argument parser ##
 8 | ################################
 9 | 
10 | p <- ArgumentParser(description='')
11 | p$add_argument('--sce',  type="character",              help='RNA SingleCellExperiment (pseudobulk)') 
12 | p$add_argument('--atac_peak_matrix',  type="character",              help='ATAC Peak matrix (pseudobulk)') 
13 | p$add_argument('--tf2peak_cor',  type="character",              help='Correlations between TF RNA expression and peak accessibility') 
14 | p$add_argument('--outdir',          type="character",                help='Output directory')
15 | args <- p$parse_args(commandArgs(TRUE))
16 | 
17 | #####################
18 | ## Define settings ##
19 | #####################
20 | 
21 | ## START TEST ##
22 | # io$basedir <- file.path(io$basedir,"test")
23 | # args <- list()
24 | # args$sce <- file.path(io$basedir,"results/rna/pseudobulk/celltype/SingleCellExperiment_pseudobulk.rds")
25 | # args$atac_peak_matrix <- file.path(io$basedir,"results/atac/archR/pseudobulk/celltype/PeakMatrix/pseudobulk_PeakMatrix_summarized_experiment.rds")
26 | # args$tf2peak_cor <- file.path(io$basedir,"results/rna_atac/rna_vs_acc/pseudobulk/TFexpr_vs_peakAcc/CISBP_cor_TFexpr_vs_peakAcc.rds")
27 | # args$outdir <- file.path(io$basedir,"results/rna_atac/rna_vs_acc/pseudobulk/TFexpr_vs_peakAcc/individual_examples")
28 | ## END TEST ##
29 | 
30 | # I/O
31 | dir.create(args$outdir, showWarnings = F)
32 | 
33 | ###############
34 | ## Load data ##
35 | ###############
36 | 
37 | # Load SingleCellExperiment
38 | rna_pseudobulk.sce <- readRDS(args$sce)
39 | 
40 | # Load ATAC SummarizedExperiment
41 | atac_peakMatrix_pseudobulk.se <- readRDS(args$atac_peak_matrix)
42 | 
43 | # Normalise ATAC data
44 | assayNames(atac_peakMatrix_pseudobulk.se) <- "counts"
45 | assay(atac_peakMatrix_pseudobulk.se,"logcounts") <- log(1e6*(sweep(assay(atac_peakMatrix_pseudobulk.se),2,colSums(assay(atac_peakMatrix_pseudobulk.se),na.rm=T),"/"))+1)
46 | 
47 | ###############################
48 | ## Load TF2peak correlations ##
49 | ###############################
50 | 
51 | tf2peak_cor.se <- readRDS(args$tf2peak_cor)
52 | 
53 | ############
54 | ## Filter ##
55 | ############
56 | 
57 | TFs <- colnames(tf2peak_cor.se)
58 | peaks <- rownames(tf2peak_cor.se)
59 | 
60 | ##########
61 | ## Plot ##
62 | ##########
63 | 
64 | # tmp <- assay(tf2peak_cor.se[,i],"cor")[,1]
65 | i <- "T"
66 | j <- "chr5:4894870-4895470"
67 | 
68 | to.plot <- data.table(
69 |   atac = assay(atac_peakMatrix_pseudobulk.se[j,],"logcounts")[1,],
70 |   rna = logcounts(rna_pseudobulk.sce[i,])[1,],
71 |   celltype = colnames(rna_pseudobulk.sce)
72 | )
73 | 
74 | p <- ggplot(to.plot, aes(x=rna, y=atac, fill=celltype)) +
75 |   geom_point(color="black", size=4, shape=21) +
76 |   # geom_smooth(method="lm") +
77 |   stat_cor(method = "pearson") +
78 |   scale_fill_manual(values=opts$celltype.colors) +
79 |   ggrepel::geom_text_repel(aes(label=celltype), size=3, data=to.plot[rna>5 & atac>0.3]) +
80 |   labs(x="RNA expression", y="Peak accessibility", title=sprintf("%s expression vs %s accessibility",i,j)) +
81 |   theme_classic() + 
82 |   theme(
83 |     plot.title = element_text(hjust=0.5, size=rel(0.8)),
84 |     axis.text = element_text(color="black"),
85 |     legend.position = "none"
86 |   )
87 | 
88 | pdf(file.path(args$outdir,sprintf("%s_vs_%s_rna_vs_acc_pseudobulk.pdf",i,gsub("[:_]","-",j))), width = 8, height = 5)
89 | print(p)
90 | dev.off()


--------------------------------------------------------------------------------
/rna_atac/rna_vs_acc/pseudobulk/gene_expr_vs_promoter_acc/cor_gene_expr_vs_promoter_acc_pseudobulk.R:
--------------------------------------------------------------------------------
 1 | here::i_am("rna_atac/rna_vs_acc/pseudobulk/gene_expr_vs_promoter_acc/cor_gene_expr_vs_promoter_acc_pseudobulk.R")
 2 | 
 3 | source(here::here("settings.R"))
 4 | source(here::here("utils.R"))
 5 | 
 6 | ################################
 7 | ## Initialize argument parser ##
 8 | ################################
 9 | 
10 | p <- ArgumentParser(description='')
11 | p$add_argument('--sce',  type="character",              help='RNA SingleCellExperiment (pseudobulk)') 
12 | p$add_argument('--gene_score_matrix',  type="character",              help='ATAC Gene score matrix (pseudobulk)') 
13 | p$add_argument('--outfile',          type="character",                help='Output directory')
14 | args <- p$parse_args(commandArgs(TRUE))
15 | 
16 | ## START TEST ##
17 | # args <- list()
18 | # args$sce <- file.path(io$basedir,"results/rna/pseudobulk/SingleCellExperiment_pseudobulk_celltype.mapped.rds") 
19 | # args$gene_score_matrix <- file.path(io$basedir,"results/atac/archR/pseudobulk/celltype.mapped/pseudobulk_GeneScoreMatrix_TSS_summarized_experiment.rds") # io$archR.pseudobulk.GeneMatrix.se
20 | # args$outfile <- file.path(io$basedir,"results/rna_atac/gene_expr_vs_promoter_acc/pseudobulk/cor_gene_expr_vs_promoter_acc_pseudobulk.txt.gz")
21 | ## END TEST ##
22 | 
23 | #####################
24 | ## Define settings ##
25 | #####################
26 | 
27 | # I/O
28 | dir.create(dirname(args$outfile), showWarnings=FALSE, recursive=TRUE)
29 | 
30 | # Options
31 | 
32 | #######################################
33 | ## Load pseudobulk RNA and ATAC data ##
34 | #######################################
35 | 
36 | # Load SingleCellExperiment
37 | rna_pseudobulk.sce <- readRDS(args$sce)
38 | 
39 | # Load ATAC SummarizedExperiment
40 | atac_pseudobulk_GeneScoreMatrix.se <- readRDS(args$gene_score_matrix)
41 | assayNames(atac_pseudobulk_GeneScoreMatrix.se) <- "counts"
42 | 
43 | # Normalise ATAC data
44 | assay(atac_pseudobulk_GeneScoreMatrix.se,"logcounts") <- log(1e6*(sweep(assay(atac_pseudobulk_GeneScoreMatrix.se),2,colSums(assay(atac_pseudobulk_GeneScoreMatrix.se),na.rm=T),"/"))+1)
45 | 
46 | ###########################################
47 | ## Convert to long data.tables and merge ##
48 | ###########################################
49 | 
50 | rna_pseudobulk.dt <- logcounts(rna_pseudobulk.sce) %>%
51 |   as.data.table(keep.rownames = T) %>%
52 |   setnames("rn","gene") %>%
53 |   melt(id.vars="gene", variable.name="celltype", value.name="expr")
54 | 
55 | atac_gene_scores_pseudobulk.dt <- as.matrix(assay(atac_pseudobulk_GeneScoreMatrix.se)) %>% t %>%
56 |   as.data.table(keep.rownames = T) %>%
57 |   setnames("rn","celltype") %>%
58 |   melt(id.vars=c("celltype"), variable.name="gene", value.name="acc")
59 | 
60 | # Merge
61 | rna_atac.dt <- merge(rna_pseudobulk.dt, atac_gene_scores_pseudobulk.dt, by = c("gene","celltype"))
62 | 
63 | ##########################
64 | ## Correlation analysis ##
65 | ##########################
66 | 
67 | cor.dt <- rna_atac.dt %>% copy %>%
68 |   .[,c("acc","expr"):=list(acc + rnorm(n=.N,mean=0,sd=1e-5), expr + rnorm(n=.N,mean=0,sd=1e-5))] %>%
69 |   .[, .(V1 = unlist(cor.test(acc, expr)[c("estimate", "p.value")])), by = c("gene")] %>%
70 |   .[, para := rep(c("r","p"), .N/2)] %>% 
71 |   data.table::dcast(gene ~ para, value.var = "V1") %>%
72 |   .[,"padj_fdr" := list(p.adjust(p, method="fdr"))] %>%
73 |   # .[, sig := padj_fdr<=0.10] %>% 
74 |   setorder(padj_fdr, na.last = T)
75 | 
76 | cor.dt[,c("p","r","padj_fdr"):=list(format(p,digits=3),round(r,3), format(padj_fdr,digits=3))]
77 | 
78 | # Save
79 | fwrite(cor.dt, args$outfile, sep="\t", quote=F)
80 | 


--------------------------------------------------------------------------------
/rna_atac/rna_vs_acc/pseudobulk/gene_markers_rna_vs_acc/plot_number_markers.R:
--------------------------------------------------------------------------------
  1 | #####################
  2 | ## Define settings ##
  3 | #####################
  4 | 
  5 | source(here::here("settings.R"))
  6 | source(here::here("utils.R"))
  7 | 
  8 | # I/O
  9 | io$basedir <- file.path(io$basedir,"test")
 10 | io$marker_genes_rna <- file.path(io$basedir,"results/rna/differential/pseudobulk/celltype/parsed/marker_genes_filtered.txt.gz")
 11 | io$marker_peaks_atac <- file.path(io$basedir,"results/atac/archR/differential/pseudobulk/celltype/PeakMatrix/parsed/markers_filt.txt.gz")
 12 | io$outdir <- file.path(io$basedir,"results/rna_atac/rna_vs_acc/pseudobulk/gene_markers_rna_vs_acc")
 13 | 
 14 | # Options
 15 | opts$celltypes <- c(
 16 |   "Epiblast",
 17 |   "Primitive_Streak",
 18 |   "Caudal_epiblast",
 19 |   "PGC",
 20 |   # "Anterior_Primitive_Streak",
 21 |   "Notochord",
 22 |   "Def._endoderm",
 23 |   "Gut",
 24 |   "Nascent_mesoderm",
 25 |   # "Mixed_mesoderm",
 26 |   "Intermediate_mesoderm",
 27 |   "Caudal_Mesoderm",
 28 |   "Paraxial_mesoderm",
 29 |   "Somitic_mesoderm",
 30 |   "Pharyngeal_mesoderm",
 31 |   "Cardiomyocytes",
 32 |   "Allantois",
 33 |   "ExE_mesoderm",
 34 |   # "Mesenchyme",
 35 |   "Haematoendothelial_progenitors",
 36 |   "Endothelium",
 37 |   "Blood_progenitors_1",
 38 |   "Blood_progenitors_2",
 39 |   "Erythroid1",
 40 |   "Erythroid2",
 41 |   "Erythroid3",
 42 |   "NMP",
 43 |   "Rostral_neurectoderm",
 44 |   # "Caudal_neurectoderm",
 45 |   "Neural_crest",
 46 |   "Forebrain_Midbrain_Hindbrain",
 47 |   "Spinal_cord",
 48 |   "Surface_ectoderm"
 49 |   # "Visceral_endoderm",
 50 |   # "ExE_endoderm",
 51 |   # "ExE_ectoderm",
 52 |   # "Parietal_endoderm"
 53 | )
 54 | 
 55 | 
 56 | ###############
 57 | ## Load data ##
 58 | ###############
 59 | 
 60 | markers_genes_rna.dt <- fread(io$marker_genes_rna) %>% .[celltype%in%opts$celltypes]
 61 | marker_peaks_atac.dt <- fread(io$marker_peaks_atac) %>% .[celltype%in%opts$celltypes]
 62 | 
 63 | ##########
 64 | ## Plot ##
 65 | ##########
 66 | 
 67 | # Plot number of marker genes per cell types
 68 | to.plot <- rbind(
 69 |   markers_genes_rna.dt %>% .[,.N,by=c("celltype")] %>% .[,class:="Genes"],
 70 |   marker_peaks_atac.dt %>% .[,.N,by=c("celltype")] %>% .[,class:="ATAC peaks"]
 71 | ) %>% .[,class:=factor(class, levels=c("Genes","ATAC peaks"))]
 72 | 
 73 | # Rename celltypes
 74 | opts$rename.celltypes <- c(
 75 |   "Forebrain_Midbrain_Hindbrain" = "Brain",
 76 |   "Haematoendothelial_progenitors" = "Haematoend. progenitors"
 77 | )
 78 | to.plot %>% .[,celltype:=stringr::str_replace_all(celltype,opts$rename.celltypes)] %>% .[,celltype:=gsub("_"," ",celltype)]
 79 | opts$celltype.colors["Haematoend. progenitors"] <- opts$celltype.colors["Haematoendothelial_progenitors"]
 80 | names(opts$celltype.colors) <- gsub("_"," ",names(opts$celltype.colors))
 81 | 
 82 | # Plot
 83 | p <- ggbarplot(to.plot, x="celltype", y="N", fill="celltype") +
 84 |   facet_wrap(~class, ncol=2, scales="free_y") +
 85 |   scale_fill_manual(values=opts$celltype.colors) +
 86 |   labs(x="", y="Number of markers") +
 87 |   theme(
 88 |     strip.background = element_rect(colour="black", fill=NA),
 89 |     axis.text.y = element_text(size=rel(0.65)),
 90 |     axis.text.x = element_text(colour="black",size=rel(0.6), angle=90, hjust=1, vjust=0.5),
 91 |     axis.title = element_text(colour="black",size=rel(0.75)),
 92 |     axis.ticks.x = element_blank(),
 93 |     legend.position = "none"
 94 | )
 95 | 
 96 | pdf(file.path(io$outdir,"barplot_number_markers.pdf"), width = 7, height = 4)
 97 | print(p)
 98 | dev.off()
 99 | 
100 | 


--------------------------------------------------------------------------------
/rna_atac/rna_vs_chromvar_chip/pseudobulk/per_gene/fig/plot_rna_vs_chromvar_per_gene_pseudobulk_fig.R:
--------------------------------------------------------------------------------
 1 | source(here::here("settings.R"))
 2 | source(here::here("utils.R"))
 3 | 
 4 | #####################
 5 | ## Define settings ##
 6 | #####################
 7 | 
 8 | # Options
 9 | opts$motif_annotation <- "CISBP"
10 | 
11 | # I/O
12 | io$rna_sce_pseudobulk_file <- file.path(io$basedir,"results/rna/pseudobulk/celltype/SingleCellExperiment_TFs_pseudobulk.rds")
13 | io$atac_chromvar_chip_pseudobulk_file <- file.path(io$basedir,sprintf("results/atac/archR/chromvar_chip/pseudobulk/chromVAR_chip_%s_archr.rds",opts$motif_annotation))
14 | io$outdir <- file.path(io$basedir,"results/rna_atac/rna_vs_chromvar_chip/pseudobulk/per_gene/fig"); dir.create(io$outdir, showWarnings=F, recursive = T)
15 | 
16 | #######################################
17 | ## Load pseudobulk RNA and ATAC data ##
18 | #######################################
19 | 
20 | # Load pseudobulk RNA expression
21 | rna_pseudobulk_tf.se <- readRDS(io$rna_sce_pseudobulk_file)
22 | 
23 | # Load chromVAR matrix
24 | atac_chromvar_pseudobulk.se <- readRDS(io$atac_chromvar_chip_pseudobulk_file)
25 | 
26 | # Select TFs
27 | TFs <- intersect(rownames(rna_pseudobulk_tf.se),rownames(atac_chromvar_pseudobulk.se))
28 | rna_pseudobulk_tf.se <- rna_pseudobulk_tf.se[TFs,]
29 | atac_chromvar_pseudobulk.se <- atac_chromvar_pseudobulk.se[TFs,]
30 | 
31 | ########################
32 | ## Prepare data table ##
33 | ########################
34 | 
35 | atac_chromvar_pseudobulk.dt <- assay(atac_chromvar_pseudobulk.se) %>% t %>%
36 |   as.data.table(keep.rownames = T) %>%
37 |   setnames("rn","celltype") %>%
38 |   melt(id.vars=c("celltype"), variable.name="gene", value.name="chromvar_zscore")
39 | 
40 | rna_tf_pseudobulk.dt <- logcounts(rna_pseudobulk_tf.se) %>%
41 |   as.data.table(keep.rownames = T) %>%
42 |   setnames("rn","gene") %>%
43 |   data.table::melt(id.vars="gene", variable.name="celltype", value.name="expr")
44 | 
45 | ###########
46 | ## Merge ##
47 | ###########
48 | 
49 | rna_chromvar.dt <- merge(
50 |   rna_tf_pseudobulk.dt,
51 |   atac_chromvar_pseudobulk.dt,
52 |   by = c("celltype","gene")
53 | )
54 | 
55 | ######################################
56 | ## Scatter plot of individual genes ##
57 | ######################################
58 | 
59 | genes.to.plot <- unique(rna_chromvar.dt$gene)
60 | genes.to.plot <- c("FOXA2","FOXB1","FOXC2")
61 | 
62 | # i <- "FOXA2"
63 | for (i in genes.to.plot) {
64 | 
65 |   to.plot <- rna_chromvar.dt[gene==i]
66 | 
67 |   to.plot.text <- rbind(
68 |     to.plot %>% setorder(-expr) %>% head(n=7),
69 |     to.plot %>% setorder(-chromvar_zscore) %>% head(n=7)
70 |   ) %>% unique
71 |   
72 |   p <- ggscatter(to.plot, x="expr", y="chromvar_zscore", fill="celltype", size=5.5, shape=21, 
73 |                   add="reg.line", add.params = list(color="black", fill="lightgray"), conf.int=TRUE) +
74 |     stat_cor(method = "pearson", label.x.npc = "middle", label.y.npc = "bottom") +
75 |     ggrepel::geom_text_repel(data=to.plot.text, aes(label=gsub("_"," ",celltype)), size=3.5) +
76 |     scale_fill_manual(values=opts$celltype.colors) +
77 |     # labs(x=sprintf("%s expression",i), y=sprintf("Accessibility of %s targets (z-score)",i)) +
78 |     labs(x="RNA expression", y="chromVAR-ChIP") +
79 |     guides(fill="none") +
80 |     theme(
81 |       axis.text = element_text(size=rel(0.85))
82 |     )
83 |   
84 |   pdf(file.path(io$outdir,sprintf("%s_%s_rna_vs_chromvar_chip_pseudobulk.pdf",i,opts$motif_annotation)), width = 5.5, height = 4)
85 |   print(p)
86 |   dev.off()
87 | }
88 | 
89 | 


--------------------------------------------------------------------------------
/rna_atac/snakemake/run_cluster.sh:
--------------------------------------------------------------------------------
1 | snakemake --cores 15 -j 99 --latency-wait 90 -p --cluster "sbatch -n {threads} --mem {resources.mem_mb}M"


--------------------------------------------------------------------------------
/rna_atac/virtual_chipseq_library/metacells/analysis/virtual_chipseq_metacells_exploration.R:
--------------------------------------------------------------------------------
 1 | 
 2 | source(here::here("settings.R"))
 3 | source(here::here("utils.R"))
 4 | 
 5 | #####################
 6 | ## Define settings ##
 7 | #####################
 8 | 
 9 | # Options
10 | opts$motif_annotation <- "CISBP"
11 | opts$trajectory <- "nmp"
12 | 
13 | # I/O
14 | io$rna_metacells.sce <- file.path(io$basedir, 'results/rna/metacells/trajectories/nmp/SingleCellExperiment_metacells.rds')
15 | io$metacell_metadata <- file.path(io$basedir, 'results/atac/archR/metacells/trajectories/nmp/PeakMatrix/metacells_metadata.txt.gz')
16 | io$archR.peakMatrix.metacells <- file.path(io$basedir,"results/atac/archR/metacells/trajectories/nmp/PeakMatrix/PeakMatrix_summarized_experiment_metacells.rds")
17 | io$virtual_chip.mtx <- file.path(io$basedir,sprintf("results/rna_atac/virtual_chipseq/metacells/trajectories/nmp/%s/virtual_chip_matrix.rds",opts$motif_annotation))
18 | io$trajectory <- "nmp"
19 | io$trajectory_file <- file.path(io$basedir,"results/rna/metacells/trajectories/nmp/metacell_trajectory.txt.gz")
20 | io$outdir <-  file.path(io$basedir,sprintf("results/rna_atac/virtual_chipseq/metacells/trajectories/nmp/%s/pdf",opts$trajectory)); dir.create(io$outdir, showWarnings = F)
21 | 
22 | 
23 | if (io$trajectory=="nmp") {
24 |   celltypes.to.plot <- c("Caudal_Mesoderm", "Somitic_mesoderm", "NMP", "Spinal_cord")
25 | }
26 | 
27 | ###################
28 | ## Load metadata ##
29 | ###################
30 | 
31 | metadata.dt <- fread(io$metacell_metadata) %>% .[celltype%in%celltypes.to.plot]
32 | 
33 | #####################
34 | ## Load trajectory ##
35 | #####################
36 | 
37 | trajectory.dt <- fread(io$trajectory_file) %>% setnames(c("metacell","V1","V2"))
38 | 
39 | ##################################
40 | ## Load virtual ChIP-seq matrix ##
41 | ##################################
42 | 
43 | virtual_chip.mtx <- readRDS(io$virtual_chip.mtx)
44 | 
45 | ##################################
46 | ## Load chromatin accessibility ##
47 | ##################################
48 | 
49 | atac_peakMatrix_metacells.se <- readRDS(io$archR.peakMatrix.metacells)
50 | 
51 | metacells <- intersect(trajectory.dt$metacell,colnames(atac_peakMatrix_metacells.se))
52 | 
53 | # Normalise ATAC data
54 | assayNames(atac_peakMatrix_metacells.se) <- "counts"
55 | assay(atac_peakMatrix_metacells.se,"logcounts") <- log(1e6*(sweep(assay(atac_peakMatrix_metacells.se),2,colSums(assay(atac_peakMatrix_metacells.se),na.rm=T),"/"))+1)
56 | 
57 | ##########
58 | ## Plot ##
59 | ##########
60 | 
61 | brachyury_binding_sites <- virtual_chip.mtx[,"T"][virtual_chip.mtx[,"T"]>=0.40] %>% sort
62 | 
63 | tmp <- assay(atac_peakMatrix_metacells.se,"logcounts")[names(brachyury_binding_sites),]
64 | 
65 | to.plot <- data.table(
66 |   acc = colMeans(tmp),
67 |   metacell = colnames(tmp)
68 | ) %>% merge(trajectory.dt,by="metacell")
69 | 
70 | p <- ggplot(to.plot, aes(x=V1, y=V2)) +
71 |   geom_point(aes(fill=acc), size=2.5, shape=21, stroke=0.25) +
72 |   # facet_wrap(~gene) +
73 |   scale_fill_gradient(low = "gray95", high = "darkgreen") +
74 |   labs(x="Force-directed layout (Dim 1)", y="Force-directed layout (Dim 2)") +
75 |   theme_classic() +
76 |   ggplot_theme_NoAxes() +
77 |   theme(
78 |     legend.position = "right"
79 |   )
80 | 
81 | pdf(file.path(io$outdir,sprintf("network_coloured_by_%s_expr.pdf",i)), width = 5, height = 5.5)
82 | print(p)
83 | dev.off()
84 | 
85 | 
86 | 


--------------------------------------------------------------------------------
/rna_atac/virtual_chipseq_library/pseudobulk/analysis/virtual_chipseq_exploration.R:
--------------------------------------------------------------------------------
 1 | # here::i_am("rna_atac/virtual_chipseq_library/virtual_chipseq_plot_stats.R")
 2 | 
 3 | # Load default settings
 4 | source(here::here("settings.R"))
 5 | source(here::here("utils.R"))
 6 | 
 7 | #####################
 8 | ## Define settings ##
 9 | #####################
10 | 
11 | # Options
12 | opts$motif_annotation <- "CISBP"
13 | 
14 | opts$TFs <- c("TAL1", "GATA1", "RUNX1", "FOXA2", "GATA4", "CDX2","NKX2-5","TBX5", "SOX10")
15 | opts$TFs <- c("T")
16 | 
17 | # I/O
18 | io$virtual_chip.dir <- file.path(io$basedir,sprintf("results/rna_atac/virtual_chipseq/pseudobulk/%s",opts$motif_annotation))
19 | io$virtual_chip.mtx <- file.path(io$virtual_chip.dir,"virtual_chip_matrix.rds")
20 | io$outdir <- file.path(io$basedir,sprintf("results/rna_atac/virtual_chipseq/pseudobulk/%s/test",opts$motif_annotation)); dir.create(io$outdir, showWarnings = F)
21 | 
22 | ###################################
23 | ## Load virtual ChIP-seq library ##
24 | ###################################
25 | 
26 | # Load detailed data.tables
27 | virtual_chip.dt <- opts$TFs %>% map(function(i) {
28 |     fread(sprintf("%s/%s.txt.gz",io$virtual_chip.dir,i)) %>%
29 |     # .[,c("chr","start","end"):=NULL] %>%
30 |     .[,tf:=i] %>%
31 |     return
32 | }) %>% rbindlist
33 | 
34 | # Load matrix
35 | virtual_chip.mtx <- readRDS(io$virtual_chip.mtx)
36 | 
37 | 
38 | #######################
39 | ## Explore Brachyury ##
40 | #######################
41 | 
42 | to.plot <- virtual_chip.dt[abs(score)>=0.20 & motif_score>=0.30] %>%
43 |     .[,sign:=as.factor(c("Repressor","Activator")[(correlation_score>0)+1])]
44 | 
45 | ggbarplot(to.plot[,.N,by=c("sign")], x="sign", y="N", fill="gray70") +
46 |   labs(x="", y="Number of in silico T binding events")
47 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
 1 | import anndata
 2 | import scanpy as sc
 3 | import scipy as s
 4 | from scipy.sparse import csr_matrix, issparse
 5 | 
 6 | def load_adata(adata_file, metadata_file = None, normalise = False, cells = None, cell_column = "cell", features = None, filter_lowly_expressed_genes = False, set_colors = False, keep_counts=False):
 7 | 
 8 | 	adata = sc.read(adata_file)
 9 | 
10 | 	# Convert to sparse matrices
11 | 	if not s.sparse.issparse(adata.X):
12 | 		adata.X = csr_matrix(adata.X)
13 | 	if len(adata.layers.keys())>0:
14 | 		for i in list(adata.layers.keys()):
15 | 			if not issparse(adata.layers[i]):
16 | 				adata.layers[i] = csr_matrix(adata.layers[i])
17 | 
18 | 	if cells is not None:
19 | 		tmp = np.mean(np.isin(cells,adata.obs.index.values)==False)
20 | 		if tmp<1: print("%.2f%% of cells provided are not observed in the adata, taking the intersect..." % (100*tmp))
21 | 		cells = np.intersect1d(cells,adata.obs.index.values)
22 | 		adata = adata[cells,:]
23 | 
24 | 	if features is not None:
25 | 		adata = adata[:,features]
26 | 
27 | 	if metadata_file is not None:
28 | 		metadata = pd.read_table(metadata_file, delimiter="\t", header=0).set_index(cell_column, drop=False)
29 | 		metadata = metadata.loc[cells]
30 | 		assert np.all(adata.obs.index.isin(metadata[cell_column]))
31 | 		# assert np.all(metadata.cell.isin(adata.obs.index))
32 | 		assert metadata.shape[0] == adata.shape[0]
33 | 		adata.obs = metadata#.reindex(adata.obs.index)
34 | 
35 | 	if filter_lowly_expressed_genes:
36 | 		sc.pp.filter_genes(adata, min_counts=10)
37 | 
38 | 	if keep_counts:
39 | 		adata.layers["raw"] = adata.X.copy()
40 | 
41 | 	if normalise:
42 | 		sc.pp.normalize_total(adata, target_sum=None, exclude_highly_expressed=False)
43 | 		sc.pp.log1p(adata)
44 | 
45 | 	if set_colors:
46 | 		colPalette_celltypes = [opts["celltype_colors"][i.replace(" ","_").replace("/","_")] for i in sorted(np.unique(adata.obs['celltype']))]
47 | 		adata.uns['celltype_colors'] = colPalette_celltypes
48 | 		colPalette_stages = [opts["stage_colors"][i.replace(" ","_").replace("/","_")] for i in sorted(np.unique(adata.obs['stage']))]
49 | 		adata.uns['stage_colors'] = colPalette_stages
50 | 
51 | 	return adata
52 | 
53 | def scale(X, x_min, x_max):
54 |     nom = (X - X.min(axis=0)) * (x_max - x_min)
55 |     denom = X.max(axis=0) - X.min(axis=0)
56 |     denom[denom == 0] = 1
57 |     return x_min + nom / denom
58 | 
59 | 
60 | # cmap = custom_div_cmap(11, mincol='g', midcol='0.9' ,maxcol='CornflowerBlue')
61 | def custom_div_cmap(numcolors=11, name='custom_div_cmap',
62 |                     mincol='blue', midcol='white', maxcol='red'):
63 |     """ 
64 |     Default is blue to white to red with 11 colors.  
65 |     Colors can be specified in any way understandable by matplotlib.colors.ColorConverter.to_rgb()
66 |     """
67 | 
68 |     from matplotlib.colors import LinearSegmentedColormap 
69 |     cmap = LinearSegmentedColormap.from_list(name=name, colors =[mincol, midcol, maxcol], N=numcolors)
70 |     return cmap


--------------------------------------------------------------------------------